add base build_db code

This commit is contained in:
Luna 2022-08-27 23:56:24 -03:00
parent 1ed3e60f16
commit 08f0dd27f7
4 changed files with 75 additions and 1 deletions

4
.gitignore vendored
View File

@ -160,3 +160,7 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
*.csv
*.csv.gz
*.db

View File

@ -1,3 +1,15 @@
# e621_api_cloner # e621_api_cloner
use e621 db dumps to create your own e621 api server without needing to be ratelimited for too much scraping (this is a very specific use case i promise i am useful to society) use e621 db dumps to create your own e621 api server without needing to be ratelimited for too much scraping (this is a very specific use case i promise i am useful to society)
## how
```sh
python3 -m venv env
env/bin/pip install -Ur requirements.txt
# go to https://e621.net/db_export/ to find out the available db dumps
# database file available on e621.db
env/bin/python ./build_database.py 2022-08-27
env/bin/python ./e621_api_cloner.py ./e621.db
```

55
build_database.py Normal file
View File

@ -0,0 +1,55 @@
import asyncio
import logging
import shutil
import tempfile
import sys
from dataclasses import dataclass
from urllib.parse import urlparse
from pathlib import Path
import aiohttp
log = logging.getLogger(__name__)
@dataclass
class Context:
session: aiohttp.ClientSession
async def main():
wanted_date = sys.argv[1]
urls = (
f"https://e621.net/db_export/tags-{wanted_date}.csv.gz",
f"https://e621.net/db_export/posts-{wanted_date}.csv.gz",
)
async with aiohttp.ClientSession() as session:
ctx = Context(session)
for url in urls:
parsed = urlparse(url)
parsed_path = Path(parsed.path)
output_path = Path.cwd() / parsed_path.name
if not output_path.exists():
log.info("downloading %r into %s", url, output_path)
async with ctx.session.get(url) as resp:
assert resp.status == 200
with tempfile.TemporaryFile() as temp_fd:
async for chunk in resp.content.iter_chunked(8192):
temp_fd.write(chunk)
# write to output
log.info("copying temp to output")
with output_path.open(mode="wb") as output_fd:
shutil.copyfileobj(temp_fd, output_fd)
else:
log.info("file %s already exists, ignoring", output_path)
# now that everythings downloaded, compile the db
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
asyncio.run(main())

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
aiohttp==3.8.1
aiosqlite==0.17.0
Quart==0.18.0