From 08f0dd27f766e198ae66ae8d7c6b6aac18d3e9d5 Mon Sep 17 00:00:00 2001 From: Luna Date: Sat, 27 Aug 2022 23:56:24 -0300 Subject: [PATCH] add base build_db code --- .gitignore | 4 ++++ README.md | 14 +++++++++++- build_database.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 +++ 4 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 build_database.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 5d381cc..9b076fe 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,7 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +*.csv +*.csv.gz +*.db diff --git a/README.md b/README.md index 0290b6e..5193d7b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,15 @@ # e621_api_cloner -use e621 db dumps to create your own e621 api server without needing to be ratelimited for too much scraping (this is a very specific use case i promise i am useful to society) \ No newline at end of file +use e621 db dumps to create your own e621 api server without needing to be ratelimited for too much scraping (this is a very specific use case i promise i am useful to society) + +## how + +```sh +python3 -m venv env +env/bin/pip install -Ur requirements.txt + +# go to https://e621.net/db_export/ to find out the available db dumps +# database file available on e621.db +env/bin/python ./build_database.py 2022-08-27 +env/bin/python ./e621_api_cloner.py ./e621.db +``` diff --git a/build_database.py b/build_database.py new file mode 100644 index 0000000..6f521dc --- /dev/null +++ b/build_database.py @@ -0,0 +1,55 @@ +import asyncio +import logging +import shutil +import tempfile +import sys +from dataclasses import dataclass +from urllib.parse import urlparse +from pathlib import Path + +import aiohttp + +log = logging.getLogger(__name__) + + +@dataclass +class Context: + session: aiohttp.ClientSession + + +async def main(): + wanted_date = sys.argv[1] + urls = ( + f"https://e621.net/db_export/tags-{wanted_date}.csv.gz", + f"https://e621.net/db_export/posts-{wanted_date}.csv.gz", + ) + + async with aiohttp.ClientSession() as session: + ctx = Context(session) + + for url in urls: + parsed = urlparse(url) + parsed_path = Path(parsed.path) + output_path = Path.cwd() / parsed_path.name + if not output_path.exists(): + log.info("downloading %r into %s", url, output_path) + async with ctx.session.get(url) as resp: + assert resp.status == 200 + + with tempfile.TemporaryFile() as temp_fd: + async for chunk in resp.content.iter_chunked(8192): + temp_fd.write(chunk) + + # write to output + log.info("copying temp to output") + with output_path.open(mode="wb") as output_fd: + shutil.copyfileobj(temp_fd, output_fd) + else: + log.info("file %s already exists, ignoring", output_path) + + # now that everythings downloaded, compile the db + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + asyncio.run(main()) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7c15731 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +aiohttp==3.8.1 +aiosqlite==0.17.0 +Quart==0.18.0