add base build_db code

2022-08-27 23:56:24 -03:00 · 2022-08-27 23:56:24 -03:00 · 08f0dd27f7
commit 08f0dd27f7
parent 1ed3e60f16
4 changed files with 75 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -160,3 +160,7 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/

+
+*.csv
+*.csv.gz
+*.db
--- a/README.md
+++ b/README.md
@ -1,3 +1,15 @@
 # e621_api_cloner

 use e621 db dumps to create your own e621 api server without needing to be ratelimited for too much scraping (this is a very specific use case i promise i am useful to society)
+
+## how
+
+```sh
+python3 -m venv env
+env/bin/pip install -Ur requirements.txt
+
+# go to https://e621.net/db_export/ to find out the available db dumps
+# database file available on e621.db
+env/bin/python ./build_database.py 2022-08-27
+env/bin/python ./e621_api_cloner.py ./e621.db
+```
--- a/build_database.py
+++ b/build_database.py
@ -0,0 +1,55 @@
+import asyncio
+import logging
+import shutil
+import tempfile
+import sys
+from dataclasses import dataclass
+from urllib.parse import urlparse
+from pathlib import Path
+
+import aiohttp
+
+log = logging.getLogger(__name__)
+
+
+@dataclass
+class Context:
+    session: aiohttp.ClientSession
+
+
+async def main():
+    wanted_date = sys.argv[1]
+    urls = (
+        f"https://e621.net/db_export/tags-{wanted_date}.csv.gz",
+        f"https://e621.net/db_export/posts-{wanted_date}.csv.gz",
+    )
+
+    async with aiohttp.ClientSession() as session:
+        ctx = Context(session)
+
+        for url in urls:
+            parsed = urlparse(url)
+            parsed_path = Path(parsed.path)
+            output_path = Path.cwd() / parsed_path.name
+            if not output_path.exists():
+                log.info("downloading %r into %s", url, output_path)
+                async with ctx.session.get(url) as resp:
+                    assert resp.status == 200
+
+                    with tempfile.TemporaryFile() as temp_fd:
+                        async for chunk in resp.content.iter_chunked(8192):
+                            temp_fd.write(chunk)
+
+                        # write to output
+                        log.info("copying temp to output")
+                        with output_path.open(mode="wb") as output_fd:
+                            shutil.copyfileobj(temp_fd, output_fd)
+            else:
+                log.info("file %s already exists, ignoring", output_path)
+
+        # now that everythings downloaded, compile the db
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    asyncio.run(main())
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+aiohttp==3.8.1
+aiosqlite==0.17.0
+Quart==0.18.0