import asyncio import gzip import logging import shutil import tempfile import sys from dataclasses import dataclass from urllib.parse import urlparse from pathlib import Path import aiohttp log = logging.getLogger(__name__) @dataclass class Context: session: aiohttp.ClientSession async def main(): wanted_date = sys.argv[1] urls = ( f"https://e621.net/db_export/tags-{wanted_date}.csv.gz", f"https://e621.net/db_export/posts-{wanted_date}.csv.gz", ) async with aiohttp.ClientSession() as session: ctx = Context(session) for url in urls: parsed = urlparse(url) parsed_path = Path(parsed.path) output_path = Path.cwd() / parsed_path.name if output_path.exists(): log.info("file %s already exists, ignoring", output_path) continue log.info("downloading %r into %s", url, output_path) async with ctx.session.get(url) as resp: assert resp.status == 200 total_length = int(resp.headers["content-length"]) downloaded_bytes = 0 download_ratio = 0 log.info("to download %d bytes", total_length) with tempfile.TemporaryFile() as temp_fd: async for chunk in resp.content.iter_chunked(8192): temp_fd.write(chunk) downloaded_bytes += len(chunk) new_download_ratio = round( (downloaded_bytes / total_length) * 100 ) if new_download_ratio != download_ratio: log.info("download at %d%%", download_ratio) download_ratio = new_download_ratio temp_fd.seek(0) # write to output log.info("copying temp to output") with output_path.open(mode="wb") as output_fd: shutil.copyfileobj(temp_fd, output_fd) # decompress for url in urls: parsed = urlparse(url) parsed_path = Path(parsed.path) input_path = Path.cwd() / parsed_path.name original_name, original_extension, _gz = parsed_path.name.split(".") output_path = Path.cwd() / f"{original_name}.{original_extension}" if output_path.exists(): log.info("decompressed file %s already exists, ignoring", output_path) continue log.info("decompressing %s into %s", input_path.name, output_path.name) with gzip.open(input_path, "rb") as in_fd: with output_path.open(mode="wb") as out_fd: shutil.copyfileobj(in_fd, out_fd) # now that everythings downloaded, compile the db if __name__ == "__main__": logging.basicConfig(level=logging.INFO) asyncio.run(main())