e621_api_cloner/build_database.py

89 lines
2.9 KiB
Python

import asyncio
import gzip
import logging
import shutil
import tempfile
import sys
from dataclasses import dataclass
from urllib.parse import urlparse
from pathlib import Path
import aiohttp
log = logging.getLogger(__name__)
@dataclass
class Context:
session: aiohttp.ClientSession
async def main():
wanted_date = sys.argv[1]
urls = (
f"https://e621.net/db_export/tags-{wanted_date}.csv.gz",
f"https://e621.net/db_export/posts-{wanted_date}.csv.gz",
)
async with aiohttp.ClientSession() as session:
ctx = Context(session)
for url in urls:
parsed = urlparse(url)
parsed_path = Path(parsed.path)
output_path = Path.cwd() / parsed_path.name
if output_path.exists():
log.info("file %s already exists, ignoring", output_path)
continue
log.info("downloading %r into %s", url, output_path)
async with ctx.session.get(url) as resp:
assert resp.status == 200
total_length = int(resp.headers["content-length"])
downloaded_bytes = 0
download_ratio = 0
log.info("to download %d bytes", total_length)
with tempfile.TemporaryFile() as temp_fd:
async for chunk in resp.content.iter_chunked(8192):
temp_fd.write(chunk)
downloaded_bytes += len(chunk)
new_download_ratio = round(
(downloaded_bytes / total_length) * 100
)
if new_download_ratio != download_ratio:
log.info("download at %d%%", download_ratio)
download_ratio = new_download_ratio
temp_fd.seek(0)
# write to output
log.info("copying temp to output")
with output_path.open(mode="wb") as output_fd:
shutil.copyfileobj(temp_fd, output_fd)
# decompress
for url in urls:
parsed = urlparse(url)
parsed_path = Path(parsed.path)
input_path = Path.cwd() / parsed_path.name
original_name, original_extension, _gz = parsed_path.name.split(".")
output_path = Path.cwd() / f"{original_name}.{original_extension}"
if output_path.exists():
log.info("decompressed file %s already exists, ignoring", output_path)
continue
log.info("decompressing %s into %s", input_path.name, output_path.name)
with gzip.open(input_path, "rb") as in_fd:
with output_path.open(mode="wb") as out_fd:
shutil.copyfileobj(in_fd, out_fd)
# now that everythings downloaded, compile the db
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
asyncio.run(main())