e621_api_cloner/build_database.py

import asyncio
import gzip
import logging
import shutil
import tempfile
import sys
from dataclasses import dataclass
from urllib.parse import urlparse
from pathlib import Path

import aiohttp

log = logging.getLogger(__name__)


@dataclass
class Context:
    session: aiohttp.ClientSession


async def main():
    wanted_date = sys.argv[1]
    urls = (
        f"https://e621.net/db_export/tags-{wanted_date}.csv.gz",
        f"https://e621.net/db_export/posts-{wanted_date}.csv.gz",
    )

    async with aiohttp.ClientSession() as session:
        ctx = Context(session)

        for url in urls:
            parsed = urlparse(url)
            parsed_path = Path(parsed.path)
            output_path = Path.cwd() / parsed_path.name
            if output_path.exists():
                log.info("file %s already exists, ignoring", output_path)
                continue

            log.info("downloading %r into %s", url, output_path)
            async with ctx.session.get(url) as resp:
                assert resp.status == 200

                total_length = int(resp.headers["content-length"])
                downloaded_bytes = 0
                download_ratio = 0

                log.info("to download %d bytes", total_length)

                with tempfile.TemporaryFile() as temp_fd:
                    async for chunk in resp.content.iter_chunked(8192):
                        temp_fd.write(chunk)
                        downloaded_bytes += len(chunk)
                        new_download_ratio = round(
                            (downloaded_bytes / total_length) * 100
                        )
                        if new_download_ratio != download_ratio:
                            log.info("download at %d%%", download_ratio)
                            download_ratio = new_download_ratio

                    temp_fd.seek(0)

                    # write to output
                    log.info("copying temp to output")
                    with output_path.open(mode="wb") as output_fd:
                        shutil.copyfileobj(temp_fd, output_fd)

        # decompress
        for url in urls:
            parsed = urlparse(url)
            parsed_path = Path(parsed.path)
            input_path = Path.cwd() / parsed_path.name
            original_name, original_extension, _gz = parsed_path.name.split(".")
            output_path = Path.cwd() / f"{original_name}.{original_extension}"
            if output_path.exists():
                log.info("decompressed file %s already exists, ignoring", output_path)
                continue

            log.info("decompressing %s into %s", input_path.name, output_path.name)
            with gzip.open(input_path, "rb") as in_fd:
                with output_path.open(mode="wb") as out_fd:
                    shutil.copyfileobj(in_fd, out_fd)

        # now that everythings downloaded, compile the db


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    asyncio.run(main())
add base build_db code 2022-08-28 02:56:24 +00:00			`import asyncio`
fix writing 0 bytes to downloded files 2022-08-28 03:24:38 +00:00			`import gzip`
add base build_db code 2022-08-28 02:56:24 +00:00			`import logging`
			`import shutil`
			`import tempfile`
			`import sys`
			`from dataclasses import dataclass`
			`from urllib.parse import urlparse`
			`from pathlib import Path`

			`import aiohttp`

			`log = logging.getLogger(__name__)`


			`@dataclass`
			`class Context:`
			`session: aiohttp.ClientSession`


			`async def main():`
			`wanted_date = sys.argv[1]`
			`urls = (`
			`f"https://e621.net/db_export/tags-{wanted_date}.csv.gz",`
			`f"https://e621.net/db_export/posts-{wanted_date}.csv.gz",`
			`)`

			`async with aiohttp.ClientSession() as session:`
			`ctx = Context(session)`

			`for url in urls:`
			`parsed = urlparse(url)`
			`parsed_path = Path(parsed.path)`
			`output_path = Path.cwd() / parsed_path.name`
decompress csv files 2022-08-28 03:14:42 +00:00			`if output_path.exists():`
add base build_db code 2022-08-28 02:56:24 +00:00			`log.info("file %s already exists, ignoring", output_path)`
decompress csv files 2022-08-28 03:14:42 +00:00			`continue`

			`log.info("downloading %r into %s", url, output_path)`
			`async with ctx.session.get(url) as resp:`
			`assert resp.status == 200`

			`total_length = int(resp.headers["content-length"])`
			`downloaded_bytes = 0`
			`download_ratio = 0`

			`log.info("to download %d bytes", total_length)`

			`with tempfile.TemporaryFile() as temp_fd:`
			`async for chunk in resp.content.iter_chunked(8192):`
			`temp_fd.write(chunk)`
			`downloaded_bytes += len(chunk)`
			`new_download_ratio = round(`
			`(downloaded_bytes / total_length) * 100`
			`)`
			`if new_download_ratio != download_ratio:`
			`log.info("download at %d%%", download_ratio)`
			`download_ratio = new_download_ratio`

fix writing 0 bytes to downloded files 2022-08-28 03:24:38 +00:00			`temp_fd.seek(0)`

decompress csv files 2022-08-28 03:14:42 +00:00			`# write to output`
			`log.info("copying temp to output")`
			`with output_path.open(mode="wb") as output_fd:`
			`shutil.copyfileobj(temp_fd, output_fd)`

			`# decompress`
			`for url in urls:`
			`parsed = urlparse(url)`
			`parsed_path = Path(parsed.path)`
			`input_path = Path.cwd() / parsed_path.name`
			`original_name, original_extension, _gz = parsed_path.name.split(".")`
			`output_path = Path.cwd() / f"{original_name}.{original_extension}"`
			`if output_path.exists():`
fix writing 0 bytes to downloded files 2022-08-28 03:24:38 +00:00			`log.info("decompressed file %s already exists, ignoring", output_path)`
decompress csv files 2022-08-28 03:14:42 +00:00			`continue`
fix writing 0 bytes to downloded files 2022-08-28 03:24:38 +00:00
			`log.info("decompressing %s into %s", input_path.name, output_path.name)`
decompress csv files 2022-08-28 03:14:42 +00:00			`with gzip.open(input_path, "rb") as in_fd:`
			`with output_path.open(mode="wb") as out_fd:`
			`shutil.copyfileobj(in_fd, out_fd)`
add base build_db code 2022-08-28 02:56:24 +00:00
			`# now that everythings downloaded, compile the db`


			`if __name__ == "__main__":`
			`logging.basicConfig(level=logging.INFO)`
			`asyncio.run(main())`