2022-08-28 02:56:24 +00:00
|
|
|
import asyncio
|
2022-08-28 03:24:38 +00:00
|
|
|
import gzip
|
2022-08-28 02:56:24 +00:00
|
|
|
import logging
|
|
|
|
import shutil
|
|
|
|
import tempfile
|
|
|
|
import sys
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import aiohttp
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Context:
|
|
|
|
session: aiohttp.ClientSession
|
|
|
|
|
|
|
|
|
|
|
|
async def main():
|
|
|
|
wanted_date = sys.argv[1]
|
|
|
|
urls = (
|
|
|
|
f"https://e621.net/db_export/tags-{wanted_date}.csv.gz",
|
|
|
|
f"https://e621.net/db_export/posts-{wanted_date}.csv.gz",
|
|
|
|
)
|
|
|
|
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
|
|
ctx = Context(session)
|
|
|
|
|
|
|
|
for url in urls:
|
|
|
|
parsed = urlparse(url)
|
|
|
|
parsed_path = Path(parsed.path)
|
|
|
|
output_path = Path.cwd() / parsed_path.name
|
2022-08-28 03:14:42 +00:00
|
|
|
if output_path.exists():
|
2022-08-28 02:56:24 +00:00
|
|
|
log.info("file %s already exists, ignoring", output_path)
|
2022-08-28 03:14:42 +00:00
|
|
|
continue
|
|
|
|
|
|
|
|
log.info("downloading %r into %s", url, output_path)
|
|
|
|
async with ctx.session.get(url) as resp:
|
|
|
|
assert resp.status == 200
|
|
|
|
|
|
|
|
total_length = int(resp.headers["content-length"])
|
|
|
|
downloaded_bytes = 0
|
|
|
|
download_ratio = 0
|
|
|
|
|
|
|
|
log.info("to download %d bytes", total_length)
|
|
|
|
|
|
|
|
with tempfile.TemporaryFile() as temp_fd:
|
|
|
|
async for chunk in resp.content.iter_chunked(8192):
|
|
|
|
temp_fd.write(chunk)
|
|
|
|
downloaded_bytes += len(chunk)
|
|
|
|
new_download_ratio = round(
|
|
|
|
(downloaded_bytes / total_length) * 100
|
|
|
|
)
|
|
|
|
if new_download_ratio != download_ratio:
|
|
|
|
log.info("download at %d%%", download_ratio)
|
|
|
|
download_ratio = new_download_ratio
|
|
|
|
|
2022-08-28 03:24:38 +00:00
|
|
|
temp_fd.seek(0)
|
|
|
|
|
2022-08-28 03:14:42 +00:00
|
|
|
# write to output
|
|
|
|
log.info("copying temp to output")
|
|
|
|
with output_path.open(mode="wb") as output_fd:
|
|
|
|
shutil.copyfileobj(temp_fd, output_fd)
|
|
|
|
|
|
|
|
# decompress
|
|
|
|
for url in urls:
|
|
|
|
parsed = urlparse(url)
|
|
|
|
parsed_path = Path(parsed.path)
|
|
|
|
input_path = Path.cwd() / parsed_path.name
|
|
|
|
original_name, original_extension, _gz = parsed_path.name.split(".")
|
|
|
|
output_path = Path.cwd() / f"{original_name}.{original_extension}"
|
|
|
|
if output_path.exists():
|
2022-08-28 03:24:38 +00:00
|
|
|
log.info("decompressed file %s already exists, ignoring", output_path)
|
2022-08-28 03:14:42 +00:00
|
|
|
continue
|
2022-08-28 03:24:38 +00:00
|
|
|
|
|
|
|
log.info("decompressing %s into %s", input_path.name, output_path.name)
|
2022-08-28 03:14:42 +00:00
|
|
|
with gzip.open(input_path, "rb") as in_fd:
|
|
|
|
with output_path.open(mode="wb") as out_fd:
|
|
|
|
shutil.copyfileobj(in_fd, out_fd)
|
2022-08-28 02:56:24 +00:00
|
|
|
|
|
|
|
# now that everythings downloaded, compile the db
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
asyncio.run(main())
|