add base build_db code
This commit is contained in:
parent
1ed3e60f16
commit
08f0dd27f7
4 changed files with 75 additions and 1 deletions
55
build_database.py
Normal file
55
build_database.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
import asyncio
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import urlparse
|
||||
from pathlib import Path
|
||||
|
||||
import aiohttp
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Context:
|
||||
session: aiohttp.ClientSession
|
||||
|
||||
|
||||
async def main():
|
||||
wanted_date = sys.argv[1]
|
||||
urls = (
|
||||
f"https://e621.net/db_export/tags-{wanted_date}.csv.gz",
|
||||
f"https://e621.net/db_export/posts-{wanted_date}.csv.gz",
|
||||
)
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
ctx = Context(session)
|
||||
|
||||
for url in urls:
|
||||
parsed = urlparse(url)
|
||||
parsed_path = Path(parsed.path)
|
||||
output_path = Path.cwd() / parsed_path.name
|
||||
if not output_path.exists():
|
||||
log.info("downloading %r into %s", url, output_path)
|
||||
async with ctx.session.get(url) as resp:
|
||||
assert resp.status == 200
|
||||
|
||||
with tempfile.TemporaryFile() as temp_fd:
|
||||
async for chunk in resp.content.iter_chunked(8192):
|
||||
temp_fd.write(chunk)
|
||||
|
||||
# write to output
|
||||
log.info("copying temp to output")
|
||||
with output_path.open(mode="wb") as output_fd:
|
||||
shutil.copyfileobj(temp_fd, output_fd)
|
||||
else:
|
||||
log.info("file %s already exists, ignoring", output_path)
|
||||
|
||||
# now that everythings downloaded, compile the db
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
asyncio.run(main())
|
Loading…
Add table
Add a link
Reference in a new issue