diff --git a/build_database.py b/build_database.py index 0dc06b6..6f521dc 100644 --- a/build_database.py +++ b/build_database.py @@ -31,48 +31,21 @@ async def main(): parsed = urlparse(url) parsed_path = Path(parsed.path) output_path = Path.cwd() / parsed_path.name - if output_path.exists(): + if not output_path.exists(): + log.info("downloading %r into %s", url, output_path) + async with ctx.session.get(url) as resp: + assert resp.status == 200 + + with tempfile.TemporaryFile() as temp_fd: + async for chunk in resp.content.iter_chunked(8192): + temp_fd.write(chunk) + + # write to output + log.info("copying temp to output") + with output_path.open(mode="wb") as output_fd: + shutil.copyfileobj(temp_fd, output_fd) + else: log.info("file %s already exists, ignoring", output_path) - continue - - log.info("downloading %r into %s", url, output_path) - async with ctx.session.get(url) as resp: - assert resp.status == 200 - - total_length = int(resp.headers["content-length"]) - downloaded_bytes = 0 - download_ratio = 0 - - log.info("to download %d bytes", total_length) - - with tempfile.TemporaryFile() as temp_fd: - async for chunk in resp.content.iter_chunked(8192): - temp_fd.write(chunk) - downloaded_bytes += len(chunk) - new_download_ratio = round( - (downloaded_bytes / total_length) * 100 - ) - if new_download_ratio != download_ratio: - log.info("download at %d%%", download_ratio) - download_ratio = new_download_ratio - - # write to output - log.info("copying temp to output") - with output_path.open(mode="wb") as output_fd: - shutil.copyfileobj(temp_fd, output_fd) - - # decompress - for url in urls: - parsed = urlparse(url) - parsed_path = Path(parsed.path) - input_path = Path.cwd() / parsed_path.name - original_name, original_extension, _gz = parsed_path.name.split(".") - output_path = Path.cwd() / f"{original_name}.{original_extension}" - if output_path.exists(): - continue - with gzip.open(input_path, "rb") as in_fd: - with output_path.open(mode="wb") as out_fd: - shutil.copyfileobj(in_fd, out_fd) # now that everythings downloaded, compile the db