From 2c333ee00b83300b2ffdf03b89112f8318215ec9 Mon Sep 17 00:00:00 2001 From: Luna Date: Sun, 28 Aug 2022 00:14:42 -0300 Subject: [PATCH] decompress csv files --- build_database.py | 68 ++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/build_database.py b/build_database.py index 4d577e8..0dc06b6 100644 --- a/build_database.py +++ b/build_database.py @@ -31,34 +31,48 @@ async def main(): parsed = urlparse(url) parsed_path = Path(parsed.path) output_path = Path.cwd() / parsed_path.name - if not output_path.exists(): - log.info("downloading %r into %s", url, output_path) - async with ctx.session.get(url) as resp: - assert resp.status == 200 - - total_length = int(resp.headers["content-length"]) - downloaded_bytes = 0 - download_ratio = 0 - - log.info("to download %d bytes", total_length) - - with tempfile.TemporaryFile() as temp_fd: - async for chunk in resp.content.iter_chunked(8192): - temp_fd.write(chunk) - downloaded_bytes += len(chunk) - new_download_ratio = round( - (downloaded_bytes / total_length) * 100 - ) - if new_download_ratio != download_ratio: - log.info("download at %d%%", download_ratio) - download_ratio = new_download_ratio - - # write to output - log.info("copying temp to output") - with output_path.open(mode="wb") as output_fd: - shutil.copyfileobj(temp_fd, output_fd) - else: + if output_path.exists(): log.info("file %s already exists, ignoring", output_path) + continue + + log.info("downloading %r into %s", url, output_path) + async with ctx.session.get(url) as resp: + assert resp.status == 200 + + total_length = int(resp.headers["content-length"]) + downloaded_bytes = 0 + download_ratio = 0 + + log.info("to download %d bytes", total_length) + + with tempfile.TemporaryFile() as temp_fd: + async for chunk in resp.content.iter_chunked(8192): + temp_fd.write(chunk) + downloaded_bytes += len(chunk) + new_download_ratio = round( + (downloaded_bytes / total_length) * 100 + ) + if new_download_ratio != download_ratio: + log.info("download at %d%%", download_ratio) + download_ratio = new_download_ratio + + # write to output + log.info("copying temp to output") + with output_path.open(mode="wb") as output_fd: + shutil.copyfileobj(temp_fd, output_fd) + + # decompress + for url in urls: + parsed = urlparse(url) + parsed_path = Path(parsed.path) + input_path = Path.cwd() / parsed_path.name + original_name, original_extension, _gz = parsed_path.name.split(".") + output_path = Path.cwd() / f"{original_name}.{original_extension}" + if output_path.exists(): + continue + with gzip.open(input_path, "rb") as in_fd: + with output_path.open(mode="wb") as out_fd: + shutil.copyfileobj(in_fd, out_fd) # now that everythings downloaded, compile the db