decompress csv files

2022-08-28 00:14:42 -03:00 · 2022-08-28 00:14:42 -03:00 · 2c333ee00b
commit 2c333ee00b
parent ed0696721f
1 changed files with 41 additions and 27 deletions
--- a/build_database.py
+++ b/build_database.py
@ -31,34 +31,48 @@ async def main():
            parsed = urlparse(url)
            parsed_path = Path(parsed.path)
            output_path = Path.cwd() / parsed_path.name
-            if not output_path.exists():
-                log.info("downloading %r into %s", url, output_path)
-                async with ctx.session.get(url) as resp:
-                    assert resp.status == 200
-
-                    total_length = int(resp.headers["content-length"])
-                    downloaded_bytes = 0
-                    download_ratio = 0
-
-                    log.info("to download %d bytes", total_length)
-
-                    with tempfile.TemporaryFile() as temp_fd:
-                        async for chunk in resp.content.iter_chunked(8192):
-                            temp_fd.write(chunk)
-                            downloaded_bytes += len(chunk)
-                            new_download_ratio = round(
-                                (downloaded_bytes / total_length) * 100
-                            )
-                            if new_download_ratio != download_ratio:
-                                log.info("download at %d%%", download_ratio)
-                                download_ratio = new_download_ratio
-
-                        # write to output
-                        log.info("copying temp to output")
-                        with output_path.open(mode="wb") as output_fd:
-                            shutil.copyfileobj(temp_fd, output_fd)
-            else:
+            if output_path.exists():
                log.info("file %s already exists, ignoring", output_path)
+                continue
+
+            log.info("downloading %r into %s", url, output_path)
+            async with ctx.session.get(url) as resp:
+                assert resp.status == 200
+
+                total_length = int(resp.headers["content-length"])
+                downloaded_bytes = 0
+                download_ratio = 0
+
+                log.info("to download %d bytes", total_length)
+
+                with tempfile.TemporaryFile() as temp_fd:
+                    async for chunk in resp.content.iter_chunked(8192):
+                        temp_fd.write(chunk)
+                        downloaded_bytes += len(chunk)
+                        new_download_ratio = round(
+                            (downloaded_bytes / total_length) * 100
+                        )
+                        if new_download_ratio != download_ratio:
+                            log.info("download at %d%%", download_ratio)
+                            download_ratio = new_download_ratio
+
+                    # write to output
+                    log.info("copying temp to output")
+                    with output_path.open(mode="wb") as output_fd:
+                        shutil.copyfileobj(temp_fd, output_fd)
+
+        # decompress
+        for url in urls:
+            parsed = urlparse(url)
+            parsed_path = Path(parsed.path)
+            input_path = Path.cwd() / parsed_path.name
+            original_name, original_extension, _gz = parsed_path.name.split(".")
+            output_path = Path.cwd() / f"{original_name}.{original_extension}"
+            if output_path.exists():
+                continue
+            with gzip.open(input_path, "rb") as in_fd:
+                with output_path.open(mode="wb") as out_fd:
+                    shutil.copyfileobj(in_fd, out_fd)

        # now that everythings downloaded, compile the db