decompress csv files
This commit is contained in:
parent
ed0696721f
commit
2c333ee00b
1 changed files with 41 additions and 27 deletions
|
@ -31,34 +31,48 @@ async def main():
|
|||
parsed = urlparse(url)
|
||||
parsed_path = Path(parsed.path)
|
||||
output_path = Path.cwd() / parsed_path.name
|
||||
if not output_path.exists():
|
||||
log.info("downloading %r into %s", url, output_path)
|
||||
async with ctx.session.get(url) as resp:
|
||||
assert resp.status == 200
|
||||
|
||||
total_length = int(resp.headers["content-length"])
|
||||
downloaded_bytes = 0
|
||||
download_ratio = 0
|
||||
|
||||
log.info("to download %d bytes", total_length)
|
||||
|
||||
with tempfile.TemporaryFile() as temp_fd:
|
||||
async for chunk in resp.content.iter_chunked(8192):
|
||||
temp_fd.write(chunk)
|
||||
downloaded_bytes += len(chunk)
|
||||
new_download_ratio = round(
|
||||
(downloaded_bytes / total_length) * 100
|
||||
)
|
||||
if new_download_ratio != download_ratio:
|
||||
log.info("download at %d%%", download_ratio)
|
||||
download_ratio = new_download_ratio
|
||||
|
||||
# write to output
|
||||
log.info("copying temp to output")
|
||||
with output_path.open(mode="wb") as output_fd:
|
||||
shutil.copyfileobj(temp_fd, output_fd)
|
||||
else:
|
||||
if output_path.exists():
|
||||
log.info("file %s already exists, ignoring", output_path)
|
||||
continue
|
||||
|
||||
log.info("downloading %r into %s", url, output_path)
|
||||
async with ctx.session.get(url) as resp:
|
||||
assert resp.status == 200
|
||||
|
||||
total_length = int(resp.headers["content-length"])
|
||||
downloaded_bytes = 0
|
||||
download_ratio = 0
|
||||
|
||||
log.info("to download %d bytes", total_length)
|
||||
|
||||
with tempfile.TemporaryFile() as temp_fd:
|
||||
async for chunk in resp.content.iter_chunked(8192):
|
||||
temp_fd.write(chunk)
|
||||
downloaded_bytes += len(chunk)
|
||||
new_download_ratio = round(
|
||||
(downloaded_bytes / total_length) * 100
|
||||
)
|
||||
if new_download_ratio != download_ratio:
|
||||
log.info("download at %d%%", download_ratio)
|
||||
download_ratio = new_download_ratio
|
||||
|
||||
# write to output
|
||||
log.info("copying temp to output")
|
||||
with output_path.open(mode="wb") as output_fd:
|
||||
shutil.copyfileobj(temp_fd, output_fd)
|
||||
|
||||
# decompress
|
||||
for url in urls:
|
||||
parsed = urlparse(url)
|
||||
parsed_path = Path(parsed.path)
|
||||
input_path = Path.cwd() / parsed_path.name
|
||||
original_name, original_extension, _gz = parsed_path.name.split(".")
|
||||
output_path = Path.cwd() / f"{original_name}.{original_extension}"
|
||||
if output_path.exists():
|
||||
continue
|
||||
with gzip.open(input_path, "rb") as in_fd:
|
||||
with output_path.open(mode="wb") as out_fd:
|
||||
shutil.copyfileobj(in_fd, out_fd)
|
||||
|
||||
# now that everythings downloaded, compile the db
|
||||
|
||||
|
|
Loading…
Reference in a new issue