diff --git a/build_database.py b/build_database.py index 54af997..249f9a3 100644 --- a/build_database.py +++ b/build_database.py @@ -194,13 +194,16 @@ async def main_with_ctx(ctx, wanted_date): await ctx.db.commit() + log.info("going to process posts") + + line_count_str = subprocess.check_output( + ["wc", "-l", output_compressed_paths["posts"]], stdout=subprocess.PIPE + ) + line_count = int(line_count_str) + with output_uncompressed_paths["posts"].open( mode="r", encoding="utf-8" ) as posts_csv_fd: - line_count = 0 - for line in posts_csv_fd: - line_count += 1 - line_count -= 1 # remove header log.info("%d posts to import", line_count) @@ -208,7 +211,7 @@ async def main_with_ctx(ctx, wanted_date): posts_reader = csv.DictReader(posts_csv_fd) processed_count = 0 - processed_ratio = 0 + processed_ratio = 0.0 for row in posts_reader: created_at_str = row["created_at"] @@ -270,9 +273,9 @@ async def main_with_ctx(ctx, wanted_date): ), ) processed_count += 1 - new_processed_ratio = round((processed_count / line_count) * 100) - if new_processed_ratio != processed_ratio: - log.info("posts processed at %d%%", processed_ratio) + new_processed_ratio = round((processed_count / line_count) * 100, 2) + if str(new_processed_ratio) != str(processed_ratio): + log.info("posts processed at %.2f%%", processed_ratio) processed_ratio = new_processed_ratio log.info("posts done")