diff --git a/build_database.py b/build_database.py index 4136be5..8eb933f 100644 --- a/build_database.py +++ b/build_database.py @@ -196,6 +196,9 @@ async def main_with_ctx(ctx, wanted_date): await ctx.db.commit() log.info("going to process posts") + post_count_rows = await ctx.db.execute_fetchall("select count(*) from posts") + post_count = post_count_rows[0][0] + log.info("already have %d posts", post_count) with output_uncompressed_paths["posts"].open( mode="r", encoding="utf-8" @@ -207,80 +210,87 @@ async def main_with_ctx(ctx, wanted_date): line_count -= 1 # remove header log.info("%d posts to import", line_count) - posts_csv_fd.seek(0) - posts_reader = csv.DictReader(posts_csv_fd) + if line_count == post_count: + log.info("already imported everything, skipping") + else: + posts_csv_fd.seek(0) + posts_reader = csv.DictReader(posts_csv_fd) - processed_count = 0 - processed_ratio = 0.0 + processed_count = 0 + processed_ratio = 0.0 - for row in posts_reader: - created_at_str = row["created_at"] - created_at = datetime.strptime( - created_at_str[: created_at_str.find(".")], "%Y-%m-%d %H:%M:%S" - ) + for row in posts_reader: + created_at_str = row["created_at"] + created_at = datetime.strptime( + created_at_str[: created_at_str.find(".")], "%Y-%m-%d %H:%M:%S" + ) - post = Post( - id=int(row["id"]), - uploader_id=int(row["uploader_id"]), - created_at=int(created_at.timestamp()), - md5=row["md5"], - source=row["source"], - rating=row["rating"], - tag_string=row["tag_string"], - is_deleted=e621_bool(row["is_deleted"]), - is_pending=e621_bool(row["is_pending"]), - is_flagged=e621_bool(row["is_flagged"]), - score=int(row["score"]), - up_score=int(row["up_score"]), - down_score=int(row["down_score"]), - is_rating_locked=e621_bool(row["is_rating_locked"]), - ) + post = Post( + id=int(row["id"]), + uploader_id=int(row["uploader_id"]), + created_at=int(created_at.timestamp()), + md5=row["md5"], + source=row["source"], + rating=row["rating"], + tag_string=row["tag_string"], + is_deleted=e621_bool(row["is_deleted"]), + is_pending=e621_bool(row["is_pending"]), + is_flagged=e621_bool(row["is_flagged"]), + score=int(row["score"]), + up_score=int(row["up_score"]), + down_score=int(row["down_score"]), + is_rating_locked=e621_bool(row["is_rating_locked"]), + ) - await ctx.db.execute( - """ - insert into posts ( - id, - uploader_id, - created_at, - md5, - source, - rating, - tag_string, - is_deleted, - is_pending, - is_flagged, - score, - up_score, - down_score, - is_rating_locked - ) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?) - """, - ( - post.id, - post.uploader_id, - post.created_at, - post.md5, - post.source, - post.rating, - post.tag_string, - post.is_deleted, - post.is_pending, - post.is_flagged, - post.score, - post.up_score, - post.down_score, - post.is_rating_locked, - ), - ) - processed_count += 1 - new_processed_ratio = round((processed_count / line_count) * 100, 2) - if str(new_processed_ratio) != str(processed_ratio): - log.info("posts processed at %.2f%%", processed_ratio) - processed_ratio = new_processed_ratio + await ctx.db.execute( + """ + insert into posts ( + id, + uploader_id, + created_at, + md5, + source, + rating, + tag_string, + is_deleted, + is_pending, + is_flagged, + score, + up_score, + down_score, + is_rating_locked + ) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?) + """, + ( + post.id, + post.uploader_id, + post.created_at, + post.md5, + post.source, + post.rating, + post.tag_string, + post.is_deleted, + post.is_pending, + post.is_flagged, + post.score, + post.up_score, + post.down_score, + post.is_rating_locked, + ), + ) + processed_count += 1 + new_processed_ratio = round((processed_count / line_count) * 100, 2) + if str(new_processed_ratio) != str(processed_ratio): + log.info("posts processed at %.2f%%", processed_ratio) + processed_ratio = new_processed_ratio - log.info("posts done") + log.info("posts done") - await ctx.db.commit() + await ctx.db.commit() + + log.info("vacuuming db...") + await ctx.db.execute("vacuum") + log.info("database built") async def main():