don't reimport posts when not needed

This commit is contained in:
Luna 2022-08-29 21:56:57 -03:00
parent 21e05ba546
commit c2576d271c
1 changed files with 78 additions and 68 deletions

View File

@ -196,6 +196,9 @@ async def main_with_ctx(ctx, wanted_date):
await ctx.db.commit() await ctx.db.commit()
log.info("going to process posts") log.info("going to process posts")
post_count_rows = await ctx.db.execute_fetchall("select count(*) from posts")
post_count = post_count_rows[0][0]
log.info("already have %d posts", post_count)
with output_uncompressed_paths["posts"].open( with output_uncompressed_paths["posts"].open(
mode="r", encoding="utf-8" mode="r", encoding="utf-8"
@ -207,80 +210,87 @@ async def main_with_ctx(ctx, wanted_date):
line_count -= 1 # remove header line_count -= 1 # remove header
log.info("%d posts to import", line_count) log.info("%d posts to import", line_count)
posts_csv_fd.seek(0) if line_count == post_count:
posts_reader = csv.DictReader(posts_csv_fd) log.info("already imported everything, skipping")
else:
posts_csv_fd.seek(0)
posts_reader = csv.DictReader(posts_csv_fd)
processed_count = 0 processed_count = 0
processed_ratio = 0.0 processed_ratio = 0.0
for row in posts_reader: for row in posts_reader:
created_at_str = row["created_at"] created_at_str = row["created_at"]
created_at = datetime.strptime( created_at = datetime.strptime(
created_at_str[: created_at_str.find(".")], "%Y-%m-%d %H:%M:%S" created_at_str[: created_at_str.find(".")], "%Y-%m-%d %H:%M:%S"
) )
post = Post( post = Post(
id=int(row["id"]), id=int(row["id"]),
uploader_id=int(row["uploader_id"]), uploader_id=int(row["uploader_id"]),
created_at=int(created_at.timestamp()), created_at=int(created_at.timestamp()),
md5=row["md5"], md5=row["md5"],
source=row["source"], source=row["source"],
rating=row["rating"], rating=row["rating"],
tag_string=row["tag_string"], tag_string=row["tag_string"],
is_deleted=e621_bool(row["is_deleted"]), is_deleted=e621_bool(row["is_deleted"]),
is_pending=e621_bool(row["is_pending"]), is_pending=e621_bool(row["is_pending"]),
is_flagged=e621_bool(row["is_flagged"]), is_flagged=e621_bool(row["is_flagged"]),
score=int(row["score"]), score=int(row["score"]),
up_score=int(row["up_score"]), up_score=int(row["up_score"]),
down_score=int(row["down_score"]), down_score=int(row["down_score"]),
is_rating_locked=e621_bool(row["is_rating_locked"]), is_rating_locked=e621_bool(row["is_rating_locked"]),
) )
await ctx.db.execute( await ctx.db.execute(
""" """
insert into posts ( insert into posts (
id, id,
uploader_id, uploader_id,
created_at, created_at,
md5, md5,
source, source,
rating, rating,
tag_string, tag_string,
is_deleted, is_deleted,
is_pending, is_pending,
is_flagged, is_flagged,
score, score,
up_score, up_score,
down_score, down_score,
is_rating_locked is_rating_locked
) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?) ) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?)
""", """,
( (
post.id, post.id,
post.uploader_id, post.uploader_id,
post.created_at, post.created_at,
post.md5, post.md5,
post.source, post.source,
post.rating, post.rating,
post.tag_string, post.tag_string,
post.is_deleted, post.is_deleted,
post.is_pending, post.is_pending,
post.is_flagged, post.is_flagged,
post.score, post.score,
post.up_score, post.up_score,
post.down_score, post.down_score,
post.is_rating_locked, post.is_rating_locked,
), ),
) )
processed_count += 1 processed_count += 1
new_processed_ratio = round((processed_count / line_count) * 100, 2) new_processed_ratio = round((processed_count / line_count) * 100, 2)
if str(new_processed_ratio) != str(processed_ratio): if str(new_processed_ratio) != str(processed_ratio):
log.info("posts processed at %.2f%%", processed_ratio) log.info("posts processed at %.2f%%", processed_ratio)
processed_ratio = new_processed_ratio processed_ratio = new_processed_ratio
log.info("posts done") log.info("posts done")
await ctx.db.commit() await ctx.db.commit()
log.info("vacuuming db...")
await ctx.db.execute("vacuum")
log.info("database built")
async def main(): async def main():