From 96c7194a1ab274da374fe733a211a0d5b4c259db Mon Sep 17 00:00:00 2001 From: Luna Date: Thu, 11 Aug 2022 21:51:23 -0300 Subject: [PATCH] add codes --- .gitignore | 1 + README.md | 17 ++++- requirements.txt | 3 + timeliner.py | 171 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 requirements.txt create mode 100755 timeliner.py diff --git a/.gitignore b/.gitignore index 55be276..2d3a88e 100644 --- a/.gitignore +++ b/.gitignore @@ -152,3 +152,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +*.db diff --git a/README.md b/README.md index 84bfe47..3e825bd 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,18 @@ # booru-tag-timeline -graph amount of posts with a certain tag over time made in a booru (Danbooru and Gelbooru supported) \ No newline at end of file +graph amount of posts with a certain tag over time made in a booru (Danbooru and Gelbooru supported) + +## use + +```sh +pip install -Ur requirements.txt + +# THERE IS LITERALLY THE ONE THING I HAD TO ASK THAT MADE THIS +# SOFTWARE INTO A THING. +# +# "HOW MUCH PORN OF BRIDGET WAS MADE BECAUSE OF THE TRANS ANNOUNCEMENT?" +# +# THIS IS AN IMPORTANT SCIENTIFIC QUESTION, AS A TRANS MYSELF, I NEED TO +# KNOW HOW MUCH GIRLDICK WE GOT +./timeliner.py gelbooru 'bridget_(guilty_gear)' +``` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7e8a37d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +plotly>5.10.0 +requests>2.28.1 +pandas>1.4.3 diff --git a/timeliner.py b/timeliner.py new file mode 100755 index 0000000..8cfdbee --- /dev/null +++ b/timeliner.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +import sys +import time +import sqlite3 +import requests +import logging +import datetime +import plotly.express as px +from collections import defaultdict +from dataclasses import dataclass +from typing import List, Optional + +log = logging.getLogger(__name__) + + +@dataclass +class Post: + hash: str + inserted_at: datetime.datetime + + +@dataclass +class GelbooruCursor: + query: str + page: int = 0 + count: Optional[int] = None + + def __iter__(self): + return self + + def __next__(self) -> List[Post]: + resp = requests.get( + "https://gelbooru.com/index.php", + params={ + "page": "dapi", + "s": "post", + "json": "1", + "q": "index", + "tags": self.query, + "limit": "100", + "pid": self.page, + }, + ) + log.debug("made request to %r", resp.url) + assert resp.status_code == 200 + rjson = resp.json() + attrs = rjson["@attributes"] + + if "post" not in rjson: + log.info("page %d reached end of tag", self.page) + raise StopIteration() + + self.count = self.count or attrs["count"] + log.info( + "page %d gave %d posts (total %d)", + self.page, + len(rjson["post"]), + self.count, + ) + + results = [] + for entry in rjson["post"]: + parsed_time = time.strptime(entry["created_at"], "%a %b %d %H:%M:%S %z %Y") + results.append( + Post( + entry["md5"], + datetime.datetime.fromtimestamp(time.mktime(parsed_time)), + ) + ) + self.page += 1 + return results + + +@dataclass +class Gelbooru: + typeid = 1 + name = "Gelbooru" + + def fetchall(self, query: str) -> GelbooruCursor: + return GelbooruCursor(query) + + +def main(): + logging.basicConfig(level=logging.DEBUG) + log.debug("%r", sys.argv) + + try: + booru = sys.argv[1] + tags = sys.argv[2] + except IndexError: + log.error("expected booru and tags argument") + return 1 + + if booru == "gelbooru": + booru_client = Gelbooru() + elif booru == "gelbooru": + booru_client = Danbooru() + raise NotImplementedError() # TODO + else: + log.error("booru must be one of {gelbooru, danbooru}") + return 1 + + db = sqlite3.connect("./timeliner-cache.db") + db.executescript( + """ + CREATE TABLE IF NOT EXISTS file_store ( + booru_type text not null, + query text not null, + file_hash text not null, + inserted_at int not null, + constraint file_store_pk primary key (booru_type, query, file_hash) + ) strict; + """ + ) + + try: + cur = db.execute( + "select file_hash, inserted_at from file_store where booru_type = ? and query = ?", + (booru_client.typeid, tags), + ) + post_entries = cur.fetchall() + if not post_entries: + cursor = booru_client.fetchall(tags) + posts = [] # final data + for incoming_posts in cursor: + posts.extend(incoming_posts) + for post in incoming_posts: + db.execute( + "insert into file_store values (?, ?, ?, ?)", + ( + booru_client.typeid, + tags, + post.hash, + post.inserted_at.timestamp(), + ), + ) + log.info("fetched %d posts", len(posts)) + db.commit() + else: + posts = [ + Post(entry[0], datetime.datetime.fromtimestamp(entry[1])) + for entry in post_entries + ] + log.info("cached %d posts", len(posts)) + finally: + db.close() + + # now that we have data, bucket and plot? + # bucket data by day + buckets = defaultdict(int) + for post in posts: + date = (post.inserted_at.year, post.inserted_at.month, post.inserted_at.day) + buckets[date] += 1 + + post_frequencies = [ + (f"{k[0]}/{k[1]}/{k[2]}", buckets[k]) for k in sorted(list(buckets.keys())) + ] + fig = px.line( + post_frequencies, + x=0, + y=1, + title=f"amount of posts per day for given query ({tags}) in booru ({booru_client.name})", + ) + fig.show() + + return 0 + + +if __name__ == "__main__": + sys.exit(main())