use lru_cache to make journal date conversion less inefficient, cleanups

This commit is contained in:
Nik V 2020-11-27 20:06:40 -08:00
parent 7f93ee8173
commit 72b3316291
2 changed files with 20 additions and 13 deletions

View file

@ -1,7 +1,7 @@
import os import os
import getpass import getpass
AGORA_PATH = os.getenv("AGORA_PATH", os.path.join("/home", getpass.getuser(), "agora")) AGORA_PATH = os.getenv('AGORA_PATH', os.path.join('/home', getpass.getuser(), 'agora'))
AGORA_VERSION = '0.5.3' AGORA_VERSION = '0.5.3'
# With trailing slash. # With trailing slash.

View file

@ -15,18 +15,13 @@ import re
from dateparser import DateDataParser from dateparser import DateDataParser
from functools import lru_cache from functools import lru_cache
parser = DateDataParser(languages=['en'])
@lru_cache(maxsize=None)
def canonical_wikilink(wikilink): def canonical_wikilink(wikilink):
if is_journal(wikilink): if is_journal(wikilink):
try: try:
parser = DateDataParser(languages=['en']) wikilink = canonical_date(wikilink)
date = parser.get_date_data(wikilink).date_obj
new_wikilink = date.isoformat().split("T")[0]
if "nov" in wikilink:
print(f'>> Journal! "{wikilink}" -> "{new_wikilink}"')
wikilink = new_wikilink
except: except:
# TODO: if we add logging, maybe log that we couldn't parse a date here # TODO: if we add logging, maybe log that we couldn't parse a date here
pass pass
@ -42,16 +37,25 @@ def canonical_wikilink(wikilink):
return wikilink return wikilink
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def is_journal(wikilink): def canonical_date(wikilink):
date = parser.get_date_data(wikilink).date_obj
try:
new_wikilink = date.isoformat().split("T")[0]
except:
pass
return new_wikilink
@lru_cache(maxsize=1) #memoize this
def get_combined_date_regex():
date_regexes = [ date_regexes = [
# iso format # iso format
'[0-9]{4}-[0-9]{2}-[0-9]{2}', '[0-9]{4}-[0-9]{2}-[0-9]{2}',
# roam format (what a monstrosity!) # roam format (what a monstrosity!)
'(January|February|March|April|May|June|July|August|September|October|November|December) [0-9]{1,2}(st|nd|th), [0-9]{4}', '(January|February|March|April|May|June|July|August|September|October|November|December) [0-9]{1,2}(st|nd|th), [0-9]{4}',
# roam format (sanitzed for filenames) # roam format (after filename sanitization)
'(january|february|march|april|may|june|july|august|september|october|november|december)-[0-9]{1,2}(st|nd|th)-[0-9]{4}', '(january|february|march|april|may|june|july|august|september|october|november|december)-[0-9]{1,2}(st|nd|th)-[0-9]{4}',
] ]
@ -59,6 +63,9 @@ def is_journal(wikilink):
# TODO: it'd really be better to compile this regex once rather than on # TODO: it'd really be better to compile this regex once rather than on
# each request, but as the knuth would say premature optimization is the # each request, but as the knuth would say premature optimization is the
# root of all evil, etc. etc. # root of all evil, etc. etc.
combined_date_regex = re.compile(f'^({"|".join(date_regexes)})$') return re.compile(f'^({"|".join(date_regexes)})$')
return combined_date_regex.match(wikilink)
@lru_cache(maxsize=None)
def is_journal(wikilink):
return get_combined_date_regex().match(wikilink)