agora-server/app/util.py

# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from dateparser import DateDataParser
from functools import lru_cache


@lru_cache(maxsize=None)
def canonical_wikilink(wikilink):

    if is_journal(wikilink):
        try:
            parser = DateDataParser(languages=['en'])
            date = parser.get_date_data(wikilink).date_obj
            new_wikilink = date.isoformat().split("T")[0] 
            if "nov" in wikilink:
                print(f'>> Journal! "{wikilink}" -> "{new_wikilink}"')
            wikilink = new_wikilink 
        except:
            # TODO: if we add logging, maybe log that we couldn't parse a date here
            pass

    # hack hack
    wikilink = (
        wikilink.lower()
        .replace(' ', '-')
        .replace('\'', '')
        .replace(',', '')
        .replace('/', '-')
    )
    return wikilink


@lru_cache(maxsize=None)
def is_journal(wikilink):

    date_regexes = [
        # iso format
        '[0-9]{4}-[0-9]{2}-[0-9]{2}',
        # roam format (what a monstrosity!)
        '(January|February|March|April|May|June|July|August|September|October|November|December) [0-9]{1,2}(st|nd|th), [0-9]{4}',
        # roam format (sanitzed for filenames)
        '(january|february|march|april|may|june|july|august|september|october|november|december)-[0-9]{1,2}(st|nd|th)-[0-9]{4}',
    ]

    # combine all the date regexes into one super regex
    # TODO: it'd really be better to compile this regex once rather than on
    # each request, but as the knuth would say premature optimization is the
    # root of all evil, etc. etc.
    combined_date_regex = re.compile(f'^({"|".join(date_regexes)})$')

    return combined_date_regex.match(wikilink)
Add util.py, a bag of holding (hack hack). 2020-11-16 14:26:03 +00:00			`# Copyright 2020 Google LLC`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
make journals in roam date format be recognized by journals page 2020-11-28 03:18:51 +00:00			`import re`
make wikilinks for journals all render in iso format (very inefficient implementation for now) 2020-11-28 03:56:19 +00:00			`from dateparser import DateDataParser`
			`from functools import lru_cache`
Add util.py, a bag of holding (hack hack). 2020-11-16 14:26:03 +00:00
make wikilinks for journals all render in iso format (very inefficient implementation for now) 2020-11-28 03:56:19 +00:00
			`@lru_cache(maxsize=None)`
Add util.py, a bag of holding (hack hack). 2020-11-16 14:26:03 +00:00			`def canonical_wikilink(wikilink):`
make wikilinks for journals all render in iso format (very inefficient implementation for now) 2020-11-28 03:56:19 +00:00
			`if is_journal(wikilink):`
			`try:`
			`parser = DateDataParser(languages=['en'])`
			`date = parser.get_date_data(wikilink).date_obj`
			`new_wikilink = date.isoformat().split("T")[0]`
			`if "nov" in wikilink:`
			`print(f'>> Journal! "{wikilink}" -> "{new_wikilink}"')`
			`wikilink = new_wikilink`
			`except:`
			`# TODO: if we add logging, maybe log that we couldn't parse a date here`
			`pass`

Add util.py, a bag of holding (hack hack). 2020-11-16 14:26:03 +00:00			`# hack hack`
protect against / in page title (common in roam exports) roam's namespace feature recommends the use of / in page titles, but since we are using a file-based approach here important to filter them out lest we end up with subfolder madness 2020-11-28 02:32:56 +00:00			`wikilink = (`
			`wikilink.lower()`
			`.replace(' ', '-')`
			`.replace('\'', '')`
			`.replace(',', '')`
			`.replace('/', '-')`
			`)`
Add util.py, a bag of holding (hack hack). 2020-11-16 14:26:03 +00:00			`return wikilink`
make journals in roam date format be recognized by journals page 2020-11-28 03:18:51 +00:00

make wikilinks for journals all render in iso format (very inefficient implementation for now) 2020-11-28 03:56:19 +00:00
			`@lru_cache(maxsize=None)`
make journals in roam date format be recognized by journals page 2020-11-28 03:18:51 +00:00			`def is_journal(wikilink):`
make wikilinks for journals all render in iso format (very inefficient implementation for now) 2020-11-28 03:56:19 +00:00
make journals in roam date format be recognized by journals page 2020-11-28 03:18:51 +00:00			`date_regexes = [`
			`# iso format`
			`'[0-9]{4}-[0-9]{2}-[0-9]{2}',`
			`# roam format (what a monstrosity!)`
			`'(January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December) [0-9]{1,2}(st\|nd\|th), [0-9]{4}',`
			`# roam format (sanitzed for filenames)`
			`'(january\|february\|march\|april\|may\|june\|july\|august\|september\|october\|november\|december)-[0-9]{1,2}(st\|nd\|th)-[0-9]{4}',`
			`]`

			`# combine all the date regexes into one super regex`
make wikilinks for journals all render in iso format (very inefficient implementation for now) 2020-11-28 03:56:19 +00:00			`# TODO: it'd really be better to compile this regex once rather than on`
			`# each request, but as the knuth would say premature optimization is the`
			`# root of all evil, etc. etc.`
make journals in roam date format be recognized by journals page 2020-11-28 03:18:51 +00:00			`combined_date_regex = re.compile(f'^({"\|".join(date_regexes)})$')`

			`return combined_date_regex.match(wikilink)`