# Copyright 2020 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import cachetools.func import glob import itertools import re import os from . import config from . import render from . import util from collections import defaultdict from fuzzywuzzy import fuzz from operator import attrgetter # For [[push]] parsing, perhaps move elsewhere? import lxml.html import lxml.etree # TODO: move action extractor regex here as well. RE_WIKILINKS = re.compile('\[\[(.*?)\]\]') FUZZ_FACTOR = 95 # URIs are ids. # - In the case of nodes, their [[wikilink]]. # - Example: 'foo', meaning the node that is rendered when you click on [[foo]] somewhere. # - In the case of subnodes, a relative path within the Agora. # - Example: 'garden/flancian/README.md', meaning an actual file called README.md. # - Note the example subnode above gets rendered in node [[README]], so fetching node with uri README would yield it (and others). # TODO: implement. class Graph: def __init__(self): # Revisit. pass def edge(self, n0, n1): pass def edges(self): pass def node(self, uri): # looks up a node by uri (essentially [[wikilink]]). # this used to be even worse :) try: nodes = [node for node in G.nodes() if node.wikilink == uri] node = nodes[0] return node except (KeyError, IndexError): # We'll handle 404 in the template, as we want to show backlinks to non-existent nodes. # Return an empty. return Node(uri) def existing_permutations(self, uri, max_length=4): # looks up nodes matching a permutation of the tokenized uri. # # example use: if [[server-agora]] does not exist, serve [[agora-server]] # # this is, of course, a terrible implementation and dangerous :) # only enable up to 4 tokens as 24 permutations is manageable. tokens = uri.split('-') permutations = itertools.permutations(tokens, max_length) permutations = ['-'.join(permutation) for permutation in permutations] nodes = [node for node in G.nodes() if node.wikilink in permutations and node.subnodes] return nodes @cachetools.func.ttl_cache(maxsize=2, ttl=20) def nodes(self, include_journals=True): # returns a list of all nodes # first we fetch all subnodes, put them in a dict {wikilink -> [subnode]}. # hack hack -- there's something in itertools better than this. wikilink_to_subnodes = defaultdict(list) for subnode in self.subnodes(): wikilink_to_subnodes[subnode.wikilink].append(subnode) # then we iterate over its values and construct nodes for each list of subnodes. nodes = [] for wikilink in wikilink_to_subnodes: node = Node(wikilink) node.subnodes = wikilink_to_subnodes[wikilink] nodes.append(node) # remove journals if so desired. if not include_journals: nodes = [node for node in nodes if not util.is_journal(node.wikilink)] # TODO: experiment with other ranking. # return sorted(nodes, key=lambda x: -x.size()) return sorted(nodes, key=lambda x: x.wikilink.lower()) # The following method is unused; it is far too slow given the current control flow. # Running something like this would be ideal eventually though. # It might also work better once all pulling/pushing logic moves to Graph, where it belongs, # and can make use of more sensible algorithms. @cachetools.func.ttl_cache(maxsize=2, ttl=20) def compute_transclusion(self, include_journals=True): # Add artisanal virtual subnodes (resulting from transclusion/[[push]]) to all nodes. for node in self.nodes(): pushed_subnodes = node.pushed_subnodes() node.subnodes.extend(pushed_subnodes) # does this belong here? @cachetools.func.ttl_cache(maxsize=1, ttl=20) def subnodes(self, sort=lambda x: x.uri.lower()): # Markdown. subnodes = [Subnode(f) for f in glob.glob(os.path.join(config.AGORA_PATH, '**/*.md'), recursive=True)] # Org mode. subnodes.extend([Subnode(f) for f in glob.glob(os.path.join(config.AGORA_PATH, '**/*.org'), recursive=True)]) # Image formats. subnodes.extend([Subnode(f, mediatype='image/jpg') for f in glob.glob(os.path.join(config.AGORA_PATH, '**/*.jpg'), recursive=True)]) subnodes.extend([Subnode(f, mediatype='image/png') for f in glob.glob(os.path.join(config.AGORA_PATH, '**/*.png'), recursive=True)]) subnodes.extend([Subnode(f, mediatype='image/gif') for f in glob.glob(os.path.join(config.AGORA_PATH, '**/*.gif'), recursive=True)]) if sort: return sorted(subnodes, key=sort) else: return subnodes G = Graph() class Node: """Nodes map 1:1 to wikilinks. They resolve to a series of subnodes when being rendered (see below). It maps to a particular file in the Agora repository, stored (relative to the Agora root) in the attribute 'uri'.""" def __init__(self, wikilink): # Use a node's URI as its identifier. # Subnodes are attached to the node matching their wikilink. # i.e. if two users contribute subnodes titled [[foo]], they both show up when querying node [[foo]]. self.wikilink = wikilink self.uri = wikilink # ensure wikilinks to journal entries are all shown in iso format # (important to do it after self.uri = wikilink to avoid breaking # links) if util.is_journal(wikilink): self.wikilink = util.canonical_wikilink(wikilink) self.url = '/node/' + self.uri self.subnodes = [] def __lt__(self, other): return self.wikilink.lower() < other.wikilink.lower() def __gt__(self, other): return self.wikilink.lower() > other.wikilink.lower() def __str__(self): return self.wikilink.lower() def __repr__(self): return "node: {}".format(self.wikilink.lower()) def size(self): return len(self.subnodes) def go(self): # There's surely a much better way to do this. Alas :) links = [] for subnode in self.subnodes: links.extend(subnode.go()) return links def filter(self, other): # There's surely a much better way to do this. Alas :) links = [] for subnode in self.subnodes: links.extend(subnode.filter(other)) return links # The following section is particularly confusing. # Some functions return wikilinks, some return full blown nodes. # We probably want to converge on the latter. # TODO: fix. def forward_links(self): links = [] for subnode in self.subnodes: links.extend(subnode.forward_links) return sorted(set(links)) # Pattern: (subject).action_object. # Could be modeled with RDF? def pull_nodes(self): # the nodes *being pulled* by this node. nodes = [] for subnode in self.subnodes: nodes.extend(subnode.pull_nodes()) return sorted(set(nodes), key=lambda x: x.uri) def pulling_nodes(self): # the nodes pulling *this* node. # compare with: pull_nodes. nodes = [] for backlink in self.back_links(): n = G.node(backlink) if self.wikilink in [n.wikilink for n in n.pull_nodes()]: nodes.append(n) return nodes def push_nodes(self): # nodes pushed to from this node. links = [] for subnode in self.subnodes: links.extend(subnode.push_nodes()) return sorted(set(links)) def pushing_nodes(self): # the nodes pushing to *this* node. # compare with: push_nodes. nodes = [] for backlink in self.back_links(): n = G.node(backlink) if self.wikilink == n.wikilink: # ignore nodes pushing to themselves. continue if self.wikilink != n.wikilink and self.wikilink in [n.wikilink for n in n.push_nodes()]: nodes.append(n) return nodes def pushing(self, other): # returns the blocks that this node pushes to one other as "virtual subnodes" # [[push]] as in anagora.org/node/push. # # arg other should be a Node. # TODO: actually add type annotations, this is 2021. # # TLDR: # - [[push]] [[other]] # pushes all children (indented subitems) to [[other]]. # # TODO: implement also: # - [[push]] [[other]] foo # pushes foo to [[other]] # # Congratulations! You've gotten to the hackiest place in the [[agora]]. # ...as of the time of writing :) subnodes = [] if other.wikilink in [n.wikilink for n in self.push_nodes()]: for subnode in self.subnodes: if subnode.mediatype != 'text/plain': continue try: # I tried parsing the marko tree but honestly this seemed easier/simpler. html = render.markdown(subnode.content) except AssertionError: breakpoint() try: tree = lxml.html.fromstring(html) except lxml.etree.ParserError: continue for link in tree.iterlinks(): # link is of the form (element, attribute, link, pos) -- see https://lxml.de/3.1/lxmlhtml.html. if link[2] == 'push': # ugly, but hey, it works... for now. # this is *flaky* as it depends on an exact number of html elements to separate # [[push]] and its [[target node]]. # could be easily improved by just looking for the next . try: argument = link[0].getnext().getnext().getnext().text_content() if re.search(other.wikilink, argument, re.IGNORECASE) or re.search(other.wikilink.replace('-', ' '), argument, re.IGNORECASE): # go one level up to find the
  • parent = link[0].getparent() # the block to be pushed is this level and its children. # TODO: replace [[push]] [[other]] with something like [[pushed from]] [[node]], which makes more sense in the target. block = lxml.etree.tostring(parent) subnodes.append(VirtualSubnode(subnode, other, block)) except AttributeError: # Better luck next time -- or when I fix this code :) pass return subnodes def back_links(self): return sorted([x.wikilink for x in nodes_by_outlink(self.wikilink) if x.wikilink != self.wikilink]) def pushed_subnodes(self): subnodes = [] for node in self.pushing_nodes(): for subnode in node.pushing(self): subnodes.append(subnode) return subnodes class Subnode: """A subnode is a note or media resource volunteered by a user of the Agora. It maps to a particular file in the Agora repository, stored (relative to the Agora root) in the attribute 'uri'.""" def __init__(self, path, mediatype='text/plain'): # Use a subnode's URI as its identifier. self.uri = path_to_uri(path) self.url = '/subnode/' + path_to_uri(path) # Subnodes are attached to the node matching their wikilink. # i.e. if two users contribute subnodes titled [[foo]], they both show up when querying node [[foo]]. self.wikilink = util.canonical_wikilink(path_to_wikilink(path)) self.user = path_to_user(path) self.mediatype = mediatype if self.mediatype == 'text/plain': with open(path) as f: self.content = f.read() # Marko raises IndexError on render if the file doesn't terminate with a newline. if not self.content.endswith('\n'): self.content = self.content + '\n' self.forward_links = content_to_forward_links(self.content) elif self.mediatype.startswith('image'): with open(path, 'rb') as f: self.content = f.read() self.forward_links = [] else: raise ValueError self.mtime = os.path.getmtime(path) self.node = self.wikilink # Initiate node for wikilink if this is the first subnode, append otherwise. # G.addsubnode(self) def __eq__(self, other): # hack hack if fuzz.ratio(self.wikilink, other.wikilink) > FUZZ_FACTOR: return True else: return False def __sub__(self, other): # hack hack return 100-fuzz.ratio(self.wikilink, other.wikilink) def distance(self, other): # hack hack return 100-fuzz.ratio(self.wikilink, other.wikilink) def render(self): if self.mediatype != 'text/plain': # hack hack #return 'This is a subnode of type {}. You can view it.'.format(self.mediatype, self.uri) return '

    '.format(self.uri) if self.uri.endswith('md') or self.uri.endswith('MD'): content = render.markdown(self.content) if self.uri.endswith('org') or self.uri.endswith('ORG'): content = render.orgmode(self.content) return render.postprocess(content) def raw(self): return content def go(self): """ returns a set of go links contained in this subnode go links are blocks of the form: - [[go]] protocol://example.org/url protocol defaults to https. """ golinks = subnode_to_actions(self, 'go') sanitized_golinks = [] for golink in golinks: # should probably instead check for contains: // if golink.startswith('http'): sanitized_golinks.append(golink) else: # hack hack. sanitized_golinks.append('https://' + golink) return sanitized_golinks def filter(self, other): """ other is a string. returns a set of links contained in this subnode in blocks of the form: - [[other]] protocol://example.org/url protocol defaults to https. might pick up magic like resolving social network issues later :) """ links = subnode_to_actions(self, other, blocks_only=True) sanitized_links = [] for link in links: if '://' in link: sanitized_links.append(link) else: # hack hack. sanitized_links.append('https://' + link) return sanitized_links def pull_nodes(self): """ returns a set of nodes pulled (anagora.org/node/pull) in this subnode pulls are blocks of the form: - [[pull]] [[node]] """ # TODO: test. pull_blocks = subnode_to_actions(self, 'pull') pull_nodes = content_to_forward_links("\n".join(pull_blocks)) return [G.node(node) for node in pull_nodes] def push_nodes(self): """ returns a set of push links contained in this subnode push links are blocks of the form: - [[push]] [[node]] TODO: refactor with the above. """ # TODO: test. push_blocks = subnode_to_actions(self, 'push') push_nodes = content_to_forward_links("\n".join(push_blocks)) return [G.node(node) for node in push_nodes] class VirtualSubnode(Subnode): # For instantiating a virtual subnode -- a subnode derived from another subnode. # Used by [[push]] (transclusion). def __init__(self, source_subnode, target_node, block): """ source_subnode: where this virtual subnode came from. target_node: where this virtual subnode will attach (go to). block: the actual payload, as pre rendered html.""" self.uri = source_subnode.uri self.url = '/subnode/virtual' # Virtual subnodes are attached to their target self.wikilink = target_node.wikilink self.user = source_subnode.user # Only text transclusion supported. self.mediatype = 'text/plain' self.content = block.decode('UTF-8') self.forward_links = content_to_forward_links(self.content) self.mtime = source_subnode.mtime self.node = self.wikilink def subnode_to_actions(subnode, action, blocks_only=False): # hack hack. if subnode.mediatype != 'text/plain': return [] if blocks_only: action_regex ='- \[\[' + action + '\]\] (.*?)$' else: action_regex ='\[\[' + action + '\]\] (.*?)$' content = subnode.content actions = [] for line in content.splitlines(): m = re.search(action_regex, line) if m: actions.append(m.group(1)) return actions class User: def __init__(self, user): self.uri = user self.url = '/user/' + self.uri self.subnodes = subnodes_by_user(user) def size(self): return len(self.subnodes) def path_to_uri(path): return path.replace(config.AGORA_PATH + '/', '') def path_to_user(path): m = re.search('garden/(.+?)/', path) if m: return m.group(1) else: return 'agora' def path_to_wikilink(path): return os.path.splitext(os.path.basename(path))[0] def content_to_forward_links(content): # hack hack. match = RE_WIKILINKS.findall(content) if match: # Work around broken forward links due to org mode convention I didn't think of. # TODO: make link parsing format-aware. return [util.canonical_wikilink(m) for m in match if '][' not in m] else: return [] def latest(): return sorted(G.subnodes(), key=lambda x: -x.mtime) def all_users(): # hack hack. users = os.listdir(os.path.join(config.AGORA_PATH, 'garden')) return sorted([User(u) for u in users], key=lambda x: x.uri.lower()) def all_journals(): # hack hack. nodes = G.nodes() nodes = [node for node in nodes if util.is_journal(node.wikilink)] return sorted(nodes, key=attrgetter('wikilink'), reverse=True) # Deprecated. def nodes_by_wikilink(wikilink): nodes = [node for node in G.nodes() if node.wikilink == wikilink] return nodes # Deprecated. def wikilink_to_node(node): try: return nodes_by_wikilink(node)[0] except (KeyError, IndexError): # We'll handle 404 in the template, as we want to show backlinks to non-existent nodes. # Return an empty. return Node(node) def subnodes_by_wikilink(wikilink, fuzzy_matching=True): if fuzzy_matching: # TODO subnodes = [subnode for subnode in G.subnodes() if fuzz.ratio(subnode.wikilink, wikilink) > FUZZ_FACTOR] else: subnodes = [subnode for subnode in G.subnodes() if subnode.wikilink == wikilink] return subnodes def search_subnodes(query): subnodes = [subnode for subnode in G.subnodes() if subnode.mediatype == 'text/plain' and re.search(query, subnode.content, re.IGNORECASE)] return subnodes def search_subnodes_by_user(query, user): subnodes = [subnode for subnode in G.subnodes() if subnode.mediatype == 'text/plain' and subnode.user == user and re.search(query, subnode.content, re.IGNORECASE)] return subnodes def subnodes_by_user(user): subnodes = [subnode for subnode in G.subnodes() if subnode.user == user] return subnodes def user_readmes(user): # hack hack # fix duplication. subnodes = [subnode for subnode in G.subnodes() if subnode.mediatype == 'text/plain' and subnode.user == user and re.search('readme', subnode.wikilink, re.IGNORECASE)] return subnodes def subnode_by_uri(uri): subnode = [subnode for subnode in G.subnodes() if subnode.uri == uri] if subnode: return subnode[0] else: # TODO: handle. return False def nodes_by_outlink(wikilink): nodes = [node for node in G.nodes() if wikilink in node.forward_links()] return sorted(nodes, key=attrgetter('wikilink')) def subnodes_by_outlink(wikilink): # This doesn't work. It matches too much/too little for some reason. Debug someday? # subnodes = [subnode for subnode in all_subnodes() if [wikilink for wikilink in subnode.forward_links if fuzz.ratio(subnode.wikilink, wikilink) > FUZZ_FACTOR]] subnodes = [subnode for subnode in G.subnodes() if util.canonical_wikilink(wikilink) in subnode.forward_links] return subnodes