Rewrite text parsing to ensure correctness
This commit is contained in:
parent
83a651e732
commit
7b766b793b
7 changed files with 47 additions and 100 deletions
|
@ -6,12 +6,6 @@ import types, utils, query
|
|||
from unicode import Rune, `$`
|
||||
|
||||
const
|
||||
urlRegex = re"((https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+([/\?][^\s\)]*)?)"
|
||||
emailRegex = re"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
|
||||
usernameRegex = re"(^|[^A-z0-9_?\/])@([A-z0-9_]+)"
|
||||
picRegex = re"pic.twitter.com/[^ ]+"
|
||||
ellipsisRegex = re" ?…"
|
||||
hashtagRegex = re"([^\S]|^)([#$]\w+)"
|
||||
ytRegex = re"(www.|m.)?youtu(be.com|.be)"
|
||||
twRegex = re"(www.|mobile.)?twitter.com"
|
||||
nbsp = $Rune(0x000A0)
|
||||
|
@ -26,75 +20,14 @@ proc shortLink*(text: string; length=28): string =
|
|||
if result.len > length:
|
||||
result = result[0 ..< length] & "…"
|
||||
|
||||
proc toLink*(url, text: string): string =
|
||||
a(text, href=url)
|
||||
|
||||
proc reUrlToShortLink*(m: RegexMatch; s: string): string =
|
||||
let url = s[m.group(0)[0]]
|
||||
toLink(url, shortLink(url))
|
||||
|
||||
proc reUrlToLink*(m: RegexMatch; s: string): string =
|
||||
let url = s[m.group(0)[0]]
|
||||
toLink(url, url.replace(re"https?://(www.)?", ""))
|
||||
|
||||
proc reEmailToLink*(m: RegexMatch; s: string): string =
|
||||
let url = s[m.group(0)[0]]
|
||||
toLink("mailto://" & url, url)
|
||||
|
||||
proc reHashtagToLink*(m: RegexMatch; s: string): string =
|
||||
result = if m.group(0).len > 0: s[m.group(0)[0]] else: ""
|
||||
let hash = s[m.group(1)[0]]
|
||||
let link = toLink("/search?q=" & encodeUrl(hash), hash)
|
||||
if hash.any(isAlphaAscii):
|
||||
result &= link
|
||||
else:
|
||||
result &= hash
|
||||
|
||||
proc reUsernameToLink*(m: RegexMatch; s: string): string =
|
||||
var username = ""
|
||||
var pretext = ""
|
||||
|
||||
let pre = m.group(0)
|
||||
let match = m.group(1)
|
||||
|
||||
username = s[match[0]]
|
||||
|
||||
if pre.len > 0:
|
||||
pretext = s[pre[0]]
|
||||
|
||||
pretext & toLink("/" & username, "@" & username)
|
||||
|
||||
proc reUsernameToFullLink*(m: RegexMatch; s: string): string =
|
||||
result = reUsernameToLink(m, s)
|
||||
result = result.replace("href=\"/", &"href=\"https://{hostname}/")
|
||||
|
||||
proc replaceUrl*(url: string; prefs: Prefs): string =
|
||||
proc replaceUrl*(url: string; prefs: Prefs; rss=false): string =
|
||||
result = url
|
||||
if prefs.replaceYouTube.len > 0:
|
||||
result = result.replace(ytRegex, prefs.replaceYouTube)
|
||||
if prefs.replaceTwitter.len > 0:
|
||||
result = result.replace(twRegex, prefs.replaceTwitter)
|
||||
|
||||
proc linkifyText*(text: string; prefs: Prefs; rss=false): string =
|
||||
result = xmltree.escape(stripText(text))
|
||||
result = result.replace(ellipsisRegex, " ")
|
||||
result = result.replace(emailRegex, reEmailToLink)
|
||||
if rss:
|
||||
result = result.replace(urlRegex, reUrlToLink)
|
||||
result = result.replace(usernameRegex, reUsernameToFullLink)
|
||||
else:
|
||||
result = result.replace(urlRegex, reUrlToShortLink)
|
||||
result = result.replace(usernameRegex, reUsernameToLink)
|
||||
result = result.replace(hashtagRegex, reHashtagToLink)
|
||||
result = result.replace(re"([^\s\(\n%])<a", "$1 <a")
|
||||
result = result.replace(re"</a>\s+([;.,!\)'%]|')", "</a>$1")
|
||||
result = result.replace(re"^\. <a", ".<a")
|
||||
result = result.replaceUrl(prefs)
|
||||
|
||||
proc stripTwitterUrls*(text: string): string =
|
||||
result = text
|
||||
result = result.replace(picRegex, "")
|
||||
result = result.replace(ellipsisRegex, "")
|
||||
result = result.replace("href=\"/", "href=\"" & hostname & "/")
|
||||
|
||||
proc proxifyVideo*(manifest: string; proxy: bool): string =
|
||||
proc cb(m: RegexMatch; s: string): string =
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
import xmltree, strtabs, strformat, strutils, times
|
||||
import xmltree, strtabs, strformat, strutils, times, uri
|
||||
import regex
|
||||
|
||||
import types, formatters
|
||||
|
||||
from q import nil
|
||||
from htmlgen import a
|
||||
|
||||
const
|
||||
thumbRegex = re".+:url\('([^']+)'\)"
|
||||
|
@ -41,29 +42,41 @@ proc isVerified*(profile: XmlNode): bool =
|
|||
proc isProtected*(profile: XmlNode): bool =
|
||||
getHeader(profile).select(".Icon.Icon--protected") != nil
|
||||
|
||||
proc emojify*(node: XmlNode) =
|
||||
for i in node.selectAll(".Emoji"):
|
||||
i.add newText(i.attr("alt"))
|
||||
proc parseText*(text: XmlNode; skipLink=""): string =
|
||||
for el in text:
|
||||
case el.kind
|
||||
of xnText:
|
||||
result.add el
|
||||
of xnElement:
|
||||
if el.attrs == nil:
|
||||
if el.tag == "strong":
|
||||
result.add $el
|
||||
continue
|
||||
|
||||
let class = el.attr("class")
|
||||
if "data-expanded-url" in el.attrs:
|
||||
let url = el.attr("data-expanded-url")
|
||||
if url == skipLink: continue
|
||||
elif "u-hidden" in class: result.add "\n"
|
||||
result.add a(shortLink(url), href=url)
|
||||
elif "ashtag" in class:
|
||||
let hash = el.innerText()
|
||||
result.add a(hash, href=("/search?q=" & encodeUrl(hash)))
|
||||
elif "atreply" in class:
|
||||
result.add a(el.innerText(), href=el.attr("href"))
|
||||
elif "Emoji" in class:
|
||||
result.add el.attr("alt")
|
||||
else: discard
|
||||
|
||||
proc getQuoteText*(tweet: XmlNode): string =
|
||||
let text = tweet.select(".QuoteTweet-text")
|
||||
emojify(text)
|
||||
result = stripText(text.innerText())
|
||||
result = stripTwitterUrls(result)
|
||||
parseText(tweet.select(".QuoteTweet-text"))
|
||||
|
||||
proc getTweetText*(tweet: XmlNode): string =
|
||||
let
|
||||
quote = tweet.select(".QuoteTweet")
|
||||
text = tweet.select(".tweet-text")
|
||||
link = text.selectAttr("a.twitter-timeline-link.u-hidden", "data-expanded-url")
|
||||
|
||||
emojify(text)
|
||||
result = stripText(text.innerText())
|
||||
|
||||
if quote != nil and link.len > 0:
|
||||
result = result.replace(link, "")
|
||||
|
||||
result = stripTwitterUrls(result)
|
||||
parseText(text, if quote != nil: link else: "")
|
||||
|
||||
proc getTime(tweet: XmlNode): XmlNode =
|
||||
tweet.select(".js-short-timestamp")
|
||||
|
@ -87,10 +100,10 @@ proc getUsername*(profile: XmlNode; selector: string): string =
|
|||
profile.selectText(selector).strip(chars={'@', ' ', '\n'})
|
||||
|
||||
proc getBio*(profile: XmlNode; selector: string; fallback=""): string =
|
||||
var bio = profile.selectText(selector)
|
||||
if bio.len == 0 and fallback.len > 0:
|
||||
bio = profile.selectText(fallback)
|
||||
stripText(bio)
|
||||
var bio = profile.select(selector)
|
||||
if bio == nil and fallback.len > 0:
|
||||
bio = profile.select(fallback)
|
||||
parseText(bio)
|
||||
|
||||
proc getLocation*(profile: XmlNode): string =
|
||||
let sel = ".ProfileHeaderCard-locationText"
|
||||
|
|
|
@ -25,7 +25,7 @@ proc renderProfileCard*(profile: Profile; prefs: Prefs): VNode =
|
|||
tdiv(class="profile-card-extra"):
|
||||
if profile.bio.len > 0:
|
||||
tdiv(class="profile-bio"):
|
||||
p: verbatim linkifyText(profile.bio, prefs)
|
||||
p: verbatim replaceUrl(profile.bio, prefs)
|
||||
|
||||
if profile.location.len > 0:
|
||||
tdiv(class="profile-location"):
|
||||
|
@ -39,8 +39,9 @@ proc renderProfileCard*(profile: Profile; prefs: Prefs): VNode =
|
|||
if profile.website.len > 0:
|
||||
tdiv(class="profile-website"):
|
||||
span:
|
||||
let url = replaceUrl(profile.website, prefs)
|
||||
icon "link"
|
||||
verbatim linkifyText(profile.website, prefs)
|
||||
a(href=url): text shortLink(url)
|
||||
|
||||
tdiv(class="profile-joindate"):
|
||||
span(title=getJoinDateFull(profile)):
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
#if tweet.pinned: result = "Pinned: "
|
||||
#elif tweet.retweet.isSome: result = "RT: "
|
||||
#end if
|
||||
#result &= xmltree.escape(replaceUrl(tweet.text, prefs))
|
||||
#result &= xmltree.escape(replaceUrl(tweet.text, prefs, rss=true))
|
||||
#if result.len > 0: return
|
||||
#end if
|
||||
#if tweet.photos.len > 0:
|
||||
|
@ -20,7 +20,7 @@
|
|||
#end proc
|
||||
#
|
||||
#proc renderRssTweet(tweet: Tweet; prefs: Prefs): string =
|
||||
#let text = linkifyText(tweet.text, prefs, rss=true)
|
||||
#let text = replaceUrl(tweet.text, prefs, rss=true)
|
||||
#if tweet.quote.isSome and get(tweet.quote).available:
|
||||
#let quoteLink = hostname & getLink(get(tweet.quote))
|
||||
<p>${text}<br><a href="https://${quoteLink}">${quoteLink}</a></p>
|
||||
|
@ -58,7 +58,7 @@
|
|||
#end proc
|
||||
#
|
||||
#proc renderTimelineRss*(timeline: Timeline; profile: Profile): string =
|
||||
#let prefs = Prefs(replaceTwitter: hostname)
|
||||
#let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
|
||||
#result = ""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
|
||||
|
@ -84,7 +84,7 @@
|
|||
#end proc
|
||||
#
|
||||
#proc renderListRss*(tweets: seq[Tweet]; name, list: string): string =
|
||||
#let prefs = Prefs(replaceTwitter: hostname)
|
||||
#let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
|
||||
#let link = &"https://{hostname}/{name}/lists/{list}"
|
||||
#result = ""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
@ -102,7 +102,7 @@
|
|||
#end proc
|
||||
#
|
||||
#proc renderSearchRss*(tweets: seq[Tweet]; name, param: string): string =
|
||||
#let prefs = Prefs(replaceTwitter: hostname)
|
||||
#let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
|
||||
#let link = &"https://{hostname}/search"
|
||||
#result = ""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
|
|
@ -56,7 +56,7 @@ proc renderUser(user: Profile; prefs: Prefs): VNode =
|
|||
linkUser(user, class="username")
|
||||
|
||||
tdiv(class="tweet-content media-body"):
|
||||
verbatim linkifyText(user.bio, prefs)
|
||||
verbatim replaceUrl(user.bio, prefs)
|
||||
|
||||
proc renderTimelineUsers*(results: Result[Profile]; prefs: Prefs; path=""): VNode =
|
||||
buildHtml(tdiv(class="timeline")):
|
||||
|
|
|
@ -215,7 +215,7 @@ proc renderQuote(quote: Quote; prefs: Prefs): VNode =
|
|||
renderReply(quote)
|
||||
|
||||
tdiv(class="quote-text"):
|
||||
verbatim linkifyText(quote.text, prefs)
|
||||
verbatim replaceUrl(quote.text, prefs)
|
||||
|
||||
if quote.hasThread:
|
||||
a(class="show-thread", href=getLink(quote)):
|
||||
|
@ -248,7 +248,7 @@ proc renderTweet*(tweet: Tweet; prefs: Prefs; path: string; class="";
|
|||
renderReply(tweet)
|
||||
|
||||
tdiv(class="tweet-content media-body"):
|
||||
verbatim linkifyText(tweet.text, prefs)
|
||||
verbatim replaceUrl(tweet.text, prefs)
|
||||
|
||||
if tweet.quote.isSome:
|
||||
renderQuote(tweet.quote.get(), prefs)
|
||||
|
|
|
@ -51,7 +51,7 @@ link = [
|
|||
'old.reddit.com/r/programming…'
|
||||
]],
|
||||
['nim_lang/status/1125887775151140864', [
|
||||
'en.wikipedia.org/wiki/Nim_(p…)'
|
||||
'en.wikipedia.org/wiki/Nim_(p…'
|
||||
]],
|
||||
['hiankun_taioan/status/1086916335215341570', [
|
||||
'(hackernoon.com/interview-wit…)'
|
||||
|
|
Loading…
Reference in a new issue