Rewrite text parsing to ensure correctness
This commit is contained in:
parent
83a651e732
commit
7b766b793b
7 changed files with 47 additions and 100 deletions
|
@ -6,12 +6,6 @@ import types, utils, query
|
||||||
from unicode import Rune, `$`
|
from unicode import Rune, `$`
|
||||||
|
|
||||||
const
|
const
|
||||||
urlRegex = re"((https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+([/\?][^\s\)]*)?)"
|
|
||||||
emailRegex = re"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
|
|
||||||
usernameRegex = re"(^|[^A-z0-9_?\/])@([A-z0-9_]+)"
|
|
||||||
picRegex = re"pic.twitter.com/[^ ]+"
|
|
||||||
ellipsisRegex = re" ?…"
|
|
||||||
hashtagRegex = re"([^\S]|^)([#$]\w+)"
|
|
||||||
ytRegex = re"(www.|m.)?youtu(be.com|.be)"
|
ytRegex = re"(www.|m.)?youtu(be.com|.be)"
|
||||||
twRegex = re"(www.|mobile.)?twitter.com"
|
twRegex = re"(www.|mobile.)?twitter.com"
|
||||||
nbsp = $Rune(0x000A0)
|
nbsp = $Rune(0x000A0)
|
||||||
|
@ -26,75 +20,14 @@ proc shortLink*(text: string; length=28): string =
|
||||||
if result.len > length:
|
if result.len > length:
|
||||||
result = result[0 ..< length] & "…"
|
result = result[0 ..< length] & "…"
|
||||||
|
|
||||||
proc toLink*(url, text: string): string =
|
proc replaceUrl*(url: string; prefs: Prefs; rss=false): string =
|
||||||
a(text, href=url)
|
|
||||||
|
|
||||||
proc reUrlToShortLink*(m: RegexMatch; s: string): string =
|
|
||||||
let url = s[m.group(0)[0]]
|
|
||||||
toLink(url, shortLink(url))
|
|
||||||
|
|
||||||
proc reUrlToLink*(m: RegexMatch; s: string): string =
|
|
||||||
let url = s[m.group(0)[0]]
|
|
||||||
toLink(url, url.replace(re"https?://(www.)?", ""))
|
|
||||||
|
|
||||||
proc reEmailToLink*(m: RegexMatch; s: string): string =
|
|
||||||
let url = s[m.group(0)[0]]
|
|
||||||
toLink("mailto://" & url, url)
|
|
||||||
|
|
||||||
proc reHashtagToLink*(m: RegexMatch; s: string): string =
|
|
||||||
result = if m.group(0).len > 0: s[m.group(0)[0]] else: ""
|
|
||||||
let hash = s[m.group(1)[0]]
|
|
||||||
let link = toLink("/search?q=" & encodeUrl(hash), hash)
|
|
||||||
if hash.any(isAlphaAscii):
|
|
||||||
result &= link
|
|
||||||
else:
|
|
||||||
result &= hash
|
|
||||||
|
|
||||||
proc reUsernameToLink*(m: RegexMatch; s: string): string =
|
|
||||||
var username = ""
|
|
||||||
var pretext = ""
|
|
||||||
|
|
||||||
let pre = m.group(0)
|
|
||||||
let match = m.group(1)
|
|
||||||
|
|
||||||
username = s[match[0]]
|
|
||||||
|
|
||||||
if pre.len > 0:
|
|
||||||
pretext = s[pre[0]]
|
|
||||||
|
|
||||||
pretext & toLink("/" & username, "@" & username)
|
|
||||||
|
|
||||||
proc reUsernameToFullLink*(m: RegexMatch; s: string): string =
|
|
||||||
result = reUsernameToLink(m, s)
|
|
||||||
result = result.replace("href=\"/", &"href=\"https://{hostname}/")
|
|
||||||
|
|
||||||
proc replaceUrl*(url: string; prefs: Prefs): string =
|
|
||||||
result = url
|
result = url
|
||||||
if prefs.replaceYouTube.len > 0:
|
if prefs.replaceYouTube.len > 0:
|
||||||
result = result.replace(ytRegex, prefs.replaceYouTube)
|
result = result.replace(ytRegex, prefs.replaceYouTube)
|
||||||
if prefs.replaceTwitter.len > 0:
|
if prefs.replaceTwitter.len > 0:
|
||||||
result = result.replace(twRegex, prefs.replaceTwitter)
|
result = result.replace(twRegex, prefs.replaceTwitter)
|
||||||
|
|
||||||
proc linkifyText*(text: string; prefs: Prefs; rss=false): string =
|
|
||||||
result = xmltree.escape(stripText(text))
|
|
||||||
result = result.replace(ellipsisRegex, " ")
|
|
||||||
result = result.replace(emailRegex, reEmailToLink)
|
|
||||||
if rss:
|
if rss:
|
||||||
result = result.replace(urlRegex, reUrlToLink)
|
result = result.replace("href=\"/", "href=\"" & hostname & "/")
|
||||||
result = result.replace(usernameRegex, reUsernameToFullLink)
|
|
||||||
else:
|
|
||||||
result = result.replace(urlRegex, reUrlToShortLink)
|
|
||||||
result = result.replace(usernameRegex, reUsernameToLink)
|
|
||||||
result = result.replace(hashtagRegex, reHashtagToLink)
|
|
||||||
result = result.replace(re"([^\s\(\n%])<a", "$1 <a")
|
|
||||||
result = result.replace(re"</a>\s+([;.,!\)'%]|')", "</a>$1")
|
|
||||||
result = result.replace(re"^\. <a", ".<a")
|
|
||||||
result = result.replaceUrl(prefs)
|
|
||||||
|
|
||||||
proc stripTwitterUrls*(text: string): string =
|
|
||||||
result = text
|
|
||||||
result = result.replace(picRegex, "")
|
|
||||||
result = result.replace(ellipsisRegex, "")
|
|
||||||
|
|
||||||
proc proxifyVideo*(manifest: string; proxy: bool): string =
|
proc proxifyVideo*(manifest: string; proxy: bool): string =
|
||||||
proc cb(m: RegexMatch; s: string): string =
|
proc cb(m: RegexMatch; s: string): string =
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import xmltree, strtabs, strformat, strutils, times
|
import xmltree, strtabs, strformat, strutils, times, uri
|
||||||
import regex
|
import regex
|
||||||
|
|
||||||
import types, formatters
|
import types, formatters
|
||||||
|
|
||||||
from q import nil
|
from q import nil
|
||||||
|
from htmlgen import a
|
||||||
|
|
||||||
const
|
const
|
||||||
thumbRegex = re".+:url\('([^']+)'\)"
|
thumbRegex = re".+:url\('([^']+)'\)"
|
||||||
|
@ -41,29 +42,41 @@ proc isVerified*(profile: XmlNode): bool =
|
||||||
proc isProtected*(profile: XmlNode): bool =
|
proc isProtected*(profile: XmlNode): bool =
|
||||||
getHeader(profile).select(".Icon.Icon--protected") != nil
|
getHeader(profile).select(".Icon.Icon--protected") != nil
|
||||||
|
|
||||||
proc emojify*(node: XmlNode) =
|
proc parseText*(text: XmlNode; skipLink=""): string =
|
||||||
for i in node.selectAll(".Emoji"):
|
for el in text:
|
||||||
i.add newText(i.attr("alt"))
|
case el.kind
|
||||||
|
of xnText:
|
||||||
|
result.add el
|
||||||
|
of xnElement:
|
||||||
|
if el.attrs == nil:
|
||||||
|
if el.tag == "strong":
|
||||||
|
result.add $el
|
||||||
|
continue
|
||||||
|
|
||||||
|
let class = el.attr("class")
|
||||||
|
if "data-expanded-url" in el.attrs:
|
||||||
|
let url = el.attr("data-expanded-url")
|
||||||
|
if url == skipLink: continue
|
||||||
|
elif "u-hidden" in class: result.add "\n"
|
||||||
|
result.add a(shortLink(url), href=url)
|
||||||
|
elif "ashtag" in class:
|
||||||
|
let hash = el.innerText()
|
||||||
|
result.add a(hash, href=("/search?q=" & encodeUrl(hash)))
|
||||||
|
elif "atreply" in class:
|
||||||
|
result.add a(el.innerText(), href=el.attr("href"))
|
||||||
|
elif "Emoji" in class:
|
||||||
|
result.add el.attr("alt")
|
||||||
|
else: discard
|
||||||
|
|
||||||
proc getQuoteText*(tweet: XmlNode): string =
|
proc getQuoteText*(tweet: XmlNode): string =
|
||||||
let text = tweet.select(".QuoteTweet-text")
|
parseText(tweet.select(".QuoteTweet-text"))
|
||||||
emojify(text)
|
|
||||||
result = stripText(text.innerText())
|
|
||||||
result = stripTwitterUrls(result)
|
|
||||||
|
|
||||||
proc getTweetText*(tweet: XmlNode): string =
|
proc getTweetText*(tweet: XmlNode): string =
|
||||||
let
|
let
|
||||||
quote = tweet.select(".QuoteTweet")
|
quote = tweet.select(".QuoteTweet")
|
||||||
text = tweet.select(".tweet-text")
|
text = tweet.select(".tweet-text")
|
||||||
link = text.selectAttr("a.twitter-timeline-link.u-hidden", "data-expanded-url")
|
link = text.selectAttr("a.twitter-timeline-link.u-hidden", "data-expanded-url")
|
||||||
|
parseText(text, if quote != nil: link else: "")
|
||||||
emojify(text)
|
|
||||||
result = stripText(text.innerText())
|
|
||||||
|
|
||||||
if quote != nil and link.len > 0:
|
|
||||||
result = result.replace(link, "")
|
|
||||||
|
|
||||||
result = stripTwitterUrls(result)
|
|
||||||
|
|
||||||
proc getTime(tweet: XmlNode): XmlNode =
|
proc getTime(tweet: XmlNode): XmlNode =
|
||||||
tweet.select(".js-short-timestamp")
|
tweet.select(".js-short-timestamp")
|
||||||
|
@ -87,10 +100,10 @@ proc getUsername*(profile: XmlNode; selector: string): string =
|
||||||
profile.selectText(selector).strip(chars={'@', ' ', '\n'})
|
profile.selectText(selector).strip(chars={'@', ' ', '\n'})
|
||||||
|
|
||||||
proc getBio*(profile: XmlNode; selector: string; fallback=""): string =
|
proc getBio*(profile: XmlNode; selector: string; fallback=""): string =
|
||||||
var bio = profile.selectText(selector)
|
var bio = profile.select(selector)
|
||||||
if bio.len == 0 and fallback.len > 0:
|
if bio == nil and fallback.len > 0:
|
||||||
bio = profile.selectText(fallback)
|
bio = profile.select(fallback)
|
||||||
stripText(bio)
|
parseText(bio)
|
||||||
|
|
||||||
proc getLocation*(profile: XmlNode): string =
|
proc getLocation*(profile: XmlNode): string =
|
||||||
let sel = ".ProfileHeaderCard-locationText"
|
let sel = ".ProfileHeaderCard-locationText"
|
||||||
|
|
|
@ -25,7 +25,7 @@ proc renderProfileCard*(profile: Profile; prefs: Prefs): VNode =
|
||||||
tdiv(class="profile-card-extra"):
|
tdiv(class="profile-card-extra"):
|
||||||
if profile.bio.len > 0:
|
if profile.bio.len > 0:
|
||||||
tdiv(class="profile-bio"):
|
tdiv(class="profile-bio"):
|
||||||
p: verbatim linkifyText(profile.bio, prefs)
|
p: verbatim replaceUrl(profile.bio, prefs)
|
||||||
|
|
||||||
if profile.location.len > 0:
|
if profile.location.len > 0:
|
||||||
tdiv(class="profile-location"):
|
tdiv(class="profile-location"):
|
||||||
|
@ -39,8 +39,9 @@ proc renderProfileCard*(profile: Profile; prefs: Prefs): VNode =
|
||||||
if profile.website.len > 0:
|
if profile.website.len > 0:
|
||||||
tdiv(class="profile-website"):
|
tdiv(class="profile-website"):
|
||||||
span:
|
span:
|
||||||
|
let url = replaceUrl(profile.website, prefs)
|
||||||
icon "link"
|
icon "link"
|
||||||
verbatim linkifyText(profile.website, prefs)
|
a(href=url): text shortLink(url)
|
||||||
|
|
||||||
tdiv(class="profile-joindate"):
|
tdiv(class="profile-joindate"):
|
||||||
span(title=getJoinDateFull(profile)):
|
span(title=getJoinDateFull(profile)):
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
#if tweet.pinned: result = "Pinned: "
|
#if tweet.pinned: result = "Pinned: "
|
||||||
#elif tweet.retweet.isSome: result = "RT: "
|
#elif tweet.retweet.isSome: result = "RT: "
|
||||||
#end if
|
#end if
|
||||||
#result &= xmltree.escape(replaceUrl(tweet.text, prefs))
|
#result &= xmltree.escape(replaceUrl(tweet.text, prefs, rss=true))
|
||||||
#if result.len > 0: return
|
#if result.len > 0: return
|
||||||
#end if
|
#end if
|
||||||
#if tweet.photos.len > 0:
|
#if tweet.photos.len > 0:
|
||||||
|
@ -20,7 +20,7 @@
|
||||||
#end proc
|
#end proc
|
||||||
#
|
#
|
||||||
#proc renderRssTweet(tweet: Tweet; prefs: Prefs): string =
|
#proc renderRssTweet(tweet: Tweet; prefs: Prefs): string =
|
||||||
#let text = linkifyText(tweet.text, prefs, rss=true)
|
#let text = replaceUrl(tweet.text, prefs, rss=true)
|
||||||
#if tweet.quote.isSome and get(tweet.quote).available:
|
#if tweet.quote.isSome and get(tweet.quote).available:
|
||||||
#let quoteLink = hostname & getLink(get(tweet.quote))
|
#let quoteLink = hostname & getLink(get(tweet.quote))
|
||||||
<p>${text}<br><a href="https://${quoteLink}">${quoteLink}</a></p>
|
<p>${text}<br><a href="https://${quoteLink}">${quoteLink}</a></p>
|
||||||
|
@ -58,7 +58,7 @@
|
||||||
#end proc
|
#end proc
|
||||||
#
|
#
|
||||||
#proc renderTimelineRss*(timeline: Timeline; profile: Profile): string =
|
#proc renderTimelineRss*(timeline: Timeline; profile: Profile): string =
|
||||||
#let prefs = Prefs(replaceTwitter: hostname)
|
#let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
|
||||||
#result = ""
|
#result = ""
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
|
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
|
||||||
|
@ -84,7 +84,7 @@
|
||||||
#end proc
|
#end proc
|
||||||
#
|
#
|
||||||
#proc renderListRss*(tweets: seq[Tweet]; name, list: string): string =
|
#proc renderListRss*(tweets: seq[Tweet]; name, list: string): string =
|
||||||
#let prefs = Prefs(replaceTwitter: hostname)
|
#let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
|
||||||
#let link = &"https://{hostname}/{name}/lists/{list}"
|
#let link = &"https://{hostname}/{name}/lists/{list}"
|
||||||
#result = ""
|
#result = ""
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
@ -102,7 +102,7 @@
|
||||||
#end proc
|
#end proc
|
||||||
#
|
#
|
||||||
#proc renderSearchRss*(tweets: seq[Tweet]; name, param: string): string =
|
#proc renderSearchRss*(tweets: seq[Tweet]; name, param: string): string =
|
||||||
#let prefs = Prefs(replaceTwitter: hostname)
|
#let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
|
||||||
#let link = &"https://{hostname}/search"
|
#let link = &"https://{hostname}/search"
|
||||||
#result = ""
|
#result = ""
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
|
|
@ -56,7 +56,7 @@ proc renderUser(user: Profile; prefs: Prefs): VNode =
|
||||||
linkUser(user, class="username")
|
linkUser(user, class="username")
|
||||||
|
|
||||||
tdiv(class="tweet-content media-body"):
|
tdiv(class="tweet-content media-body"):
|
||||||
verbatim linkifyText(user.bio, prefs)
|
verbatim replaceUrl(user.bio, prefs)
|
||||||
|
|
||||||
proc renderTimelineUsers*(results: Result[Profile]; prefs: Prefs; path=""): VNode =
|
proc renderTimelineUsers*(results: Result[Profile]; prefs: Prefs; path=""): VNode =
|
||||||
buildHtml(tdiv(class="timeline")):
|
buildHtml(tdiv(class="timeline")):
|
||||||
|
|
|
@ -215,7 +215,7 @@ proc renderQuote(quote: Quote; prefs: Prefs): VNode =
|
||||||
renderReply(quote)
|
renderReply(quote)
|
||||||
|
|
||||||
tdiv(class="quote-text"):
|
tdiv(class="quote-text"):
|
||||||
verbatim linkifyText(quote.text, prefs)
|
verbatim replaceUrl(quote.text, prefs)
|
||||||
|
|
||||||
if quote.hasThread:
|
if quote.hasThread:
|
||||||
a(class="show-thread", href=getLink(quote)):
|
a(class="show-thread", href=getLink(quote)):
|
||||||
|
@ -248,7 +248,7 @@ proc renderTweet*(tweet: Tweet; prefs: Prefs; path: string; class="";
|
||||||
renderReply(tweet)
|
renderReply(tweet)
|
||||||
|
|
||||||
tdiv(class="tweet-content media-body"):
|
tdiv(class="tweet-content media-body"):
|
||||||
verbatim linkifyText(tweet.text, prefs)
|
verbatim replaceUrl(tweet.text, prefs)
|
||||||
|
|
||||||
if tweet.quote.isSome:
|
if tweet.quote.isSome:
|
||||||
renderQuote(tweet.quote.get(), prefs)
|
renderQuote(tweet.quote.get(), prefs)
|
||||||
|
|
|
@ -51,7 +51,7 @@ link = [
|
||||||
'old.reddit.com/r/programming…'
|
'old.reddit.com/r/programming…'
|
||||||
]],
|
]],
|
||||||
['nim_lang/status/1125887775151140864', [
|
['nim_lang/status/1125887775151140864', [
|
||||||
'en.wikipedia.org/wiki/Nim_(p…)'
|
'en.wikipedia.org/wiki/Nim_(p…'
|
||||||
]],
|
]],
|
||||||
['hiankun_taioan/status/1086916335215341570', [
|
['hiankun_taioan/status/1086916335215341570', [
|
||||||
'(hackernoon.com/interview-wit…)'
|
'(hackernoon.com/interview-wit…)'
|
||||||
|
|
Loading…
Reference in a new issue