nitter/src/parserutils.nim

168 lines
5.1 KiB
Nim
Raw Normal View History

2019-06-25 00:38:18 +00:00
import xmltree, htmlparser, strtabs, strformat, times
2019-06-26 16:51:21 +00:00
import regex
2019-06-23 23:34:30 +00:00
2019-06-24 03:14:14 +00:00
import ./types, ./formatters, ./api
2019-06-23 23:34:30 +00:00
2019-06-26 16:51:21 +00:00
from q import nil
2019-06-23 23:34:30 +00:00
const
thumbRegex = re".+:url\('([^']+)'\)"
gifRegex = re".+thumb/([^\.']+)\.jpg.*"
2019-06-26 16:51:21 +00:00
proc selectAll*(node: XmlNode; selector: string): seq[XmlNode] =
2019-06-27 19:07:29 +00:00
if node == nil: return
2019-06-26 16:51:21 +00:00
q.select(node, selector)
proc select*(node: XmlNode; selector: string): XmlNode =
2019-06-27 19:07:29 +00:00
if node == nil: return
2019-06-26 16:51:21 +00:00
let nodes = node.selectAll(selector)
if nodes.len > 0: nodes[0] else: nil
2019-06-26 17:59:28 +00:00
proc select*(node: XmlNode; parent, child: string): XmlNode =
let parentNode = node.select(parent)
2019-06-27 19:07:29 +00:00
if parentNode == nil: return
2019-06-26 17:59:28 +00:00
result = parentNode.select(child)
2019-06-27 19:07:29 +00:00
proc selectAttr*(node: XmlNode; selector: string; attr: string): string =
2019-06-26 16:51:21 +00:00
let res = node.select(selector)
2019-06-27 19:07:29 +00:00
if res == nil: "" else: res.attr(attr)
2019-06-23 23:34:30 +00:00
proc selectText*(node: XmlNode; selector: string): string =
2019-06-26 16:51:21 +00:00
let res = node.select(selector)
2019-06-23 23:34:30 +00:00
result = if res == nil: "" else: res.innerText()
2019-06-24 07:30:34 +00:00
proc getHeader(profile: XmlNode): XmlNode =
2019-06-26 16:51:21 +00:00
result = profile.select(".permalink-header")
2019-06-27 19:07:29 +00:00
if result == nil:
2019-06-26 16:51:21 +00:00
result = profile.select(".stream-item-header")
2019-06-27 19:07:29 +00:00
if result == nil:
2019-06-26 16:51:21 +00:00
result = profile.select(".ProfileCard-userFields")
2019-06-24 07:30:34 +00:00
2019-06-23 23:34:30 +00:00
proc isVerified*(profile: XmlNode): bool =
2019-06-24 07:30:34 +00:00
getHeader(profile).selectText(".Icon.Icon--verified").len > 0
2019-06-23 23:34:30 +00:00
proc isProtected*(profile: XmlNode): bool =
2019-06-24 07:30:34 +00:00
getHeader(profile).selectText(".Icon.Icon--protected").len > 0
2019-06-23 23:34:30 +00:00
proc getName*(profile: XmlNode; selector: string): string =
2019-06-25 00:38:18 +00:00
profile.selectText(selector).stripText()
2019-06-23 23:34:30 +00:00
proc getUsername*(profile: XmlNode; selector: string): string =
profile.selectText(selector).strip(chars={'@', ' '})
2019-06-25 00:38:18 +00:00
proc emojify*(node: XmlNode) =
2019-06-26 16:51:21 +00:00
for i in node.selectAll(".Emoji"):
2019-06-27 19:07:29 +00:00
i.add newText(i.attr("alt"))
2019-06-25 00:38:18 +00:00
2019-06-25 02:52:38 +00:00
proc getQuoteText*(tweet: XmlNode): string =
2019-06-26 16:51:21 +00:00
let text = tweet.select(".QuoteTweet-text")
2019-06-25 04:02:57 +00:00
emojify(text)
2019-06-26 16:51:21 +00:00
result = stripText(text.innerText())
2019-06-25 02:52:38 +00:00
result = stripTwitterUrls(result)
2019-06-23 23:34:30 +00:00
proc getTweetText*(tweet: XmlNode): string =
2019-06-24 07:39:45 +00:00
let
2019-06-26 16:51:21 +00:00
quote = tweet.select(".QuoteTweet")
text = tweet.select(".tweet-text")
link = text.selectAttr("a.twitter-timeline-link.u-hidden", "data-expanded-url")
2019-06-25 00:38:18 +00:00
2019-06-25 04:02:57 +00:00
emojify(text)
2019-06-26 16:51:21 +00:00
result = stripText(text.innerText())
2019-06-23 23:34:30 +00:00
2019-06-27 19:07:29 +00:00
if quote != nil and link.len > 0:
2019-06-25 00:38:18 +00:00
result = result.replace(link, "")
2019-06-23 23:34:30 +00:00
2019-06-25 00:38:18 +00:00
result = stripTwitterUrls(result)
2019-06-23 23:34:30 +00:00
proc getTime(tweet: XmlNode): XmlNode =
2019-06-26 16:51:21 +00:00
tweet.select(".js-short-timestamp")
2019-06-23 23:34:30 +00:00
proc getTimestamp*(tweet: XmlNode): Time =
2019-06-27 19:07:29 +00:00
let time = getTime(tweet).attr("data-time")
fromUnix(if time.len > 0: parseInt(time) else: 0)
2019-06-23 23:34:30 +00:00
proc getShortTime*(tweet: XmlNode): string =
getTime(tweet).innerText()
proc getBio*(profile: XmlNode; selector: string): string =
2019-06-25 00:38:18 +00:00
profile.selectText(selector).stripText()
2019-06-23 23:34:30 +00:00
proc getAvatar*(profile: XmlNode; selector: string): string =
profile.selectAttr(selector, "src").getUserpic()
proc getBanner*(tweet: XmlNode): string =
let url = tweet.selectAttr("svg > image", "xlink:href")
if url.len > 0:
result = url.replace("600x200", "1500x500")
else:
result = tweet.selectAttr(".ProfileCard-bg", "style")
if result.len == 0:
result = "background-color: #161616"
proc getPopupStats*(profile: var Profile; node: XmlNode) =
2019-06-26 16:51:21 +00:00
for s in node.selectAll( ".ProfileCardStats-statLink"):
2019-06-27 19:07:29 +00:00
let text = s.attr("title").split(" ")[0]
case s.attr("href").split("/")[^1]
2019-06-23 23:34:30 +00:00
of "followers": profile.followers = text
of "following": profile.following = text
else: profile.tweets = text
proc getIntentStats*(profile: var Profile; node: XmlNode) =
profile.tweets = "?"
2019-06-26 16:51:21 +00:00
for s in node.selectAll( "dd.count > a"):
2019-06-23 23:34:30 +00:00
let text = s.innerText()
2019-06-27 19:07:29 +00:00
case s.attr("href").split("/")[^1]
2019-06-23 23:34:30 +00:00
of "followers": profile.followers = text
of "following": profile.following = text
2019-06-24 03:14:14 +00:00
proc getTweetStats*(tweet: Tweet; node: XmlNode) =
2019-06-23 23:34:30 +00:00
tweet.replies = "0"
tweet.retweets = "0"
tweet.likes = "0"
2019-06-26 16:51:21 +00:00
for action in node.selectAll(".ProfileTweet-actionCountForAria"):
2019-06-23 23:34:30 +00:00
let text = action.innerText.split()
2019-06-26 18:06:20 +00:00
case text[1][0 .. 2]
of "ret": tweet.retweets = text[0]
of "rep": tweet.replies = text[0]
of "lik": tweet.likes = text[0]
2019-06-23 23:34:30 +00:00
2019-06-24 03:14:14 +00:00
proc getGif(player: XmlNode): Gif =
let
2019-06-27 19:07:29 +00:00
thumb = player.attr("style").replace(thumbRegex, "$1")
2019-06-24 03:14:14 +00:00
id = thumb.replace(gifRegex, "$1")
url = fmt"https://video.twimg.com/tweet_video/{id}.mp4"
Gif(url: url, thumb: thumb)
proc getTweetMedia*(tweet: Tweet; node: XmlNode) =
2019-06-26 16:51:21 +00:00
for photo in node.selectAll(".AdaptiveMedia-photoContainer"):
2019-06-23 23:34:30 +00:00
tweet.photos.add photo.attrs["data-image-url"]
2019-06-26 16:51:21 +00:00
let player = node.select(".PlayableMedia")
2019-06-27 19:07:29 +00:00
if player == nil:
2019-06-23 23:34:30 +00:00
return
2019-06-27 19:07:29 +00:00
if "gif" in player.attr("class"):
2019-06-26 16:51:21 +00:00
tweet.gif = some(getGif(player.select(".PlayableMedia-player")))
2019-06-27 19:07:29 +00:00
elif "video" in player.attr("class"):
2019-06-24 03:14:14 +00:00
tweet.video = some(Video())
2019-06-24 06:07:36 +00:00
proc getQuoteMedia*(quote: var Quote; node: XmlNode) =
2019-06-26 16:51:21 +00:00
let sensitive = node.select(".QuoteTweet--sensitive")
2019-06-27 19:07:29 +00:00
if sensitive != nil:
2019-06-25 00:58:33 +00:00
quote.sensitive = true
return
2019-06-26 16:51:21 +00:00
let media = node.select(".QuoteMedia")
2019-06-27 19:07:29 +00:00
if media != nil:
2019-06-24 06:07:36 +00:00
quote.thumb = some(media.selectAttr("img", "src"))
2019-06-26 16:51:21 +00:00
let badge = node.select(".AdaptiveMedia-badgeText")
let gifBadge = node.select(".Icon--gifBadge")
2019-06-24 22:39:32 +00:00
2019-06-27 19:07:29 +00:00
if badge != nil:
2019-06-24 06:07:36 +00:00
quote.badge = some(badge.innerText())
2019-06-27 19:07:29 +00:00
elif gifBadge != nil:
2019-06-24 22:39:32 +00:00
quote.badge = some("GIF")