Parse HTML properly instead of relying on regexes

This commit is contained in:
Omar Roth 2018-03-03 15:59:21 -06:00
parent b8fe82a7f7
commit d573461a67

View file

@ -132,8 +132,19 @@ def fetch_video(id, client)
dislikes = dislikes ? dislikes.content.delete(",").to_i : 0 dislikes = dislikes ? dislikes.content.delete(",").to_i : 0
description = html.xpath_node(%q(//p[@id="eow-description"])) description = html.xpath_node(%q(//p[@id="eow-description"]))
if description
description.xpath_nodes(%q(//a/@href)).each do |match|
uri = URI.parse(match.content)
if uri.host =~ /(www\.)?youtube.com/
uri = uri.full_path
puts uri
end
match.content = uri.to_s
end
end
description = description ? description.to_xml : "" description = description ? description.to_xml : ""
description = description.gsub(/(https:\/\/)|(http:\/\/)?(www\.)?(youtube\.com)/, "")
wilson_score = ci_lower_bound(likes, likes + dislikes) wilson_score = ci_lower_bound(likes, likes + dislikes)
@ -278,6 +289,20 @@ def template_comments(root)
author = child["data"]["author"] author = child["data"]["author"]
score = child["data"]["score"] score = child["data"]["score"]
body_html = HTML.unescape(child["data"]["body_html"].as_s) body_html = HTML.unescape(child["data"]["body_html"].as_s)
body_html = XML.parse_html(body_html)
body_html.xpath_nodes(%q(//a/@href)).each do |match|
uri = URI.parse(match.content)
if uri.host =~ /(www\.)?youtube.com/
uri = uri.full_path
puts uri
end
match.content = uri.to_s
end
body_html = body_html.to_s
replies_html = "" replies_html = ""
if child["data"]["replies"] != "" if child["data"]["replies"] != ""
@ -317,8 +342,6 @@ def template_comments(root)
end end
end end
html = html.gsub(/(https:\/\/)|(http:\/\/)?(www\.)?(youtube\.com)/, "")
return html return html
end end