Use XML.parse instead of XML.parse_html

Due to recent changes to libxml2 (between 2.9.14 and 2.10.4,
See https://gitlab.gnome.org/GNOME/libxml2/-/issues/508), the
HTML parser doesn't take into account the namespaces (xmlns).

Because HTML shouldn't contain namespaces anyway, there is no
reason for use to keep using it. But switching to the XML
parser means that we have to pass the namespaces to every
single 'xpath_node(s)' method for it to be able to properly
navigate the XML structure.
This commit is contained in:
Samantaz Fox 2023-05-08 00:53:08 +02:00
parent 1eb1bae370
commit ce1fb8d08c
No known key found for this signature in database
GPG Key ID: F42821059186176E
1 changed files with 25 additions and 11 deletions

View File

@ -159,12 +159,18 @@ def fetch_channel(ucid, pull_all_videos : Bool)
LOGGER.debug("fetch_channel: #{ucid}")
LOGGER.trace("fetch_channel: #{ucid} : pull_all_videos = #{pull_all_videos}")
namespaces = {
"yt" => "http://www.youtube.com/xml/schemas/2015",
"media" => "http://search.yahoo.com/mrss/",
"default" => "http://www.w3.org/2005/Atom",
}
LOGGER.trace("fetch_channel: #{ucid} : Downloading RSS feed")
rss = YT_POOL.client &.get("/feeds/videos.xml?channel_id=#{ucid}").body
LOGGER.trace("fetch_channel: #{ucid} : Parsing RSS feed")
rss = XML.parse_html(rss)
rss = XML.parse(rss)
author = rss.xpath_node(%q(//feed/title))
author = rss.xpath_node("//default:feed/default:title", namespaces)
if !author
raise InfoException.new("Deleted or invalid channel")
end
@ -192,15 +198,23 @@ def fetch_channel(ucid, pull_all_videos : Bool)
videos, continuation = IV::Channel::Tabs.get_videos(channel)
LOGGER.trace("fetch_channel: #{ucid} : Extracting videos from channel RSS feed")
rss.xpath_nodes("//feed/entry").each do |entry|
video_id = entry.xpath_node("videoid").not_nil!.content
title = entry.xpath_node("title").not_nil!.content
published = Time.parse_rfc3339(entry.xpath_node("published").not_nil!.content)
updated = Time.parse_rfc3339(entry.xpath_node("updated").not_nil!.content)
author = entry.xpath_node("author/name").not_nil!.content
ucid = entry.xpath_node("channelid").not_nil!.content
views = entry.xpath_node("group/community/statistics").try &.["views"]?.try &.to_i64?
views ||= 0_i64
rss.xpath_nodes("//default:feed/default:entry", namespaces).each do |entry|
video_id = entry.xpath_node("yt:videoid", namespaces).not_nil!.content
title = entry.xpath_node("default:title", namespaces).not_nil!.content
published = Time.parse_rfc3339(
entry.xpath_node("default:published", namespaces).not_nil!.content
)
updated = Time.parse_rfc3339(
entry.xpath_node("default:updated", namespaces).not_nil!.content
)
author = entry.xpath_node("default:author/default:name", namespaces).not_nil!.content
ucid = entry.xpath_node("yt:channelid", namespaces).not_nil!.content
views = entry
.xpath_node("media:group/media:community/media:statistics", namespaces)
.try &.["views"]?.try &.to_i64? || 0_i64
channel_video = videos
.select(SearchVideo)