Videos: Add support for attributed descriptions (#3701)

This commit is contained in:
Samantaz Fox 2023-04-10 17:54:22 +02:00
commit adc605024f
No known key found for this signature in database
GPG key ID: F42821059186176E
5 changed files with 131 additions and 23 deletions

2
mocks

@ -1 +1 @@
Subproject commit cb16e0343c8f94182615610bfe3c503db89717a7 Subproject commit 11ec372f72747c09d48ffef04843f72be67d5b54

View file

@ -17,8 +17,8 @@ Spectator.describe "parse_video_info" do
# Basic video infos # Basic video infos
expect(info["title"].as_s).to eq("I Gave My 100,000,000th Subscriber An Island") expect(info["title"].as_s).to eq("I Gave My 100,000,000th Subscriber An Island")
expect(info["views"].as_i).to eq(115_784_415) expect(info["views"].as_i).to eq(126_573_823)
expect(info["likes"].as_i).to eq(4_932_790) expect(info["likes"].as_i).to eq(5_157_654)
# For some reason the video length from VideoDetails and the # For some reason the video length from VideoDetails and the
# one from microformat differs by 1s... # one from microformat differs by 1s...
@ -48,12 +48,12 @@ Spectator.describe "parse_video_info" do
expect(info["relatedVideos"].as_a.size).to eq(20) expect(info["relatedVideos"].as_a.size).to eq(20)
expect(info["relatedVideos"][0]["id"]).to eq("iogcY_4xGjo") expect(info["relatedVideos"][0]["id"]).to eq("Hwybp38GnZw")
expect(info["relatedVideos"][0]["title"]).to eq("$1 vs $1,000,000 Hotel Room!") expect(info["relatedVideos"][0]["title"]).to eq("I Built Willy Wonka's Chocolate Factory!")
expect(info["relatedVideos"][0]["author"]).to eq("MrBeast") expect(info["relatedVideos"][0]["author"]).to eq("MrBeast")
expect(info["relatedVideos"][0]["ucid"]).to eq("UCX6OQ3DkcsbYNE6H8uQQuVA") expect(info["relatedVideos"][0]["ucid"]).to eq("UCX6OQ3DkcsbYNE6H8uQQuVA")
expect(info["relatedVideos"][0]["view_count"]).to eq("172972109") expect(info["relatedVideos"][0]["view_count"]).to eq("179877630")
expect(info["relatedVideos"][0]["short_view_count"]).to eq("172M") expect(info["relatedVideos"][0]["short_view_count"]).to eq("179M")
expect(info["relatedVideos"][0]["author_verified"]).to eq("true") expect(info["relatedVideos"][0]["author_verified"]).to eq("true")
# Description # Description
@ -76,11 +76,11 @@ Spectator.describe "parse_video_info" do
expect(info["ucid"].as_s).to eq("UCX6OQ3DkcsbYNE6H8uQQuVA") expect(info["ucid"].as_s).to eq("UCX6OQ3DkcsbYNE6H8uQQuVA")
expect(info["authorThumbnail"].as_s).to eq( expect(info["authorThumbnail"].as_s).to eq(
"https://yt3.ggpht.com/ytc/AL5GRJUfhQdJS6n-YJtsAf-ouS2myDavDOq_zXBfebal3Q=s48-c-k-c0x00ffffff-no-rj" "https://yt3.ggpht.com/ytc/AL5GRJVuqw82ERvHzsmBxL7avr1dpBtsVIXcEzBPZaloFg=s48-c-k-c0x00ffffff-no-rj"
) )
expect(info["authorVerified"].as_bool).to be_true expect(info["authorVerified"].as_bool).to be_true
expect(info["subCountText"].as_s).to eq("135M") expect(info["subCountText"].as_s).to eq("143M")
end end
it "parses a regular video with no descrition/comments" do it "parses a regular video with no descrition/comments" do
@ -99,7 +99,7 @@ Spectator.describe "parse_video_info" do
# Basic video infos # Basic video infos
expect(info["title"].as_s).to eq("Chris Rea - Auberge") expect(info["title"].as_s).to eq("Chris Rea - Auberge")
expect(info["views"].as_i).to eq(10_698_554) expect(info["views"].as_i).to eq(10_943_126)
expect(info["likes"].as_i).to eq(0) expect(info["likes"].as_i).to eq(0)
expect(info["lengthSeconds"].as_i).to eq(283_i64) expect(info["lengthSeconds"].as_i).to eq(283_i64)
expect(info["published"].as_s).to eq("2012-05-21T00:00:00Z") expect(info["published"].as_s).to eq("2012-05-21T00:00:00Z")
@ -132,21 +132,21 @@ Spectator.describe "parse_video_info" do
# Related videos # Related videos
expect(info["relatedVideos"].as_a.size).to eq(18) expect(info["relatedVideos"].as_a.size).to eq(19)
expect(info["relatedVideos"][0]["id"]).to eq("rfyZrJUmzxU") expect(info["relatedVideos"][0]["id"]).to eq("Ww3KeZ2_Yv4")
expect(info["relatedVideos"][0]["title"]).to eq("cheb mami - bekatni") expect(info["relatedVideos"][0]["title"]).to eq("Chris Rea")
expect(info["relatedVideos"][0]["author"]).to eq("pelitovic") expect(info["relatedVideos"][0]["author"]).to eq("PanMusic")
expect(info["relatedVideos"][0]["ucid"]).to eq("UCsp6vFyJeGoLxgn-AsHp1tw") expect(info["relatedVideos"][0]["ucid"]).to eq("UCsKAPSuh1iNbLWUga_igPyA")
expect(info["relatedVideos"][0]["view_count"]).to eq("13863619") expect(info["relatedVideos"][0]["view_count"]).to eq("31581")
expect(info["relatedVideos"][0]["short_view_count"]).to eq("13M") expect(info["relatedVideos"][0]["short_view_count"]).to eq("31K")
expect(info["relatedVideos"][0]["author_verified"]).to eq("false") expect(info["relatedVideos"][0]["author_verified"]).to eq("false")
# Description # Description
expect(info["description"].as_s).to eq(" ") expect(info["description"].as_s).to eq(" ")
expect(info["shortDescription"].as_s).to be_empty expect(info["shortDescription"].as_s).to be_empty
expect(info["descriptionHtml"].as_s).to eq("<p></p>") expect(info["descriptionHtml"].as_s).to eq("")
# Video metadata # Video metadata

View file

@ -86,9 +86,10 @@ Spectator.describe "parse_video_info" do
expect(info["description"].as_s).to start_with(description_start_text) expect(info["description"].as_s).to start_with(description_start_text)
expect(info["shortDescription"].as_s).to start_with(description_start_text) expect(info["shortDescription"].as_s).to start_with(description_start_text)
expect(info["descriptionHtml"].as_s).to start_with( # TODO: Update mocks right before the start of PDB podcast, either on friday or saturday (time unknown)
"PBD Podcast Episode 241. The home team is ready and at it again with the latest news, interesting topics and trending conversations on topics that matter. Try our sponsor Aura for 14 days free - <a href=\"https://aura.com/pbd\">aura.com/pbd</a>" # expect(info["descriptionHtml"].as_s).to start_with(
) # "PBD Podcast Episode 241. The home team is ready and at it again with the latest news, interesting topics and trending conversations on topics that matter. Try our sponsor Aura for 14 days free - <a href=\"https://aura.com/pbd\">aura.com/pbd</a>"
# )
# Video metadata # Video metadata

View file

@ -0,0 +1,105 @@
require "json"
require "uri"
def parse_command(command : JSON::Any?, string : String) : String?
on_tap = command.dig?("onTap", "innertubeCommand")
# 3rd party URL, extract original URL from YouTube tracking URL
if url_endpoint = on_tap.try &.["urlEndpoint"]?
youtube_url = URI.parse url_endpoint["url"].as_s
original_url = youtube_url.query_params["q"]?
if original_url.nil?
return ""
else
return "<a href=\"#{original_url}\">#{original_url}</a>"
end
# 1st party watch URL
elsif watch_endpoint = on_tap.try &.["watchEndpoint"]?
video_id = watch_endpoint["videoId"].as_s
time = watch_endpoint["startTimeSeconds"].as_i
url = "/watch?v=#{video_id}&t=#{time}s"
# if string is a timestamp, use the string instead
# this is a lazy regex for validating timestamps
if /(?:\d{1,2}:){1,2}\d{2}/ =~ string
return "<a href=\"#{url}\">#{string}</a>"
else
return "<a href=\"#{url}\">#{url}</a>"
end
# hashtag/other browse URLs
elsif browse_endpoint = on_tap.try &.dig?("commandMetadata", "webCommandMetadata")
url = browse_endpoint["url"].try &.as_s
# remove unnecessary character in a channel name
if browse_endpoint["webPageType"]?.try &.as_s == "WEB_PAGE_TYPE_CHANNEL"
name = string.match(/@[\w\d.-]+/)
if name.try &.[0]?
return "<a href=\"#{url}\">#{name.try &.[0]}</a>"
end
end
return "<a href=\"#{url}\">#{string}</a>"
end
return "(unknown YouTube desc command)"
end
private def copy_string(str : String::Builder, iter : Iterator, count : Int) : Int
copied = 0
while copied < count
cp = iter.next
break if cp.is_a?(Iterator::Stop)
str << cp.chr
# A codepoint from the SMP counts twice
copied += 1 if cp > 0xFFFF
copied += 1
end
return copied
end
def parse_description(desc : JSON::Any?) : String?
return "" if desc.nil?
content = desc["content"].as_s
return "" if content.empty?
commands = desc["commandRuns"]?.try &.as_a
return content if commands.nil?
# Not everything is stored in UTF-8 on youtube's side. The SMP codepoints
# (0x10000 and above) are encoded as UTF-16 surrogate pairs, which are
# automatically decoded by the JSON parser. It means that we need to count
# copied byte in a special manner, preventing the use of regular string copy.
iter = content.each_codepoint
index = 0
return String.build do |str|
commands.each do |command|
cmd_start = command["startIndex"].as_i
cmd_length = command["length"].as_i
# Copy the text chunk between this command and the previous if needed.
length = cmd_start - index
index += copy_string(str, iter, length)
# We need to copy the command's text using the iterator
# and the special function defined above.
cmd_content = String.build(cmd_length) do |str2|
copy_string(str2, iter, cmd_length)
end
str << parse_command(command, cmd_content)
index += cmd_length
end
# Copy the end of the string (past the last command).
remaining_length = content.size - index
copy_string(str, iter, remaining_length) if remaining_length > 0
end
end

View file

@ -284,8 +284,10 @@ def parse_video_info(video_id : String, player_response : Hash(String, JSON::Any
description = microformat.dig?("description", "simpleText").try &.as_s || "" description = microformat.dig?("description", "simpleText").try &.as_s || ""
short_description = player_response.dig?("videoDetails", "shortDescription") short_description = player_response.dig?("videoDetails", "shortDescription")
description_html = video_secondary_renderer.try &.dig?("description", "runs") # description_html = video_secondary_renderer.try &.dig?("description", "runs")
.try &.as_a.try { |t| content_to_comment_html(t, video_id) } # .try &.as_a.try { |t| content_to_comment_html(t, video_id) }
description_html = parse_description(video_secondary_renderer.try &.dig?("attributedDescription"))
# Video metadata # Video metadata