Rewrite transcript logic to be more generic (#4747)

The transcript logic in Invidious was written specifically as a workaround for
captions, and not transcripts as a feature.

This PR genericises the logic as so it can be used to implement transcripts
within Invidious.

The most notable change is the added parsing of section headings when it was
previously skipped over in favor of regular lines.
This commit is contained in:
Samantaz Fox 2024-07-10 22:14:56 +02:00
commit a56a724a55
No known key found for this signature in database
GPG key ID: F42821059186176E
2 changed files with 90 additions and 36 deletions

View file

@ -89,9 +89,14 @@ module Invidious::Routes::API::V1::Videos
if CONFIG.use_innertube_for_captions if CONFIG.use_innertube_for_captions
params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated) params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
initial_data = YoutubeAPI.get_transcript(params)
webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code) transcript = Invidious::Videos::Transcript.from_raw(
YoutubeAPI.get_transcript(params),
caption.language_code,
caption.auto_generated
)
webvtt = transcript.to_vtt
else else
# Timedtext API handling # Timedtext API handling
url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target

View file

@ -1,8 +1,26 @@
module Invidious::Videos module Invidious::Videos
# Namespace for methods primarily relating to Transcripts # A `Transcripts` struct encapsulates a sequence of lines that together forms the whole transcript for a given YouTube video.
module Transcript # These lines can be categorized into two types: section headings and regular lines representing content from the video.
record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String struct Transcript
# Types
record HeadingLine, start_ms : Time::Span, end_ms : Time::Span, line : String
record RegularLine, start_ms : Time::Span, end_ms : Time::Span, line : String
alias TranscriptLine = HeadingLine | RegularLine
property lines : Array(TranscriptLine)
property language_code : String
property auto_generated : Bool
# User friendly label for the current transcript.
# Example: "English (auto-generated)"
property label : String
# Initializes a new Transcript struct with the contents and associated metadata describing it
def initialize(@lines : Array(TranscriptLine), @language_code : String, @auto_generated : Bool, @label : String)
end
# Generates a protobuf string to fetch the requested transcript from YouTube
def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
kind = auto_generated ? "asr" : "" kind = auto_generated ? "asr" : ""
@ -30,48 +48,79 @@ module Invidious::Videos
return params return params
end end
def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String # Constructs a Transcripts struct from the initial YouTube response
# Convert into array of TranscriptLine def self.from_raw(initial_data : Hash(String, JSON::Any), language_code : String, auto_generated : Bool)
lines = self.parse(initial_data) transcript_panel = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
"content", "transcriptSearchPanelRenderer")
segment_list = transcript_panel.dig("body", "transcriptSegmentListRenderer")
if !segment_list["initialSegments"]?
raise NotFoundException.new("Requested transcript does not exist")
end
# Extract user-friendly label for the current transcript
footer_language_menu = transcript_panel.dig?(
"footer", "transcriptFooterRenderer", "languageMenu", "sortFilterSubMenuRenderer", "subMenuItems"
)
if footer_language_menu
label = footer_language_menu.as_a.select(&.["selected"].as_bool)[0]["title"].as_s
else
label = language_code
end
# Extract transcript lines
initial_segments = segment_list["initialSegments"].as_a
lines = [] of TranscriptLine
initial_segments.each do |line|
if unpacked_line = line["transcriptSectionHeaderRenderer"]?
line_type = HeadingLine
else
unpacked_line = line["transcriptSegmentRenderer"]
line_type = RegularLine
end
start_ms = unpacked_line["startMs"].as_s.to_i.millisecond
end_ms = unpacked_line["endMs"].as_s.to_i.millisecond
text = extract_text(unpacked_line["snippet"]) || ""
lines << line_type.new(start_ms, end_ms, text)
end
return Transcript.new(
lines: lines,
language_code: language_code,
auto_generated: auto_generated,
label: label
)
end
# Converts transcript lines to a WebVTT file
#
# This is used within Invidious to replace subtitles
# as to workaround YouTube's rate-limited timedtext endpoint.
def to_vtt
settings_field = { settings_field = {
"Kind" => "captions", "Kind" => "captions",
"Language" => target_language, "Language" => @language_code,
} }
# Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt()
vtt = WebVTT.build(settings_field) do |vtt| vtt = WebVTT.build(settings_field) do |vtt|
lines.each do |line| @lines.each do |line|
# Section headers are excluded from the VTT conversion as to
# match the regular captions returned from YouTube as much as possible
next if line.is_a? HeadingLine
vtt.cue(line.start_ms, line.end_ms, line.line) vtt.cue(line.start_ms, line.end_ms, line.line)
end end
end end
return vtt return vtt
end end
private def self.parse(initial_data : Hash(String, JSON::Any))
body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
"content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
"initialSegments").as_a
lines = [] of TranscriptLine
body.each do |line|
# Transcript section headers. They are not apart of the captions and as such we can safely skip them.
if line.as_h.has_key?("transcriptSectionHeaderRenderer")
next
end
line = line["transcriptSegmentRenderer"]
start_ms = line["startMs"].as_s.to_i.millisecond
end_ms = line["endMs"].as_s.to_i.millisecond
text = extract_text(line["snippet"]) || ""
lines << TranscriptLine.new(start_ms, end_ms, text)
end
return lines
end
end end
end end