Fix extractor bugs (#2454)

* Add debug/trace logging to extract_items
* Handle invalid timestamps for livestreams extraction
* Make use of author_fallback in playlist extractor
* Don't use extract_text for video length extraction

The extract_text function attempts to extract from both the simpleText and
the runs route. This is typically what we'd want for text extraction as
it could appear in both locations. However, while this still holds true,
the thumbnailOverlayTimeStatusRenderer writes a numerical length (when
present on the video) to the simpleText route and uses runs for a
text overlay like "LIVE" or "PREMIERE".

Therefore, when a video has a text overlay instead of a numerical one,
Invidious still passes it onto decode_length_seconds, which obviously
raises since it cannot be converted into integers.

In the future, if more routes requires one text route over the other, we
should go ahead and add an argument to extract_text itself. Though for
now, this is sufficient.

* Handle unsupported "special" categories
This commit is contained in:
syeopite 2021-10-07 21:39:21 +00:00 committed by GitHub
parent f85930700b
commit 21e29411af
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -74,7 +74,15 @@ private module Parsers
if length_container = item_contents["lengthText"]? if length_container = item_contents["lengthText"]?
length_seconds = decode_length_seconds(length_container["simpleText"].as_s) length_seconds = decode_length_seconds(length_container["simpleText"].as_s)
elsif length_container = item_contents["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?) elsif length_container = item_contents["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?)
length_seconds = extract_text(length_container["thumbnailOverlayTimeStatusRenderer"]["text"]).try { |t| decode_length_seconds(t) } || 0 # This needs to only go down the `simpleText` path (if possible). If more situations came up that requires
# a specific pathway then we should add an argument to extract_text that'll make this possible
length_seconds = length_container.dig?("thumbnailOverlayTimeStatusRenderer", "text", "simpleText")
if length_seconds
length_seconds = decode_length_seconds(length_seconds.as_s)
else
length_seconds = 0
end
else else
length_seconds = 0 length_seconds = 0
end end
@ -113,6 +121,10 @@ private module Parsers
premiere_timestamp: premiere_timestamp, premiere_timestamp: premiere_timestamp,
}) })
end end
def self.parser_name
return {{@type.name}}
end
end end
# Parses a InnerTube channelRenderer into a SearchChannel. Returns nil when the given object isn't a channelRenderer # Parses a InnerTube channelRenderer into a SearchChannel. Returns nil when the given object isn't a channelRenderer
@ -159,6 +171,10 @@ private module Parsers
auto_generated: auto_generated, auto_generated: auto_generated,
}) })
end end
def self.parser_name
return {{@type.name}}
end
end end
# Parses a InnerTube gridPlaylistRenderer into a SearchPlaylist. Returns nil when the given object isn't a gridPlaylistRenderer # Parses a InnerTube gridPlaylistRenderer into a SearchPlaylist. Returns nil when the given object isn't a gridPlaylistRenderer
@ -194,6 +210,10 @@ private module Parsers
thumbnail: playlist_thumbnail, thumbnail: playlist_thumbnail,
}) })
end end
def self.parser_name
return {{@type.name}}
end
end end
# Parses a InnerTube playlistRenderer into a SearchPlaylist. Returns nil when the given object isn't a playlistRenderer # Parses a InnerTube playlistRenderer into a SearchPlaylist. Returns nil when the given object isn't a playlistRenderer
@ -207,20 +227,20 @@ private module Parsers
module PlaylistRendererParser module PlaylistRendererParser
def self.process(item : JSON::Any, author_fallback : AuthorFallback) def self.process(item : JSON::Any, author_fallback : AuthorFallback)
if item_contents = item["playlistRenderer"]? if item_contents = item["playlistRenderer"]?
return self.parse(item_contents) return self.parse(item_contents, author_fallback)
end end
end end
private def self.parse(item_contents) private def self.parse(item_contents, author_fallback)
title = item_contents["title"]["simpleText"]?.try &.as_s || "" title = item_contents["title"]["simpleText"]?.try &.as_s || ""
plid = item_contents["playlistId"]?.try &.as_s || "" plid = item_contents["playlistId"]?.try &.as_s || ""
video_count = HelperExtractors.get_video_count(item_contents) video_count = HelperExtractors.get_video_count(item_contents)
playlist_thumbnail = HelperExtractors.get_thumbnails_plural(item_contents) playlist_thumbnail = HelperExtractors.get_thumbnails_plural(item_contents)
author_info = item_contents.dig("shortBylineText", "runs", 0) author_info = item_contents.dig?("shortBylineText", "runs", 0)
author = author_info["text"].as_s author = author_info.try &.["text"].as_s || author_fallback.name
author_id = HelperExtractors.get_browse_id(author_info) author_id = author_info.try { |x| HelperExtractors.get_browse_id(x) } || author_fallback.id
videos = item_contents["videos"]?.try &.as_a.map do |v| videos = item_contents["videos"]?.try &.as_a.map do |v|
v = v["childVideoRenderer"] v = v["childVideoRenderer"]
@ -246,6 +266,10 @@ private module Parsers
thumbnail: playlist_thumbnail, thumbnail: playlist_thumbnail,
}) })
end end
def self.parser_name
return {{@type.name}}
end
end end
# Parses a InnerTube shelfRenderer into a Category. Returns nil when the given object isn't a shelfRenderer # Parses a InnerTube shelfRenderer into a Category. Returns nil when the given object isn't a shelfRenderer
@ -283,11 +307,17 @@ private module Parsers
# Content parsing # Content parsing
contents = [] of SearchItem contents = [] of SearchItem
# Content could be in three locations. # InnerTube recognizes some "special" categories, which are organized differently.
if content_container = item_contents["content"]["horizontalListRenderer"]? if special_category_container = item_contents["content"]?
elsif content_container = item_contents["content"]["expandedShelfContentsRenderer"]? if content_container = special_category_container["horizontalListRenderer"]?
elsif content_container = item_contents["content"]["verticalListRenderer"]? elsif content_container = special_category_container["expandedShelfContentsRenderer"]?
elsif content_container = special_category_container["verticalListRenderer"]?
else
# Anything else, such as `horizontalMovieListRenderer` is currently unsupported.
return
end
else else
# "Normal" category.
content_container = item_contents["contents"] content_container = item_contents["contents"]
end end
@ -307,6 +337,10 @@ private module Parsers
badges: badges, badges: badges,
}) })
end end
def self.parser_name
return {{@type.name}}
end
end end
end end
@ -372,6 +406,10 @@ private module Extractors
return raw_items return raw_items
end end
def self.extractor_name
return {{@type.name}}
end
end end
# Extracts items from the InnerTube response for search results # Extracts items from the InnerTube response for search results
@ -409,6 +447,10 @@ private module Extractors
return raw_items.flatten return raw_items.flatten
end end
def self.extractor_name
return {{@type.name}}
end
end end
# Extracts continuation items from a InnerTube response # Extracts continuation items from a InnerTube response
@ -440,6 +482,10 @@ private module Extractors
return raw_items return raw_items
end end
def self.extractor_name
return {{@type.name}}
end
end end
end end
@ -529,8 +575,14 @@ def extract_item(item : JSON::Any, author_fallback : String? = "",
# Each parser automatically validates the data given to see if the data is # Each parser automatically validates the data given to see if the data is
# applicable to itself. If not nil is returned and the next parser is attemped. # applicable to itself. If not nil is returned and the next parser is attemped.
ITEM_PARSERS.each do |parser| ITEM_PARSERS.each do |parser|
LOGGER.trace("extract_item: Attempting to parse item using \"#{parser.parser_name}\" (cycling...)")
if result = parser.process(item, author_fallback) if result = parser.process(item, author_fallback)
LOGGER.debug("extract_item: Successfully parsed via #{parser.parser_name}")
return result return result
else
LOGGER.trace("extract_item: Parser \"#{parser.parser_name}\" does not apply. Cycling to the next one...")
end end
end end
end end
@ -550,7 +602,10 @@ def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : Stri
# This is identical to the parser cycling of extract_item(). # This is identical to the parser cycling of extract_item().
ITEM_CONTAINER_EXTRACTOR.each do |extractor| ITEM_CONTAINER_EXTRACTOR.each do |extractor|
LOGGER.trace("extract_items: Attempting to extract item container using \"#{extractor.extractor_name}\" (cycling...)")
if container = extractor.process(unpackaged_data) if container = extractor.process(unpackaged_data)
LOGGER.debug("extract_items: Successfully unpacked container with \"#{extractor.extractor_name}\"")
# Extract items in container # Extract items in container
container.each do |item| container.each do |item|
if parsed_result = extract_item(item, author_fallback, author_id_fallback) if parsed_result = extract_item(item, author_fallback, author_id_fallback)
@ -559,6 +614,8 @@ def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : Stri
end end
break break
else
LOGGER.trace("extract_items: Extractor \"#{extractor.extractor_name}\" does not apply. Cycling to the next one...")
end end
end end