From 1323b94b7a3a90a27a4353edddb7b9c103044e02 Mon Sep 17 00:00:00 2001 From: syeopite Date: Tue, 4 May 2021 01:48:51 -0700 Subject: [PATCH 01/22] Rewrite extract_item and extract_items functions This commit completely rewrites the extract_item and extract_items function. Before this commit these two function were an unreadable mess. The extract_item function was a lengthy if-elsif chain while the extract_items function contained an incomprehensible mess of .try, else and ||. With this commit both of these functions have been pulled into a separate file with the internal logic being moved to a few classes. This significantly reduces the size of these two methods, enhances readability and makes adding new extraction/parse rules much simpler. See diff for details. -- This cherry-picked commit also removes the code for parsing featured channels present on the original. (cherry picked from commit a027fbf7af1f96dc26fe5a610525ae52bcc40c28) --- src/invidious/helpers/extractors.cr | 317 ++++++++++++++++++++++++++++ src/invidious/helpers/helpers.cr | 162 +------------- 2 files changed, 320 insertions(+), 159 deletions(-) create mode 100644 src/invidious/helpers/extractors.cr diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr new file mode 100644 index 00000000..e8daa913 --- /dev/null +++ b/src/invidious/helpers/extractors.cr @@ -0,0 +1,317 @@ +# This file contains helper methods to parse the Youtube API json data into +# neat little packages we can use + +# Tuple of Parsers/Extractors so we can easily cycle through them. +private ITEM_CONTAINER_EXTRACTOR = { + YoutubeTabsExtractor.new, + SearchResultsExtractor.new, + ContinuationExtractor.new, +} + +private ITEM_PARSERS = { + VideoParser.new, + ChannelParser.new, + GridPlaylistParser.new, + PlaylistParser.new, +} + +private struct AuthorFallback + property name, id + + def initialize(@name : String? = nil, @id : String? = nil) + end +end + +# The following are the parsers for parsing raw item data into neatly packaged structs. +# They're accessed through the process() method which validates the given data as applicable +# to their specific struct and then use the internal parse() method to assemble the struct +# specific to their category. +private class ItemParser + # Base type for all item parsers. + def process(item : JSON::Any, author_fallback : AuthorFallback) + end + + private def parse(item_contents : JSON::Any, author_fallback : AuthorFallback) + end +end + +private class VideoParser < ItemParser + def process(item, author_fallback) + if item_contents = (item["videoRenderer"]? || item["gridVideoRenderer"]?) + return self.parse(item_contents, author_fallback) + end + end + + private def parse(item_contents, author_fallback) + video_id = item_contents["videoId"].as_s + title = item_contents["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || "" + + author_info = item_contents["ownerText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? + author = author_info.try &.["text"].as_s || author_fallback.name || "" + author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_fallback.id || "" + + published = item_contents["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local + view_count = item_contents["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64 + description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" + length_seconds = item_contents["lengthText"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || + item_contents["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?).try &.["thumbnailOverlayTimeStatusRenderer"]? + .try &.["text"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || 0 + + live_now = false + paid = false + premium = false + + premiere_timestamp = item_contents["upcomingEventData"]?.try &.["startTime"]?.try { |t| Time.unix(t.as_s.to_i64) } + + item_contents["badges"]?.try &.as_a.each do |badge| + b = badge["metadataBadgeRenderer"] + case b["label"].as_s + when "LIVE NOW" + live_now = true + when "New", "4K", "CC" + # TODO + when "Premium" + # TODO: Potentially available as item_contents["topStandaloneBadge"]["metadataBadgeRenderer"] + premium = true + else nil # Ignore + end + end + + SearchVideo.new({ + title: title, + id: video_id, + author: author, + ucid: author_id, + published: published, + views: view_count, + description_html: description_html, + length_seconds: length_seconds, + live_now: live_now, + premium: premium, + premiere_timestamp: premiere_timestamp, + }) + end +end + +private class ChannelParser < ItemParser + def process(item, author_fallback) + if item_contents = item["channelRenderer"]? + return self.parse(item_contents, author_fallback) + end + end + + private def parse(item_contents, author_fallback) + author = item_contents["title"]["simpleText"]?.try &.as_s || author_fallback.name || "" + author_id = item_contents["channelId"]?.try &.as_s || author_fallback.id || "" + + author_thumbnail = item_contents["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try &.["url"]?.try &.as_s || "" + subscriber_count = item_contents["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0 + + auto_generated = false + auto_generated = true if !item_contents["videoCountText"]? + video_count = item_contents["videoCountText"]?.try &.["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 + description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" + + SearchChannel.new({ + author: author, + ucid: author_id, + author_thumbnail: author_thumbnail, + subscriber_count: subscriber_count, + video_count: video_count, + description_html: description_html, + auto_generated: auto_generated, + }) + end +end + +private class GridPlaylistParser < ItemParser + def process(item, author_fallback) + if item_contents = item["gridPlaylistRenderer"]? + return self.parse(item_contents, author_fallback) + end + end + + private def parse(item_contents, author_fallback) + title = item_contents["title"]["runs"].as_a[0]?.try &.["text"].as_s || "" + plid = item_contents["playlistId"]?.try &.as_s || "" + + video_count = item_contents["videoCountText"]["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 + playlist_thumbnail = item_contents["thumbnail"]["thumbnails"][0]?.try &.["url"]?.try &.as_s || "" + + SearchPlaylist.new({ + title: title, + id: plid, + author: author_fallback.name || "", + ucid: author_fallback.id || "", + video_count: video_count, + videos: [] of SearchPlaylistVideo, + thumbnail: playlist_thumbnail, + }) + end +end + +private class PlaylistParser < ItemParser + def process(item, author_fallback) + if item_contents = item["playlistRenderer"]? + return self.parse(item_contents, author_fallback) + end + end + + def parse(item_contents, author_fallback) + title = item_contents["title"]["simpleText"]?.try &.as_s || "" + plid = item_contents["playlistId"]?.try &.as_s || "" + + video_count = item_contents["videoCount"]?.try &.as_s.to_i || 0 + playlist_thumbnail = item_contents["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || "" + + author_info = item_contents["shortBylineText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? + author = author_info.try &.["text"].as_s || author_fallback.name || "" + author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_fallback.id || "" + + videos = item_contents["videos"]?.try &.as_a.map do |v| + v = v["childVideoRenderer"] + v_title = v["title"]["simpleText"]?.try &.as_s || "" + v_id = v["videoId"]?.try &.as_s || "" + v_length_seconds = v["lengthText"]?.try &.["simpleText"]?.try { |t| decode_length_seconds(t.as_s) } || 0 + SearchPlaylistVideo.new({ + title: v_title, + id: v_id, + length_seconds: v_length_seconds, + }) + end || [] of SearchPlaylistVideo + + # TODO: item_contents["publishedTimeText"]? + + SearchPlaylist.new({ + title: title, + id: plid, + author: author, + ucid: author_id, + video_count: video_count, + videos: videos, + thumbnail: playlist_thumbnail, + }) + end +end + +# The following are the extractors for extracting an array of items from +# the internal Youtube API's JSON response. The result is then packaged into +# a structure we can more easily use via the parsers above. Their internals are +# identical to the item parsers. + +private class ItemsContainerExtractor + def process(item : Hash(String, JSON::Any)) + end + + private def extract(target : JSON::Any) + end +end + +private class YoutubeTabsExtractor < ItemsContainerExtractor + def process(initial_data) + if target = initial_data["twoColumnBrowseResultsRenderer"]? + self.extract(target) + end + end + + private def extract(target) + raw_items = [] of JSON::Any + selected_tab = extract_selected_tab(target["tabs"]) + content = selected_tab["tabRenderer"]["content"] + + content["sectionListRenderer"]["contents"].as_a.each do |renderer_container| + renderer_container = renderer_container["itemSectionRenderer"] + renderer_container_contents = renderer_container["contents"].as_a[0] + + # Shelf renderer usually refer to a category and would need special handling once + # An extractor for categories are added. But for now it is just used to + # extract items for the trending page + if items_container = renderer_container_contents["shelfRenderer"]? + if items_container["content"]["expandedShelfContentsRenderer"]? + items_container = items_container["content"]["expandedShelfContentsRenderer"] + end + elsif items_container = renderer_container_contents["gridRenderer"]? + else + items_container = renderer_container_contents + end + + items_container["items"].as_a.each do |item| + raw_items << item + end + end + + return raw_items + end +end + +private class SearchResultsExtractor < ItemsContainerExtractor + def process(initial_data) + if target = initial_data["twoColumnSearchResultsRenderer"]? + self.extract(target) + end + end + + private def extract(target) + raw_items = [] of JSON::Any + content = target["primaryContents"] + renderer = content["sectionListRenderer"]["contents"].as_a[0]["itemSectionRenderer"] + raw_items = renderer["contents"].as_a + + return raw_items + end +end + +private class ContinuationExtractor < ItemsContainerExtractor + def process(initial_data) + if target = initial_data["continuationContents"]? + self.extract(target) + end + end + + private def extract(target) + raw_items = [] of JSON::Any + if content = target["gridContinuation"]? + raw_items = content["items"].as_a + end + + return raw_items + end +end + +def extract_item(item : JSON::Any, author_fallback : String? = nil, author_id_fallback : String? = nil) + # Parses an item from Youtube's JSON response into a more usable structure. + # The end result can either be a SearchVideo, SearchPlaylist or SearchChannel. + author_fallback = AuthorFallback.new(author_fallback, author_id_fallback) + + # Cycles through all of the item parsers and attempt to parse the raw YT JSON data. + # Each parser automatically validates the data given to see if the data is + # applicable to itself. If not nil is returned and the next parser is attemped. + ITEM_PARSERS.each do |parser| + result = parser.process(item, author_fallback) + if !result.nil? + return result + end + end + # TODO radioRenderer, showRenderer, shelfRenderer, horizontalCardListRenderer, searchPyvRenderer +end + +def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) + items = [] of SearchItem + initial_data = initial_data["contents"]?.try &.as_h || initial_data["response"]?.try &.as_h || initial_data + + # This is identicial to the parser cyling of extract_item(). + ITEM_CONTAINER_EXTRACTOR.each do |extractor| + results = extractor.process(initial_data) + if !results.nil? + results.each do |item| + parsed_result = extract_item(item, author_fallback, author_id_fallback) + + if !parsed_result.nil? + items << parsed_result + end + end + end + end + + return items +end diff --git a/src/invidious/helpers/helpers.cr b/src/invidious/helpers/helpers.cr index fb33df1c..1a058195 100644 --- a/src/invidious/helpers/helpers.cr +++ b/src/invidious/helpers/helpers.cr @@ -251,165 +251,9 @@ def extract_videos(initial_data : Hash(String, JSON::Any), author_fallback : Str extract_items(initial_data, author_fallback, author_id_fallback).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo)) end -def extract_item(item : JSON::Any, author_fallback : String? = nil, author_id_fallback : String? = nil) - if i = (item["videoRenderer"]? || item["gridVideoRenderer"]?) - video_id = i["videoId"].as_s - title = i["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || "" - - author_info = i["ownerText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? - author = author_info.try &.["text"].as_s || author_fallback || "" - author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || "" - - published = i["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local - view_count = i["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64 - description_html = i["descriptionSnippet"]?.try { |t| parse_content(t) } || "" - length_seconds = i["lengthText"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || - i["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?).try &.["thumbnailOverlayTimeStatusRenderer"]? - .try &.["text"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || 0 - - live_now = false - premium = false - - premiere_timestamp = i["upcomingEventData"]?.try &.["startTime"]?.try { |t| Time.unix(t.as_s.to_i64) } - - i["badges"]?.try &.as_a.each do |badge| - b = badge["metadataBadgeRenderer"] - case b["label"].as_s - when "LIVE NOW" - live_now = true - when "New", "4K", "CC" - # TODO - when "Premium" - # TODO: Potentially available as i["topStandaloneBadge"]["metadataBadgeRenderer"] - premium = true - else nil # Ignore - end - end - - SearchVideo.new({ - title: title, - id: video_id, - author: author, - ucid: author_id, - published: published, - views: view_count, - description_html: description_html, - length_seconds: length_seconds, - live_now: live_now, - premium: premium, - premiere_timestamp: premiere_timestamp, - }) - elsif i = item["channelRenderer"]? - author = i["title"]["simpleText"]?.try &.as_s || author_fallback || "" - author_id = i["channelId"]?.try &.as_s || author_id_fallback || "" - - author_thumbnail = i["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try &.["url"]?.try &.as_s || "" - subscriber_count = i["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0 - - auto_generated = false - auto_generated = true if !i["videoCountText"]? - video_count = i["videoCountText"]?.try &.["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 - description_html = i["descriptionSnippet"]?.try { |t| parse_content(t) } || "" - - SearchChannel.new({ - author: author, - ucid: author_id, - author_thumbnail: author_thumbnail, - subscriber_count: subscriber_count, - video_count: video_count, - description_html: description_html, - auto_generated: auto_generated, - }) - elsif i = item["gridPlaylistRenderer"]? - title = i["title"]["runs"].as_a[0]?.try &.["text"].as_s || "" - plid = i["playlistId"]?.try &.as_s || "" - - video_count = i["videoCountText"]["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 - playlist_thumbnail = i["thumbnail"]["thumbnails"][0]?.try &.["url"]?.try &.as_s || "" - - SearchPlaylist.new({ - title: title, - id: plid, - author: author_fallback || "", - ucid: author_id_fallback || "", - video_count: video_count, - videos: [] of SearchPlaylistVideo, - thumbnail: playlist_thumbnail, - }) - elsif i = item["playlistRenderer"]? - title = i["title"]["simpleText"]?.try &.as_s || "" - plid = i["playlistId"]?.try &.as_s || "" - - video_count = i["videoCount"]?.try &.as_s.to_i || 0 - playlist_thumbnail = i["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || "" - - author_info = i["shortBylineText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? - author = author_info.try &.["text"].as_s || author_fallback || "" - author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || "" - - videos = i["videos"]?.try &.as_a.map do |v| - v = v["childVideoRenderer"] - v_title = v["title"]["simpleText"]?.try &.as_s || "" - v_id = v["videoId"]?.try &.as_s || "" - v_length_seconds = v["lengthText"]?.try &.["simpleText"]?.try { |t| decode_length_seconds(t.as_s) } || 0 - SearchPlaylistVideo.new({ - title: v_title, - id: v_id, - length_seconds: v_length_seconds, - }) - end || [] of SearchPlaylistVideo - - # TODO: i["publishedTimeText"]? - - SearchPlaylist.new({ - title: title, - id: plid, - author: author, - ucid: author_id, - video_count: video_count, - videos: videos, - thumbnail: playlist_thumbnail, - }) - elsif i = item["radioRenderer"]? # Mix - # TODO - elsif i = item["showRenderer"]? # Show - # TODO - elsif i = item["shelfRenderer"]? - elsif i = item["horizontalCardListRenderer"]? - elsif i = item["searchPyvRenderer"]? # Ad - end -end - -def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) - items = [] of SearchItem - - channel_v2_response = initial_data - .try &.["continuationContents"]? - .try &.["gridContinuation"]? - .try &.["items"]? - - if channel_v2_response - channel_v2_response.try &.as_a.each { |item| - extract_item(item, author_fallback, author_id_fallback) - .try { |t| items << t } - } - else - initial_data.try { |t| t["contents"]? || t["response"]? } - .try { |t| t["twoColumnBrowseResultsRenderer"]?.try &.["tabs"].as_a.select(&.["tabRenderer"]?.try &.["selected"].as_bool)[0]?.try &.["tabRenderer"]["content"] || - t["twoColumnSearchResultsRenderer"]?.try &.["primaryContents"] || - t["continuationContents"]? } - .try { |t| t["sectionListRenderer"]? || t["sectionListContinuation"]? } - .try &.["contents"].as_a - .each { |c| c.try &.["itemSectionRenderer"]?.try &.["contents"].as_a - .try { |t| t[0]?.try &.["shelfRenderer"]?.try &.["content"]["expandedShelfContentsRenderer"]?.try &.["items"].as_a || - t[0]?.try &.["gridRenderer"]?.try &.["items"].as_a || t } - .each { |item| - extract_item(item, author_fallback, author_id_fallback) - .try { |t| items << t } - } } - end - - items +def extract_selected_tab(tabs) + # Extract the selected tab from the array of tabs Youtube returns + return selected_target = tabs.as_a.select(&.["tabRenderer"]?.try &.["selected"].as_bool)[0] end def check_enum(db, enum_name, struct_type = nil) From a50f64f6e9ab55efa9301915817b11f152625f22 Mon Sep 17 00:00:00 2001 From: syeopite Date: Fri, 7 May 2021 05:13:53 -0700 Subject: [PATCH 02/22] Add parser for categories (shelfRenderer) This commit adds a new parser for YT's shelfRenderers which are typically used to denote different categories.The code for featured channels parsing has also been moved to use the new parser but some additional refactoring are needed there. The ContinuationExtractor has also been improved and is now capable of extraction continuation data that is packaged under "appendContinuationItemsAction" In additional this commit adds some useful helper functions to extract the current selected tab the continuation token. This is to mainly reduce code size and repetition. -- This cherry-picked commit also removes the code for parsing featured channels present on the original. (cherry picked from commit 8000d538dbbf1eb9c78e000b1449926ba3b24da9) --- src/invidious/helpers/extractors.cr | 117 +++++++++-- src/invidious/helpers/helpers.cr | 29 ++- src/invidious/helpers/invidiousitems.cr | 256 ++++++++++++++++++++++++ src/invidious/search.cr | 230 --------------------- src/invidious/views/components/item.ecr | 1 + 5 files changed, 389 insertions(+), 244 deletions(-) create mode 100644 src/invidious/helpers/invidiousitems.cr diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index e8daa913..1fa06c91 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -13,6 +13,7 @@ private ITEM_PARSERS = { ChannelParser.new, GridPlaylistParser.new, PlaylistParser.new, + CategoryParser.new, } private struct AuthorFallback @@ -95,7 +96,7 @@ end private class ChannelParser < ItemParser def process(item, author_fallback) - if item_contents = item["channelRenderer"]? + if item_contents = (item["channelRenderer"]? || item["gridChannelRenderer"]?) return self.parse(item_contents, author_fallback) end end @@ -194,6 +195,88 @@ private class PlaylistParser < ItemParser end end +private class CategoryParser < ItemParser + def process(item, author_fallback) + if item_contents = item["shelfRenderer"]? + return self.parse(item_contents, author_fallback) + end + end + + def parse(item_contents, author_fallback) + # Title extraction is a bit complicated. There are two possible routes for it + # as well as times when the title attribute just isn't sent by YT. + + title_container = item_contents["title"]? || "" + if !title_container.is_a? String + if title = title_container["simpleText"]? + title = title.as_s + else + title = title_container["runs"][0]["text"].as_s + end + else + title = "" + end + + browse_endpoint = item_contents["endpoint"]?.try &.["browseEndpoint"] || nil + browse_endpoint_data = "" + category_type = 0 # 0: Video, 1: Channels, 2: Playlist/feed, 3: trending + + # There's no endpoint data for video and trending category + if !item_contents["endpoint"]? + if !item_contents["videoId"]? + category_type = 3 + end + end + + if !browse_endpoint.nil? + # Playlist/feed categories doesn't need the params value (nor is it even included in yt response) + # instead it uses the browseId parameter. So if there isn't a params value we can assume the + # category is a playlist/feed + if browse_endpoint["params"]? + browse_endpoint_data = browse_endpoint["params"].as_s + category_type = 1 + else + browse_endpoint_data = browse_endpoint["browseId"].as_s + category_type = 2 + end + end + + # Sometimes a category can have badges. + badges = [] of Tuple(String, String) # (Badge style, label) + item_contents["badges"]?.try &.as_a.each do |badge| + badge = badge["metadataBadgeRenderer"] + badges << {badge["style"].as_s, badge["label"].as_s} + end + + # Content parsing + contents = [] of SearchItem + + # Content could be in three locations. + if content_container = item_contents["content"]["horizontalListRenderer"]? + elsif content_container = item_contents["content"]["expandedShelfContentsRenderer"] + elsif content_container = item_contents["content"]["verticalListRenderer"] + else + content_container = item_contents["contents"] + end + + raw_contents = content_container["items"].as_a + raw_contents.each do |item| + result = extract_item(item) + if !result.nil? + contents << result + end + end + + Category.new({ + title: title, + contents: contents, + browse_endpoint_data: browse_endpoint_data, + continuation_token: nil, + badges: badges, + }) + end +end + # The following are the extractors for extracting an array of items from # the internal Youtube API's JSON response. The result is then packaged into # a structure we can more easily use via the parsers above. Their internals are @@ -217,19 +300,16 @@ private class YoutubeTabsExtractor < ItemsContainerExtractor private def extract(target) raw_items = [] of JSON::Any selected_tab = extract_selected_tab(target["tabs"]) - content = selected_tab["tabRenderer"]["content"] + content = selected_tab["content"] content["sectionListRenderer"]["contents"].as_a.each do |renderer_container| renderer_container = renderer_container["itemSectionRenderer"] renderer_container_contents = renderer_container["contents"].as_a[0] - # Shelf renderer usually refer to a category and would need special handling once - # An extractor for categories are added. But for now it is just used to - # extract items for the trending page + # Category extraction if items_container = renderer_container_contents["shelfRenderer"]? - if items_container["content"]["expandedShelfContentsRenderer"]? - items_container = items_container["content"]["expandedShelfContentsRenderer"] - end + raw_items << renderer_container_contents + next elsif items_container = renderer_container_contents["gridRenderer"]? else items_container = renderer_container_contents @@ -265,6 +345,8 @@ private class ContinuationExtractor < ItemsContainerExtractor def process(initial_data) if target = initial_data["continuationContents"]? self.extract(target) + elsif target = initial_data["appendContinuationItemsAction"]? + self.extract(target) end end @@ -272,13 +354,16 @@ private class ContinuationExtractor < ItemsContainerExtractor raw_items = [] of JSON::Any if content = target["gridContinuation"]? raw_items = content["items"].as_a + elsif content = target["continuationItems"]? + raw_items = content.as_a end return raw_items end end -def extract_item(item : JSON::Any, author_fallback : String? = nil, author_id_fallback : String? = nil) +def extract_item(item : JSON::Any, author_fallback : String? = nil, + author_id_fallback : String? = nil) # Parses an item from Youtube's JSON response into a more usable structure. # The end result can either be a SearchVideo, SearchPlaylist or SearchChannel. author_fallback = AuthorFallback.new(author_fallback, author_id_fallback) @@ -295,13 +380,20 @@ def extract_item(item : JSON::Any, author_fallback : String? = nil, author_id_fa # TODO radioRenderer, showRenderer, shelfRenderer, horizontalCardListRenderer, searchPyvRenderer end -def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) +def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, + author_id_fallback : String? = nil) items = [] of SearchItem - initial_data = initial_data["contents"]?.try &.as_h || initial_data["response"]?.try &.as_h || initial_data + + if unpackaged_data = initial_data["contents"]?.try &.as_h + elsif unpackaged_data = initial_data["response"]?.try &.as_h + elsif unpackaged_data = initial_data["onResponseReceivedActions"]?.try &.as_a.[0].as_h + else + unpackaged_data = initial_data + end # This is identicial to the parser cyling of extract_item(). ITEM_CONTAINER_EXTRACTOR.each do |extractor| - results = extractor.process(initial_data) + results = extractor.process(unpackaged_data) if !results.nil? results.each do |item| parsed_result = extract_item(item, author_fallback, author_id_fallback) @@ -310,6 +402,7 @@ def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : Stri items << parsed_result end end + return items end end diff --git a/src/invidious/helpers/helpers.cr b/src/invidious/helpers/helpers.cr index 1a058195..a52c7bd4 100644 --- a/src/invidious/helpers/helpers.cr +++ b/src/invidious/helpers/helpers.cr @@ -248,12 +248,37 @@ def html_to_content(description_html : String) end def extract_videos(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) - extract_items(initial_data, author_fallback, author_id_fallback).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo)) + extracted = extract_items(initial_data, author_fallback, author_id_fallback) + + if extracted.is_a?(Category) + target = extracted.contents + else + target = extracted + end + return target.select(&.is_a?(SearchVideo)).map(&.as(SearchVideo)) end def extract_selected_tab(tabs) # Extract the selected tab from the array of tabs Youtube returns - return selected_target = tabs.as_a.select(&.["tabRenderer"]?.try &.["selected"].as_bool)[0] + return selected_target = tabs.as_a.select(&.["tabRenderer"]?.try &.["selected"].as_bool)[0]["tabRenderer"] +end + +def fetch_continuation_token(items : Array(JSON::Any)) + # Fetches the continuation token from an array of items + return items.last["continuationItemRenderer"]? + .try &.["continuationEndpoint"]["continuationCommand"]["token"].as_s +end + +def fetch_continuation_token(initial_data : Hash(String, JSON::Any)) + # Fetches the continuation token from initial data + if initial_data["onResponseReceivedActions"]? + continuation_items = initial_data["onResponseReceivedActions"][0]["appendContinuationItemsAction"]["continuationItems"] + else + tab = extract_selected_tab(initial_data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"]) + continuation_items = tab["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"][0]["gridRenderer"]["items"] + end + + return fetch_continuation_token(continuation_items.as_a) end def check_enum(db, enum_name, struct_type = nil) diff --git a/src/invidious/helpers/invidiousitems.cr b/src/invidious/helpers/invidiousitems.cr new file mode 100644 index 00000000..50a47726 --- /dev/null +++ b/src/invidious/helpers/invidiousitems.cr @@ -0,0 +1,256 @@ +struct SearchVideo + include DB::Serializable + + property title : String + property id : String + property author : String + property ucid : String + property published : Time + property views : Int64 + property description_html : String + property length_seconds : Int32 + property live_now : Bool + property premium : Bool + property premiere_timestamp : Time? + + def to_xml(auto_generated, query_params, xml : XML::Builder) + query_params["v"] = self.id + + xml.element("entry") do + xml.element("id") { xml.text "yt:video:#{self.id}" } + xml.element("yt:videoId") { xml.text self.id } + xml.element("yt:channelId") { xml.text self.ucid } + xml.element("title") { xml.text self.title } + xml.element("link", rel: "alternate", href: "#{HOST_URL}/watch?#{query_params}") + + xml.element("author") do + if auto_generated + xml.element("name") { xml.text self.author } + xml.element("uri") { xml.text "#{HOST_URL}/channel/#{self.ucid}" } + else + xml.element("name") { xml.text author } + xml.element("uri") { xml.text "#{HOST_URL}/channel/#{ucid}" } + end + end + + xml.element("content", type: "xhtml") do + xml.element("div", xmlns: "http://www.w3.org/1999/xhtml") do + xml.element("a", href: "#{HOST_URL}/watch?#{query_params}") do + xml.element("img", src: "#{HOST_URL}/vi/#{self.id}/mqdefault.jpg") + end + + xml.element("p", style: "word-break:break-word;white-space:pre-wrap") { xml.text html_to_content(self.description_html) } + end + end + + xml.element("published") { xml.text self.published.to_s("%Y-%m-%dT%H:%M:%S%:z") } + + xml.element("media:group") do + xml.element("media:title") { xml.text self.title } + xml.element("media:thumbnail", url: "#{HOST_URL}/vi/#{self.id}/mqdefault.jpg", + width: "320", height: "180") + xml.element("media:description") { xml.text html_to_content(self.description_html) } + end + + xml.element("media:community") do + xml.element("media:statistics", views: self.views) + end + end + end + + def to_xml(auto_generated, query_params, xml : XML::Builder | Nil = nil) + if xml + to_xml(HOST_URL, auto_generated, query_params, xml) + else + XML.build do |json| + to_xml(HOST_URL, auto_generated, query_params, xml) + end + end + end + + def to_json(locale : Hash(String, JSON::Any), json : JSON::Builder) + json.object do + json.field "type", "video" + json.field "title", self.title + json.field "videoId", self.id + + json.field "author", self.author + json.field "authorId", self.ucid + json.field "authorUrl", "/channel/#{self.ucid}" + + json.field "videoThumbnails" do + generate_thumbnails(json, self.id) + end + + json.field "description", html_to_content(self.description_html) + json.field "descriptionHtml", self.description_html + + json.field "viewCount", self.views + json.field "published", self.published.to_unix + json.field "publishedText", translate(locale, "`x` ago", recode_date(self.published, locale)) + json.field "lengthSeconds", self.length_seconds + json.field "liveNow", self.live_now + json.field "premium", self.premium + json.field "isUpcoming", self.is_upcoming + + if self.premiere_timestamp + json.field "premiereTimestamp", self.premiere_timestamp.try &.to_unix + end + end + end + + def to_json(locale, json : JSON::Builder | Nil = nil) + if json + to_json(locale, json) + else + JSON.build do |json| + to_json(locale, json) + end + end + end + + def is_upcoming + premiere_timestamp ? true : false + end +end + +struct SearchPlaylistVideo + include DB::Serializable + + property title : String + property id : String + property length_seconds : Int32 +end + +struct SearchPlaylist + include DB::Serializable + + property title : String + property id : String + property author : String + property ucid : String + property video_count : Int32 + property videos : Array(SearchPlaylistVideo) + property thumbnail : String? + + def to_json(locale, json : JSON::Builder) + json.object do + json.field "type", "playlist" + json.field "title", self.title + json.field "playlistId", self.id + json.field "playlistThumbnail", self.thumbnail + + json.field "author", self.author + json.field "authorId", self.ucid + json.field "authorUrl", "/channel/#{self.ucid}" + + json.field "videoCount", self.video_count + json.field "videos" do + json.array do + self.videos.each do |video| + json.object do + json.field "title", video.title + json.field "videoId", video.id + json.field "lengthSeconds", video.length_seconds + + json.field "videoThumbnails" do + generate_thumbnails(json, video.id) + end + end + end + end + end + end + end + + def to_json(locale, json : JSON::Builder | Nil = nil) + if json + to_json(locale, json) + else + JSON.build do |json| + to_json(locale, json) + end + end + end +end + +struct SearchChannel + include DB::Serializable + + property author : String + property ucid : String + property author_thumbnail : String + property subscriber_count : Int32 + property video_count : Int32 + property description_html : String + property auto_generated : Bool + + def to_json(locale, json : JSON::Builder) + json.object do + json.field "type", "channel" + json.field "author", self.author + json.field "authorId", self.ucid + json.field "authorUrl", "/channel/#{self.ucid}" + + json.field "authorThumbnails" do + json.array do + qualities = {32, 48, 76, 100, 176, 512} + + qualities.each do |quality| + json.object do + json.field "url", self.author_thumbnail.gsub(/=\d+/, "=s#{quality}") + json.field "width", quality + json.field "height", quality + end + end + end + end + + json.field "autoGenerated", self.auto_generated + json.field "subCount", self.subscriber_count + json.field "videoCount", self.video_count + + json.field "description", html_to_content(self.description_html) + json.field "descriptionHtml", self.description_html + end + end + + def to_json(locale, json : JSON::Builder | Nil = nil) + if json + to_json(locale, json) + else + JSON.build do |json| + to_json(locale, json) + end + end + end +end + +class Category + include DB::Serializable + + property title : String + property contents : Array(SearchItem) | SearchItem + property browse_endpoint_data : String? + property continuation_token : String? + property badges : Array(Tuple(String, String))? + + def to_json(locale, json : JSON::Builder) + json.object do + json.field "title", self.title + json.field "contents", self.contents + end + end + + def to_json(locale, json : JSON::Builder | Nil = nil) + if json + to_json(locale, json) + else + JSON.build do |json| + to_json(locale, json) + end + end + end +end + +alias SearchItem = SearchVideo | SearchChannel | SearchPlaylist | Category diff --git a/src/invidious/search.cr b/src/invidious/search.cr index a3fcc7a3..eb9c37c5 100644 --- a/src/invidious/search.cr +++ b/src/invidious/search.cr @@ -1,233 +1,3 @@ -struct SearchVideo - include DB::Serializable - - property title : String - property id : String - property author : String - property ucid : String - property published : Time - property views : Int64 - property description_html : String - property length_seconds : Int32 - property live_now : Bool - property premium : Bool - property premiere_timestamp : Time? - - def to_xml(auto_generated, query_params, xml : XML::Builder) - query_params["v"] = self.id - - xml.element("entry") do - xml.element("id") { xml.text "yt:video:#{self.id}" } - xml.element("yt:videoId") { xml.text self.id } - xml.element("yt:channelId") { xml.text self.ucid } - xml.element("title") { xml.text self.title } - xml.element("link", rel: "alternate", href: "#{HOST_URL}/watch?#{query_params}") - - xml.element("author") do - if auto_generated - xml.element("name") { xml.text self.author } - xml.element("uri") { xml.text "#{HOST_URL}/channel/#{self.ucid}" } - else - xml.element("name") { xml.text author } - xml.element("uri") { xml.text "#{HOST_URL}/channel/#{ucid}" } - end - end - - xml.element("content", type: "xhtml") do - xml.element("div", xmlns: "http://www.w3.org/1999/xhtml") do - xml.element("a", href: "#{HOST_URL}/watch?#{query_params}") do - xml.element("img", src: "#{HOST_URL}/vi/#{self.id}/mqdefault.jpg") - end - - xml.element("p", style: "word-break:break-word;white-space:pre-wrap") { xml.text html_to_content(self.description_html) } - end - end - - xml.element("published") { xml.text self.published.to_s("%Y-%m-%dT%H:%M:%S%:z") } - - xml.element("media:group") do - xml.element("media:title") { xml.text self.title } - xml.element("media:thumbnail", url: "#{HOST_URL}/vi/#{self.id}/mqdefault.jpg", - width: "320", height: "180") - xml.element("media:description") { xml.text html_to_content(self.description_html) } - end - - xml.element("media:community") do - xml.element("media:statistics", views: self.views) - end - end - end - - def to_xml(auto_generated, query_params, xml : XML::Builder | Nil = nil) - if xml - to_xml(HOST_URL, auto_generated, query_params, xml) - else - XML.build do |json| - to_xml(HOST_URL, auto_generated, query_params, xml) - end - end - end - - def to_json(locale, json : JSON::Builder) - json.object do - json.field "type", "video" - json.field "title", self.title - json.field "videoId", self.id - - json.field "author", self.author - json.field "authorId", self.ucid - json.field "authorUrl", "/channel/#{self.ucid}" - - json.field "videoThumbnails" do - generate_thumbnails(json, self.id) - end - - json.field "description", html_to_content(self.description_html) - json.field "descriptionHtml", self.description_html - - json.field "viewCount", self.views - json.field "published", self.published.to_unix - json.field "publishedText", translate(locale, "`x` ago", recode_date(self.published, locale)) - json.field "lengthSeconds", self.length_seconds - json.field "liveNow", self.live_now - json.field "premium", self.premium - json.field "isUpcoming", self.is_upcoming - - if self.premiere_timestamp - json.field "premiereTimestamp", self.premiere_timestamp.try &.to_unix - end - end - end - - def to_json(locale, json : JSON::Builder | Nil = nil) - if json - to_json(locale, json) - else - JSON.build do |json| - to_json(locale, json) - end - end - end - - def is_upcoming - premiere_timestamp ? true : false - end -end - -struct SearchPlaylistVideo - include DB::Serializable - - property title : String - property id : String - property length_seconds : Int32 -end - -struct SearchPlaylist - include DB::Serializable - - property title : String - property id : String - property author : String - property ucid : String - property video_count : Int32 - property videos : Array(SearchPlaylistVideo) - property thumbnail : String? - - def to_json(locale, json : JSON::Builder) - json.object do - json.field "type", "playlist" - json.field "title", self.title - json.field "playlistId", self.id - json.field "playlistThumbnail", self.thumbnail - - json.field "author", self.author - json.field "authorId", self.ucid - json.field "authorUrl", "/channel/#{self.ucid}" - - json.field "videoCount", self.video_count - json.field "videos" do - json.array do - self.videos.each do |video| - json.object do - json.field "title", video.title - json.field "videoId", video.id - json.field "lengthSeconds", video.length_seconds - - json.field "videoThumbnails" do - generate_thumbnails(json, video.id) - end - end - end - end - end - end - end - - def to_json(locale, json : JSON::Builder | Nil = nil) - if json - to_json(locale, json) - else - JSON.build do |json| - to_json(locale, json) - end - end - end -end - -struct SearchChannel - include DB::Serializable - - property author : String - property ucid : String - property author_thumbnail : String - property subscriber_count : Int32 - property video_count : Int32 - property description_html : String - property auto_generated : Bool - - def to_json(locale, json : JSON::Builder) - json.object do - json.field "type", "channel" - json.field "author", self.author - json.field "authorId", self.ucid - json.field "authorUrl", "/channel/#{self.ucid}" - - json.field "authorThumbnails" do - json.array do - qualities = {32, 48, 76, 100, 176, 512} - - qualities.each do |quality| - json.object do - json.field "url", self.author_thumbnail.gsub(/=\d+/, "=s#{quality}") - json.field "width", quality - json.field "height", quality - end - end - end - end - - json.field "autoGenerated", self.auto_generated - json.field "subCount", self.subscriber_count - json.field "videoCount", self.video_count - - json.field "description", html_to_content(self.description_html) - json.field "descriptionHtml", self.description_html - end - end - - def to_json(locale, json : JSON::Builder | Nil = nil) - if json - to_json(locale, json) - else - JSON.build do |json| - to_json(locale, json) - end - end - end -end - -alias SearchItem = SearchVideo | SearchChannel | SearchPlaylist - def channel_search(query, page, channel) response = YT_POOL.client &.get("/channel/#{channel}") diff --git a/src/invidious/views/components/item.ecr b/src/invidious/views/components/item.ecr index 68aa1812..ec282216 100644 --- a/src/invidious/views/components/item.ecr +++ b/src/invidious/views/components/item.ecr @@ -96,6 +96,7 @@ <% end %> + <% when Category %> <% else %> <% if !env.get("preferences").as(Preferences).thin_mode %> From ae30f32c36c738b85dc114a4bb4edaa95257a3c2 Mon Sep 17 00:00:00 2001 From: syeopite Date: Sat, 8 May 2021 03:43:26 -0700 Subject: [PATCH 03/22] Unpack search items that are embedded in categories This is a squash of a bunch of commits cherry-picked commits Fix category parse error on search (cherry picked from commit cc02fed4e69f0eb5f19e017173632b3a3f20519f) Fix category items not being extracted in search (cherry picked from commit 2605b9c609ff217b5a6ae09d22450596dcad90fc) Make search not include category items for now (cherry picked from commit ca4afd59f46b595e3c339f31432cad98a5771ee1) Change behavior of categories in search results (cherry picked from commit cc1067561051b1c113b490e79c4a71cd346f7b3f) Fix missing search results in extraction (cherry picked from commit abda6840d5bfe58f845128bdd1a3f4916dd3bb84) Fix miscount of search results (cherry picked from commit 491e33450eb1300d0234bb33df0d0e78a027114f) --- src/invidious/helpers/extractors.cr | 15 ++++++++++----- src/invidious/search.cr | 17 ++++++++++++++++- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index 1fa06c91..ea9411d7 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -253,8 +253,8 @@ private class CategoryParser < ItemParser # Content could be in three locations. if content_container = item_contents["content"]["horizontalListRenderer"]? - elsif content_container = item_contents["content"]["expandedShelfContentsRenderer"] - elsif content_container = item_contents["content"]["verticalListRenderer"] + elsif content_container = item_contents["content"]["expandedShelfContentsRenderer"]? + elsif content_container = item_contents["content"]["verticalListRenderer"]? else content_container = item_contents["contents"] end @@ -332,10 +332,15 @@ private class SearchResultsExtractor < ItemsContainerExtractor end private def extract(target) - raw_items = [] of JSON::Any + raw_items = [] of Array(JSON::Any) content = target["primaryContents"] - renderer = content["sectionListRenderer"]["contents"].as_a[0]["itemSectionRenderer"] - raw_items = renderer["contents"].as_a + renderer = content["sectionListRenderer"]["contents"].as_a.each do |node| + if node = node["itemSectionRenderer"]? + raw_items << node["contents"].as_a + end + end + + raw_items = raw_items.flatten return raw_items end diff --git a/src/invidious/search.cr b/src/invidious/search.cr index eb9c37c5..3873b2dd 100644 --- a/src/invidious/search.cr +++ b/src/invidious/search.cr @@ -232,5 +232,20 @@ def process_search_query(query, page, user, region) count, items = search(search_query, search_params, region).as(Tuple) end - {search_query, count, items, operators} + # Light processing to flatten search results out of Categories. + # They should ideally be supported in the future. + items_without_cate_items = [] of SearchItem | ChannelVideo + items.each do |i| + if i.is_a? Category + i.contents.each do |nest_i| + if !nest_i.is_a? Video + items_without_cate_items << nest_i + end + end + else + items_without_cate_items << i + end + end + + {search_query, items_without_cate_items.size, items_without_cate_items, url_params} end From 57c63f3598867ce406b807923ea81352f9b1b384 Mon Sep 17 00:00:00 2001 From: syeopite Date: Mon, 28 Jun 2021 22:51:28 -0700 Subject: [PATCH 04/22] Rename "items_without_cate_items" to reflect usage --- src/invidious/search.cr | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/invidious/search.cr b/src/invidious/search.cr index 3873b2dd..adf079f3 100644 --- a/src/invidious/search.cr +++ b/src/invidious/search.cr @@ -234,18 +234,18 @@ def process_search_query(query, page, user, region) # Light processing to flatten search results out of Categories. # They should ideally be supported in the future. - items_without_cate_items = [] of SearchItem | ChannelVideo + items_without_category = [] of SearchItem | ChannelVideo items.each do |i| if i.is_a? Category i.contents.each do |nest_i| if !nest_i.is_a? Video - items_without_cate_items << nest_i + items_without_category << nest_i end end else - items_without_cate_items << i + items_without_category << i end end - {search_query, items_without_cate_items.size, items_without_cate_items, url_params} + {search_query, items_without_category.size, items_without_category, url_params} end From 0b7a108a59b2f1def6aea5b611f68b29abf59064 Mon Sep 17 00:00:00 2001 From: syeopite Date: Sat, 8 May 2021 04:54:12 -0700 Subject: [PATCH 05/22] Move continuation_token out of Category struct (cherry picked from commit 0e96eda28f25171a0344b972af1852a4d6fc3007) --- src/invidious/helpers/extractors.cr | 11 +++++++++-- src/invidious/helpers/invidiousitems.cr | 1 - 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index ea9411d7..cd3b1f93 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -217,6 +217,7 @@ private class CategoryParser < ItemParser title = "" end + auxiliary_data = {} of String => String browse_endpoint = item_contents["endpoint"]?.try &.["browseEndpoint"] || nil browse_endpoint_data = "" category_type = 0 # 0: Video, 1: Channels, 2: Playlist/feed, 3: trending @@ -233,7 +234,14 @@ private class CategoryParser < ItemParser # instead it uses the browseId parameter. So if there isn't a params value we can assume the # category is a playlist/feed if browse_endpoint["params"]? - browse_endpoint_data = browse_endpoint["params"].as_s + # However, even though the channel category type returns the browse endpoint param + # we're not going to be using it in order to preserve compatablity with Youtube. + # and for an URL that looks cleaner + url = item_contents["endpoint"]["commandMetadata"]["webCommandMetadata"]["url"] + url = URI.parse(url.as_s) + auxiliary_data["view"] = url.query_params["view"] + auxiliary_data["shelf_id"] = url.query_params["shelf_id"] + category_type = 1 else browse_endpoint_data = browse_endpoint["browseId"].as_s @@ -271,7 +279,6 @@ private class CategoryParser < ItemParser title: title, contents: contents, browse_endpoint_data: browse_endpoint_data, - continuation_token: nil, badges: badges, }) end diff --git a/src/invidious/helpers/invidiousitems.cr b/src/invidious/helpers/invidiousitems.cr index 50a47726..edcb2054 100644 --- a/src/invidious/helpers/invidiousitems.cr +++ b/src/invidious/helpers/invidiousitems.cr @@ -232,7 +232,6 @@ class Category property title : String property contents : Array(SearchItem) | SearchItem property browse_endpoint_data : String? - property continuation_token : String? property badges : Array(Tuple(String, String))? def to_json(locale, json : JSON::Builder) From ea6434662daf97e8710fe4d2a4943112994ce760 Mon Sep 17 00:00:00 2001 From: syeopite Date: Sat, 8 May 2021 06:01:17 -0700 Subject: [PATCH 06/22] Change typing of Category contents to only Array (cherry picked from commit d3384e17f10d0baca70db7993df14100485be9da) --- src/invidious/helpers/invidiousitems.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/invidious/helpers/invidiousitems.cr b/src/invidious/helpers/invidiousitems.cr index edcb2054..65f755e6 100644 --- a/src/invidious/helpers/invidiousitems.cr +++ b/src/invidious/helpers/invidiousitems.cr @@ -230,7 +230,7 @@ class Category include DB::Serializable property title : String - property contents : Array(SearchItem) | SearchItem + property contents : Array(SearchItem) property browse_endpoint_data : String? property badges : Array(Tuple(String, String))? From 7b60dac526c5df118c39bf428c0778a7a7982c98 Mon Sep 17 00:00:00 2001 From: syeopite Date: Sat, 8 May 2021 20:07:07 -0700 Subject: [PATCH 07/22] Add description_html field to Category (cherry picked from commit aa8f15f795787113e56473f8e8fd606749a14bdd) --- src/invidious/helpers/extractors.cr | 4 ++++ src/invidious/helpers/invidiousitems.cr | 1 + 2 files changed, 5 insertions(+) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index cd3b1f93..48885d48 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -256,6 +256,9 @@ private class CategoryParser < ItemParser badges << {badge["style"].as_s, badge["label"].as_s} end + # Category description + description_html = item_contents["subtitle"]?.try { |desc| parse_content(desc) } || "" + # Content parsing contents = [] of SearchItem @@ -278,6 +281,7 @@ private class CategoryParser < ItemParser Category.new({ title: title, contents: contents, + description_html: description_html, browse_endpoint_data: browse_endpoint_data, badges: badges, }) diff --git a/src/invidious/helpers/invidiousitems.cr b/src/invidious/helpers/invidiousitems.cr index 65f755e6..2db838ea 100644 --- a/src/invidious/helpers/invidiousitems.cr +++ b/src/invidious/helpers/invidiousitems.cr @@ -232,6 +232,7 @@ class Category property title : String property contents : Array(SearchItem) property browse_endpoint_data : String? + property description_html : String property badges : Array(Tuple(String, String))? def to_json(locale, json : JSON::Builder) From abca8f7a7ca043035459abce35d334013a71e957 Mon Sep 17 00:00:00 2001 From: syeopite Date: Mon, 24 May 2021 11:18:22 -0700 Subject: [PATCH 08/22] Rename invidiousitems.cr --- .../helpers/{invidiousitems.cr => serialized_yt_data.cr} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/invidious/helpers/{invidiousitems.cr => serialized_yt_data.cr} (100%) diff --git a/src/invidious/helpers/invidiousitems.cr b/src/invidious/helpers/serialized_yt_data.cr similarity index 100% rename from src/invidious/helpers/invidiousitems.cr rename to src/invidious/helpers/serialized_yt_data.cr From be1a43a3377c543b84fd9bd534fd2033b7223e62 Mon Sep 17 00:00:00 2001 From: syeopite Date: Mon, 28 Jun 2021 23:11:04 -0700 Subject: [PATCH 09/22] Manually extract category refactor from 1b569bbc99207cae7c20aa285f42477ae361dd30 Also fixes some errors caused by cherry-picking --- spec/helpers_spec.cr | 1 + src/invidious/helpers/extractors.cr | 43 +++------------------ src/invidious/helpers/serialized_yt_data.cr | 4 +- src/invidious/search.cr | 2 +- src/invidious/videos.cr | 2 +- 5 files changed, 11 insertions(+), 41 deletions(-) diff --git a/spec/helpers_spec.cr b/spec/helpers_spec.cr index ada5b28f..b17c8d73 100644 --- a/spec/helpers_spec.cr +++ b/spec/helpers_spec.cr @@ -6,6 +6,7 @@ require "spec" require "yaml" require "../src/invidious/helpers/*" require "../src/invidious/channels/*" +require "../src/invidious/videos" require "../src/invidious/comments" require "../src/invidious/playlists" require "../src/invidious/search" diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index 48885d48..c1f7205c 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -205,7 +205,6 @@ private class CategoryParser < ItemParser def parse(item_contents, author_fallback) # Title extraction is a bit complicated. There are two possible routes for it # as well as times when the title attribute just isn't sent by YT. - title_container = item_contents["title"]? || "" if !title_container.is_a? String if title = title_container["simpleText"]? @@ -217,37 +216,7 @@ private class CategoryParser < ItemParser title = "" end - auxiliary_data = {} of String => String - browse_endpoint = item_contents["endpoint"]?.try &.["browseEndpoint"] || nil - browse_endpoint_data = "" - category_type = 0 # 0: Video, 1: Channels, 2: Playlist/feed, 3: trending - - # There's no endpoint data for video and trending category - if !item_contents["endpoint"]? - if !item_contents["videoId"]? - category_type = 3 - end - end - - if !browse_endpoint.nil? - # Playlist/feed categories doesn't need the params value (nor is it even included in yt response) - # instead it uses the browseId parameter. So if there isn't a params value we can assume the - # category is a playlist/feed - if browse_endpoint["params"]? - # However, even though the channel category type returns the browse endpoint param - # we're not going to be using it in order to preserve compatablity with Youtube. - # and for an URL that looks cleaner - url = item_contents["endpoint"]["commandMetadata"]["webCommandMetadata"]["url"] - url = URI.parse(url.as_s) - auxiliary_data["view"] = url.query_params["view"] - auxiliary_data["shelf_id"] = url.query_params["shelf_id"] - - category_type = 1 - else - browse_endpoint_data = browse_endpoint["browseId"].as_s - category_type = 2 - end - end + url = item_contents["endpoint"]?.try &.["commandMetadata"]["webCommandMetadata"]["url"].as_s # Sometimes a category can have badges. badges = [] of Tuple(String, String) # (Badge style, label) @@ -279,11 +248,11 @@ private class CategoryParser < ItemParser end Category.new({ - title: title, - contents: contents, - description_html: description_html, - browse_endpoint_data: browse_endpoint_data, - badges: badges, + title: title, + contents: contents, + description_html: description_html, + url: url, + badges: badges, }) end end diff --git a/src/invidious/helpers/serialized_yt_data.cr b/src/invidious/helpers/serialized_yt_data.cr index 2db838ea..61356555 100644 --- a/src/invidious/helpers/serialized_yt_data.cr +++ b/src/invidious/helpers/serialized_yt_data.cr @@ -230,8 +230,8 @@ class Category include DB::Serializable property title : String - property contents : Array(SearchItem) - property browse_endpoint_data : String? + property contents : Array(SearchItem) | Array(Video) + property url : String? property description_html : String property badges : Array(Tuple(String, String))? diff --git a/src/invidious/search.cr b/src/invidious/search.cr index adf079f3..d95d802e 100644 --- a/src/invidious/search.cr +++ b/src/invidious/search.cr @@ -247,5 +247,5 @@ def process_search_query(query, page, user, region) end end - {search_query, items_without_category.size, items_without_category, url_params} + {search_query, items_without_category.size, items_without_category, operators} end diff --git a/src/invidious/videos.cr b/src/invidious/videos.cr index d9c07142..0e6bd77c 100644 --- a/src/invidious/videos.cr +++ b/src/invidious/videos.cr @@ -275,7 +275,7 @@ struct Video end end - def to_json(locale, json : JSON::Builder) + def to_json(locale : Hash(String, JSON::Any), json : JSON::Builder) json.object do json.field "type", "video" From 30e85b40f9b817c8620ef9536ad2d327da9ba83b Mon Sep 17 00:00:00 2001 From: syeopite Date: Mon, 28 Jun 2021 23:51:04 -0700 Subject: [PATCH 10/22] Fix extract_videos --- src/invidious/helpers/helpers.cr | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/invidious/helpers/helpers.cr b/src/invidious/helpers/helpers.cr index a52c7bd4..99adcd30 100644 --- a/src/invidious/helpers/helpers.cr +++ b/src/invidious/helpers/helpers.cr @@ -250,10 +250,13 @@ end def extract_videos(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) extracted = extract_items(initial_data, author_fallback, author_id_fallback) - if extracted.is_a?(Category) - target = extracted.contents - else - target = extracted + target = [] of SearchItem + extracted.each do |i| + if i.is_a?(Category) + i.contents.each { |cate_i| target << cate_i if !cate_i.is_a? Video } + else + target << i + end end return target.select(&.is_a?(SearchVideo)).map(&.as(SearchVideo)) end From 8435e7991337edcb007b82c148a372a0a678b5c1 Mon Sep 17 00:00:00 2001 From: syeopite Date: Tue, 29 Jun 2021 09:23:48 -0700 Subject: [PATCH 11/22] Improve documentation for extract_item(s) funcs --- src/invidious/helpers/extractors.cr | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index c1f7205c..e8226888 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -347,10 +347,10 @@ private class ContinuationExtractor < ItemsContainerExtractor end end +# Parses an item from Youtube's JSON response into a more usable structure. +# The end result can either be a SearchVideo, SearchPlaylist or SearchChannel. def extract_item(item : JSON::Any, author_fallback : String? = nil, author_id_fallback : String? = nil) - # Parses an item from Youtube's JSON response into a more usable structure. - # The end result can either be a SearchVideo, SearchPlaylist or SearchChannel. author_fallback = AuthorFallback.new(author_fallback, author_id_fallback) # Cycles through all of the item parsers and attempt to parse the raw YT JSON data. @@ -365,8 +365,10 @@ def extract_item(item : JSON::Any, author_fallback : String? = nil, # TODO radioRenderer, showRenderer, shelfRenderer, horizontalCardListRenderer, searchPyvRenderer end +# Parses multiple items from Youtube's initial JSON response into a more usable structure. +# The end result is an array of SearchItem. def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, - author_id_fallback : String? = nil) + author_id_fallback : String? = nil) : Array(SearchItem) items = [] of SearchItem if unpackaged_data = initial_data["contents"]?.try &.as_h From 3dea670091b0fc4a20d623c928292f7bd94892d8 Mon Sep 17 00:00:00 2001 From: syeopite Date: Mon, 19 Jul 2021 21:30:41 -0700 Subject: [PATCH 12/22] Switch to structs in extractors.cr for performance --- src/invidious/helpers/extractors.cr | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index e8226888..68e84850 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -27,7 +27,7 @@ end # They're accessed through the process() method which validates the given data as applicable # to their specific struct and then use the internal parse() method to assemble the struct # specific to their category. -private class ItemParser +private abstract struct ItemParser # Base type for all item parsers. def process(item : JSON::Any, author_fallback : AuthorFallback) end @@ -36,7 +36,7 @@ private class ItemParser end end -private class VideoParser < ItemParser +private struct VideoParser < ItemParser def process(item, author_fallback) if item_contents = (item["videoRenderer"]? || item["gridVideoRenderer"]?) return self.parse(item_contents, author_fallback) @@ -94,7 +94,7 @@ private class VideoParser < ItemParser end end -private class ChannelParser < ItemParser +private struct ChannelParser < ItemParser def process(item, author_fallback) if item_contents = (item["channelRenderer"]? || item["gridChannelRenderer"]?) return self.parse(item_contents, author_fallback) @@ -125,7 +125,7 @@ private class ChannelParser < ItemParser end end -private class GridPlaylistParser < ItemParser +private struct GridPlaylistParser < ItemParser def process(item, author_fallback) if item_contents = item["gridPlaylistRenderer"]? return self.parse(item_contents, author_fallback) @@ -151,7 +151,7 @@ private class GridPlaylistParser < ItemParser end end -private class PlaylistParser < ItemParser +private struct PlaylistParser < ItemParser def process(item, author_fallback) if item_contents = item["playlistRenderer"]? return self.parse(item_contents, author_fallback) @@ -195,7 +195,7 @@ private class PlaylistParser < ItemParser end end -private class CategoryParser < ItemParser +private struct CategoryParser < ItemParser def process(item, author_fallback) if item_contents = item["shelfRenderer"]? return self.parse(item_contents, author_fallback) @@ -262,7 +262,7 @@ end # a structure we can more easily use via the parsers above. Their internals are # identical to the item parsers. -private class ItemsContainerExtractor +private abstract struct ItemsContainerExtractor def process(item : Hash(String, JSON::Any)) end @@ -270,7 +270,7 @@ private class ItemsContainerExtractor end end -private class YoutubeTabsExtractor < ItemsContainerExtractor +private struct YoutubeTabsExtractor < ItemsContainerExtractor def process(initial_data) if target = initial_data["twoColumnBrowseResultsRenderer"]? self.extract(target) @@ -304,7 +304,7 @@ private class YoutubeTabsExtractor < ItemsContainerExtractor end end -private class SearchResultsExtractor < ItemsContainerExtractor +private struct SearchResultsExtractor < ItemsContainerExtractor def process(initial_data) if target = initial_data["twoColumnSearchResultsRenderer"]? self.extract(target) @@ -326,7 +326,7 @@ private class SearchResultsExtractor < ItemsContainerExtractor end end -private class ContinuationExtractor < ItemsContainerExtractor +private struct ContinuationExtractor < ItemsContainerExtractor def process(initial_data) if target = initial_data["continuationContents"]? self.extract(target) From 142317c2be064f8114c7d75f9ae336eb6a6e96a3 Mon Sep 17 00:00:00 2001 From: syeopite Date: Tue, 3 Aug 2021 00:22:31 -0700 Subject: [PATCH 13/22] Overhaul extractors.cr to use modules --- src/invidious/helpers/extractors.cr | 556 ++++++++++++++-------------- 1 file changed, 269 insertions(+), 287 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index 68e84850..cec0e728 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -3,257 +3,245 @@ # Tuple of Parsers/Extractors so we can easily cycle through them. private ITEM_CONTAINER_EXTRACTOR = { - YoutubeTabsExtractor.new, - SearchResultsExtractor.new, - ContinuationExtractor.new, + Extractors::YouTubeTabs, + Extractors::SearchResults, + Extractors::Continuation, } private ITEM_PARSERS = { - VideoParser.new, - ChannelParser.new, - GridPlaylistParser.new, - PlaylistParser.new, - CategoryParser.new, + Parsers::VideoRendererParser, + Parsers::ChannelRendererParser, + Parsers::GridPlaylistRendererParser, + Parsers::PlaylistRendererParser, + Parsers::CategoryRendererParser, } -private struct AuthorFallback - property name, id - - def initialize(@name : String? = nil, @id : String? = nil) - end -end +record AuthorFallback, name : String? = nil, id : String? = nil # The following are the parsers for parsing raw item data into neatly packaged structs. # They're accessed through the process() method which validates the given data as applicable # to their specific struct and then use the internal parse() method to assemble the struct # specific to their category. -private abstract struct ItemParser - # Base type for all item parsers. - def process(item : JSON::Any, author_fallback : AuthorFallback) - end - - private def parse(item_contents : JSON::Any, author_fallback : AuthorFallback) - end -end - -private struct VideoParser < ItemParser - def process(item, author_fallback) - if item_contents = (item["videoRenderer"]? || item["gridVideoRenderer"]?) - return self.parse(item_contents, author_fallback) - end - end - - private def parse(item_contents, author_fallback) - video_id = item_contents["videoId"].as_s - title = item_contents["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || "" - - author_info = item_contents["ownerText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? - author = author_info.try &.["text"].as_s || author_fallback.name || "" - author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_fallback.id || "" - - published = item_contents["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local - view_count = item_contents["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64 - description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" - length_seconds = item_contents["lengthText"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || - item_contents["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?).try &.["thumbnailOverlayTimeStatusRenderer"]? - .try &.["text"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || 0 - - live_now = false - paid = false - premium = false - - premiere_timestamp = item_contents["upcomingEventData"]?.try &.["startTime"]?.try { |t| Time.unix(t.as_s.to_i64) } - - item_contents["badges"]?.try &.as_a.each do |badge| - b = badge["metadataBadgeRenderer"] - case b["label"].as_s - when "LIVE NOW" - live_now = true - when "New", "4K", "CC" - # TODO - when "Premium" - # TODO: Potentially available as item_contents["topStandaloneBadge"]["metadataBadgeRenderer"] - premium = true - else nil # Ignore +private module Parsers + module VideoRendererParser + def self.process(item : JSON::Any, author_fallback : AuthorFallback) + if item_contents = (item["videoRenderer"]? || item["gridVideoRenderer"]?) + return self.parse(item_contents, author_fallback) end end - SearchVideo.new({ - title: title, - id: video_id, - author: author, - ucid: author_id, - published: published, - views: view_count, - description_html: description_html, - length_seconds: length_seconds, - live_now: live_now, - premium: premium, - premiere_timestamp: premiere_timestamp, - }) - end -end + private def self.parse(item_contents, author_fallback) + video_id = item_contents["videoId"].as_s + title = item_contents["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || "" -private struct ChannelParser < ItemParser - def process(item, author_fallback) - if item_contents = (item["channelRenderer"]? || item["gridChannelRenderer"]?) - return self.parse(item_contents, author_fallback) - end - end + author_info = item_contents["ownerText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? + author = author_info.try &.["text"].as_s || author_fallback.name || "" + author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_fallback.id || "" - private def parse(item_contents, author_fallback) - author = item_contents["title"]["simpleText"]?.try &.as_s || author_fallback.name || "" - author_id = item_contents["channelId"]?.try &.as_s || author_fallback.id || "" + published = item_contents["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local + view_count = item_contents["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64 + description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" + length_seconds = item_contents["lengthText"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || + item_contents["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?).try &.["thumbnailOverlayTimeStatusRenderer"]? + .try &.["text"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || 0 - author_thumbnail = item_contents["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try &.["url"]?.try &.as_s || "" - subscriber_count = item_contents["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0 + live_now = false + paid = false + premium = false - auto_generated = false - auto_generated = true if !item_contents["videoCountText"]? - video_count = item_contents["videoCountText"]?.try &.["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 - description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" + premiere_timestamp = item_contents["upcomingEventData"]?.try &.["startTime"]?.try { |t| Time.unix(t.as_s.to_i64) } - SearchChannel.new({ - author: author, - ucid: author_id, - author_thumbnail: author_thumbnail, - subscriber_count: subscriber_count, - video_count: video_count, - description_html: description_html, - auto_generated: auto_generated, - }) - end -end + item_contents["badges"]?.try &.as_a.each do |badge| + b = badge["metadataBadgeRenderer"] + case b["label"].as_s + when "LIVE NOW" + live_now = true + when "New", "4K", "CC" + # TODO + when "Premium" + # TODO: Potentially available as item_contents["topStandaloneBadge"]["metadataBadgeRenderer"] + premium = true + else nil # Ignore + end + end -private struct GridPlaylistParser < ItemParser - def process(item, author_fallback) - if item_contents = item["gridPlaylistRenderer"]? - return self.parse(item_contents, author_fallback) - end - end - - private def parse(item_contents, author_fallback) - title = item_contents["title"]["runs"].as_a[0]?.try &.["text"].as_s || "" - plid = item_contents["playlistId"]?.try &.as_s || "" - - video_count = item_contents["videoCountText"]["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 - playlist_thumbnail = item_contents["thumbnail"]["thumbnails"][0]?.try &.["url"]?.try &.as_s || "" - - SearchPlaylist.new({ - title: title, - id: plid, - author: author_fallback.name || "", - ucid: author_fallback.id || "", - video_count: video_count, - videos: [] of SearchPlaylistVideo, - thumbnail: playlist_thumbnail, - }) - end -end - -private struct PlaylistParser < ItemParser - def process(item, author_fallback) - if item_contents = item["playlistRenderer"]? - return self.parse(item_contents, author_fallback) - end - end - - def parse(item_contents, author_fallback) - title = item_contents["title"]["simpleText"]?.try &.as_s || "" - plid = item_contents["playlistId"]?.try &.as_s || "" - - video_count = item_contents["videoCount"]?.try &.as_s.to_i || 0 - playlist_thumbnail = item_contents["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || "" - - author_info = item_contents["shortBylineText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? - author = author_info.try &.["text"].as_s || author_fallback.name || "" - author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_fallback.id || "" - - videos = item_contents["videos"]?.try &.as_a.map do |v| - v = v["childVideoRenderer"] - v_title = v["title"]["simpleText"]?.try &.as_s || "" - v_id = v["videoId"]?.try &.as_s || "" - v_length_seconds = v["lengthText"]?.try &.["simpleText"]?.try { |t| decode_length_seconds(t.as_s) } || 0 - SearchPlaylistVideo.new({ - title: v_title, - id: v_id, - length_seconds: v_length_seconds, + SearchVideo.new({ + title: title, + id: video_id, + author: author, + ucid: author_id, + published: published, + views: view_count, + description_html: description_html, + length_seconds: length_seconds, + live_now: live_now, + premium: premium, + premiere_timestamp: premiere_timestamp, }) - end || [] of SearchPlaylistVideo - - # TODO: item_contents["publishedTimeText"]? - - SearchPlaylist.new({ - title: title, - id: plid, - author: author, - ucid: author_id, - video_count: video_count, - videos: videos, - thumbnail: playlist_thumbnail, - }) - end -end - -private struct CategoryParser < ItemParser - def process(item, author_fallback) - if item_contents = item["shelfRenderer"]? - return self.parse(item_contents, author_fallback) end end - def parse(item_contents, author_fallback) - # Title extraction is a bit complicated. There are two possible routes for it - # as well as times when the title attribute just isn't sent by YT. - title_container = item_contents["title"]? || "" - if !title_container.is_a? String - if title = title_container["simpleText"]? - title = title.as_s + module ChannelRendererParser + def self.process(item : JSON::Any, author_fallback : AuthorFallback) + if item_contents = (item["channelRenderer"]? || item["gridChannelRenderer"]?) + return self.parse(item_contents, author_fallback) + end + end + + private def self.parse(item_contents, author_fallback) + author = item_contents["title"]["simpleText"]?.try &.as_s || author_fallback.name || "" + author_id = item_contents["channelId"]?.try &.as_s || author_fallback.id || "" + + author_thumbnail = item_contents["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try &.["url"]?.try &.as_s || "" + subscriber_count = item_contents["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0 + + auto_generated = false + auto_generated = true if !item_contents["videoCountText"]? + video_count = item_contents["videoCountText"]?.try &.["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 + description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" + + SearchChannel.new({ + author: author, + ucid: author_id, + author_thumbnail: author_thumbnail, + subscriber_count: subscriber_count, + video_count: video_count, + description_html: description_html, + auto_generated: auto_generated, + }) + end + end + + module GridPlaylistRendererParser + def self.process(item : JSON::Any, author_fallback : AuthorFallback) + if item_contents = item["gridPlaylistRenderer"]? + return self.parse(item_contents, author_fallback) + end + end + + private def self.parse(item_contents, author_fallback) + title = item_contents["title"]["runs"].as_a[0]?.try &.["text"].as_s || "" + plid = item_contents["playlistId"]?.try &.as_s || "" + + video_count = item_contents["videoCountText"]["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 + playlist_thumbnail = item_contents["thumbnail"]["thumbnails"][0]?.try &.["url"]?.try &.as_s || "" + + SearchPlaylist.new({ + title: title, + id: plid, + author: author_fallback.name || "", + ucid: author_fallback.id || "", + video_count: video_count, + videos: [] of SearchPlaylistVideo, + thumbnail: playlist_thumbnail, + }) + end + end + + module PlaylistRendererParser + def self.process(item : JSON::Any, author_fallback : AuthorFallback) + if item_contents = item["playlistRenderer"]? + return self.parse(item_contents, author_fallback) + end + end + + private def self.parse(item_contents, author_fallback) + title = item_contents["title"]["simpleText"]?.try &.as_s || "" + plid = item_contents["playlistId"]?.try &.as_s || "" + + video_count = item_contents["videoCount"]?.try &.as_s.to_i || 0 + playlist_thumbnail = item_contents["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || "" + + author_info = item_contents["shortBylineText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? + author = author_info.try &.["text"].as_s || author_fallback.name || "" + author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_fallback.id || "" + + videos = item_contents["videos"]?.try &.as_a.map do |v| + v = v["childVideoRenderer"] + v_title = v["title"]["simpleText"]?.try &.as_s || "" + v_id = v["videoId"]?.try &.as_s || "" + v_length_seconds = v["lengthText"]?.try &.["simpleText"]?.try { |t| decode_length_seconds(t.as_s) } || 0 + SearchPlaylistVideo.new({ + title: v_title, + id: v_id, + length_seconds: v_length_seconds, + }) + end || [] of SearchPlaylistVideo + + # TODO: item_contents["publishedTimeText"]? + + SearchPlaylist.new({ + title: title, + id: plid, + author: author, + ucid: author_id, + video_count: video_count, + videos: videos, + thumbnail: playlist_thumbnail, + }) + end + end + + module CategoryRendererParser + def self.process(item : JSON::Any, author_fallback : AuthorFallback) + if item_contents = item["shelfRenderer"]? + return self.parse(item_contents, author_fallback) + end + end + + private def self.parse(item_contents, author_fallback) + # Title extraction is a bit complicated. There are two possible routes for it + # as well as times when the title attribute just isn't sent by YT. + title_container = item_contents["title"]? || "" + if !title_container.is_a? String + if title = title_container["simpleText"]? + title = title.as_s + else + title = title_container["runs"][0]["text"].as_s + end else - title = title_container["runs"][0]["text"].as_s + title = "" end - else - title = "" - end - url = item_contents["endpoint"]?.try &.["commandMetadata"]["webCommandMetadata"]["url"].as_s + url = item_contents["endpoint"]?.try &.["commandMetadata"]["webCommandMetadata"]["url"].as_s - # Sometimes a category can have badges. - badges = [] of Tuple(String, String) # (Badge style, label) - item_contents["badges"]?.try &.as_a.each do |badge| - badge = badge["metadataBadgeRenderer"] - badges << {badge["style"].as_s, badge["label"].as_s} - end - - # Category description - description_html = item_contents["subtitle"]?.try { |desc| parse_content(desc) } || "" - - # Content parsing - contents = [] of SearchItem - - # Content could be in three locations. - if content_container = item_contents["content"]["horizontalListRenderer"]? - elsif content_container = item_contents["content"]["expandedShelfContentsRenderer"]? - elsif content_container = item_contents["content"]["verticalListRenderer"]? - else - content_container = item_contents["contents"] - end - - raw_contents = content_container["items"].as_a - raw_contents.each do |item| - result = extract_item(item) - if !result.nil? - contents << result + # Sometimes a category can have badges. + badges = [] of Tuple(String, String) # (Badge style, label) + item_contents["badges"]?.try &.as_a.each do |badge| + badge = badge["metadataBadgeRenderer"] + badges << {badge["style"].as_s, badge["label"].as_s} end - end - Category.new({ - title: title, - contents: contents, - description_html: description_html, - url: url, - badges: badges, - }) + # Category description + description_html = item_contents["subtitle"]?.try { |desc| parse_content(desc) } || "" + + # Content parsing + contents = [] of SearchItem + + # Content could be in three locations. + if content_container = item_contents["content"]["horizontalListRenderer"]? + elsif content_container = item_contents["content"]["expandedShelfContentsRenderer"]? + elsif content_container = item_contents["content"]["verticalListRenderer"]? + else + content_container = item_contents["contents"] + end + + raw_contents = content_container["items"].as_a + raw_contents.each do |item| + result = extract_item(item) + if !result.nil? + contents << result + end + end + + Category.new({ + title: title, + contents: contents, + description_html: description_html, + url: url, + badges: badges, + }) + end end end @@ -262,88 +250,82 @@ end # a structure we can more easily use via the parsers above. Their internals are # identical to the item parsers. -private abstract struct ItemsContainerExtractor - def process(item : Hash(String, JSON::Any)) - end - - private def extract(target : JSON::Any) - end -end - -private struct YoutubeTabsExtractor < ItemsContainerExtractor - def process(initial_data) - if target = initial_data["twoColumnBrowseResultsRenderer"]? - self.extract(target) - end - end - - private def extract(target) - raw_items = [] of JSON::Any - selected_tab = extract_selected_tab(target["tabs"]) - content = selected_tab["content"] - - content["sectionListRenderer"]["contents"].as_a.each do |renderer_container| - renderer_container = renderer_container["itemSectionRenderer"] - renderer_container_contents = renderer_container["contents"].as_a[0] - - # Category extraction - if items_container = renderer_container_contents["shelfRenderer"]? - raw_items << renderer_container_contents - next - elsif items_container = renderer_container_contents["gridRenderer"]? - else - items_container = renderer_container_contents - end - - items_container["items"].as_a.each do |item| - raw_items << item +private module Extractors + module YouTubeTabs + def self.process(initial_data : Hash(String, JSON::Any)) + if target = initial_data["twoColumnBrowseResultsRenderer"]? + self.extract(target) end end - return raw_items - end -end + private def self.extract(target) + raw_items = [] of JSON::Any + selected_tab = extract_selected_tab(target["tabs"]) + content = selected_tab["content"] -private struct SearchResultsExtractor < ItemsContainerExtractor - def process(initial_data) - if target = initial_data["twoColumnSearchResultsRenderer"]? - self.extract(target) + content["sectionListRenderer"]["contents"].as_a.each do |renderer_container| + renderer_container = renderer_container["itemSectionRenderer"] + renderer_container_contents = renderer_container["contents"].as_a[0] + + # Category extraction + if items_container = renderer_container_contents["shelfRenderer"]? + raw_items << renderer_container_contents + next + elsif items_container = renderer_container_contents["gridRenderer"]? + else + items_container = renderer_container_contents + end + + items_container["items"].as_a.each do |item| + raw_items << item + end + end + + return raw_items end end - private def extract(target) - raw_items = [] of Array(JSON::Any) - content = target["primaryContents"] - renderer = content["sectionListRenderer"]["contents"].as_a.each do |node| - if node = node["itemSectionRenderer"]? - raw_items << node["contents"].as_a + module SearchResults + def self.process(initial_data : Hash(String, JSON::Any)) + if target = initial_data["twoColumnSearchResultsRenderer"]? + self.extract(target) end end - raw_items = raw_items.flatten + private def self.extract(target) + raw_items = [] of Array(JSON::Any) + content = target["primaryContents"] + renderer = content["sectionListRenderer"]["contents"].as_a.each do |node| + if node = node["itemSectionRenderer"]? + raw_items << node["contents"].as_a + end + end - return raw_items - end -end + raw_items = raw_items.flatten -private struct ContinuationExtractor < ItemsContainerExtractor - def process(initial_data) - if target = initial_data["continuationContents"]? - self.extract(target) - elsif target = initial_data["appendContinuationItemsAction"]? - self.extract(target) + return raw_items end end - private def extract(target) - raw_items = [] of JSON::Any - if content = target["gridContinuation"]? - raw_items = content["items"].as_a - elsif content = target["continuationItems"]? - raw_items = content.as_a + module Continuation + def self.process(initial_data : Hash(String, JSON::Any)) + if target = initial_data["continuationContents"]? + self.extract(target) + elsif target = initial_data["appendContinuationItemsAction"]? + self.extract(target) + end end - return raw_items + private def self.extract(target) + raw_items = [] of JSON::Any + if content = target["gridContinuation"]? + raw_items = content["items"].as_a + elsif content = target["continuationItems"]? + raw_items = content.as_a + end + + return raw_items + end end end From ca9eb0d5392743cd64c9e0c010ae9c507699bc7c Mon Sep 17 00:00:00 2001 From: syeopite Date: Tue, 3 Aug 2021 21:22:34 -0700 Subject: [PATCH 14/22] Bountiful extractor changes - Add extract_text to simplify extraction of InnerTube texts - Add helper extractor methods to reduce repetition in parsing InnerTube - Change [] more than 2 blocks long to use #dig or #dig? - Remove useless ?.try blocks for items that always exists - Add (some) documentation to VideoRendererParser --- src/invidious/helpers/extractors.cr | 178 ++++++++++++++++++++-------- 1 file changed, 127 insertions(+), 51 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index cec0e728..dc46d40a 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -32,24 +32,49 @@ private module Parsers private def self.parse(item_contents, author_fallback) video_id = item_contents["videoId"].as_s - title = item_contents["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || "" + title = extract_text(item_contents["title"]) || "" + # Extract author information author_info = item_contents["ownerText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? - author = author_info.try &.["text"].as_s || author_fallback.name || "" - author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_fallback.id || "" + if author_info = item_contents.dig?("ownerText", "runs") + author_info = author_info[0] + author = author_info["text"].as_s + author_id = HelperExtractors.get_browse_endpoint(author_info) + else + author = author_fallback.name || "" + author_id = author_fallback.id || "" + end - published = item_contents["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local - view_count = item_contents["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64 + # For live videos (and possibly recently premiered videos) there is no published information. + # Instead, in its place is the amount of people currently watching. This behavior should be replicated + # on Invidious once all features of livestreams are supported. On an unrelated note, defaulting to the current + # time for publishing isn't a good idea. + published = item_contents["publishedTimeText"]?.try &.["simpleText"].try { |t| decode_date(t.as_s) } || Time.local + + # Typically views are stored under a "simpleText" in the "viewCountText". However, for + # livestreams and premiered it is stored under a "runs" array: [{"text":123}, {"text": "watching"}] + # When view count is disabled the "viewCountText" is not present on InnerTube data. + # TODO change default value to nil and typical encoding type to tuple storing type (watchers, views, etc) + # and count + view_count = item_contents.dig?("viewCountText", "simpleText").try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64 description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" - length_seconds = item_contents["lengthText"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || - item_contents["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?).try &.["thumbnailOverlayTimeStatusRenderer"]? - .try &.["text"]?.try &.["simpleText"]?.try &.as_s.try { |t| decode_length_seconds(t) } || 0 + + # The length information *should* only always exist in "lengthText". However, the legacy Invidious code + # extracts from "thumbnailOverlays" when it doesn't. More testing is needed to see if this is + # actually needed + if length_container = item_contents["lengthText"]? + length_seconds = decode_length_seconds(length_container["simpleText"].as_s) + elsif length_container = item_contents["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?) + length_seconds = extract_text(length_container["thumbnailOverlayTimeStatusRenderer"]["text"]).try { |t| decode_length_seconds(t) } || 0 + else + length_seconds = 0 + end live_now = false paid = false premium = false - premiere_timestamp = item_contents["upcomingEventData"]?.try &.["startTime"]?.try { |t| Time.unix(t.as_s.to_i64) } + premiere_timestamp = item_contents.dig?("upcomingEventData", "startTime").try { |t| Time.unix(t.as_s.to_i64) } item_contents["badges"]?.try &.as_a.each do |badge| b = badge["metadataBadgeRenderer"] @@ -89,15 +114,17 @@ private module Parsers end private def self.parse(item_contents, author_fallback) - author = item_contents["title"]["simpleText"]?.try &.as_s || author_fallback.name || "" + author = extract_text(item_contents["title"]) || author_fallback.name || "" author_id = item_contents["channelId"]?.try &.as_s || author_fallback.id || "" - author_thumbnail = item_contents["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try &.["url"]?.try &.as_s || "" - subscriber_count = item_contents["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0 + author_thumbnail = HelperExtractors.get_thumbnails(item_contents) + # When public subscriber count is disabled, the subscriberCountText isn't sent by InnerTube. + # TODO change default value to nil + subscriber_count = item_contents.dig?("subscriberCountText").try &.["simpleText"].try { |s| short_text_to_number(s.as_s.split(" ")[0]) } || 0 - auto_generated = false - auto_generated = true if !item_contents["videoCountText"]? - video_count = item_contents["videoCountText"]?.try &.["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 + auto_generated = !item_contents["videoCountText"]? ? true : false + + video_count = HelperExtractors.get_video_count(item_contents) description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" SearchChannel.new({ @@ -120,11 +147,11 @@ private module Parsers end private def self.parse(item_contents, author_fallback) - title = item_contents["title"]["runs"].as_a[0]?.try &.["text"].as_s || "" + title = extract_text(item_contents["title"]) || "" plid = item_contents["playlistId"]?.try &.as_s || "" - video_count = item_contents["videoCountText"]["runs"].as_a[0]?.try &.["text"].as_s.gsub(/\D/, "").to_i || 0 - playlist_thumbnail = item_contents["thumbnail"]["thumbnails"][0]?.try &.["url"]?.try &.as_s || "" + video_count = HelperExtractors.get_video_count(item_contents) + playlist_thumbnail = HelperExtractors.get_thumbnails(item_contents) SearchPlaylist.new({ title: title, @@ -141,26 +168,26 @@ private module Parsers module PlaylistRendererParser def self.process(item : JSON::Any, author_fallback : AuthorFallback) if item_contents = item["playlistRenderer"]? - return self.parse(item_contents, author_fallback) + return self.parse(item_contents) end end - private def self.parse(item_contents, author_fallback) + private def self.parse(item_contents) title = item_contents["title"]["simpleText"]?.try &.as_s || "" plid = item_contents["playlistId"]?.try &.as_s || "" - video_count = item_contents["videoCount"]?.try &.as_s.to_i || 0 - playlist_thumbnail = item_contents["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || "" + video_count = HelperExtractors.get_video_count(item_contents) + playlist_thumbnail = HelperExtractors.get_thumbnails_plural(item_contents) - author_info = item_contents["shortBylineText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? - author = author_info.try &.["text"].as_s || author_fallback.name || "" - author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_fallback.id || "" + author_info = item_contents.dig("shortBylineText", "runs", 0) + author = author_info["text"].as_s + author_id = HelperExtractors.get_browse_endpoint(author_info) videos = item_contents["videos"]?.try &.as_a.map do |v| v = v["childVideoRenderer"] - v_title = v["title"]["simpleText"]?.try &.as_s || "" + v_title = v.dig?("title", "simpleText").try &.as_s || "" v_id = v["videoId"]?.try &.as_s || "" - v_length_seconds = v["lengthText"]?.try &.["simpleText"]?.try { |t| decode_length_seconds(t.as_s) } || 0 + v_length_seconds = v.dig?("lengthText", "simpleText").try { |t| decode_length_seconds(t.as_s) } || 0 SearchPlaylistVideo.new({ title: v_title, id: v_id, @@ -190,20 +217,8 @@ private module Parsers end private def self.parse(item_contents, author_fallback) - # Title extraction is a bit complicated. There are two possible routes for it - # as well as times when the title attribute just isn't sent by YT. - title_container = item_contents["title"]? || "" - if !title_container.is_a? String - if title = title_container["simpleText"]? - title = title.as_s - else - title = title_container["runs"][0]["text"].as_s - end - else - title = "" - end - - url = item_contents["endpoint"]?.try &.["commandMetadata"]["webCommandMetadata"]["url"].as_s + title = extract_text(item_contents["title"]?) || "" + url = item_contents["endpoint"]?.try &.dig("commandMetadata", "webCommandMetadata", "url").as_s # Sometimes a category can have badges. badges = [] of Tuple(String, String) # (Badge style, label) @@ -249,7 +264,6 @@ end # the internal Youtube API's JSON response. The result is then packaged into # a structure we can more easily use via the parsers above. Their internals are # identical to the item parsers. - private module Extractors module YouTubeTabs def self.process(initial_data : Hash(String, JSON::Any)) @@ -260,12 +274,10 @@ private module Extractors private def self.extract(target) raw_items = [] of JSON::Any - selected_tab = extract_selected_tab(target["tabs"]) - content = selected_tab["content"] + content = extract_selected_tab(target["tabs"])["content"] content["sectionListRenderer"]["contents"].as_a.each do |renderer_container| - renderer_container = renderer_container["itemSectionRenderer"] - renderer_container_contents = renderer_container["contents"].as_a[0] + renderer_container_contents = renderer_container["itemSectionRenderer"]["contents"].as_a[0] # Category extraction if items_container = renderer_container_contents["shelfRenderer"]? @@ -294,16 +306,14 @@ private module Extractors private def self.extract(target) raw_items = [] of Array(JSON::Any) - content = target["primaryContents"] - renderer = content["sectionListRenderer"]["contents"].as_a.each do |node| + + target.dig("primaryContents", "sectionListRenderer", "contents").as_a.each do |node| if node = node["itemSectionRenderer"]? raw_items << node["contents"].as_a end end - raw_items = raw_items.flatten - - return raw_items + return raw_items.flatten end end @@ -329,6 +339,72 @@ private module Extractors end end +# Helper methods to extract out certain stuff from InnerTube +private module HelperExtractors + # Retrieves the amount of videos present within the given InnerTube data. + # + # Returns a 0 when it's unable to do so + def self.get_video_count(container : JSON::Any) : Int32 + if box = container["videoCountText"]? + return extract_text(container["videoCountText"]?).try &.gsub(/\D/, "").to_i || 0 + elsif box = container["videoCount"]? + return box.as_s.to_i + else + return 0 + end + end + + # Retrieve lowest quality thumbnail from InnerTube data + # + # TODO allow configuration of image quality (-1 is highest) + # + # Raises when it's unable to parse from the given JSON data. + def self.get_thumbnails(container : JSON::Any) : String + return container.dig("thumbnail", "thumbnails", 0, "url").as_s + end + + # ditto + # YouTube sometimes sends the thumbnail as: + # {"thumbnails": [{"thumbnails": [{"url": "example.com"}, ...]}]} + def self.get_thumbnails_plural(container : JSON::Any) : String + return container.dig("thumbnails", 0, "thumbnails", 0, "url").as_s + end + + # Retrieves the ID required for querying the InnerTube browse endpoint + # + # Raises when it's unable to do so + def self.get_browse_endpoint(container) + return container.dig("navigationEndpoint", "browseEndpoint", "browseId").as_s + end +end + +# Extracts text from InnerTube response +# +# InnerTube can package text in three different formats +# "runs": [ +# {"text": "something"}, +# {"text": "cont"}, +# ... +# ] +# +# "SimpleText": "something" +# +# Or sometimes just none at all as with the data returned from +# category continuations. +def extract_text(item : JSON::Any?) : String? + if item.nil? + return nil + end + + if text_container = item["simpleText"]? + return text_container.as_s + elsif text_container = item["runs"]? + return text_container.as_a.map(&.["text"].as_s).join("") + else + nil + end +end + # Parses an item from Youtube's JSON response into a more usable structure. # The end result can either be a SearchVideo, SearchPlaylist or SearchChannel. def extract_item(item : JSON::Any, author_fallback : String? = nil, From e5f07dedbf92459a237165f359d7565e638d4ffa Mon Sep 17 00:00:00 2001 From: syeopite Date: Wed, 4 Aug 2021 19:54:41 -0700 Subject: [PATCH 15/22] Typos and tiny styling changes --- src/invidious/helpers/extractors.cr | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index dc46d40a..3a90f017 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -35,9 +35,7 @@ private module Parsers title = extract_text(item_contents["title"]) || "" # Extract author information - author_info = item_contents["ownerText"]?.try &.["runs"]?.try &.as_a?.try &.[0]? - if author_info = item_contents.dig?("ownerText", "runs") - author_info = author_info[0] + if author_info = item_contents.dig?("ownerText", "runs", 0) author = author_info["text"].as_s author_id = HelperExtractors.get_browse_endpoint(author_info) else @@ -49,7 +47,7 @@ private module Parsers # Instead, in its place is the amount of people currently watching. This behavior should be replicated # on Invidious once all features of livestreams are supported. On an unrelated note, defaulting to the current # time for publishing isn't a good idea. - published = item_contents["publishedTimeText"]?.try &.["simpleText"].try { |t| decode_date(t.as_s) } || Time.local + published = item_contents.dig?("publishedTimeText", "simpleText").try { |t| decode_date(t.as_s) } || Time.local # Typically views are stored under a "simpleText" in the "viewCountText". However, for # livestreams and premiered it is stored under a "runs" array: [{"text":123}, {"text": "watching"}] @@ -119,8 +117,10 @@ private module Parsers author_thumbnail = HelperExtractors.get_thumbnails(item_contents) # When public subscriber count is disabled, the subscriberCountText isn't sent by InnerTube. + # Always simpleText # TODO change default value to nil - subscriber_count = item_contents.dig?("subscriberCountText").try &.["simpleText"].try { |s| short_text_to_number(s.as_s.split(" ")[0]) } || 0 + subscriber_count = item_contents.dig?("subscriberCountText").try &.["simpleText"].try { \ + |s| short_text_to_number(s.as_s.split(" ")[0]) } || 0 auto_generated = !item_contents["videoCountText"]? ? true : false @@ -420,10 +420,9 @@ def extract_item(item : JSON::Any, author_fallback : String? = nil, return result end end - # TODO radioRenderer, showRenderer, shelfRenderer, horizontalCardListRenderer, searchPyvRenderer end -# Parses multiple items from Youtube's initial JSON response into a more usable structure. +# Parses multiple items from YouTube's initial JSON response into a more usable structure. # The end result is an array of SearchItem. def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) : Array(SearchItem) @@ -436,7 +435,7 @@ def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : Stri unpackaged_data = initial_data end - # This is identicial to the parser cyling of extract_item(). + # This is identical to the parser cyling of extract_item(). ITEM_CONTAINER_EXTRACTOR.each do |extractor| results = extractor.process(unpackaged_data) if !results.nil? From 092b8a4e5220cbe7e6eed45d1c331d5596dc68bc Mon Sep 17 00:00:00 2001 From: syeopite Date: Thu, 5 Aug 2021 20:31:48 -0700 Subject: [PATCH 16/22] Add documentation to extractors.cr --- src/invidious/helpers/extractors.cr | 122 ++++++++++++++++++++++++++-- 1 file changed, 115 insertions(+), 7 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index 3a90f017..32134cc9 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -18,11 +18,22 @@ private ITEM_PARSERS = { record AuthorFallback, name : String? = nil, id : String? = nil -# The following are the parsers for parsing raw item data into neatly packaged structs. -# They're accessed through the process() method which validates the given data as applicable -# to their specific struct and then use the internal parse() method to assemble the struct -# specific to their category. +# Namespace for logic relating to parsing InnerTube data into various datastructs. +# +# Each of the parsers in this namespace are accessed through the #process() method +# which validates the given data as applicable to itself. If it is applicable the given +# data is passed to the private `#parse()` method which returns a datastruct of the given +# type. Otherwise, nil is returned. private module Parsers + # Parses a InnerTube videoRenderer into a SearchVideo. Returns nil when the given object isn't a videoRenderer + # + # A videoRenderer renders a video to click on within the YouTube and Invidious UI. It is **not** + # the watchable video itself. + # + # See specs for example. + # + # `videoRenderer`s can be found almost everywhere on YouTube. In categories, search results, channels, etc. + # module VideoRendererParser def self.process(item : JSON::Any, author_fallback : AuthorFallback) if item_contents = (item["videoRenderer"]? || item["gridVideoRenderer"]?) @@ -104,6 +115,15 @@ private module Parsers end end + # Parses a InnerTube channelRenderer into a SearchChannel. Returns nil when the given object isn't a channelRenderer + # + # A channelRenderer renders a channel to click on within the YouTube and Invidious UI. It is **not** + # the channel page itself. + # + # See specs for example. + # + # `channelRenderer`s can be found almost everywhere on YouTube. In categories, search results, channels, etc. + # module ChannelRendererParser def self.process(item : JSON::Any, author_fallback : AuthorFallback) if item_contents = (item["channelRenderer"]? || item["gridChannelRenderer"]?) @@ -139,6 +159,15 @@ private module Parsers end end + # Parses a InnerTube gridPlaylistRenderer into a SearchPlaylist. Returns nil when the given object isn't a gridPlaylistRenderer + # + # A gridPlaylistRenderer renders a playlist, that is located in a grid, to click on within the YouTube and Invidious UI. + # It is **not** the playlist itself. + # + # See specs for example. + # + # `gridPlaylistRenderer`s can be found on the playlist-tabs of channels and expanded categories. + # module GridPlaylistRendererParser def self.process(item : JSON::Any, author_fallback : AuthorFallback) if item_contents = item["gridPlaylistRenderer"]? @@ -165,6 +194,14 @@ private module Parsers end end + # Parses a InnerTube playlistRenderer into a SearchPlaylist. Returns nil when the given object isn't a playlistRenderer + # + # A playlistRenderer renders a playlist to click on within the YouTube and Invidious UI. It is **not** the playlist itself. + # + # See specs for example. + # + # `playlistRenderer`s can be found almost everywhere on YouTube. In categories, search results, recommended, etc. + # module PlaylistRendererParser def self.process(item : JSON::Any, author_fallback : AuthorFallback) if item_contents = item["playlistRenderer"]? @@ -209,6 +246,16 @@ private module Parsers end end + # Parses a InnerTube shelfRenderer into a Category. Returns nil when the given object isn't a shelfRenderer + # + # A shelfRenderer renders divided sections on YouTube. IE "People also watched" in search results and + # the various organizational sections in the channel home page. A separate one (richShelfRenderer) is used + # for YouTube home. A shelfRenderer can also sometimes be expanded to show more content within it. + # + # See specs for example. + # + # `shelfRenderer`s can be found almost everywhere on YouTube. In categories, search results, channels, etc. + # module CategoryRendererParser def self.process(item : JSON::Any, author_fallback : AuthorFallback) if item_contents = item["shelfRenderer"]? @@ -264,7 +311,34 @@ end # the internal Youtube API's JSON response. The result is then packaged into # a structure we can more easily use via the parsers above. Their internals are # identical to the item parsers. + +# Namespace for logic relating to extracting InnerTube's initial response to items we can parse. +# +# Each of the extractors in this namespace are accessed through the #process() method +# which validates the given data as applicable to itself. If it is applicable the given +# data is passed to the private `#extract()` method which returns an array of +# parsable items. Otherwise, nil is returned. +# +# NOTE perhaps the result from here should be abstracted into a struct in order to +# get additional metadata regarding the container of the item(s). private module Extractors + # Extracts items from the selected YouTube tab. + # + # YouTube tabs are typically stored under "twoColumnBrowseResultsRenderer" + # and is structured like this: + # + # "twoColumnBrowseResultsRenderer": { + # {"tabs": [ + # {"tabRenderer": { + # "endpoint": {...} + # "title": "Playlists", + # "selected": true, + # "content": {...}, + # ... + # }} + # ]} + # }] + # module YouTubeTabs def self.process(initial_data : Hash(String, JSON::Any)) if target = initial_data["twoColumnBrowseResultsRenderer"]? @@ -297,6 +371,23 @@ private module Extractors end end + # Extracts items from the InnerTube response for search results + # + # Search results are typically stored under "twoColumnSearchResultsRenderer" + # and is structured like this: + # + # "twoColumnSearchResultsRenderer": { + # {"primaryContents": { + # {"sectionListRenderer": { + # "contents": [...], + # ..., + # "subMenu": {...}, + # "hideBottomSeparator": true, + # "targetId": "search-feed" + # }} + # }} + # } + # module SearchResults def self.process(initial_data : Hash(String, JSON::Any)) if target = initial_data["twoColumnSearchResultsRenderer"]? @@ -317,6 +408,16 @@ private module Extractors end end + # Extracts continuation items from a InnerTube response + # + # Continuation items (on YouTube) are items which are appended to the + # end of the page for continuous scrolling. As such, in many cases, + # the items are lacking information such as author or category title, + # since the original results has already rendered them on the top of the page. + # + # The way they are structured is too varied to be accurately written down here. + # However, they all eventually lead to an array of parsable items after traversing + # through the JSON structure. module Continuation def self.process(initial_data : Hash(String, JSON::Any)) if target = initial_data["continuationContents"]? @@ -339,7 +440,10 @@ private module Extractors end end -# Helper methods to extract out certain stuff from InnerTube +# Helper methods to aid in the parsing of InnerTube to data structs. +# +# Mostly used to extract out repeated structures to deal with code +# repetition. private module HelperExtractors # Retrieves the amount of videos present within the given InnerTube data. # @@ -364,14 +468,14 @@ private module HelperExtractors end # ditto + # # YouTube sometimes sends the thumbnail as: # {"thumbnails": [{"thumbnails": [{"url": "example.com"}, ...]}]} def self.get_thumbnails_plural(container : JSON::Any) : String return container.dig("thumbnails", 0, "thumbnails", 0, "url").as_s end - # Retrieves the ID required for querying the InnerTube browse endpoint - # + # Retrieves the ID required for querying the InnerTube browse endpoint. # Raises when it's unable to do so def self.get_browse_endpoint(container) return container.dig("navigationEndpoint", "browseEndpoint", "browseId").as_s @@ -391,6 +495,10 @@ end # # Or sometimes just none at all as with the data returned from # category continuations. +# +# In order to facilitate calling this function with `#[]?`: +# A nil will be accepted. Of course, since nil cannot be parsed, +# another nil will be returned. def extract_text(item : JSON::Any?) : String? if item.nil? return nil From 6df85718e6dac2faa9037fcf2283aa6b5ab819a3 Mon Sep 17 00:00:00 2001 From: syeopite <70992037+syeopite@users.noreply.github.com> Date: Tue, 28 Sep 2021 15:23:36 +0000 Subject: [PATCH 17/22] Apply suggestions from code review Co-authored-by: Samantaz Fox --- src/invidious/helpers/extractors.cr | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index 32134cc9..0c645868 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -139,8 +139,8 @@ private module Parsers # When public subscriber count is disabled, the subscriberCountText isn't sent by InnerTube. # Always simpleText # TODO change default value to nil - subscriber_count = item_contents.dig?("subscriberCountText").try &.["simpleText"].try { \ - |s| short_text_to_number(s.as_s.split(" ")[0]) } || 0 + subscriber_count = item_contents.dig?("subscriberCountText", "simpleText") + .try { |s| short_text_to_number(s.as_s.split(" ")[0]) } || 0 auto_generated = !item_contents["videoCountText"]? ? true : false @@ -265,7 +265,8 @@ private module Parsers private def self.parse(item_contents, author_fallback) title = extract_text(item_contents["title"]?) || "" - url = item_contents["endpoint"]?.try &.dig("commandMetadata", "webCommandMetadata", "url").as_s + url = item_contents.dig?("endpoint", "commandMetadata", "webCommandMetadata", "url") + .try &.as_s # Sometimes a category can have badges. badges = [] of Tuple(String, String) # (Badge style, label) @@ -450,7 +451,7 @@ private module HelperExtractors # Returns a 0 when it's unable to do so def self.get_video_count(container : JSON::Any) : Int32 if box = container["videoCountText"]? - return extract_text(container["videoCountText"]?).try &.gsub(/\D/, "").to_i || 0 + return extract_text(box).try &.gsub(/\D/, "").to_i || 0 elsif box = container["videoCount"]? return box.as_s.to_i else From 43ea8fa70698ef94701fdf9da419300b9a6a0710 Mon Sep 17 00:00:00 2001 From: syeopite Date: Tue, 28 Sep 2021 08:19:55 -0700 Subject: [PATCH 18/22] Convert nil for AuthorFallback to empty strings --- src/invidious/helpers/extractors.cr | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index 0c645868..88248e8d 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -16,7 +16,7 @@ private ITEM_PARSERS = { Parsers::CategoryRendererParser, } -record AuthorFallback, name : String? = nil, id : String? = nil +record AuthorFallback, name : String, id : String # Namespace for logic relating to parsing InnerTube data into various datastructs. # @@ -50,8 +50,8 @@ private module Parsers author = author_info["text"].as_s author_id = HelperExtractors.get_browse_endpoint(author_info) else - author = author_fallback.name || "" - author_id = author_fallback.id || "" + author = author_fallback.name + author_id = author_fallback.id end # For live videos (and possibly recently premiered videos) there is no published information. @@ -132,8 +132,8 @@ private module Parsers end private def self.parse(item_contents, author_fallback) - author = extract_text(item_contents["title"]) || author_fallback.name || "" - author_id = item_contents["channelId"]?.try &.as_s || author_fallback.id || "" + author = extract_text(item_contents["title"]) || author_fallback.name + author_id = item_contents["channelId"]?.try &.as_s || author_fallback.id author_thumbnail = HelperExtractors.get_thumbnails(item_contents) # When public subscriber count is disabled, the subscriberCountText isn't sent by InnerTube. @@ -185,8 +185,8 @@ private module Parsers SearchPlaylist.new({ title: title, id: plid, - author: author_fallback.name || "", - ucid: author_fallback.id || "", + author: author_fallback.name, + ucid: author_fallback.id, video_count: video_count, videos: [] of SearchPlaylistVideo, thumbnail: playlist_thumbnail, @@ -516,9 +516,12 @@ end # Parses an item from Youtube's JSON response into a more usable structure. # The end result can either be a SearchVideo, SearchPlaylist or SearchChannel. -def extract_item(item : JSON::Any, author_fallback : String? = nil, - author_id_fallback : String? = nil) - author_fallback = AuthorFallback.new(author_fallback, author_id_fallback) +def extract_item(item : JSON::Any, author_fallback : String? = "", + author_id_fallback : String? = "") + # We "allow" nil values but secretly use empty strings instead. This is to save us the + # hassle of modifying every author_fallback and author_id_fallback arg usage + # which is more often than not nil. + author_fallback = AuthorFallback.new(author_fallback || "", author_id_fallback || "") # Cycles through all of the item parsers and attempt to parse the raw YT JSON data. # Each parser automatically validates the data given to see if the data is From aa59925374849a4e2aee09de5e65ba027e16f3be Mon Sep 17 00:00:00 2001 From: syeopite Date: Tue, 28 Sep 2021 08:39:00 -0700 Subject: [PATCH 19/22] Rename get_browse_endpoint to get_browse_id --- src/invidious/helpers/extractors.cr | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index 88248e8d..13ffe1e4 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -48,7 +48,7 @@ private module Parsers # Extract author information if author_info = item_contents.dig?("ownerText", "runs", 0) author = author_info["text"].as_s - author_id = HelperExtractors.get_browse_endpoint(author_info) + author_id = HelperExtractors.get_browse_id(author_info) else author = author_fallback.name author_id = author_fallback.id @@ -218,7 +218,7 @@ private module Parsers author_info = item_contents.dig("shortBylineText", "runs", 0) author = author_info["text"].as_s - author_id = HelperExtractors.get_browse_endpoint(author_info) + author_id = HelperExtractors.get_browse_id(author_info) videos = item_contents["videos"]?.try &.as_a.map do |v| v = v["childVideoRenderer"] @@ -478,7 +478,7 @@ private module HelperExtractors # Retrieves the ID required for querying the InnerTube browse endpoint. # Raises when it's unable to do so - def self.get_browse_endpoint(container) + def self.get_browse_id(container) return container.dig("navigationEndpoint", "browseEndpoint", "browseId").as_s end end From 9ab242ca2e79ecc8a196a019619fa3ddab31b28a Mon Sep 17 00:00:00 2001 From: syeopite Date: Tue, 28 Sep 2021 08:50:23 -0700 Subject: [PATCH 20/22] Optimize routing logic of extract_item(s) funcs --- src/invidious/helpers/extractors.cr | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index 13ffe1e4..c6929162 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -352,7 +352,7 @@ private module Extractors content = extract_selected_tab(target["tabs"])["content"] content["sectionListRenderer"]["contents"].as_a.each do |renderer_container| - renderer_container_contents = renderer_container["itemSectionRenderer"]["contents"].as_a[0] + renderer_container_contents = renderer_container["itemSectionRenderer"]["contents"][0] # Category extraction if items_container = renderer_container_contents["shelfRenderer"]? @@ -527,8 +527,7 @@ def extract_item(item : JSON::Any, author_fallback : String? = "", # Each parser automatically validates the data given to see if the data is # applicable to itself. If not nil is returned and the next parser is attemped. ITEM_PARSERS.each do |parser| - result = parser.process(item, author_fallback) - if !result.nil? + if result = parser.process(item, author_fallback) return result end end @@ -542,22 +541,21 @@ def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : Stri if unpackaged_data = initial_data["contents"]?.try &.as_h elsif unpackaged_data = initial_data["response"]?.try &.as_h - elsif unpackaged_data = initial_data["onResponseReceivedActions"]?.try &.as_a.[0].as_h + elsif unpackaged_data = initial_data.dig?("onResponseReceivedActions", 0).try &.as_h else unpackaged_data = initial_data end - # This is identical to the parser cyling of extract_item(). + # This is identical to the parser cycling of extract_item(). ITEM_CONTAINER_EXTRACTOR.each do |extractor| - results = extractor.process(unpackaged_data) - if !results.nil? - results.each do |item| - parsed_result = extract_item(item, author_fallback, author_id_fallback) - - if !parsed_result.nil? + if container = extractor.process(unpackaged_data) + # Extract items in container + container.each do |item| + if parsed_result = extract_item(item, author_fallback, author_id_fallback) items << parsed_result end end + return items end end From 23049e026f4c4f8fe02f8a911a717791345d44fa Mon Sep 17 00:00:00 2001 From: syeopite Date: Tue, 28 Sep 2021 08:55:02 -0700 Subject: [PATCH 21/22] Improve readabltiy of SearchChannel auto-gen detect --- src/invidious/helpers/extractors.cr | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index c6929162..83c751e0 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -142,7 +142,9 @@ private module Parsers subscriber_count = item_contents.dig?("subscriberCountText", "simpleText") .try { |s| short_text_to_number(s.as_s.split(" ")[0]) } || 0 - auto_generated = !item_contents["videoCountText"]? ? true : false + # Auto-generated channels doesn't have videoCountText + # Taken from: https://github.com/iv-org/invidious/pull/2228#discussion_r717620922 + auto_generated = item_contents["videoCountText"]?.nil? video_count = HelperExtractors.get_video_count(item_contents) description_html = item_contents["descriptionSnippet"]?.try { |t| parse_content(t) } || "" From 26b28cea498f3d7be10907165e1f9d8322843911 Mon Sep 17 00:00:00 2001 From: syeopite Date: Fri, 1 Oct 2021 05:39:23 -0700 Subject: [PATCH 22/22] Use break instead of short-circuit return --- src/invidious/helpers/extractors.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr index 83c751e0..850c93ec 100644 --- a/src/invidious/helpers/extractors.cr +++ b/src/invidious/helpers/extractors.cr @@ -558,7 +558,7 @@ def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : Stri end end - return items + break end end