invidious-copy-2022-03-16/src/invidious/search.cr

474 lines
13 KiB
Crystal
Raw Normal View History

2019-03-29 21:30:02 +00:00
struct SearchVideo
include DB::Serializable
property title : String
property id : String
property author : String
property ucid : String
property published : Time
property views : Int64
property description_html : String
property length_seconds : Int32
property live_now : Bool
property paid : Bool
property premium : Bool
property premiere_timestamp : Time?
2020-06-15 22:10:30 +00:00
def to_xml(auto_generated, query_params, xml : XML::Builder)
query_params["v"] = self.id
2019-06-07 17:39:12 +00:00
xml.element("entry") do
xml.element("id") { xml.text "yt:video:#{self.id}" }
xml.element("yt:videoId") { xml.text self.id }
xml.element("yt:channelId") { xml.text self.ucid }
xml.element("title") { xml.text self.title }
2020-06-15 22:10:30 +00:00
xml.element("link", rel: "alternate", href: "#{HOST_URL}/watch?#{query_params}")
2019-06-07 17:39:12 +00:00
xml.element("author") do
if auto_generated
xml.element("name") { xml.text self.author }
2020-06-15 22:10:30 +00:00
xml.element("uri") { xml.text "#{HOST_URL}/channel/#{self.ucid}" }
2019-06-07 17:39:12 +00:00
else
xml.element("name") { xml.text author }
2020-06-15 22:10:30 +00:00
xml.element("uri") { xml.text "#{HOST_URL}/channel/#{ucid}" }
2019-06-07 17:39:12 +00:00
end
end
xml.element("content", type: "xhtml") do
xml.element("div", xmlns: "http://www.w3.org/1999/xhtml") do
2020-06-15 22:10:30 +00:00
xml.element("a", href: "#{HOST_URL}/watch?#{query_params}") do
xml.element("img", src: "#{HOST_URL}/vi/#{self.id}/mqdefault.jpg")
2019-06-07 17:39:12 +00:00
end
2019-10-04 16:49:58 +00:00
xml.element("p", style: "word-break:break-word;white-space:pre-wrap") { xml.text html_to_content(self.description_html) }
2019-06-07 17:39:12 +00:00
end
end
xml.element("published") { xml.text self.published.to_s("%Y-%m-%dT%H:%M:%S%:z") }
xml.element("media:group") do
xml.element("media:title") { xml.text self.title }
2020-06-15 22:10:30 +00:00
xml.element("media:thumbnail", url: "#{HOST_URL}/vi/#{self.id}/mqdefault.jpg",
2019-06-07 17:39:12 +00:00
width: "320", height: "180")
2019-06-08 20:08:27 +00:00
xml.element("media:description") { xml.text html_to_content(self.description_html) }
2019-06-07 17:39:12 +00:00
end
xml.element("media:community") do
xml.element("media:statistics", views: self.views)
end
end
end
2020-06-15 22:10:30 +00:00
def to_xml(auto_generated, query_params, xml : XML::Builder | Nil = nil)
2019-06-07 17:39:12 +00:00
if xml
2020-06-15 22:10:30 +00:00
to_xml(HOST_URL, auto_generated, query_params, xml)
2019-06-07 17:39:12 +00:00
else
XML.build do |json|
2020-06-15 22:10:30 +00:00
to_xml(HOST_URL, auto_generated, query_params, xml)
2019-06-07 17:39:12 +00:00
end
end
end
2020-06-15 22:10:30 +00:00
def to_json(locale, json : JSON::Builder)
2019-06-08 18:31:41 +00:00
json.object do
json.field "type", "video"
json.field "title", self.title
json.field "videoId", self.id
json.field "author", self.author
json.field "authorId", self.ucid
json.field "authorUrl", "/channel/#{self.ucid}"
json.field "videoThumbnails" do
2020-06-15 22:10:30 +00:00
generate_thumbnails(json, self.id)
2019-06-08 18:31:41 +00:00
end
2019-06-08 20:08:27 +00:00
json.field "description", html_to_content(self.description_html)
2019-06-08 18:31:41 +00:00
json.field "descriptionHtml", self.description_html
json.field "viewCount", self.views
json.field "published", self.published.to_unix
json.field "publishedText", translate(locale, "`x` ago", recode_date(self.published, locale))
json.field "lengthSeconds", self.length_seconds
json.field "liveNow", self.live_now
json.field "paid", self.paid
json.field "premium", self.premium
2020-06-15 22:10:30 +00:00
json.field "isUpcoming", self.is_upcoming
if self.premiere_timestamp
json.field "premiereTimestamp", self.premiere_timestamp.try &.to_unix
end
2019-06-08 18:31:41 +00:00
end
end
2020-06-15 22:10:30 +00:00
def to_json(locale, json : JSON::Builder | Nil = nil)
2019-06-08 18:31:41 +00:00
if json
2020-06-15 22:10:30 +00:00
to_json(locale, json)
2019-06-08 18:31:41 +00:00
else
JSON.build do |json|
2020-06-15 22:10:30 +00:00
to_json(locale, json)
2019-06-08 18:31:41 +00:00
end
end
end
2020-06-15 22:33:23 +00:00
def is_upcoming
premiere_timestamp ? true : false
end
2018-08-05 04:07:38 +00:00
end
2019-03-29 21:30:02 +00:00
struct SearchPlaylistVideo
include DB::Serializable
property title : String
property id : String
property length_seconds : Int32
end
2019-03-29 21:30:02 +00:00
struct SearchPlaylist
include DB::Serializable
property title : String
property id : String
property author : String
property ucid : String
property video_count : Int32
property videos : Array(SearchPlaylistVideo)
property thumbnail : String?
2020-06-15 22:10:30 +00:00
def to_json(locale, json : JSON::Builder)
2019-06-08 18:31:41 +00:00
json.object do
json.field "type", "playlist"
json.field "title", self.title
json.field "playlistId", self.id
json.field "playlistThumbnail", self.thumbnail
2019-06-08 18:31:41 +00:00
json.field "author", self.author
json.field "authorId", self.ucid
json.field "authorUrl", "/channel/#{self.ucid}"
json.field "videoCount", self.video_count
json.field "videos" do
json.array do
self.videos.each do |video|
json.object do
json.field "title", video.title
json.field "videoId", video.id
json.field "lengthSeconds", video.length_seconds
json.field "videoThumbnails" do
2020-06-15 22:10:30 +00:00
generate_thumbnails(json, video.id)
2019-06-08 18:31:41 +00:00
end
end
end
end
end
end
end
2020-06-15 22:10:30 +00:00
def to_json(locale, json : JSON::Builder | Nil = nil)
2019-06-08 18:31:41 +00:00
if json
2020-06-15 22:10:30 +00:00
to_json(locale, json)
2019-06-08 18:31:41 +00:00
else
JSON.build do |json|
2020-06-15 22:10:30 +00:00
to_json(locale, json)
2019-06-08 18:31:41 +00:00
end
end
end
end
2019-03-29 21:30:02 +00:00
struct SearchChannel
include DB::Serializable
property author : String
property ucid : String
property author_thumbnail : String
property subscriber_count : Int32
property video_count : Int32
property description_html : String
property auto_generated : Bool
2020-06-15 22:10:30 +00:00
def to_json(locale, json : JSON::Builder)
2019-06-08 18:31:41 +00:00
json.object do
json.field "type", "channel"
json.field "author", self.author
json.field "authorId", self.ucid
json.field "authorUrl", "/channel/#{self.ucid}"
json.field "authorThumbnails" do
json.array do
qualities = {32, 48, 76, 100, 176, 512}
qualities.each do |quality|
json.object do
2019-08-01 00:16:09 +00:00
json.field "url", self.author_thumbnail.gsub(/=\d+/, "=s#{quality}")
2019-06-08 18:31:41 +00:00
json.field "width", quality
json.field "height", quality
end
end
end
end
json.field "autoGenerated", self.auto_generated
2019-06-08 18:31:41 +00:00
json.field "subCount", self.subscriber_count
json.field "videoCount", self.video_count
2019-06-08 20:08:27 +00:00
json.field "description", html_to_content(self.description_html)
2019-06-08 18:31:41 +00:00
json.field "descriptionHtml", self.description_html
end
end
2020-06-15 22:10:30 +00:00
def to_json(locale, json : JSON::Builder | Nil = nil)
2019-06-08 18:31:41 +00:00
if json
2020-06-15 22:10:30 +00:00
to_json(locale, json)
2019-06-08 18:31:41 +00:00
else
JSON.build do |json|
2020-06-15 22:10:30 +00:00
to_json(locale, json)
2019-06-08 18:31:41 +00:00
end
end
end
end
alias SearchItem = SearchVideo | SearchChannel | SearchPlaylist
2018-09-13 22:47:31 +00:00
def channel_search(query, page, channel)
2021-03-24 05:15:06 +00:00
response = YT_POOL.client &.get("/channel/#{channel}")
2021-03-24 05:15:06 +00:00
if response.status_code == 404
response = YT_POOL.client &.get("/user/#{channel}")
response = YT_POOL.client &.get("/c/#{channel}") if response.status_code == 404
initial_data = extract_initial_data(response.body)
ucid = initial_data["header"]["c4TabbedHeaderRenderer"]?.try &.["channelId"].as_s?
raise InfoException.new("Impossible to extract channel ID from page") if !ucid
2021-03-24 05:15:06 +00:00
else
ucid = channel
end
2018-09-13 22:47:31 +00:00
2021-03-24 05:15:06 +00:00
continuation = produce_channel_search_continuation(ucid, query, page)
response_json = request_youtube_api_browse(continuation)
2018-09-13 22:47:31 +00:00
result = JSON.parse(response_json)
continuationItems = result["onResponseReceivedActions"]?
.try &.[0]["appendContinuationItemsAction"]["continuationItems"]
return 0, [] of SearchItem if !continuationItems
2021-03-24 05:15:06 +00:00
items = [] of SearchItem
continuationItems.as_a.select(&.as_h.has_key?("itemSectionRenderer")).each { |item|
2021-03-24 05:15:06 +00:00
extract_item(item["itemSectionRenderer"]["contents"].as_a[0])
.try { |t| items << t }
}
2018-09-13 22:47:31 +00:00
2020-06-15 22:33:23 +00:00
return items.size, items
2018-09-13 22:47:31 +00:00
end
def search(query, search_params = produce_search_params(content_type: "all"), region = nil)
2020-06-15 22:33:23 +00:00
return 0, [] of SearchItem if query.empty?
2018-08-27 20:23:25 +00:00
body = YT_POOL.client(region, &.get("/results?search_query=#{URI.encode_www_form(query)}&sp=#{search_params}&hl=en").body)
2020-06-15 22:33:23 +00:00
return 0, [] of SearchItem if body.empty?
2018-08-05 04:07:38 +00:00
2020-06-15 22:33:23 +00:00
initial_data = extract_initial_data(body)
items = extract_items(initial_data)
2018-08-04 20:30:44 +00:00
2020-06-15 22:33:23 +00:00
# initial_data["estimatedResults"]?.try &.as_s.to_i64
return items.size, items
2018-08-04 20:30:44 +00:00
end
def produce_search_params(page = 1, sort : String = "relevance", date : String = "", content_type : String = "",
2018-09-17 21:38:18 +00:00
duration : String = "", features : Array(String) = [] of String)
2019-10-27 17:50:42 +00:00
object = {
"1:varint" => 0_i64,
"2:embedded" => {} of String => Int64,
"9:varint" => ((page - 1) * 20).to_i64,
2019-10-27 17:50:42 +00:00
}
case sort
2019-08-27 14:35:15 +00:00
when "relevance"
2019-10-27 17:50:42 +00:00
object["1:varint"] = 0_i64
2019-08-27 14:35:15 +00:00
when "rating"
2019-10-27 17:50:42 +00:00
object["1:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "upload_date", "date"
2019-10-27 17:50:42 +00:00
object["1:varint"] = 2_i64
2019-08-27 14:35:15 +00:00
when "view_count", "views"
2019-10-27 17:50:42 +00:00
object["1:varint"] = 3_i64
2019-08-27 14:35:15 +00:00
else
raise "No sort #{sort}"
end
2019-10-27 17:50:42 +00:00
case date
2019-08-27 14:35:15 +00:00
when "hour"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["1:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "today"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["1:varint"] = 2_i64
2019-08-27 14:35:15 +00:00
when "week"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["1:varint"] = 3_i64
2019-08-27 14:35:15 +00:00
when "month"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["1:varint"] = 4_i64
2019-08-27 14:35:15 +00:00
when "year"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["1:varint"] = 5_i64
else nil # Ignore
2019-08-27 14:35:15 +00:00
end
2019-10-27 17:50:42 +00:00
case content_type
2019-08-27 14:35:15 +00:00
when "video"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["2:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "channel"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["2:varint"] = 2_i64
2019-08-27 14:35:15 +00:00
when "playlist"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["2:varint"] = 3_i64
2019-08-27 14:35:15 +00:00
when "movie"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["2:varint"] = 4_i64
2019-08-27 14:35:15 +00:00
when "show"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["2:varint"] = 5_i64
2019-08-27 14:35:15 +00:00
when "all"
2019-10-27 17:50:42 +00:00
#
2019-08-27 14:35:15 +00:00
else
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["2:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
end
2019-10-27 17:50:42 +00:00
case duration
2019-08-27 14:35:15 +00:00
when "short"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["3:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "long"
2019-11-08 14:29:33 +00:00
object["2:embedded"].as(Hash)["3:varint"] = 2_i64
else nil # Ignore
2019-08-27 14:35:15 +00:00
end
features.each do |feature|
2019-10-27 17:50:42 +00:00
case feature
2019-08-27 14:35:15 +00:00
when "hd"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["4:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "subtitles"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["5:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "creative_commons", "cc"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["6:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "3d"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["7:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "live", "livestream"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["8:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "purchased"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["9:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "4k"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["14:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "360"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["15:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "location"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["23:varint"] = 1_i64
2019-08-27 14:35:15 +00:00
when "hdr"
2019-10-27 17:50:42 +00:00
object["2:embedded"].as(Hash)["25:varint"] = 1_i64
else nil # Ignore
2019-08-27 14:35:15 +00:00
end
end
2019-10-28 10:17:39 +00:00
if object["2:embedded"].as(Hash).empty?
object.delete("2:embedded")
end
2019-10-27 17:50:42 +00:00
params = object.try { |i| Protodec::Any.cast_json(object) }
.try { |i| Protodec::Any.from_json(i) }
2019-10-28 10:17:39 +00:00
.try { |i| Base64.urlsafe_encode(i) }
.try { |i| URI.encode_www_form(i) }
2019-10-27 17:50:42 +00:00
return params
end
2018-09-13 22:47:31 +00:00
2021-03-24 05:15:06 +00:00
def produce_channel_search_continuation(ucid, query, page)
if page <= 1
idx = 0_i64
else
idx = 30_i64 * (page - 1)
end
2019-10-27 17:50:42 +00:00
object = {
"80226972:embedded" => {
"2:string" => ucid,
"3:base64" => {
"2:string" => "search",
2021-03-24 05:15:06 +00:00
"6:varint" => 1_i64,
2019-10-27 17:50:42 +00:00
"7:varint" => 1_i64,
2021-03-24 05:15:06 +00:00
"12:varint" => 1_i64,
"15:base64" => {
"3:varint" => idx,
},
2020-06-15 22:33:23 +00:00
"23:varint" => 0_i64,
2019-10-27 17:50:42 +00:00
},
"11:string" => query,
2021-03-24 05:15:06 +00:00
"35:string" => "browse-feed#{ucid}search",
2019-10-27 17:50:42 +00:00
},
}
continuation = object.try { |i| Protodec::Any.cast_json(object) }
.try { |i| Protodec::Any.from_json(i) }
.try { |i| Base64.urlsafe_encode(i) }
.try { |i| URI.encode_www_form(i) }
2021-03-24 05:15:06 +00:00
return continuation
2018-09-13 22:47:31 +00:00
end
2019-08-05 23:49:13 +00:00
def process_search_query(query, page, user, region)
if user
user = user.as(User)
view_name = "subscriptions_#{sha256(user.email)}"
end
channel = nil
content_type = "all"
date = ""
duration = ""
features = [] of String
sort = "relevance"
subscriptions = nil
operators = query.split(" ").select { |a| a.match(/\w+:[\w,]+/) }
operators.each do |operator|
key, value = operator.downcase.split(":")
case key
when "channel", "user"
channel = operator.split(":")[-1]
when "content_type", "type"
content_type = value
when "date"
date = value
when "duration"
duration = value
when "feature", "features"
features = value.split(",")
when "sort"
sort = value
when "subscriptions"
subscriptions = value == "true"
else
operators.delete(operator)
end
end
search_query = (query.split(" ") - operators).join(" ")
if channel
count, items = channel_search(search_query, page, channel)
elsif subscriptions
if view_name
items = PG_DB.query_all("SELECT id,title,published,updated,ucid,author,length_seconds FROM (
SELECT *,
to_tsvector(#{view_name}.title) ||
to_tsvector(#{view_name}.author)
as document
FROM #{view_name}
) v_search WHERE v_search.document @@ plainto_tsquery($1) LIMIT 20 OFFSET $2;", search_query, (page - 1) * 20, as: ChannelVideo)
count = items.size
else
items = [] of ChannelVideo
count = 0
end
else
search_params = produce_search_params(page: page, sort: sort, date: date, content_type: content_type,
2019-08-05 23:49:13 +00:00
duration: duration, features: features)
count, items = search(search_query, search_params, region).as(Tuple)
2019-08-05 23:49:13 +00:00
end
2021-01-03 00:35:31 +00:00
{search_query, count, items, operators}
2019-08-05 23:49:13 +00:00
end