Update sub_count extractor

This commit is contained in:
Omar Roth 2019-09-12 21:09:23 -04:00
parent 50d793e49b
commit b1fc80b79a
No known key found for this signature in database
GPG key ID: B8254FB7EC3D37F2
7 changed files with 16 additions and 34 deletions

View file

@ -118,7 +118,7 @@ struct AboutChannel
description_html: String, description_html: String,
paid: Bool, paid: Bool,
total_views: Int64, total_views: Int64,
sub_count: Int64, sub_count: Int32,
joined: Time, joined: Time,
is_family_friendly: Bool, is_family_friendly: Bool,
allowed_regions: Array(String), allowed_regions: Array(String),
@ -951,12 +951,6 @@ def get_about_info(ucid, locale)
raise error_message raise error_message
end end
sub_count = about.xpath_node(%q(//span[contains(text(), "subscribers")]))
if sub_count
sub_count = sub_count.content.delete(", subscribers").to_i?
end
sub_count ||= 0
author = about.xpath_node(%q(//span[contains(@class,"qualified-channel-title-text")]/a)).not_nil!.content author = about.xpath_node(%q(//span[contains(@class,"qualified-channel-title-text")]/a)).not_nil!.content
author_url = about.xpath_node(%q(//span[contains(@class,"qualified-channel-title-text")]/a)).not_nil!["href"] author_url = about.xpath_node(%q(//span[contains(@class,"qualified-channel-title-text")]/a)).not_nil!["href"]
author_thumbnail = about.xpath_node(%q(//img[@class="channel-header-profile-image"])).not_nil!["src"] author_thumbnail = about.xpath_node(%q(//img[@class="channel-header-profile-image"])).not_nil!["src"]
@ -1000,21 +994,14 @@ def get_about_info(ucid, locale)
) )
end end
total_views = 0_i64 joined = about.xpath_node(%q(//span[contains(., "Joined")]))
sub_count = 0_i64 .try &.content.try { |text| Time.parse(text, "Joined %b %-d, %Y", Time::Location.local) } || Time.unix(0)
joined = Time.unix(0) total_views = about.xpath_node(%q(//span[contains(., "views")]/b))
metadata = about.xpath_nodes(%q(//span[@class="about-stat"])) .try &.content.try &.gsub(/\D/, "").to_i64? || 0_i64
metadata.each do |item|
case item.content sub_count = about.xpath_node(%q(.//span[contains(@class, "subscriber-count")]))
when .includes? "views" .try &.["title"].try { |text| short_text_to_number(text) } || 0
total_views = item.content.gsub(/\D/, "").to_i64
when .includes? "subscribers"
sub_count = item.content.delete("subscribers").gsub(/\D/, "").to_i64
when .includes? "Joined"
joined = Time.parse(item.content.lchop("Joined "), "%b %-d, %Y", Time::Location.local)
end
end
# Auto-generated channels # Auto-generated channels
# https://support.google.com/youtube/answer/2579942 # https://support.google.com/youtube/answer/2579942
@ -1026,7 +1013,7 @@ def get_about_info(ucid, locale)
tabs = about.xpath_nodes(%q(//ul[@id="channel-navigation-menu"]/li/a/span)).map { |node| node.content.downcase } tabs = about.xpath_nodes(%q(//ul[@id="channel-navigation-menu"]/li/a/span)).map { |node| node.content.downcase }
return AboutChannel.new( AboutChannel.new(
ucid: ucid, ucid: ucid,
author: author, author: author,
auto_generated: auto_generated, auto_generated: auto_generated,

View file

@ -415,13 +415,8 @@ def extract_items(nodeset, ucid = nil, author_name = nil)
author_thumbnail ||= "" author_thumbnail ||= ""
subscriber_count_text = node.xpath_node(%q(.//span[contains(@class, "yt-subscriber-count")])).try &.["title"] subscriber_count = node.xpath_node(%q(.//span[contains(@class, "subscriber-count")]))
begin .try &.["title"].try { |text| short_text_to_number(text) } || 0
subscriber_count = subscriber_count_text.try { |text| short_text_to_number(text) }
rescue ex
subscriber_count = subscriber_count_text.try &.gsub(/\D/, "").to_i?
end
subscriber_count ||= 0
video_count = node.xpath_node(%q(.//ul[@class="yt-lockup-meta-info"]/li)).try &.content.split(" ")[0].gsub(/\D/, "").to_i? video_count = node.xpath_node(%q(.//ul[@class="yt-lockup-meta-info"]/li)).try &.content.split(" ")[0].gsub(/\D/, "").to_i?

View file

@ -157,7 +157,7 @@ def number_with_separator(number)
number.to_s.reverse.gsub(/(\d{3})(?=\d)/, "\\1,").reverse number.to_s.reverse.gsub(/(\d{3})(?=\d)/, "\\1,").reverse
end end
def short_text_to_number(short_text) def short_text_to_number(short_text : String) : Int32
case short_text case short_text
when .ends_with? "M" when .ends_with? "M"
number = short_text.rstrip(" mM").to_f number = short_text.rstrip(" mM").to_f

View file

@ -1262,7 +1262,7 @@ def fetch_video(id, region)
end end
license = html.xpath_node(%q(//h4[contains(text(),"License")]/parent::*/ul/li)).try &.content || "" license = html.xpath_node(%q(//h4[contains(text(),"License")]/parent::*/ul/li)).try &.content || ""
sub_count_text = html.xpath_node(%q(//span[contains(@class, "yt-subscriber-count")])).try &.["title"]? || "0" sub_count_text = html.xpath_node(%q(//span[contains(@class, "subscriber-count")])).try &.["title"]? || "0"
author_thumbnail = html.xpath_node(%(//span[@class="yt-thumb-clip"]/img)).try &.["data-thumb"]?.try &.gsub(/^\/\//, "https://") || "" author_thumbnail = html.xpath_node(%(//span[@class="yt-thumb-clip"]/img)).try &.["data-thumb"]?.try &.gsub(/^\/\//, "https://") || ""
video = Video.new(id, info, Time.utc, title, views, likes, dislikes, wilson_score, published, description_html, video = Video.new(id, info, Time.utc, title, views, likes, dislikes, wilson_score, published, description_html,

View file

@ -34,7 +34,7 @@
<div class="h-box"> <div class="h-box">
<% ucid = channel.ucid %> <% ucid = channel.ucid %>
<% author = channel.author %> <% author = channel.author %>
<% sub_count_text = channel.sub_count.format %> <% sub_count_text = number_to_short_text(channel.sub_count) %>
<%= rendered "components/subscribe_widget" %> <%= rendered "components/subscribe_widget" %>
</div> </div>

View file

@ -33,7 +33,7 @@
<div class="h-box"> <div class="h-box">
<% ucid = channel.ucid %> <% ucid = channel.ucid %>
<% author = channel.author %> <% author = channel.author %>
<% sub_count_text = channel.sub_count.format %> <% sub_count_text = number_to_short_text(channel.sub_count) %>
<%= rendered "components/subscribe_widget" %> <%= rendered "components/subscribe_widget" %>
</div> </div>

View file

@ -33,7 +33,7 @@
<div class="h-box"> <div class="h-box">
<% ucid = channel.ucid %> <% ucid = channel.ucid %>
<% author = channel.author %> <% author = channel.author %>
<% sub_count_text = channel.sub_count.format %> <% sub_count_text = number_to_short_text(channel.sub_count) %>
<%= rendered "components/subscribe_widget" %> <%= rendered "components/subscribe_widget" %>
</div> </div>