From c4fe2183cecb7469df024893aeed96c8b789b1b7 Mon Sep 17 00:00:00 2001 From: Mauricio Colli Date: Tue, 29 Oct 2019 02:00:29 -0300 Subject: [PATCH] [YouTube] Improve detection of reCAPTCHA pages --- .../schabi/newpipe/extractor/Downloader.java | 3 +++ .../extractors/YoutubeChannelExtractor.java | 6 +++-- .../extractors/YoutubePlaylistExtractor.java | 6 +++-- .../extractors/YoutubeSearchExtractor.java | 10 ++++---- .../extractors/YoutubeStreamExtractor.java | 23 ++++++++----------- .../extractors/YoutubeTrendingExtractor.java | 6 +++-- .../linkHandler/YoutubeParsingHelper.java | 21 +++++++++++++++++ .../java/org/schabi/newpipe/Downloader.java | 10 ++++++++ 8 files changed, 59 insertions(+), 26 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/Downloader.java b/extractor/src/main/java/org/schabi/newpipe/extractor/Downloader.java index 5f83c82b..f3526fce 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/Downloader.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/Downloader.java @@ -62,6 +62,9 @@ public interface Downloader { DownloadResponse head(String siteUrl) throws IOException, ReCaptchaException; + DownloadResponse get(String siteUrl, Localization localization) + throws IOException, ReCaptchaException; + DownloadResponse get(String siteUrl, DownloadRequest request) throws IOException, ReCaptchaException; diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java index 38722fa5..9641d393 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeChannelExtractor.java @@ -7,6 +7,7 @@ import com.grack.nanojson.JsonParserException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.schabi.newpipe.extractor.DownloadResponse; import org.schabi.newpipe.extractor.Downloader; import org.schabi.newpipe.extractor.NewPipe; import org.schabi.newpipe.extractor.StreamingService; @@ -14,6 +15,7 @@ import org.schabi.newpipe.extractor.channel.ChannelExtractor; import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; +import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper; import org.schabi.newpipe.extractor.stream.StreamInfoItem; import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector; import org.schabi.newpipe.extractor.utils.DonationLinkHelper; @@ -60,8 +62,8 @@ public class YoutubeChannelExtractor extends ChannelExtractor { @Override public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException { String channelUrl = super.getUrl() + CHANNEL_URL_PARAMETERS; - String pageContent = downloader.download(channelUrl); - doc = Jsoup.parse(pageContent, channelUrl); + final DownloadResponse response = downloader.get(channelUrl); + doc = YoutubeParsingHelper.parseAndCheckPage(channelUrl, response); } @Override diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubePlaylistExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubePlaylistExtractor.java index 98a4c402..4480b38a 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubePlaylistExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubePlaylistExtractor.java @@ -6,6 +6,7 @@ import com.grack.nanojson.JsonParserException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.schabi.newpipe.extractor.DownloadResponse; import org.schabi.newpipe.extractor.Downloader; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.exceptions.ExtractionException; @@ -35,8 +36,9 @@ public class YoutubePlaylistExtractor extends PlaylistExtractor { @Override public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException { - String pageContent = downloader.download(getUrl()); - doc = Jsoup.parse(pageContent, getUrl()); + final String url = getUrl(); + final DownloadResponse response = downloader.get(url); + doc = YoutubeParsingHelper.parseAndCheckPage(url, response); } @Override diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java index 709e5f57..0a954607 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java @@ -3,6 +3,7 @@ package org.schabi.newpipe.extractor.services.youtube.extractors; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.schabi.newpipe.extractor.DownloadResponse; import org.schabi.newpipe.extractor.Downloader; import org.schabi.newpipe.extractor.InfoItem; import org.schabi.newpipe.extractor.StreamingService; @@ -12,6 +13,7 @@ import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector; import org.schabi.newpipe.extractor.search.SearchExtractor; import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler; import org.schabi.newpipe.extractor.utils.Localization; +import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper; import org.schabi.newpipe.extractor.utils.Parser; import javax.annotation.Nonnull; @@ -52,13 +54,9 @@ public class YoutubeSearchExtractor extends SearchExtractor { @Override public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException { - final String site; final String url = getUrl(); - //String url = builder.build().toString(); - //if we've been passed a valid language code, append it to the URL - site = downloader.download(url, getLocalization()); - - doc = Jsoup.parse(site, url); + final DownloadResponse response = downloader.get(url, getLocalization()); + doc = YoutubeParsingHelper.parseAndCheckPage(url, response); } @Override diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java index ee20112a..fa866cd5 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java @@ -18,6 +18,7 @@ import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import org.schabi.newpipe.extractor.linkhandler.LinkHandler; import org.schabi.newpipe.extractor.services.youtube.ItagItem; +import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper; import org.schabi.newpipe.extractor.stream.*; import org.schabi.newpipe.extractor.utils.Localization; import org.schabi.newpipe.extractor.utils.Parser; @@ -536,7 +537,7 @@ public class YoutubeStreamExtractor extends StreamExtractor { if (watch.size() < 1) { return null;// prevent the snackbar notification "report error" on age-restricted videos } - + collector.commit(extractVideoPreviewInfo(watch.first().select("li").first())); return collector.getItems().get(0); } catch (Exception e) { @@ -611,18 +612,12 @@ public class YoutubeStreamExtractor extends StreamExtractor { private String pageHtml = null; - private String getPageHtml(Downloader downloader) throws IOException, ExtractionException { - final String verifiedUrl = getUrl() + VERIFIED_URL_PARAMS; - if (pageHtml == null) { - pageHtml = downloader.download(verifiedUrl); - } - return pageHtml; - } - @Override public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException { - final String pageContent = getPageHtml(downloader); - doc = Jsoup.parse(pageContent, getUrl()); + final String verifiedUrl = getUrl() + VERIFIED_URL_PARAMS; + final DownloadResponse response = downloader.get(verifiedUrl); + pageHtml = response.getResponseBody(); + doc = YoutubeParsingHelper.parseAndCheckPage(verifiedUrl, response); final String playerUrl; // Check if the video is age restricted @@ -634,7 +629,7 @@ public class YoutubeStreamExtractor extends StreamExtractor { playerUrl = info.url; isAgeRestricted = true; } else { - final JsonObject ytPlayerConfig = getPlayerConfig(pageContent); + final JsonObject ytPlayerConfig = getPlayerConfig(); playerArgs = getPlayerArgs(ytPlayerConfig); playerUrl = getPlayerUrl(ytPlayerConfig); isAgeRestricted = false; @@ -650,9 +645,9 @@ public class YoutubeStreamExtractor extends StreamExtractor { } } - private JsonObject getPlayerConfig(String pageContent) throws ParsingException { + private JsonObject getPlayerConfig() throws ParsingException { try { - String ytPlayerConfigRaw = Parser.matchGroup1("ytplayer.config\\s*=\\s*(\\{.*?\\});", pageContent); + String ytPlayerConfigRaw = Parser.matchGroup1("ytplayer.config\\s*=\\s*(\\{.*?\\});", pageHtml); return JsonParser.object().from(ytPlayerConfigRaw); } catch (Parser.RegexException e) { String errorReason = getErrorMessage(); diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeTrendingExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeTrendingExtractor.java index df75470e..dc7cc7e6 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeTrendingExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeTrendingExtractor.java @@ -24,12 +24,14 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; +import org.schabi.newpipe.extractor.DownloadResponse; import org.schabi.newpipe.extractor.Downloader; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.kiosk.KioskExtractor; +import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper; import org.schabi.newpipe.extractor.stream.StreamInfoItem; import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector; import org.schabi.newpipe.extractor.utils.Localization; @@ -56,8 +58,8 @@ public class YoutubeTrendingExtractor extends KioskExtractor { url += "?gl=" + contentCountry; } - String pageContent = downloader.download(url); - doc = Jsoup.parse(pageContent, url); + final DownloadResponse response = downloader.get(url); + doc = YoutubeParsingHelper.parseAndCheckPage(url, response); } @Override diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java index 4c365534..120275ca 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeParsingHelper.java @@ -1,7 +1,11 @@ package org.schabi.newpipe.extractor.services.youtube.linkHandler; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.schabi.newpipe.extractor.DownloadResponse; import org.schabi.newpipe.extractor.exceptions.ParsingException; +import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import java.net.URL; @@ -30,6 +34,23 @@ public class YoutubeParsingHelper { private YoutubeParsingHelper() { } + private static final String[] RECAPTCHA_DETECTION_SELECTORS = { + "form[action*=\"/das_captcha\"]", + "input[name*=\"action_recaptcha_verify\"]" + }; + + public static Document parseAndCheckPage(final String url, final DownloadResponse response) throws ReCaptchaException { + final Document document = Jsoup.parse(response.getResponseBody(), url); + + for (String detectionSelector : RECAPTCHA_DETECTION_SELECTORS) { + if (!document.select(detectionSelector).isEmpty()) { + throw new ReCaptchaException("reCAPTCHA challenge requested (detected with selector: \"" + detectionSelector + "\")", url); + } + } + + return document; + } + public static boolean isYoutubeURL(URL url) { String host = url.getHost(); return host.equalsIgnoreCase("youtube.com") || host.equalsIgnoreCase("www.youtube.com") diff --git a/extractor/src/test/java/org/schabi/newpipe/Downloader.java b/extractor/src/test/java/org/schabi/newpipe/Downloader.java index 1a7536ac..3091c74b 100644 --- a/extractor/src/test/java/org/schabi/newpipe/Downloader.java +++ b/extractor/src/test/java/org/schabi/newpipe/Downloader.java @@ -16,6 +16,8 @@ import org.schabi.newpipe.extractor.DownloadResponse; import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import org.schabi.newpipe.extractor.utils.Localization; +import static java.util.Collections.singletonList; + /* * Created by Christian Schabesberger on 28.01.16. * @@ -194,6 +196,14 @@ public class Downloader implements org.schabi.newpipe.extractor.Downloader { return new DownloadResponse(con.getResponseCode(), null, con.getHeaderFields()); } + @Override + public DownloadResponse get(String siteUrl, Localization localization) throws IOException, ReCaptchaException { + final Map> requestHeaders = new HashMap<>(); + requestHeaders.put("Accept-Language", singletonList(localization.getLanguage())); + + return get(siteUrl, new DownloadRequest(null, requestHeaders)); + } + @Override public DownloadResponse get(String siteUrl, DownloadRequest request) throws IOException, ReCaptchaException {