Implement pagination in YoutubeSearchExtractor

This commit is contained in:
wb9688 2020-02-24 18:24:36 +01:00 committed by TobiGr
parent c0a8e01889
commit 5d883d100c
2 changed files with 62 additions and 42 deletions

View file

@ -5,9 +5,7 @@ import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.downloader.Downloader;
@ -19,12 +17,12 @@ import org.schabi.newpipe.extractor.localization.TimeAgoParser;
import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector;
import org.schabi.newpipe.extractor.search.SearchExtractor;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.utils.Parser;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.annotation.Nonnull;
@ -73,58 +71,70 @@ public class YoutubeSearchExtractor extends SearchExtractor {
@Override
public String getSearchSuggestion() {
final Element el = doc.select("div[class*=\"spell-correction\"]").first();
if (el != null) {
return el.select("a").first().text();
} else {
JsonObject showingResultsForRenderer = initialData.getObject("contents")
.getObject("twoColumnSearchResultsRenderer").getObject("primaryContents")
.getObject("sectionListRenderer").getArray("contents").getObject(0)
.getObject("itemSectionRenderer").getArray("contents").getObject(0)
.getObject("showingResultsForRenderer");
if (showingResultsForRenderer == null) {
return "";
} else {
return showingResultsForRenderer.getObject("correctedQuery").getArray("runs")
.getObject(0).getString("text");
}
}
@Nonnull
@Override
public InfoItemsPage<InfoItem> getInitialPage() throws ExtractionException {
return new InfoItemsPage<>(collectItems(doc), getNextPageUrl());
InfoItemsSearchCollector collector = getInfoItemSearchCollector();
JsonArray videos = initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
.getObject(0).getObject("itemSectionRenderer").getArray("contents");
collectStreamsFrom(collector, videos);
return new InfoItemsPage<>(collector, getNextPageUrl());
}
@Override
public String getNextPageUrl() throws ExtractionException {
return getUrl() + "&page=" + 2;
return getNextPageUrlFrom(initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
.getObject(0).getObject("itemSectionRenderer").getArray("continuations"));
}
@Override
public InfoItemsPage<InfoItem> getPage(String pageUrl) throws IOException, ExtractionException {
// TODO: Get extracting next pages working
final String response = getDownloader().get(pageUrl, getExtractorLocalization()).responseBody();
doc = Jsoup.parse(response, pageUrl);
if (pageUrl == null || pageUrl.isEmpty()) {
throw new ExtractionException(new IllegalArgumentException("Page url is empty or null"));
}
return new InfoItemsPage<>(collectItems(doc), getNextPageUrlFromCurrentUrl(pageUrl));
}
private String getNextPageUrlFromCurrentUrl(String currentUrl)
throws MalformedURLException, UnsupportedEncodingException {
final int pageNr = Integer.parseInt(
Parser.compatParseMap(
new URL(currentUrl)
.getQuery())
.get("page"));
return currentUrl.replace("&page=" + pageNr,
"&page=" + Integer.toString(pageNr + 1));
}
private InfoItemsSearchCollector collectItems(Document doc) throws NothingFoundException, ParsingException {
InfoItemsSearchCollector collector = getInfoItemSearchCollector();
JsonArray ajaxJson;
try {
Map<String, List<String>> headers = new HashMap<>();
headers.put("X-YouTube-Client-Name", Collections.singletonList("1"));
headers.put("X-YouTube-Client-Version", Collections.singletonList("2.20200221.03.00")); // TODO: Automatically get YouTube client version somehow
final String response = getDownloader().get(pageUrl, headers, getExtractorLocalization()).responseBody();
ajaxJson = JsonParser.array().from(response);
} catch (JsonParserException pe) {
throw new ParsingException("Could not parse json data for next streams", pe);
}
JsonObject itemSectionRenderer = ajaxJson.getObject(1).getObject("response")
.getObject("continuationContents").getObject("itemSectionContinuation");
collectStreamsFrom(collector, itemSectionRenderer.getArray("contents"));
return new InfoItemsPage<>(collector, getNextPageUrlFrom(itemSectionRenderer.getArray("continuations")));
}
private void collectStreamsFrom(InfoItemsSearchCollector collector, JsonArray videos) throws NothingFoundException, ParsingException {
collector.reset();
final TimeAgoParser timeAgoParser = getTimeAgoParser();
if (initialData == null) initialData = YoutubeParsingHelper.getInitialData(doc.toString());
JsonArray list = initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
.getObject(0).getObject("itemSectionRenderer").getArray("contents");
for (Object item : list) {
for (Object item : videos) {
if (((JsonObject) item).getObject("backgroundPromoRenderer") != null) {
throw new NothingFoundException(((JsonObject) item).getObject("backgroundPromoRenderer")
.getObject("bodyText").getArray("runs").getObject(0).getString("text"));
@ -136,7 +146,17 @@ public class YoutubeSearchExtractor extends SearchExtractor {
collector.commit(new YoutubePlaylistInfoItemExtractor(((JsonObject) item).getObject("playlistRenderer")));
}
}
return collector;
}
private String getNextPageUrlFrom(JsonArray continuations) throws ParsingException {
if (continuations == null) {
return "";
}
JsonObject nextContinuationData = continuations.getObject(0).getObject("nextContinuationData");
String continuation = nextContinuationData.getString("continuation");
String clickTrackingParams = nextContinuationData.getString("clickTrackingParams");
return getUrl() + "&pbj=1&ctoken=" + continuation + "&continuation=" + continuation
+ "&itct=" + clickTrackingParams;
}
}

View file

@ -24,13 +24,13 @@ public class YoutubeSearchQueryHandlerFactory extends SearchQueryHandlerFactory
public String getUrl(String searchString, List<String> contentFilters, String sortFilter) throws ParsingException {
try {
final String url = "https://www.youtube.com/results"
+ "?q=" + URLEncoder.encode(searchString, CHARSET_UTF_8);
+ "?search_query=" + URLEncoder.encode(searchString, CHARSET_UTF_8);
if (contentFilters.size() > 0) {
switch (contentFilters.get(0)) {
case VIDEOS: return url + "&sp=EgIQAVAU";
case CHANNELS: return url + "&sp=EgIQAlAU";
case PLAYLISTS: return url + "&sp=EgIQA1AU";
case VIDEOS: return url + "&sp=EgIQAQ%253D%253D";
case CHANNELS: return url + "&sp=EgIQAg%253D%253D";
case PLAYLISTS: return url + "&sp=EgIQAw%253D%253D";
case ALL:
default:
}