Implement pagination in YoutubeSearchExtractor
This commit is contained in:
parent
c0a8e01889
commit
5d883d100c
2 changed files with 62 additions and 42 deletions
|
@ -5,9 +5,7 @@ import com.grack.nanojson.JsonObject;
|
||||||
import com.grack.nanojson.JsonParser;
|
import com.grack.nanojson.JsonParser;
|
||||||
import com.grack.nanojson.JsonParserException;
|
import com.grack.nanojson.JsonParserException;
|
||||||
|
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
|
||||||
import org.schabi.newpipe.extractor.InfoItem;
|
import org.schabi.newpipe.extractor.InfoItem;
|
||||||
import org.schabi.newpipe.extractor.StreamingService;
|
import org.schabi.newpipe.extractor.StreamingService;
|
||||||
import org.schabi.newpipe.extractor.downloader.Downloader;
|
import org.schabi.newpipe.extractor.downloader.Downloader;
|
||||||
|
@ -19,12 +17,12 @@ import org.schabi.newpipe.extractor.localization.TimeAgoParser;
|
||||||
import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector;
|
import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector;
|
||||||
import org.schabi.newpipe.extractor.search.SearchExtractor;
|
import org.schabi.newpipe.extractor.search.SearchExtractor;
|
||||||
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
|
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
|
||||||
import org.schabi.newpipe.extractor.utils.Parser;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.util.Collections;
|
||||||
import java.net.MalformedURLException;
|
import java.util.HashMap;
|
||||||
import java.net.URL;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import javax.annotation.Nonnull;
|
import javax.annotation.Nonnull;
|
||||||
|
|
||||||
|
@ -73,58 +71,70 @@ public class YoutubeSearchExtractor extends SearchExtractor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getSearchSuggestion() {
|
public String getSearchSuggestion() {
|
||||||
final Element el = doc.select("div[class*=\"spell-correction\"]").first();
|
JsonObject showingResultsForRenderer = initialData.getObject("contents")
|
||||||
if (el != null) {
|
.getObject("twoColumnSearchResultsRenderer").getObject("primaryContents")
|
||||||
return el.select("a").first().text();
|
.getObject("sectionListRenderer").getArray("contents").getObject(0)
|
||||||
} else {
|
.getObject("itemSectionRenderer").getArray("contents").getObject(0)
|
||||||
|
.getObject("showingResultsForRenderer");
|
||||||
|
if (showingResultsForRenderer == null) {
|
||||||
return "";
|
return "";
|
||||||
|
} else {
|
||||||
|
return showingResultsForRenderer.getObject("correctedQuery").getArray("runs")
|
||||||
|
.getObject(0).getString("text");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nonnull
|
@Nonnull
|
||||||
@Override
|
@Override
|
||||||
public InfoItemsPage<InfoItem> getInitialPage() throws ExtractionException {
|
public InfoItemsPage<InfoItem> getInitialPage() throws ExtractionException {
|
||||||
return new InfoItemsPage<>(collectItems(doc), getNextPageUrl());
|
InfoItemsSearchCollector collector = getInfoItemSearchCollector();
|
||||||
|
JsonArray videos = initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
|
||||||
|
.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
|
||||||
|
.getObject(0).getObject("itemSectionRenderer").getArray("contents");
|
||||||
|
|
||||||
|
collectStreamsFrom(collector, videos);
|
||||||
|
return new InfoItemsPage<>(collector, getNextPageUrl());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getNextPageUrl() throws ExtractionException {
|
public String getNextPageUrl() throws ExtractionException {
|
||||||
return getUrl() + "&page=" + 2;
|
return getNextPageUrlFrom(initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
|
||||||
|
.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
|
||||||
|
.getObject(0).getObject("itemSectionRenderer").getArray("continuations"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public InfoItemsPage<InfoItem> getPage(String pageUrl) throws IOException, ExtractionException {
|
public InfoItemsPage<InfoItem> getPage(String pageUrl) throws IOException, ExtractionException {
|
||||||
// TODO: Get extracting next pages working
|
if (pageUrl == null || pageUrl.isEmpty()) {
|
||||||
final String response = getDownloader().get(pageUrl, getExtractorLocalization()).responseBody();
|
throw new ExtractionException(new IllegalArgumentException("Page url is empty or null"));
|
||||||
doc = Jsoup.parse(response, pageUrl);
|
|
||||||
|
|
||||||
return new InfoItemsPage<>(collectItems(doc), getNextPageUrlFromCurrentUrl(pageUrl));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getNextPageUrlFromCurrentUrl(String currentUrl)
|
|
||||||
throws MalformedURLException, UnsupportedEncodingException {
|
|
||||||
final int pageNr = Integer.parseInt(
|
|
||||||
Parser.compatParseMap(
|
|
||||||
new URL(currentUrl)
|
|
||||||
.getQuery())
|
|
||||||
.get("page"));
|
|
||||||
|
|
||||||
return currentUrl.replace("&page=" + pageNr,
|
|
||||||
"&page=" + Integer.toString(pageNr + 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
private InfoItemsSearchCollector collectItems(Document doc) throws NothingFoundException, ParsingException {
|
|
||||||
InfoItemsSearchCollector collector = getInfoItemSearchCollector();
|
InfoItemsSearchCollector collector = getInfoItemSearchCollector();
|
||||||
|
JsonArray ajaxJson;
|
||||||
|
try {
|
||||||
|
Map<String, List<String>> headers = new HashMap<>();
|
||||||
|
headers.put("X-YouTube-Client-Name", Collections.singletonList("1"));
|
||||||
|
headers.put("X-YouTube-Client-Version", Collections.singletonList("2.20200221.03.00")); // TODO: Automatically get YouTube client version somehow
|
||||||
|
final String response = getDownloader().get(pageUrl, headers, getExtractorLocalization()).responseBody();
|
||||||
|
ajaxJson = JsonParser.array().from(response);
|
||||||
|
} catch (JsonParserException pe) {
|
||||||
|
throw new ParsingException("Could not parse json data for next streams", pe);
|
||||||
|
}
|
||||||
|
|
||||||
|
JsonObject itemSectionRenderer = ajaxJson.getObject(1).getObject("response")
|
||||||
|
.getObject("continuationContents").getObject("itemSectionContinuation");
|
||||||
|
|
||||||
|
collectStreamsFrom(collector, itemSectionRenderer.getArray("contents"));
|
||||||
|
|
||||||
|
return new InfoItemsPage<>(collector, getNextPageUrlFrom(itemSectionRenderer.getArray("continuations")));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void collectStreamsFrom(InfoItemsSearchCollector collector, JsonArray videos) throws NothingFoundException, ParsingException {
|
||||||
collector.reset();
|
collector.reset();
|
||||||
|
|
||||||
final TimeAgoParser timeAgoParser = getTimeAgoParser();
|
final TimeAgoParser timeAgoParser = getTimeAgoParser();
|
||||||
|
|
||||||
if (initialData == null) initialData = YoutubeParsingHelper.getInitialData(doc.toString());
|
for (Object item : videos) {
|
||||||
JsonArray list = initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
|
|
||||||
.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
|
|
||||||
.getObject(0).getObject("itemSectionRenderer").getArray("contents");
|
|
||||||
|
|
||||||
for (Object item : list) {
|
|
||||||
if (((JsonObject) item).getObject("backgroundPromoRenderer") != null) {
|
if (((JsonObject) item).getObject("backgroundPromoRenderer") != null) {
|
||||||
throw new NothingFoundException(((JsonObject) item).getObject("backgroundPromoRenderer")
|
throw new NothingFoundException(((JsonObject) item).getObject("backgroundPromoRenderer")
|
||||||
.getObject("bodyText").getArray("runs").getObject(0).getString("text"));
|
.getObject("bodyText").getArray("runs").getObject(0).getString("text"));
|
||||||
|
@ -136,7 +146,17 @@ public class YoutubeSearchExtractor extends SearchExtractor {
|
||||||
collector.commit(new YoutubePlaylistInfoItemExtractor(((JsonObject) item).getObject("playlistRenderer")));
|
collector.commit(new YoutubePlaylistInfoItemExtractor(((JsonObject) item).getObject("playlistRenderer")));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return collector;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getNextPageUrlFrom(JsonArray continuations) throws ParsingException {
|
||||||
|
if (continuations == null) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
JsonObject nextContinuationData = continuations.getObject(0).getObject("nextContinuationData");
|
||||||
|
String continuation = nextContinuationData.getString("continuation");
|
||||||
|
String clickTrackingParams = nextContinuationData.getString("clickTrackingParams");
|
||||||
|
return getUrl() + "&pbj=1&ctoken=" + continuation + "&continuation=" + continuation
|
||||||
|
+ "&itct=" + clickTrackingParams;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,13 +24,13 @@ public class YoutubeSearchQueryHandlerFactory extends SearchQueryHandlerFactory
|
||||||
public String getUrl(String searchString, List<String> contentFilters, String sortFilter) throws ParsingException {
|
public String getUrl(String searchString, List<String> contentFilters, String sortFilter) throws ParsingException {
|
||||||
try {
|
try {
|
||||||
final String url = "https://www.youtube.com/results"
|
final String url = "https://www.youtube.com/results"
|
||||||
+ "?q=" + URLEncoder.encode(searchString, CHARSET_UTF_8);
|
+ "?search_query=" + URLEncoder.encode(searchString, CHARSET_UTF_8);
|
||||||
|
|
||||||
if (contentFilters.size() > 0) {
|
if (contentFilters.size() > 0) {
|
||||||
switch (contentFilters.get(0)) {
|
switch (contentFilters.get(0)) {
|
||||||
case VIDEOS: return url + "&sp=EgIQAVAU";
|
case VIDEOS: return url + "&sp=EgIQAQ%253D%253D";
|
||||||
case CHANNELS: return url + "&sp=EgIQAlAU";
|
case CHANNELS: return url + "&sp=EgIQAg%253D%253D";
|
||||||
case PLAYLISTS: return url + "&sp=EgIQA1AU";
|
case PLAYLISTS: return url + "&sp=EgIQAw%253D%253D";
|
||||||
case ALL:
|
case ALL:
|
||||||
default:
|
default:
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue