Merge pull request #222 from mauriciocolli/feed-extractor

Introduce FeedExtractor to fetch from dedicated feeds when available
This commit is contained in:
Tobias Groza 2020-02-01 18:41:26 +01:00 committed by GitHub
commit 62b81c3607
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 372 additions and 28 deletions

View file

@ -7,6 +7,7 @@ import org.schabi.newpipe.extractor.channel.ChannelExtractor;
import org.schabi.newpipe.extractor.comments.CommentsExtractor; import org.schabi.newpipe.extractor.comments.CommentsExtractor;
import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.feed.FeedExtractor;
import org.schabi.newpipe.extractor.kiosk.KioskList; import org.schabi.newpipe.extractor.kiosk.KioskList;
import org.schabi.newpipe.extractor.linkhandler.LinkHandler; import org.schabi.newpipe.extractor.linkhandler.LinkHandler;
import org.schabi.newpipe.extractor.linkhandler.LinkHandlerFactory; import org.schabi.newpipe.extractor.linkhandler.LinkHandlerFactory;
@ -24,6 +25,8 @@ import org.schabi.newpipe.extractor.stream.StreamExtractor;
import org.schabi.newpipe.extractor.subscription.SubscriptionExtractor; import org.schabi.newpipe.extractor.subscription.SubscriptionExtractor;
import org.schabi.newpipe.extractor.suggestion.SuggestionExtractor; import org.schabi.newpipe.extractor.suggestion.SuggestionExtractor;
import javax.annotation.Nullable;
/* /*
* Copyright (C) Christian Schabesberger 2018 <chris.schabesberger@mailbox.org> * Copyright (C) Christian Schabesberger 2018 <chris.schabesberger@mailbox.org>
* StreamingService.java is part of NewPipe. * StreamingService.java is part of NewPipe.
@ -173,6 +176,19 @@ public abstract class StreamingService {
*/ */
public abstract SubscriptionExtractor getSubscriptionExtractor(); public abstract SubscriptionExtractor getSubscriptionExtractor();
/**
* This method decides which strategy will be chosen to fetch the feed. In YouTube, for example, a separate feed
* exists which is lightweight and made specifically to be used like this.
* <p>
* In services which there's no other way to retrieve them, null should be returned.
*
* @return a {@link FeedExtractor} instance or null.
*/
@Nullable
public FeedExtractor getFeedExtractor(String url) throws ExtractionException {
return null;
}
/** /**
* Must create a new instance of a KioskList implementation. * Must create a new instance of a KioskList implementation.
* @return a new KioskList instance * @return a new KioskList instance

View file

@ -5,9 +5,7 @@ import org.schabi.newpipe.extractor.ListInfo;
import org.schabi.newpipe.extractor.NewPipe; import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.localization.Localization;
import org.schabi.newpipe.extractor.stream.StreamInfoItem; import org.schabi.newpipe.extractor.stream.StreamInfoItem;
import org.schabi.newpipe.extractor.utils.ExtractorHelper; import org.schabi.newpipe.extractor.utils.ExtractorHelper;
@ -35,8 +33,8 @@ import java.io.IOException;
public class ChannelInfo extends ListInfo<StreamInfoItem> { public class ChannelInfo extends ListInfo<StreamInfoItem> {
public ChannelInfo(int serviceId, ListLinkHandler linkHandler, String name) throws ParsingException { public ChannelInfo(int serviceId, String id, String url, String originalUrl, String name, ListLinkHandler listLinkHandler) {
super(serviceId, linkHandler, name); super(serviceId, id, url, originalUrl, name, listLinkHandler.getContentFilters(), listLinkHandler.getSortFilter());
} }
public static ChannelInfo getInfo(String url) throws IOException, ExtractionException { public static ChannelInfo getInfo(String url) throws IOException, ExtractionException {
@ -57,15 +55,14 @@ public class ChannelInfo extends ListInfo<StreamInfoItem> {
public static ChannelInfo getInfo(ChannelExtractor extractor) throws IOException, ExtractionException { public static ChannelInfo getInfo(ChannelExtractor extractor) throws IOException, ExtractionException {
ChannelInfo info = new ChannelInfo(extractor.getServiceId(), final int serviceId = extractor.getServiceId();
extractor.getLinkHandler(), final String id = extractor.getId();
extractor.getName()); final String url = extractor.getUrl();
final String originalUrl = extractor.getOriginalUrl();
final String name = extractor.getName();
final ChannelInfo info = new ChannelInfo(serviceId, id, url, originalUrl, name, extractor.getLinkHandler());
try {
info.setOriginalUrl(extractor.getOriginalUrl());
} catch (Exception e) {
info.addError(e);
}
try { try {
info.setAvatarUrl(extractor.getAvatarUrl()); info.setAvatarUrl(extractor.getAvatarUrl());
} catch (Exception e) { } catch (Exception e) {

View file

@ -0,0 +1,17 @@
package org.schabi.newpipe.extractor.feed;
import org.schabi.newpipe.extractor.ListExtractor;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.stream.StreamInfoItem;
/**
* This class helps to extract items from lightweight feeds that the services may provide.
* <p>
* YouTube is an example of a service that has this alternative available.
*/
public abstract class FeedExtractor extends ListExtractor<StreamInfoItem> {
public FeedExtractor(StreamingService service, ListLinkHandler listLinkHandler) {
super(service, listLinkHandler);
}
}

View file

@ -0,0 +1,52 @@
package org.schabi.newpipe.extractor.feed;
import org.schabi.newpipe.extractor.ListExtractor.InfoItemsPage;
import org.schabi.newpipe.extractor.ListInfo;
import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.stream.StreamInfoItem;
import org.schabi.newpipe.extractor.utils.ExtractorHelper;
import java.io.IOException;
import java.util.List;
public class FeedInfo extends ListInfo<StreamInfoItem> {
public FeedInfo(int serviceId, String id, String url, String originalUrl, String name, List<String> contentFilter, String sortFilter) {
super(serviceId, id, url, originalUrl, name, contentFilter, sortFilter);
}
public static FeedInfo getInfo(String url) throws IOException, ExtractionException {
return getInfo(NewPipe.getServiceByUrl(url), url);
}
public static FeedInfo getInfo(StreamingService service, String url) throws IOException, ExtractionException {
final FeedExtractor extractor = service.getFeedExtractor(url);
if (extractor == null) {
throw new IllegalArgumentException("Service \"" + service.getServiceInfo().getName() + "\" doesn't support FeedExtractor.");
}
extractor.fetchPage();
return getInfo(extractor);
}
public static FeedInfo getInfo(FeedExtractor extractor) throws IOException, ExtractionException {
extractor.fetchPage();
final int serviceId = extractor.getServiceId();
final String id = extractor.getId();
final String url = extractor.getUrl();
final String originalUrl = extractor.getOriginalUrl();
final String name = extractor.getName();
final FeedInfo info = new FeedInfo(serviceId, id, url, originalUrl, name, null, null);
final InfoItemsPage<StreamInfoItem> itemsPage = ExtractorHelper.getItemsPageOrLogError(info, extractor);
info.setRelatedItems(itemsPage.getItems());
info.setNextPageUrl(itemsPage.getNextPageUrl());
return info;
}
}

View file

@ -12,6 +12,7 @@ import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.channel.ChannelExtractor; import org.schabi.newpipe.extractor.channel.ChannelExtractor;
import org.schabi.newpipe.extractor.comments.CommentsExtractor; import org.schabi.newpipe.extractor.comments.CommentsExtractor;
import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.feed.FeedExtractor;
import org.schabi.newpipe.extractor.kiosk.KioskExtractor; import org.schabi.newpipe.extractor.kiosk.KioskExtractor;
import org.schabi.newpipe.extractor.kiosk.KioskList; import org.schabi.newpipe.extractor.kiosk.KioskList;
import org.schabi.newpipe.extractor.linkhandler.LinkHandler; import org.schabi.newpipe.extractor.linkhandler.LinkHandler;
@ -24,14 +25,7 @@ import org.schabi.newpipe.extractor.localization.ContentCountry;
import org.schabi.newpipe.extractor.localization.Localization; import org.schabi.newpipe.extractor.localization.Localization;
import org.schabi.newpipe.extractor.playlist.PlaylistExtractor; import org.schabi.newpipe.extractor.playlist.PlaylistExtractor;
import org.schabi.newpipe.extractor.search.SearchExtractor; import org.schabi.newpipe.extractor.search.SearchExtractor;
import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeChannelExtractor; import org.schabi.newpipe.extractor.services.youtube.extractors.*;
import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeCommentsExtractor;
import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubePlaylistExtractor;
import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeSearchExtractor;
import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeStreamExtractor;
import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeSubscriptionExtractor;
import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeSuggestionExtractor;
import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeTrendingExtractor;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeChannelLinkHandlerFactory; import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeChannelLinkHandlerFactory;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeCommentsLinkHandlerFactory; import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeCommentsLinkHandlerFactory;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubePlaylistLinkHandlerFactory; import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubePlaylistLinkHandlerFactory;
@ -42,6 +36,8 @@ import org.schabi.newpipe.extractor.stream.StreamExtractor;
import org.schabi.newpipe.extractor.subscription.SubscriptionExtractor; import org.schabi.newpipe.extractor.subscription.SubscriptionExtractor;
import org.schabi.newpipe.extractor.suggestion.SuggestionExtractor; import org.schabi.newpipe.extractor.suggestion.SuggestionExtractor;
import javax.annotation.Nonnull;
/* /*
* Created by Christian Schabesberger on 23.08.15. * Created by Christian Schabesberger on 23.08.15.
* *
@ -147,6 +143,12 @@ public class YoutubeService extends StreamingService {
return new YoutubeSubscriptionExtractor(this); return new YoutubeSubscriptionExtractor(this);
} }
@Nonnull
@Override
public FeedExtractor getFeedExtractor(final String channelUrl) throws ExtractionException {
return new YoutubeFeedExtractor(this, getChannelLHFactory().fromUrl(channelUrl));
}
@Override @Override
public ListLinkHandlerFactory getCommentsLHFactory() { public ListLinkHandlerFactory getCommentsLHFactory() {
return YoutubeCommentsLinkHandlerFactory.getInstance(); return YoutubeCommentsLinkHandlerFactory.getInstance();

View file

@ -46,7 +46,6 @@ import java.io.IOException;
@SuppressWarnings("WeakerAccess") @SuppressWarnings("WeakerAccess")
public class YoutubeChannelExtractor extends ChannelExtractor { public class YoutubeChannelExtractor extends ChannelExtractor {
/*package-private*/ static final String CHANNEL_URL_BASE = "https://www.youtube.com/channel/"; /*package-private*/ static final String CHANNEL_URL_BASE = "https://www.youtube.com/channel/";
private static final String CHANNEL_FEED_BASE = "https://www.youtube.com/feeds/videos.xml?channel_id=";
private static final String CHANNEL_URL_PARAMETERS = "/videos?view=0&flow=list&sort=dd&live_view=10000"; private static final String CHANNEL_URL_PARAMETERS = "/videos?view=0&flow=list&sort=dd&live_view=10000";
private Document doc; private Document doc;
@ -130,7 +129,7 @@ public class YoutubeChannelExtractor extends ChannelExtractor {
@Override @Override
public String getFeedUrl() throws ParsingException { public String getFeedUrl() throws ParsingException {
try { try {
return CHANNEL_FEED_BASE + getId(); return YoutubeParsingHelper.getFeedUrlFrom(getId());
} catch (Exception e) { } catch (Exception e) {
throw new ParsingException("Could not get feed url", e); throw new ParsingException("Could not get feed url", e);
} }

View file

@ -0,0 +1,82 @@
package org.schabi.newpipe.extractor.services.youtube.extractors;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.schabi.newpipe.extractor.ListExtractor;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.downloader.Downloader;
import org.schabi.newpipe.extractor.downloader.Response;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.feed.FeedExtractor;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.stream.StreamInfoItem;
import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector;
import javax.annotation.Nonnull;
import java.io.IOException;
public class YoutubeFeedExtractor extends FeedExtractor {
public YoutubeFeedExtractor(StreamingService service, ListLinkHandler linkHandler) {
super(service, linkHandler);
}
private Document document;
@Override
public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException {
final String channelIdOrUser = getLinkHandler().getId();
final String feedUrl = YoutubeParsingHelper.getFeedUrlFrom(channelIdOrUser);
final Response response = downloader.get(feedUrl);
document = Jsoup.parse(response.responseBody());
}
@Nonnull
@Override
public ListExtractor.InfoItemsPage<StreamInfoItem> getInitialPage() {
final Elements entries = document.select("feed > entry");
final StreamInfoItemsCollector collector = new StreamInfoItemsCollector(getServiceId());
for (Element entryElement : entries) {
collector.commit(new YoutubeFeedInfoItemExtractor(entryElement));
}
return new InfoItemsPage<>(collector, null);
}
@Nonnull
@Override
public String getId() {
return document.getElementsByTag("yt:channelId").first().text();
}
@Nonnull
@Override
public String getUrl() {
return document.select("feed > author > uri").first().text();
}
@Nonnull
@Override
public String getName() {
return document.select("feed > author > name").first().text();
}
@Override
public String getNextPageUrl() {
return null;
}
@Override
public InfoItemsPage<StreamInfoItem> getPage(String pageUrl) {
return null;
}
@Override
public boolean hasNextPage() {
return false;
}
}

View file

@ -0,0 +1,94 @@
package org.schabi.newpipe.extractor.services.youtube.extractors;
import org.jsoup.nodes.Element;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.localization.DateWrapper;
import org.schabi.newpipe.extractor.stream.StreamInfoItemExtractor;
import org.schabi.newpipe.extractor.stream.StreamType;
import javax.annotation.Nullable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.TimeZone;
public class YoutubeFeedInfoItemExtractor implements StreamInfoItemExtractor {
private final Element entryElement;
public YoutubeFeedInfoItemExtractor(Element entryElement) {
this.entryElement = entryElement;
}
@Override
public StreamType getStreamType() {
// It is not possible to determine the stream type using the feed endpoint.
// All entries are considered a video stream.
return StreamType.VIDEO_STREAM;
}
@Override
public boolean isAd() {
return false;
}
@Override
public long getDuration() {
// Not available when fetching through the feed endpoint.
return -1;
}
@Override
public long getViewCount() {
return Long.parseLong(entryElement.getElementsByTag("media:statistics").first().attr("views"));
}
@Override
public String getUploaderName() {
return entryElement.select("author > name").first().text();
}
@Override
public String getUploaderUrl() {
return entryElement.select("author > uri").first().text();
}
@Nullable
@Override
public String getTextualUploadDate() {
return entryElement.getElementsByTag("published").first().text();
}
@Nullable
@Override
public DateWrapper getUploadDate() throws ParsingException {
final Date date;
try {
final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss+00:00");
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
date = dateFormat.parse(getTextualUploadDate());
} catch (ParseException e) {
throw new ParsingException("Could not parse date (\"" + getTextualUploadDate() + "\")", e);
}
final Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
return new DateWrapper(calendar);
}
@Override
public String getName() {
return entryElement.getElementsByTag("title").first().text();
}
@Override
public String getUrl() {
return entryElement.getElementsByTag("link").first().attr("href");
}
@Override
public String getThumbnailUrl() {
return entryElement.getElementsByTag("media:thumbnail").first().attr("url");
}
}

View file

@ -22,8 +22,8 @@ import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
import org.schabi.newpipe.extractor.linkhandler.LinkHandler; import org.schabi.newpipe.extractor.linkhandler.LinkHandler;
import org.schabi.newpipe.extractor.localization.TimeAgoParser;
import org.schabi.newpipe.extractor.localization.DateWrapper; import org.schabi.newpipe.extractor.localization.DateWrapper;
import org.schabi.newpipe.extractor.localization.TimeAgoParser;
import org.schabi.newpipe.extractor.services.youtube.ItagItem; import org.schabi.newpipe.extractor.services.youtube.ItagItem;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper; import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.stream.*; import org.schabi.newpipe.extractor.stream.*;
@ -152,7 +152,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
return null; return null;
} }
return new DateWrapper(YoutubeParsingHelper.parseDateFrom(textualUploadDate)); return new DateWrapper(YoutubeParsingHelper.parseDateFrom(textualUploadDate), true);
} }
@Nonnull @Nonnull

View file

@ -38,6 +38,9 @@ public class YoutubeParsingHelper {
private YoutubeParsingHelper() { private YoutubeParsingHelper() {
} }
private static final String FEED_BASE_CHANNEL_ID = "https://www.youtube.com/feeds/videos.xml?channel_id=";
private static final String FEED_BASE_USER = "https://www.youtube.com/feeds/videos.xml?user=";
private static final String[] RECAPTCHA_DETECTION_SELECTORS = { private static final String[] RECAPTCHA_DETECTION_SELECTORS = {
"form[action*=\"/das_captcha\"]", "form[action*=\"/das_captcha\"]",
"input[name*=\"action_recaptcha_verify\"]" "input[name*=\"action_recaptcha_verify\"]"
@ -118,6 +121,16 @@ public class YoutubeParsingHelper {
+ Long.parseLong(seconds); + Long.parseLong(seconds);
} }
public static String getFeedUrlFrom(final String channelIdOrUser) {
if (channelIdOrUser.startsWith("user/")) {
return FEED_BASE_USER + channelIdOrUser.replace("user/", "");
} else if (channelIdOrUser.startsWith("channel/")) {
return FEED_BASE_CHANNEL_ID + channelIdOrUser.replace("channel/", "");
} else {
return FEED_BASE_CHANNEL_ID + channelIdOrUser;
}
}
public static Calendar parseDateFrom(String textualUploadDate) throws ParsingException { public static Calendar parseDateFrom(String textualUploadDate) throws ParsingException {
Date date; Date date;
try { try {

View file

@ -0,0 +1,72 @@
package org.schabi.newpipe.extractor.services.youtube;
import org.junit.BeforeClass;
import org.junit.Test;
import org.schabi.newpipe.DownloaderTestImpl;
import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.services.BaseListExtractorTest;
import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeFeedExtractor;
import static org.junit.Assert.*;
import static org.schabi.newpipe.extractor.ServiceList.YouTube;
import static org.schabi.newpipe.extractor.services.DefaultTests.defaultTestRelatedItems;
public class YoutubeFeedExtractorTest {
public static class Kurzgesagt implements BaseListExtractorTest {
private static YoutubeFeedExtractor extractor;
@BeforeClass
public static void setUp() throws Exception {
NewPipe.init(DownloaderTestImpl.getInstance());
extractor = (YoutubeFeedExtractor) YouTube
.getFeedExtractor("https://www.youtube.com/user/Kurzgesagt");
extractor.fetchPage();
}
/*//////////////////////////////////////////////////////////////////////////
// Extractor
//////////////////////////////////////////////////////////////////////////*/
@Test
public void testServiceId() {
assertEquals(YouTube.getServiceId(), extractor.getServiceId());
}
@Test
public void testName() {
String name = extractor.getName();
assertTrue(name, name.startsWith("Kurzgesagt"));
}
@Test
public void testId() {
assertEquals("UCsXVk37bltHxD1rDPwtNM8Q", extractor.getId());
}
@Test
public void testUrl() {
assertEquals("https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q", extractor.getUrl());
}
@Test
public void testOriginalUrl() throws ParsingException {
assertEquals("https://www.youtube.com/user/Kurzgesagt", extractor.getOriginalUrl());
}
/*//////////////////////////////////////////////////////////////////////////
// ListExtractor
//////////////////////////////////////////////////////////////////////////*/
@Test
public void testRelatedItems() throws Exception {
defaultTestRelatedItems(extractor, YouTube.getServiceId());
}
@Test
public void testMoreRelatedItems() {
assertFalse(extractor.hasNextPage());
assertNull(extractor.getNextPageUrl());
}
}
}