From a12c0e2a37ff2697aa528cb13d69713bc860997b Mon Sep 17 00:00:00 2001 From: B0pol Date: Fri, 17 Jan 2020 12:09:59 +0100 Subject: [PATCH] fix:invidious: getID and onAccpetURl for comments --- .../YoutubeCommentsLinkHandlerFactory.java | 222 ++++++++++++------ .../youtube/YoutubeCommentsExtractorTest.java | 4 +- 2 files changed, 157 insertions(+), 69 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeCommentsLinkHandlerFactory.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeCommentsLinkHandlerFactory.java index 036529ab..8dc8715f 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeCommentsLinkHandlerFactory.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeCommentsLinkHandlerFactory.java @@ -1,13 +1,15 @@ package org.schabi.newpipe.extractor.services.youtube.linkHandler; -import java.io.UnsupportedEncodingException; -import java.net.URLDecoder; -import java.util.List; - import org.schabi.newpipe.extractor.exceptions.FoundAdException; import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory; -import org.schabi.newpipe.extractor.utils.Parser; +import org.schabi.newpipe.extractor.utils.Utils; + +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.List; public class YoutubeCommentsLinkHandlerFactory extends ListLinkHandlerFactory { @@ -18,84 +20,170 @@ public class YoutubeCommentsLinkHandlerFactory extends ListLinkHandlerFactory { return instance; } + private static String assertIsID(String id) throws ParsingException { + if (id == null || !id.matches("[a-zA-Z0-9_-]{11}")) { + throw new ParsingException("The given string is not a Youtube-Video-ID"); + } + + return id; + } + @Override public String getUrl(String id) { return "https://m.youtube.com/watch?v=" + id; } @Override - public String getId(String url) throws ParsingException, IllegalArgumentException { - if (url.isEmpty()) { - throw new IllegalArgumentException("The url parameter should not be empty"); - } + public String getId(String urlString) throws ParsingException, IllegalArgumentException { + try { + URI uri = new URI(urlString); + String scheme = uri.getScheme(); - String id; - String lowercaseUrl = url.toLowerCase(); - if (lowercaseUrl.contains("youtube")) { - if (url.contains("attribution_link")) { - try { - String escapedQuery = Parser.matchGroup1("u=(.[^&|$]*)", url); - String query = URLDecoder.decode(escapedQuery, "UTF-8"); - id = Parser.matchGroup1("v=" + ID_PATTERN, query); - } catch (UnsupportedEncodingException uee) { - throw new ParsingException("Could not parse attribution_link", uee); + if (scheme != null && (scheme.equals("vnd.youtube") || scheme.equals("vnd.youtube.launch"))) { + String schemeSpecificPart = uri.getSchemeSpecificPart(); + if (schemeSpecificPart.startsWith("//")) { + urlString = "https:" + schemeSpecificPart; + } else { + return assertIsID(schemeSpecificPart); } - } else if (url.contains("vnd.youtube")) { - id = Parser.matchGroup1(ID_PATTERN, url); - } else if (url.contains("embed")) { - id = Parser.matchGroup1("embed/" + ID_PATTERN, url); - } else if (url.contains("googleads")) { - throw new FoundAdException("Error found add: " + url); - } else { - id = Parser.matchGroup1("[?&]v=" + ID_PATTERN, url); } - } else if (lowercaseUrl.contains("youtu.be")) { - if (url.contains("v=")) { - id = Parser.matchGroup1("v=" + ID_PATTERN, url); - } else { - id = Parser.matchGroup1("[Yy][Oo][Uu][Tt][Uu]\\.[Bb][Ee]/" + ID_PATTERN, url); - } - } else if(lowercaseUrl.contains("hooktube")) { - if(lowercaseUrl.contains("&v=") - || lowercaseUrl.contains("?v=")) { - id = Parser.matchGroup1("[?&]v=" + ID_PATTERN, url); - } else if (url.contains("/embed/")) { - id = Parser.matchGroup1("embed/" + ID_PATTERN, url); - } else if (url.contains("/v/")) { - id = Parser.matchGroup1("v/" + ID_PATTERN, url); - } else if (url.contains("/watch/")) { - id = Parser.matchGroup1("watch/" + ID_PATTERN, url); - } else { - throw new ParsingException("Error no suitable url: " + url); - } - } else { - throw new ParsingException("Error no suitable url: " + url); + } catch (URISyntaxException ignored) { } - - if (!id.isEmpty()) { - return id; - } else { - throw new ParsingException("Error could not parse url: " + url); + URL url; + try { + url = Utils.stringToURL(urlString); + } catch (MalformedURLException e) { + throw new IllegalArgumentException("The given URL is not valid"); } + + String host = url.getHost(); + String path = url.getPath(); + // remove leading "/" of URL-path if URL-path is given + if (!path.isEmpty()) { + path = path.substring(1); + } + + if (!Utils.isHTTP(url) || !(YoutubeParsingHelper.isYoutubeURL(url) || + YoutubeParsingHelper.isYoutubeServiceURL(url) || YoutubeParsingHelper.isHooktubeURL(url) || + YoutubeParsingHelper.isInvidioURL(url))) { + if (host.equalsIgnoreCase("googleads.g.doubleclick.net")) { + throw new FoundAdException("Error found ad: " + urlString); + } + + throw new ParsingException("The url is not a Youtube-URL"); + } + + if (YoutubePlaylistLinkHandlerFactory.getInstance().acceptUrl(urlString)) { + throw new ParsingException("Error no suitable url: " + urlString); + } + + // using uppercase instead of lowercase, because toLowercase replaces some unicode characters + // with their lowercase ASCII equivalent. Using toLowercase could result in faultily matching unicode urls. + switch (host.toUpperCase()) { + case "WWW.YOUTUBE-NOCOOKIE.COM": { + if (path.startsWith("embed/")) { + String id = path.split("/")[1]; + + return assertIsID(id); + } + + break; + } + + case "YOUTUBE.COM": + case "WWW.YOUTUBE.COM": + case "M.YOUTUBE.COM": + case "MUSIC.YOUTUBE.COM": { + if (path.equals("attribution_link")) { + String uQueryValue = Utils.getQueryValue(url, "u"); + + URL decodedURL; + try { + decodedURL = Utils.stringToURL("http://www.youtube.com" + uQueryValue); + } catch (MalformedURLException e) { + throw new ParsingException("Error no suitable url: " + urlString); + } + + String viewQueryValue = Utils.getQueryValue(decodedURL, "v"); + return assertIsID(viewQueryValue); + } + + if (path.startsWith("embed/")) { + String id = path.split("/")[1]; + + return assertIsID(id); + } + + String viewQueryValue = Utils.getQueryValue(url, "v"); + return assertIsID(viewQueryValue); + } + + case "YOUTU.BE": { + String viewQueryValue = Utils.getQueryValue(url, "v"); + if (viewQueryValue != null) { + return assertIsID(viewQueryValue); + } + + return assertIsID(path); + } + + case "HOOKTUBE.COM": { + if (path.startsWith("v/")) { + String id = path.substring("v/".length()); + + return assertIsID(id); + } + if (path.startsWith("watch/")) { + String id = path.substring("watch/".length()); + + return assertIsID(id); + } + // there is no break-statement here on purpose so the next code-block gets also run for hooktube + } + + case "WWW.INVIDIO.US": + case "DEV.INVIDIO.US": + case "INVIDIO.US": + case "INVIDIOUS.SNOPYTA.ORG": + case "DE.INVIDIOUS.SNOPYTA.ORG": + case "FI.INVIDIOUS.SNOPYTA.ORG": + case "VID.WXZM.SX": + case "INVIDIOUS.KABI.TK": + case "INVIDIOU.SH": + case "WWW.INVIDIOU.SH": + case "NO.INVIDIOU.SH": + case "INVIDIOUS.ENKIRTON.NET": + case "TUBE.POAL.CO": + case "INVIDIOUS.13AD.DE": + case "YT.ELUKERIO.ORG": { // code-block for hooktube.com and Invidious instances + if (path.equals("watch")) { + String viewQueryValue = Utils.getQueryValue(url, "v"); + if (viewQueryValue != null) { + return assertIsID(viewQueryValue); + } + } + if (path.startsWith("embed/")) { + String id = path.substring("embed/".length()); + + return assertIsID(id); + } + + break; + } + } + + throw new ParsingException("Error no suitable url: " + urlString); } @Override public boolean onAcceptUrl(final String url) throws FoundAdException { - final String lowercaseUrl = url.toLowerCase(); - if (lowercaseUrl.contains("youtube") - || lowercaseUrl.contains("youtu.be") - || lowercaseUrl.contains("hooktube")) { - // bad programming I know - try { - getId(url); - return true; - } catch (FoundAdException fe) { - throw fe; - } catch (ParsingException e) { - return false; - } - } else { + try { + getId(url); + return true; + } catch (FoundAdException fe) { + throw fe; + } catch (ParsingException e) { return false; } } diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeCommentsExtractorTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeCommentsExtractorTest.java index 6bf76b61..79f13b6b 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeCommentsExtractorTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeCommentsExtractorTest.java @@ -27,7 +27,7 @@ public class YoutubeCommentsExtractorTest { public static void setUp() throws Exception { NewPipe.init(DownloaderTestImpl.getInstance()); extractor = (YoutubeCommentsExtractor) YouTube - .getCommentsExtractor("https://www.youtube.com/watch?v=D00Au7k3i6o"); + .getCommentsExtractor("https://www.invidio.us/watch?v=D00Au7k3i6o"); } @Test @@ -47,7 +47,7 @@ public class YoutubeCommentsExtractorTest { @Test public void testGetCommentsFromCommentsInfo() throws IOException, ExtractionException { boolean result = false; - CommentsInfo commentsInfo = CommentsInfo.getInfo("https://www.youtube.com/watch?v=D00Au7k3i6o"); + CommentsInfo commentsInfo = CommentsInfo.getInfo("https://www.invidio.us/watch?v=D00Au7k3i6o"); assertTrue("what the fuck am i doing with my life".equals(commentsInfo.getName())); result = findInComments(commentsInfo.getRelatedItems(), "s1ck m3m3");