Fix timestamp links in Youtube video descriptions

For some reason, in NewPipeExtractor,
comments were loaded from JSON by YoutubeCommentsInfoItemExtractor as text,
sent via CommentsInfoItem#getCommentText to NewPipe,
where timestamps are converted to hyperlinks using Linkify:
https://github.com/TeamNewPipe/NewPipe/pull/2168

On the other hand, video descriptions are handled in NewPipeExtractor
by scraping the watch-page HTML.
There, timestamp links were previously mangled (and now properly parsed),
before being sent as HTML via YoutubeStreamExtractor#getDescription
to NewPipe (where HTML gets converted to Spanned).

The logic introduced in this commit is different from the above PR,
since it operates in the extractor, and mutates the HTML DOM
rather than identifying via regex.
This commit is contained in:
jimbo1qaz 2019-08-17 20:48:15 -07:00
parent 430da57350
commit e38d906ff9

View file

@ -30,6 +30,8 @@ import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.*; import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/* /*
* Created by Christian Schabesberger on 06.08.15. * Created by Christian Schabesberger on 06.08.15.
@ -162,14 +164,54 @@ public class YoutubeStreamExtractor extends StreamExtractor {
} }
} }
// onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;"
// :00 is NOT recognized as a timestamp in description or comments.
// 0:00 is recognized in both description and comments.
// https://www.youtube.com/watch?v=4cccfDXu1vA
private final static Pattern DESCRIPTION_TIMESTAMP_ONCLICK_REGEX = Pattern.compile(
"seekTo\\("
+ "(?:(\\d+)\\*3600\\+)?" // hours?
+ "(\\d+)\\*60\\+" // minutes
+ "(\\d+)" // seconds
+ "\\)");
@SafeVarargs
private static <T> T coalesce(T... args) {
for (T arg : args) {
if (arg != null) return arg;
}
throw new IllegalArgumentException("all arguments to coalesce() were null");
}
private String parseHtmlAndGetFullLinks(String descriptionHtml) private String parseHtmlAndGetFullLinks(String descriptionHtml)
throws MalformedURLException, UnsupportedEncodingException, ParsingException { throws MalformedURLException, UnsupportedEncodingException, ParsingException {
final Document description = Jsoup.parse(descriptionHtml, getUrl()); final Document description = Jsoup.parse(descriptionHtml, getUrl());
for(Element a : description.select("a")) { for(Element a : description.select("a")) {
final String rawUrl = a.attr("abs:href"); final String rawUrl = a.attr("abs:href");
final URL redirectLink = new URL(rawUrl); final URL redirectLink = new URL(rawUrl);
final String queryString = redirectLink.getQuery();
if(queryString != null) { final Matcher onClickTimestamp;
final String queryString;
if ((onClickTimestamp = DESCRIPTION_TIMESTAMP_ONCLICK_REGEX.matcher(a.attr("onclick")))
.find()) {
a.removeAttr("onclick");
String hours = coalesce(onClickTimestamp.group(1), "0");
String minutes = onClickTimestamp.group(2);
String seconds = onClickTimestamp.group(3);
int timestamp = 0;
timestamp += Integer.parseInt(hours) * 3600;
timestamp += Integer.parseInt(minutes) * 60;
timestamp += Integer.parseInt(seconds);
String setTimestamp = "&t=" + timestamp;
// Even after clicking https://youtu.be/...?t=6,
// getUrl() is https://www.youtube.com/watch?v=..., never youtu.be, never &t=.
a.attr("href", getUrl() + setTimestamp);
} else if((queryString = redirectLink.getQuery()) != null) {
// if the query string is null we are not dealing with a redirect link, // if the query string is null we are not dealing with a redirect link,
// so we don't need to override it. // so we don't need to override it.
final String link = final String link =