[youtube] improve comments extraction performance

- do not parse responseBody twice for continuation
instead try to get commentsTokenInside with the new pattern ("sectionListRenderer")
and try again with the old pattern ("commentSectionRenderer") on failure
- do not unescape responseBody multiple times
   -> parse responseBody less times
This commit is contained in:
bopol 2021-03-05 13:33:25 +01:00
parent b608587e4d
commit ed850d0688
2 changed files with 18 additions and 16 deletions

View File

@ -824,4 +824,14 @@ public class YoutubeParsingHelper {
return false; return false;
} }
public static String unescapeDocument(final String doc) {
return doc
.replaceAll("\\\\x22", "\"")
.replaceAll("\\\\x7b", "{")
.replaceAll("\\\\x7d", "}")
.replaceAll("\\\\x5b", "[")
.replaceAll("\\\\x5d", "]");
}
} }

View File

@ -15,6 +15,7 @@ import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.utils.JsonUtils; import org.schabi.newpipe.extractor.utils.JsonUtils;
import org.schabi.newpipe.extractor.utils.Parser; import org.schabi.newpipe.extractor.utils.Parser;
@ -46,11 +47,9 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
@Override @Override
public InfoItemsPage<CommentsInfoItem> getInitialPage() throws IOException, ExtractionException { public InfoItemsPage<CommentsInfoItem> getInitialPage() throws IOException, ExtractionException {
final String commentsTokenInside; String commentsTokenInside = findValue(responseBody, "sectionListRenderer", "}");
if (responseBody.contains("commentSectionRenderer")) { if (!commentsTokenInside.contains("continuation")) {
commentsTokenInside = findValue(responseBody, "commentSectionRenderer", "}"); commentsTokenInside = findValue(responseBody, "commentSectionRenderer", "}");
} else {
commentsTokenInside = findValue(responseBody, "sectionListRenderer", "}");
} }
final String commentsToken = findValue(commentsTokenInside, "continuation\":\"", "\""); final String commentsToken = findValue(commentsTokenInside, "continuation\":\"", "\"");
return getPage(getNextPage(commentsToken)); return getPage(getNextPage(commentsToken));
@ -133,7 +132,7 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
final Map<String, List<String>> requestHeaders = new HashMap<>(); final Map<String, List<String>> requestHeaders = new HashMap<>();
requestHeaders.put("User-Agent", singletonList(USER_AGENT)); requestHeaders.put("User-Agent", singletonList(USER_AGENT));
final Response response = downloader.get(getUrl(), requestHeaders, getExtractorLocalization()); final Response response = downloader.get(getUrl(), requestHeaders, getExtractorLocalization());
responseBody = response.responseBody(); responseBody = YoutubeParsingHelper.unescapeDocument(response.responseBody());
ytClientVersion = findValue(responseBody, "INNERTUBE_CONTEXT_CLIENT_VERSION\":\"", "\""); ytClientVersion = findValue(responseBody, "INNERTUBE_CONTEXT_CLIENT_VERSION\":\"", "\"");
ytClientName = Parser.matchGroup1(YT_CLIENT_NAME_PATTERN, responseBody); ytClientName = Parser.matchGroup1(YT_CLIENT_NAME_PATTERN, responseBody);
} }
@ -163,16 +162,9 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
return result.toString(); return result.toString();
} }
private String findValue(String doc, String start, String end) { private String findValue(final String doc, final String start, final String end) {
final String unescaped = doc final int beginIndex = doc.indexOf(start) + start.length();
.replaceAll("\\\\x22", "\"") final int endIndex = doc.indexOf(end, beginIndex);
.replaceAll("\\\\x7b", "{") return doc.substring(beginIndex, endIndex);
.replaceAll("\\\\x7d", "}")
.replaceAll("\\\\x5b", "[")
.replaceAll("\\\\x5d", "]");
final int beginIndex = unescaped.indexOf(start) + start.length();
final int endIndex = unescaped.indexOf(end, beginIndex);
return unescaped.substring(beginIndex, endIndex);
} }
} }