Merge pull request #794 from FireMasterK/comments-count

[YouTube] Add support to extract total comment count
This commit is contained in:
Stypox 2023-01-11 15:32:19 +01:00 committed by GitHub
commit c1040bccac
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 126 additions and 59 deletions

View file

@ -22,6 +22,13 @@ public abstract class CommentsExtractor extends ListExtractor<CommentsInfoItem>
return false;
}
/**
* @return the total number of comments
*/
public int getCommentsCount() throws ExtractionException {
return -1;
}
@Nonnull
@Override
public String getName() throws ParsingException {

View file

@ -48,6 +48,11 @@ public final class CommentsInfo extends ListInfo<CommentsInfoItem> {
ExtractorHelper.getItemsPageOrLogError(commentsInfo, commentsExtractor);
commentsInfo.setCommentsDisabled(commentsExtractor.isCommentsDisabled());
commentsInfo.setRelatedItems(initialCommentsPage.getItems());
try {
commentsInfo.setCommentsCount(commentsExtractor.getCommentsCount());
} catch (final Exception e) {
commentsInfo.addError(e);
}
commentsInfo.setNextPage(initialCommentsPage.getNextPage());
return commentsInfo;
@ -76,6 +81,7 @@ public final class CommentsInfo extends ListInfo<CommentsInfoItem> {
private transient CommentsExtractor commentsExtractor;
private boolean commentsDisabled = false;
private int commentsCount;
public CommentsExtractor getCommentsExtractor() {
return commentsExtractor;
@ -86,7 +92,6 @@ public final class CommentsInfo extends ListInfo<CommentsInfoItem> {
}
/**
* @apiNote Warning: This method is experimental and may get removed in a future release.
* @return {@code true} if the comments are disabled otherwise {@code false} (default)
* @see CommentsExtractor#isCommentsDisabled()
*/
@ -95,10 +100,27 @@ public final class CommentsInfo extends ListInfo<CommentsInfoItem> {
}
/**
* @apiNote Warning: This method is experimental and may get removed in a future release.
* @param commentsDisabled {@code true} if the comments are disabled otherwise {@code false}
*/
public void setCommentsDisabled(final boolean commentsDisabled) {
this.commentsDisabled = commentsDisabled;
}
/**
* Returns the total number of comments.
*
* @return the total number of comments
*/
public int getCommentsCount() {
return commentsCount;
}
/**
* Sets the total number of comments.
*
* @param commentsCount the commentsCount to set.
*/
public void setCommentsCount(final int commentsCount) {
this.commentsCount = commentsCount;
}
}

View file

@ -1,18 +1,8 @@
package org.schabi.newpipe.extractor.services.youtube.extractors;
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonPostResponse;
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.prepareDesktopJsonBuilder;
import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import com.grack.nanojson.JsonArray;
import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonWriter;
import org.schabi.newpipe.extractor.Page;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.comments.CommentsExtractor;
@ -24,26 +14,31 @@ import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.localization.Localization;
import org.schabi.newpipe.extractor.utils.JsonUtils;
import org.schabi.newpipe.extractor.utils.Utils;
import com.grack.nanojson.JsonArray;
import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonWriter;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonPostResponse;
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getTextFromObject;
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.prepareDesktopJsonBuilder;
import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
public class YoutubeCommentsExtractor extends CommentsExtractor {
private JsonObject nextResponse;
/**
* Whether comments are disabled on video.
*/
private boolean commentsDisabled;
/**
* Caching mechanism and holder of the commentsDisabled value.
* <br/>
* Initial value = empty -> unknown if comments are disabled or not<br/>
* Some method calls {@link #findInitialCommentsToken()}
* -> value is set<br/>
* If the method or another one that is depending on disabled comments
* is now called again, the method execution can avoid unnecessary calls
* The second ajax <b>/next</b> response.
*/
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
private Optional<Boolean> optCommentsDisabled = Optional.empty();
private JsonObject ajaxJson;
public YoutubeCommentsExtractor(
final StreamingService service,
@ -56,32 +51,25 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
public InfoItemsPage<CommentsInfoItem> getInitialPage()
throws IOException, ExtractionException {
// Check if findInitialCommentsToken was already called and optCommentsDisabled initialized
if (optCommentsDisabled.orElse(false)) {
if (commentsDisabled) {
return getInfoItemsPageForDisabledComments();
}
// Get the token
final String commentsToken = findInitialCommentsToken();
// Check if the comments have been disabled
if (optCommentsDisabled.get()) {
return getInfoItemsPageForDisabledComments();
}
return getPage(getNextPage(commentsToken));
return extractComments(ajaxJson);
}
/**
* Finds the initial comments token and initializes commentsDisabled.
* <br/>
* Also sets {@link #optCommentsDisabled}.
* Also sets {@link #commentsDisabled}.
*
* @return the continuation token or null if none was found
*/
@Nullable
private String findInitialCommentsToken() throws ExtractionException {
private String findInitialCommentsToken(final JsonObject nextResponse)
throws ExtractionException {
final String token = JsonUtils.getArray(nextResponse,
"contents.twoColumnWatchNextResults.results.results.contents")
"contents.twoColumnWatchNextResults.results.results.contents")
.stream()
// Only use JsonObjects
.filter(JsonObject.class::isInstance)
@ -112,7 +100,7 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
.orElse(null);
// The comments are disabled if we couldn't get a token
optCommentsDisabled = Optional.of(token == null);
commentsDisabled = token == null;
return token;
}
@ -123,9 +111,9 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
}
@Nullable
private Page getNextPage(@Nonnull final JsonObject ajaxJson) throws ExtractionException {
private Page getNextPage(@Nonnull final JsonObject jsonObject) throws ExtractionException {
final JsonArray onResponseReceivedEndpoints =
ajaxJson.getArray("onResponseReceivedEndpoints");
jsonObject.getArray("onResponseReceivedEndpoints");
// Prevent ArrayIndexOutOfBoundsException
if (onResponseReceivedEndpoints.isEmpty()) {
@ -173,30 +161,39 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
@Override
public InfoItemsPage<CommentsInfoItem> getPage(final Page page)
throws IOException, ExtractionException {
if (optCommentsDisabled.orElse(false)) {
if (commentsDisabled) {
return getInfoItemsPageForDisabledComments();
}
if (page == null || isNullOrEmpty(page.getId())) {
throw new IllegalArgumentException("Page doesn't have the continuation.");
}
final Localization localization = getExtractorLocalization();
// @formatter:off
final byte[] body = JsonWriter.string(
prepareDesktopJsonBuilder(localization, getExtractorContentCountry())
.value("continuation", page.getId())
.done())
.getBytes(StandardCharsets.UTF_8);
// @formatter:on
final JsonObject ajaxJson = getJsonPostResponse("next", body, localization);
final var jsonObject = getJsonPostResponse("next", body, localization);
final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector(
getServiceId());
collectCommentsFrom(collector, ajaxJson);
return new InfoItemsPage<>(collector, getNextPage(ajaxJson));
return extractComments(jsonObject);
}
private void collectCommentsFrom(final CommentsInfoItemsCollector collector,
@Nonnull final JsonObject ajaxJson) throws ParsingException {
private InfoItemsPage<CommentsInfoItem> extractComments(final JsonObject jsonObject)
throws ExtractionException {
final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector(
getServiceId());
collectCommentsFrom(collector);
return new InfoItemsPage<>(collector, getNextPage(jsonObject));
}
private void collectCommentsFrom(final CommentsInfoItemsCollector collector)
throws ParsingException {
final JsonArray onResponseReceivedEndpoints =
ajaxJson.getArray("onResponseReceivedEndpoints");
@ -254,24 +251,59 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
public void onFetchPage(@Nonnull final Downloader downloader)
throws IOException, ExtractionException {
final Localization localization = getExtractorLocalization();
// @formatter:off
final byte[] body = JsonWriter.string(
prepareDesktopJsonBuilder(localization, getExtractorContentCountry())
.value("videoId", getId())
.done())
.getBytes(StandardCharsets.UTF_8);
// @formatter:on
nextResponse = getJsonPostResponse("next", body, localization);
final String initialToken =
findInitialCommentsToken(getJsonPostResponse("next", body, localization));
if (initialToken == null) {
return;
}
// @formatter:off
final byte[] ajaxBody = JsonWriter.string(
prepareDesktopJsonBuilder(localization, getExtractorContentCountry())
.value("continuation", initialToken)
.done())
.getBytes(StandardCharsets.UTF_8);
// @formatter:on
ajaxJson = getJsonPostResponse("next", ajaxBody, localization);
}
@Override
public boolean isCommentsDisabled() throws ExtractionException {
// Check if commentsDisabled has to be initialized
if (!optCommentsDisabled.isPresent()) {
// Initialize commentsDisabled
this.findInitialCommentsToken();
public boolean isCommentsDisabled() {
return commentsDisabled;
}
@Override
public int getCommentsCount() throws ExtractionException {
assertPageFetched();
if (commentsDisabled) {
return -1;
}
return optCommentsDisabled.get();
final JsonObject countText = ajaxJson
.getArray("onResponseReceivedEndpoints").getObject(0)
.getObject("reloadContinuationItemsCommand")
.getArray("continuationItems").getObject(0)
.getObject("commentsHeaderRenderer")
.getObject("countText");
try {
return Integer.parseInt(
Utils.removeNonDigitCharacters(getTextFromObject(countText))
);
} catch (final Exception e) {
throw new ExtractionException("Unable to get comments count", e);
}
}
}

View file

@ -89,6 +89,7 @@ public class YoutubeCommentsExtractorTest {
@Test
public void testGetCommentsAllData() throws IOException, ExtractionException {
InfoItemsPage<CommentsInfoItem> comments = extractor.getInitialPage();
assertTrue(extractor.getCommentsCount() > 5); // at least 5 comments
DefaultTests.defaultTestListOfItems(YouTube, comments.getItems(), comments.getErrors());
for (CommentsInfoItem c : comments.getItems()) {
@ -344,6 +345,11 @@ public class YoutubeCommentsExtractorTest {
assertNotEquals(UNKNOWN_REPLY_COUNT, firstComment.getReplyCount(), "Could not get the reply count of the first comment");
assertGreater(300, firstComment.getReplyCount());
}
@Test
public void testCommentsCount() throws IOException, ExtractionException {
assertTrue(extractor.getCommentsCount() > 18800);
}
}
public static class FormattingTest {