Fix extractor

This commit is contained in:
Fynn Godau 2020-10-08 17:56:03 +02:00
parent cfe88a74c1
commit 81b5e7cf3d
4 changed files with 14 additions and 39 deletions

View file

@ -6,6 +6,8 @@ import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;
import com.grack.nanojson.JsonWriter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
@ -20,7 +22,7 @@ import java.util.*;
public class BandcampExtractorHelper {
/**
* <p>Get JSON behind <code>var $variable = </code> out of web page</p>
* <p>Get an attribute of a web page as JSON
*
* <p>Originally a part of bandcampDirect.</p>
*
@ -29,35 +31,10 @@ public class BandcampExtractorHelper {
* @param variable Name of the variable
* @return The JsonObject stored in the variable with this name
*/
public static JsonObject getJSONFromJavaScriptVariables(String html, String variable) throws JsonParserException, ArrayIndexOutOfBoundsException, ParsingException {
String[] part = html.split("var " + variable + " = ");
String firstHalfGone = part[1];
firstHalfGone = firstHalfGone.replaceAll("\" \\+ \"", "");
int position = -1;
int level = 0;
for (char character : firstHalfGone.toCharArray()) {
position++;
switch (character) {
case '{':
level++;
continue;
case '}':
level--;
if (level == 0) {
return JsonParser.object().from(firstHalfGone.substring(0, position + 1)
.replaceAll(" {4}//.+", "") // Remove "for the curious" in JSON
.replaceAll("// xxx: note - don't internationalize this variable", "") // Remove this comment
);
}
}
}
throw new ParsingException("Unexpected HTML: JSON never ends");
public static JsonObject getJsonData(String html, String variable) throws JsonParserException, ArrayIndexOutOfBoundsException, ParsingException {
Document document = Jsoup.parse(html);
String json = document.getElementsByAttribute(variable).attr(variable);
return JsonParser.object().from(json);
}
/**

View file

@ -21,7 +21,7 @@ import javax.annotation.Nonnull;
import java.io.IOException;
import static org.schabi.newpipe.extractor.services.bandcamp.extractors.BandcampExtractorHelper.getImageUrl;
import static org.schabi.newpipe.extractor.services.bandcamp.extractors.BandcampExtractorHelper.getJSONFromJavaScriptVariables;
import static org.schabi.newpipe.extractor.services.bandcamp.extractors.BandcampExtractorHelper.getJsonData;
import static org.schabi.newpipe.extractor.services.bandcamp.extractors.BandcampStreamExtractor.getAlbumInfoJson;
public class BandcampPlaylistExtractor extends PlaylistExtractor {
@ -50,7 +50,7 @@ public class BandcampPlaylistExtractor extends PlaylistExtractor {
trackInfo = albumJson.getArray("trackinfo");
try {
name = getJSONFromJavaScriptVariables(html, "EmbedData").getString("album_title");
name = getJsonData(html, "data-embed").getString("album_title");
} catch (JsonParserException e) {
throw new ParsingException("Faulty JSON; page likely does not contain album data", e);
} catch (ArrayIndexOutOfBoundsException e) {

View file

@ -20,11 +20,7 @@ import org.schabi.newpipe.extractor.stream.*;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Locale;
@ -63,7 +59,7 @@ public class BandcampStreamExtractor extends StreamExtractor {
*/
public static JsonObject getAlbumInfoJson(String html) throws ParsingException {
try {
return BandcampExtractorHelper.getJSONFromJavaScriptVariables(html, "TralbumData");
return BandcampExtractorHelper.getJsonData(html, "data-tralbum");
} catch (JsonParserException e) {
throw new ParsingException("Faulty JSON; page likely does not contain album data", e);
} catch (ArrayIndexOutOfBoundsException e) {
@ -264,7 +260,9 @@ public class BandcampStreamExtractor extends StreamExtractor {
@Override
public String getCategory() {
// Get first tag from html, which is the artist's Genre
return document.getElementsByAttributeValue("itemprop", "keywords").first().text();
return document
.getElementsByClass("tralbum-tags").first()
.getElementsByClass("tag").first().text();
}
@Nonnull

View file

@ -25,7 +25,7 @@ public class BandcampChannelLinkHandlerFactory extends ListLinkHandlerFactory {
String response = NewPipe.getDownloader().get(url).responseBody();
// This variable contains band data!
JsonObject bandData = BandcampExtractorHelper.getJSONFromJavaScriptVariables(response, "BandData");
JsonObject bandData = BandcampExtractorHelper.getJsonData(response, "data-band");
return String.valueOf(bandData.getLong("id"));