disable image scraping

due to being IP flagged
This commit is contained in:
buzz-lightsnack-2007 2024-05-25 10:05:44 +08:00
parent 00c0069fa9
commit 349d16b06d

View file

@ -1,5 +1,5 @@
/* reader.js /* reader.js
Read the contents of the page. Read the contents of the page.
*/ */
import net from "/scripts/utils/net.js"; import net from "/scripts/utils/net.js";
@ -8,7 +8,7 @@ export default class scraper {
#options; #options;
/* /*
Scrape fields. Scrape fields.
@param {Object} scraper_fields the fields to scrape @param {Object} scraper_fields the fields to scrape
@param {Object} options the options @param {Object} options the options
@ -19,8 +19,8 @@ export default class scraper {
: false; : false;
this.#options = Object.assign({}, {"scroll": true, "duration": 125, "automatic": true, "background": true}, options); this.#options = Object.assign({}, {"scroll": true, "duration": 125, "automatic": true, "background": true}, options);
if (this.#options.automatic) { if (this.#options.automatic) {
// Quickly scroll down then to where the user already was to get automatically hidden content. // Quickly scroll down then to where the user already was to get automatically hidden content.
async function autoscroll(options) { async function autoscroll(options) {
let SCROLL = {"x": parseInt(window.scrollX), "y": parseInt(window.scrollY)}; let SCROLL = {"x": parseInt(window.scrollX), "y": parseInt(window.scrollY)};
let DURATION = Math.abs(options[`duration`]); let DURATION = Math.abs(options[`duration`]);
@ -35,7 +35,7 @@ export default class scraper {
}); });
} }
// Scroll two times to check for updated data. // Scroll two times to check for updated data.
for (let SCROLLS = 1; SCROLLS <= 2; SCROLLS++) { for (let SCROLLS = 1; SCROLLS <= 2; SCROLLS++) {
for (const POSITION of [{"top": document.body.scrollHeight, "left": document.body.scrollWidth}, {"top": 0, "left": 0}]) { for (const POSITION of [{"top": document.body.scrollHeight, "left": document.body.scrollWidth}, {"top": 0, "left": 0}]) {
await go(POSITION, DURATION); await go(POSITION, DURATION);
@ -73,7 +73,7 @@ export default class scraper {
this.getTexts(this.fields, this.#options); this.getTexts(this.fields, this.#options);
this.getImages(this.fields, this.#options); this.getImages(this.fields, this.#options);
}); });
// Observe the document. // Observe the document.
OBSERVER.observe(document.body, {"childList": true, "subtree": true}); OBSERVER.observe(document.body, {"childList": true, "subtree": true});
} }
@ -94,19 +94,19 @@ export default class scraper {
/* Read for the particular fields. */ /* Read for the particular fields. */
function read(fields) { function read(fields) {
let DATA = {}; // Store here the resulting data let DATA = {}; // Store here the resulting data
(Object.keys(fields)).forEach((NAME) => { (Object.keys(fields)).forEach((NAME) => {
// Remove trailing spaces within the name. // Remove trailing spaces within the name.
NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME; NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME;
// Set the referring value. // Set the referring value.
let VALUE = fields[NAME]; let VALUE = fields[NAME];
VALUE = (typeof VALUE).includes(`str`) ? VALUE.trim() : VALUE; VALUE = (typeof VALUE).includes(`str`) ? VALUE.trim() : VALUE;
if (VALUE && NAME) { if (VALUE && NAME) {
// Check if array. // Check if array.
if ((Array.isArray(VALUE)) ? VALUE.length : false) { if ((Array.isArray(VALUE)) ? VALUE.length : false) {
// Temporarily create an empty list. // Temporarily create an empty list.
DATA[NAME] = []; DATA[NAME] = [];
VALUE.forEach((PARTICULAR) => { VALUE.forEach((PARTICULAR) => {
@ -114,7 +114,7 @@ export default class scraper {
DATA[NAME].push(read(PARTICULAR)); DATA[NAME].push(read(PARTICULAR));
} else { } else {
let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))]; let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))];
(ELEMENTS && ELEMENTS.length) (ELEMENTS && ELEMENTS.length)
? (ELEMENTS).forEach((ELEMENT) => { ? (ELEMENTS).forEach((ELEMENT) => {
DATA[NAME].push(ELEMENT.textContent.trim()); DATA[NAME].push(ELEMENT.textContent.trim());
@ -135,18 +135,18 @@ export default class scraper {
return DATA; return DATA;
}; };
// Determine and set the appropriate field source. // Determine and set the appropriate field source.
let FIELDS = (((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false) ? fields : this.fields; let FIELDS = (((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false) ? fields : this.fields;
((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) ((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true)
? this.fields = FIELDS ? this.fields = FIELDS
: null; : null;
// Read the fields. // Read the fields.
(FIELDS) (FIELDS)
? CONTENT = read(FIELDS) ? CONTENT = read(FIELDS)
: false; : false;
// Set the data if the options doesn't indicate otherwise. // Set the data if the options doesn't indicate otherwise.
(((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT) (((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT)
? this.texts = CONTENT ? this.texts = CONTENT
: false; : false;
@ -154,135 +154,140 @@ export default class scraper {
}; };
/* /*
Scrape the images from a page. Scrape the images from a page.
It's temporarily disabled due to consequent flagging of the IP address. Also it's output is not yet implemented. This is a future point of expansion (Crit E).
@param {Object} fields the fields to scrape @param {Object} fields the fields to scrape
@param {Object} options the options @param {Object} options the options
@return {Object} the blob of the images @return {Object} the blob of the images
*/ */
async getImages(fields, options) { async getImages(fields, options) {
let CONTENT; let DISABLE = true // This is how to disable it
/* if (!DISABLE) {
Get the blob of the image in an element. let CONTENT;
@param {Element} element the element to get the blob from
@return {Blob} the blob of the image
*/
async function blobbify(element) {
/*
Get the URL of the image.
@param {Element} element the element to get the URL from
@return {String} the URL of the image
*/
function reference(element) {
let LOCATION;
// Get using standard attributes.
LOCATION = element.getAttribute(`src`);
if (!LOCATION) {
// Use the CSS background image.
(window.getComputedStyle(element).backgroundImage)
? LOCATION = window.getComputedStyle(element).backgroundImage.slice(4, -1).replace(/"/g, "")
: false;
}
// Return the location.
return LOCATION;
}
/* /*
Get the blob from the URL. Get the blob of the image in an element.
@param {String} URL the URL to get the blob from @param {Element} element the element to get the blob from
@return {Blob} the blob of the image @return {Blob} the blob of the image
*/ */
function getBlob(URL) { async function blobbify(element) {
return(net.download(URL, `blob`)); /*
} Get the URL of the image.
let LOCATION = reference(element); @param {Element} element the element to get the URL from
let BLOB = await getBlob(LOCATION); @return {String} the URL of the image
*/
function reference(element) {
let LOCATION;
return ((BLOB.type.includes(`image`)) ? BLOB : null); // Get using standard attributes.
} LOCATION = element.getAttribute(`src`);
/* Read for the particular fields. */ if (!LOCATION) {
async function read(fields) { // Use the CSS background image.
/* (window.getComputedStyle(element).backgroundImage)
Select all images from an element and get their blobs. ? LOCATION = window.getComputedStyle(element).backgroundImage.slice(4, -1).replace(/"/g, "")
: false;
@param {Element} element the element to get the images from
@return {Array} the blobs of the images
*/
async function select(element) {
let IMAGES = [...element.querySelectorAll(`*`)];
let BLOBS = [];
if (IMAGES && IMAGES.length) {
for (let IMAGE of IMAGES) {
let BLOB = await blobbify(IMAGE);
(BLOB) ? BLOBS.push(BLOB) : false;
} }
// Return the location.
return LOCATION;
} }
return BLOBS; /*
Get the blob from the URL.
@param {String} URL the URL to get the blob from
@return {Blob} the blob of the image
*/
function getBlob(URL) {
return(net.download(URL, `blob`));
}
let LOCATION = reference(element);
let BLOB = await getBlob(LOCATION);
return ((BLOB.type.includes(`image`)) ? BLOB : null);
} }
let DATA = []; // Store here the resulting data /* Read for the particular fields. */
async function read(fields) {
/*
Select all images from an element and get their blobs.
for (let NAME of Object.keys(fields)) { @param {Element} element the element to get the images from
// Remove trailing spaces within the name. @return {Array} the blobs of the images
NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME; */
let VALUE = fields[NAME]; async function select(element) {
let IMAGES = [...element.querySelectorAll(`*`)];
let BLOBS = [];
if (VALUE && NAME) { if (IMAGES && IMAGES.length) {
// Check if array. for (let IMAGE of IMAGES) {
if (Array.isArray(VALUE)) { let BLOB = await blobbify(IMAGE);
// Temporarily create an empty list. (BLOB) ? BLOBS.push(BLOB) : false;
for (let PARTICULAR of VALUE) { }
if ((typeof PARTICULAR).includes(`obj`) && PARTICULAR && !Array.isArray(PARTICULAR)) { }
DATA = [...DATA, ...(await read(PARTICULAR))];
} else {
let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))];
if (ELEMENTS && ELEMENTS.length) { return BLOBS;
for (let ELEMENT of ELEMENTS) { }
let BLOBS = await select(ELEMENT);
if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS]; let DATA = []; // Store here the resulting data
for (let NAME of Object.keys(fields)) {
// Remove trailing spaces within the name.
NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME;
let VALUE = fields[NAME];
if (VALUE && NAME) {
// Check if array.
if (Array.isArray(VALUE)) {
// Temporarily create an empty list.
for (let PARTICULAR of VALUE) {
if ((typeof PARTICULAR).includes(`obj`) && PARTICULAR && !Array.isArray(PARTICULAR)) {
DATA = [...DATA, ...(await read(PARTICULAR))];
} else {
let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))];
if (ELEMENTS && ELEMENTS.length) {
for (let ELEMENT of ELEMENTS) {
let BLOBS = await select(ELEMENT);
if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS];
}
} }
} }
} }
} } else if ((typeof VALUE).includes(`obj`) && VALUE) {
} else if ((typeof VALUE).includes(`obj`) && VALUE) { DATA = [...DATA, ...(await read(VALUE))];
DATA = [...DATA, ...(await read(VALUE))]; } else if (document.querySelector(VALUE)) {
} else if (document.querySelector(VALUE)) { let ELEMENTS = [...(document.querySelectorAll(VALUE))];
let ELEMENTS = [...(document.querySelectorAll(VALUE))];
if (ELEMENTS && ELEMENTS.length) { if (ELEMENTS && ELEMENTS.length) {
for (let ELEMENT of ELEMENTS) { for (let ELEMENT of ELEMENTS) {
let BLOBS = await select(ELEMENT); let BLOBS = await select(ELEMENT);
if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS]; if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS];
}
} }
} }
} }
} }
}
return (DATA); return (DATA);
}; };
// Read the fields. // Read the fields.
(((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false) (((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false)
? CONTENT = await read(fields) ? CONTENT = await read(fields)
: false; : false;
// Set the data if the options doesn't indicate otherwise. // Set the data if the options doesn't indicate otherwise.
(((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT) (((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT)
? this.images = CONTENT ? this.images = CONTENT
: false; : false;
return (CONTENT); return (CONTENT);
}
} }
} }