disable image scraping
due to being IP flagged
This commit is contained in:
parent
00c0069fa9
commit
349d16b06d
1 changed files with 117 additions and 112 deletions
|
@ -1,5 +1,5 @@
|
||||||
/* reader.js
|
/* reader.js
|
||||||
Read the contents of the page.
|
Read the contents of the page.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import net from "/scripts/utils/net.js";
|
import net from "/scripts/utils/net.js";
|
||||||
|
@ -8,7 +8,7 @@ export default class scraper {
|
||||||
#options;
|
#options;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Scrape fields.
|
Scrape fields.
|
||||||
|
|
||||||
@param {Object} scraper_fields the fields to scrape
|
@param {Object} scraper_fields the fields to scrape
|
||||||
@param {Object} options the options
|
@param {Object} options the options
|
||||||
|
@ -19,8 +19,8 @@ export default class scraper {
|
||||||
: false;
|
: false;
|
||||||
this.#options = Object.assign({}, {"scroll": true, "duration": 125, "automatic": true, "background": true}, options);
|
this.#options = Object.assign({}, {"scroll": true, "duration": 125, "automatic": true, "background": true}, options);
|
||||||
|
|
||||||
if (this.#options.automatic) {
|
if (this.#options.automatic) {
|
||||||
// Quickly scroll down then to where the user already was to get automatically hidden content.
|
// Quickly scroll down then to where the user already was to get automatically hidden content.
|
||||||
async function autoscroll(options) {
|
async function autoscroll(options) {
|
||||||
let SCROLL = {"x": parseInt(window.scrollX), "y": parseInt(window.scrollY)};
|
let SCROLL = {"x": parseInt(window.scrollX), "y": parseInt(window.scrollY)};
|
||||||
let DURATION = Math.abs(options[`duration`]);
|
let DURATION = Math.abs(options[`duration`]);
|
||||||
|
@ -35,7 +35,7 @@ export default class scraper {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scroll two times to check for updated data.
|
// Scroll two times to check for updated data.
|
||||||
for (let SCROLLS = 1; SCROLLS <= 2; SCROLLS++) {
|
for (let SCROLLS = 1; SCROLLS <= 2; SCROLLS++) {
|
||||||
for (const POSITION of [{"top": document.body.scrollHeight, "left": document.body.scrollWidth}, {"top": 0, "left": 0}]) {
|
for (const POSITION of [{"top": document.body.scrollHeight, "left": document.body.scrollWidth}, {"top": 0, "left": 0}]) {
|
||||||
await go(POSITION, DURATION);
|
await go(POSITION, DURATION);
|
||||||
|
@ -73,7 +73,7 @@ export default class scraper {
|
||||||
this.getTexts(this.fields, this.#options);
|
this.getTexts(this.fields, this.#options);
|
||||||
this.getImages(this.fields, this.#options);
|
this.getImages(this.fields, this.#options);
|
||||||
});
|
});
|
||||||
|
|
||||||
// Observe the document.
|
// Observe the document.
|
||||||
OBSERVER.observe(document.body, {"childList": true, "subtree": true});
|
OBSERVER.observe(document.body, {"childList": true, "subtree": true});
|
||||||
}
|
}
|
||||||
|
@ -94,19 +94,19 @@ export default class scraper {
|
||||||
/* Read for the particular fields. */
|
/* Read for the particular fields. */
|
||||||
function read(fields) {
|
function read(fields) {
|
||||||
let DATA = {}; // Store here the resulting data
|
let DATA = {}; // Store here the resulting data
|
||||||
|
|
||||||
(Object.keys(fields)).forEach((NAME) => {
|
(Object.keys(fields)).forEach((NAME) => {
|
||||||
// Remove trailing spaces within the name.
|
// Remove trailing spaces within the name.
|
||||||
NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME;
|
NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME;
|
||||||
|
|
||||||
// Set the referring value.
|
// Set the referring value.
|
||||||
let VALUE = fields[NAME];
|
let VALUE = fields[NAME];
|
||||||
VALUE = (typeof VALUE).includes(`str`) ? VALUE.trim() : VALUE;
|
VALUE = (typeof VALUE).includes(`str`) ? VALUE.trim() : VALUE;
|
||||||
|
|
||||||
if (VALUE && NAME) {
|
if (VALUE && NAME) {
|
||||||
// Check if array.
|
// Check if array.
|
||||||
if ((Array.isArray(VALUE)) ? VALUE.length : false) {
|
if ((Array.isArray(VALUE)) ? VALUE.length : false) {
|
||||||
// Temporarily create an empty list.
|
// Temporarily create an empty list.
|
||||||
DATA[NAME] = [];
|
DATA[NAME] = [];
|
||||||
|
|
||||||
VALUE.forEach((PARTICULAR) => {
|
VALUE.forEach((PARTICULAR) => {
|
||||||
|
@ -114,7 +114,7 @@ export default class scraper {
|
||||||
DATA[NAME].push(read(PARTICULAR));
|
DATA[NAME].push(read(PARTICULAR));
|
||||||
} else {
|
} else {
|
||||||
let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))];
|
let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))];
|
||||||
|
|
||||||
(ELEMENTS && ELEMENTS.length)
|
(ELEMENTS && ELEMENTS.length)
|
||||||
? (ELEMENTS).forEach((ELEMENT) => {
|
? (ELEMENTS).forEach((ELEMENT) => {
|
||||||
DATA[NAME].push(ELEMENT.textContent.trim());
|
DATA[NAME].push(ELEMENT.textContent.trim());
|
||||||
|
@ -135,18 +135,18 @@ export default class scraper {
|
||||||
return DATA;
|
return DATA;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Determine and set the appropriate field source.
|
// Determine and set the appropriate field source.
|
||||||
let FIELDS = (((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false) ? fields : this.fields;
|
let FIELDS = (((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false) ? fields : this.fields;
|
||||||
((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true)
|
((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true)
|
||||||
? this.fields = FIELDS
|
? this.fields = FIELDS
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
// Read the fields.
|
// Read the fields.
|
||||||
(FIELDS)
|
(FIELDS)
|
||||||
? CONTENT = read(FIELDS)
|
? CONTENT = read(FIELDS)
|
||||||
: false;
|
: false;
|
||||||
|
|
||||||
// Set the data if the options doesn't indicate otherwise.
|
// Set the data if the options doesn't indicate otherwise.
|
||||||
(((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT)
|
(((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT)
|
||||||
? this.texts = CONTENT
|
? this.texts = CONTENT
|
||||||
: false;
|
: false;
|
||||||
|
@ -154,135 +154,140 @@ export default class scraper {
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Scrape the images from a page.
|
Scrape the images from a page.
|
||||||
|
It's temporarily disabled due to consequent flagging of the IP address. Also it's output is not yet implemented. This is a future point of expansion (Crit E).
|
||||||
|
|
||||||
@param {Object} fields the fields to scrape
|
@param {Object} fields the fields to scrape
|
||||||
@param {Object} options the options
|
@param {Object} options the options
|
||||||
@return {Object} the blob of the images
|
@return {Object} the blob of the images
|
||||||
*/
|
*/
|
||||||
async getImages(fields, options) {
|
async getImages(fields, options) {
|
||||||
let CONTENT;
|
let DISABLE = true // This is how to disable it
|
||||||
|
|
||||||
/*
|
if (!DISABLE) {
|
||||||
Get the blob of the image in an element.
|
let CONTENT;
|
||||||
|
|
||||||
@param {Element} element the element to get the blob from
|
|
||||||
@return {Blob} the blob of the image
|
|
||||||
*/
|
|
||||||
async function blobbify(element) {
|
|
||||||
/*
|
|
||||||
Get the URL of the image.
|
|
||||||
|
|
||||||
@param {Element} element the element to get the URL from
|
|
||||||
@return {String} the URL of the image
|
|
||||||
*/
|
|
||||||
function reference(element) {
|
|
||||||
let LOCATION;
|
|
||||||
|
|
||||||
// Get using standard attributes.
|
|
||||||
LOCATION = element.getAttribute(`src`);
|
|
||||||
|
|
||||||
if (!LOCATION) {
|
|
||||||
// Use the CSS background image.
|
|
||||||
(window.getComputedStyle(element).backgroundImage)
|
|
||||||
? LOCATION = window.getComputedStyle(element).backgroundImage.slice(4, -1).replace(/"/g, "")
|
|
||||||
: false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the location.
|
|
||||||
return LOCATION;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Get the blob from the URL.
|
Get the blob of the image in an element.
|
||||||
|
|
||||||
@param {String} URL the URL to get the blob from
|
@param {Element} element the element to get the blob from
|
||||||
@return {Blob} the blob of the image
|
@return {Blob} the blob of the image
|
||||||
*/
|
*/
|
||||||
function getBlob(URL) {
|
async function blobbify(element) {
|
||||||
return(net.download(URL, `blob`));
|
/*
|
||||||
}
|
Get the URL of the image.
|
||||||
|
|
||||||
let LOCATION = reference(element);
|
@param {Element} element the element to get the URL from
|
||||||
let BLOB = await getBlob(LOCATION);
|
@return {String} the URL of the image
|
||||||
|
*/
|
||||||
|
function reference(element) {
|
||||||
|
let LOCATION;
|
||||||
|
|
||||||
return ((BLOB.type.includes(`image`)) ? BLOB : null);
|
// Get using standard attributes.
|
||||||
}
|
LOCATION = element.getAttribute(`src`);
|
||||||
|
|
||||||
/* Read for the particular fields. */
|
if (!LOCATION) {
|
||||||
async function read(fields) {
|
// Use the CSS background image.
|
||||||
/*
|
(window.getComputedStyle(element).backgroundImage)
|
||||||
Select all images from an element and get their blobs.
|
? LOCATION = window.getComputedStyle(element).backgroundImage.slice(4, -1).replace(/"/g, "")
|
||||||
|
: false;
|
||||||
@param {Element} element the element to get the images from
|
|
||||||
@return {Array} the blobs of the images
|
|
||||||
*/
|
|
||||||
async function select(element) {
|
|
||||||
let IMAGES = [...element.querySelectorAll(`*`)];
|
|
||||||
let BLOBS = [];
|
|
||||||
|
|
||||||
if (IMAGES && IMAGES.length) {
|
|
||||||
for (let IMAGE of IMAGES) {
|
|
||||||
let BLOB = await blobbify(IMAGE);
|
|
||||||
(BLOB) ? BLOBS.push(BLOB) : false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return the location.
|
||||||
|
return LOCATION;
|
||||||
}
|
}
|
||||||
|
|
||||||
return BLOBS;
|
/*
|
||||||
|
Get the blob from the URL.
|
||||||
|
|
||||||
|
@param {String} URL the URL to get the blob from
|
||||||
|
@return {Blob} the blob of the image
|
||||||
|
*/
|
||||||
|
function getBlob(URL) {
|
||||||
|
return(net.download(URL, `blob`));
|
||||||
|
}
|
||||||
|
|
||||||
|
let LOCATION = reference(element);
|
||||||
|
let BLOB = await getBlob(LOCATION);
|
||||||
|
|
||||||
|
return ((BLOB.type.includes(`image`)) ? BLOB : null);
|
||||||
}
|
}
|
||||||
|
|
||||||
let DATA = []; // Store here the resulting data
|
/* Read for the particular fields. */
|
||||||
|
async function read(fields) {
|
||||||
|
/*
|
||||||
|
Select all images from an element and get their blobs.
|
||||||
|
|
||||||
for (let NAME of Object.keys(fields)) {
|
@param {Element} element the element to get the images from
|
||||||
// Remove trailing spaces within the name.
|
@return {Array} the blobs of the images
|
||||||
NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME;
|
*/
|
||||||
let VALUE = fields[NAME];
|
async function select(element) {
|
||||||
|
let IMAGES = [...element.querySelectorAll(`*`)];
|
||||||
|
let BLOBS = [];
|
||||||
|
|
||||||
if (VALUE && NAME) {
|
if (IMAGES && IMAGES.length) {
|
||||||
// Check if array.
|
for (let IMAGE of IMAGES) {
|
||||||
if (Array.isArray(VALUE)) {
|
let BLOB = await blobbify(IMAGE);
|
||||||
// Temporarily create an empty list.
|
(BLOB) ? BLOBS.push(BLOB) : false;
|
||||||
for (let PARTICULAR of VALUE) {
|
}
|
||||||
if ((typeof PARTICULAR).includes(`obj`) && PARTICULAR && !Array.isArray(PARTICULAR)) {
|
}
|
||||||
DATA = [...DATA, ...(await read(PARTICULAR))];
|
|
||||||
} else {
|
|
||||||
let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))];
|
|
||||||
|
|
||||||
if (ELEMENTS && ELEMENTS.length) {
|
return BLOBS;
|
||||||
for (let ELEMENT of ELEMENTS) {
|
}
|
||||||
let BLOBS = await select(ELEMENT);
|
|
||||||
if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS];
|
let DATA = []; // Store here the resulting data
|
||||||
|
|
||||||
|
for (let NAME of Object.keys(fields)) {
|
||||||
|
// Remove trailing spaces within the name.
|
||||||
|
NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME;
|
||||||
|
let VALUE = fields[NAME];
|
||||||
|
|
||||||
|
if (VALUE && NAME) {
|
||||||
|
// Check if array.
|
||||||
|
if (Array.isArray(VALUE)) {
|
||||||
|
// Temporarily create an empty list.
|
||||||
|
for (let PARTICULAR of VALUE) {
|
||||||
|
if ((typeof PARTICULAR).includes(`obj`) && PARTICULAR && !Array.isArray(PARTICULAR)) {
|
||||||
|
DATA = [...DATA, ...(await read(PARTICULAR))];
|
||||||
|
} else {
|
||||||
|
let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))];
|
||||||
|
|
||||||
|
if (ELEMENTS && ELEMENTS.length) {
|
||||||
|
for (let ELEMENT of ELEMENTS) {
|
||||||
|
let BLOBS = await select(ELEMENT);
|
||||||
|
if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} else if ((typeof VALUE).includes(`obj`) && VALUE) {
|
||||||
} else if ((typeof VALUE).includes(`obj`) && VALUE) {
|
DATA = [...DATA, ...(await read(VALUE))];
|
||||||
DATA = [...DATA, ...(await read(VALUE))];
|
} else if (document.querySelector(VALUE)) {
|
||||||
} else if (document.querySelector(VALUE)) {
|
let ELEMENTS = [...(document.querySelectorAll(VALUE))];
|
||||||
let ELEMENTS = [...(document.querySelectorAll(VALUE))];
|
|
||||||
|
|
||||||
if (ELEMENTS && ELEMENTS.length) {
|
if (ELEMENTS && ELEMENTS.length) {
|
||||||
for (let ELEMENT of ELEMENTS) {
|
for (let ELEMENT of ELEMENTS) {
|
||||||
let BLOBS = await select(ELEMENT);
|
let BLOBS = await select(ELEMENT);
|
||||||
if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS];
|
if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return (DATA);
|
return (DATA);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Read the fields.
|
// Read the fields.
|
||||||
(((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false)
|
(((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false)
|
||||||
? CONTENT = await read(fields)
|
? CONTENT = await read(fields)
|
||||||
: false;
|
: false;
|
||||||
|
|
||||||
// Set the data if the options doesn't indicate otherwise.
|
// Set the data if the options doesn't indicate otherwise.
|
||||||
(((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT)
|
(((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT)
|
||||||
? this.images = CONTENT
|
? this.images = CONTENT
|
||||||
: false;
|
: false;
|
||||||
return (CONTENT);
|
return (CONTENT);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue