scrape texts stored differently
This commit is contained in:
parent
7ad7072456
commit
bd5d4ceef0
1 changed files with 133 additions and 59 deletions
152
scripts/external/scraper.js
vendored
152
scripts/external/scraper.js
vendored
|
@ -2,81 +2,155 @@
|
||||||
Read the contents of the page.
|
Read the contents of the page.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import net from "/scripts/utils/net.js";
|
||||||
|
|
||||||
export default class scraper {
|
export default class scraper {
|
||||||
|
#options;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Scrape fields.
|
Scrape fields.
|
||||||
|
|
||||||
@param {Object} scraper_fields the fields to scrape
|
@param {Object} scraper_fields the fields to scrape
|
||||||
@param {Object} options the options
|
@param {Object} options the options
|
||||||
*/
|
*/
|
||||||
constructor(scraper_fields, options = {"wait until available": true}) {
|
constructor(fields, options) {
|
||||||
let field_content;
|
(((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false)
|
||||||
|
? this.fields = fields
|
||||||
|
: false;
|
||||||
|
this.#options = Object.assign({}, {"scroll": true, "duration": 125, "automatic": true, "background": true}, options);
|
||||||
|
|
||||||
|
if (this.#options.automatic) {
|
||||||
// Quickly scroll down then to where the user already was to get automatically hidden content.
|
// Quickly scroll down then to where the user already was to get automatically hidden content.
|
||||||
function autoscroll() {
|
async function autoscroll(options) {
|
||||||
let SCROLL = {"x": parseInt(window.scrollX), "y": parseInt(window.scrollY)};
|
let SCROLL = {"x": parseInt(window.scrollX), "y": parseInt(window.scrollY)};
|
||||||
|
let DURATION = Math.abs(options[`duration`]);
|
||||||
|
|
||||||
// Repeat every ten milliseconds until 3 times.
|
// Repeat every ten milliseconds until 3 times.
|
||||||
|
function go(position, duration) {
|
||||||
|
Object.assign({}, position, {"behavior": `smooth`})
|
||||||
|
|
||||||
for (let SCROLLS = 1; SCROLLS <= 2; SCROLLS++) {
|
return new Promise(resolve => {
|
||||||
[{"top": document.body.scrollHeight, "left": document.body.scrollWidth}, {"top": 0, "left": 0}].forEach(POSITION => {
|
window.scrollTo(position);
|
||||||
setTimeout(() => {
|
setTimeout(resolve, duration);
|
||||||
window.scrollTo(POSITION);
|
|
||||||
}, 10);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scroll back to user's previous position.
|
// Scroll two times to check for updated data.
|
||||||
setTimeout(() => {
|
for (let SCROLLS = 1; SCROLLS <= 2; SCROLLS++) {
|
||||||
window.scrollTo(SCROLL);
|
for (const POSITION of [{"top": document.body.scrollHeight, "left": document.body.scrollWidth}, {"top": 0, "left": 0}]) {
|
||||||
}, 5)
|
await go(POSITION, DURATION);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const read = () => {
|
// Scroll back to user's previous position.
|
||||||
if ((typeof scraper_fields).includes("object") && scraper_fields != null && scraper_fields) {
|
setTimeout(() => {window.scrollTo(SCROLL);}, DURATION)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check every 1 second to check until autosccroll is done.
|
||||||
|
function wait(OPTIONS) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
// Check if autoscroll is done.
|
||||||
|
if (!((typeof window).includes(`undef`))) {
|
||||||
|
autoscroll(OPTIONS);
|
||||||
|
resolve();
|
||||||
|
} else if (OPTIONS[`scroll`]) {
|
||||||
|
setTimeout(() => {
|
||||||
|
wait(OPTIONS).then(resolve).catch(reject);
|
||||||
|
}, 1000);
|
||||||
|
} else {
|
||||||
|
reject();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
wait(this.#options).then(() => {
|
||||||
|
this.getTexts(this.fields, this.#options);
|
||||||
|
this.getImages(this.fields, this.#options);
|
||||||
|
|
||||||
|
if (this.#options.background) {
|
||||||
|
// Event listener when elements are added or removed.
|
||||||
|
const OBSERVER = new MutationObserver((mutations) => {
|
||||||
|
this.getTexts(this.fields, this.#options);
|
||||||
|
this.getImages(this.fields, this.#options);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Observe the document.
|
||||||
|
OBSERVER.observe(document.body, {"childList": true, "subtree": true});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Scrape the texts of the page.
|
||||||
|
|
||||||
|
@param {Object} fields the fields to scrape
|
||||||
|
@param {Object} options the options
|
||||||
|
@return {Object} the texts
|
||||||
|
*/
|
||||||
|
getTexts(fields, options) {
|
||||||
|
let CONTENT;
|
||||||
|
|
||||||
/* Read for the particular fields. */
|
/* Read for the particular fields. */
|
||||||
function read(fields) {
|
function read(fields) {
|
||||||
let field_data = {};
|
let DATA = {}; // Store here the resulting data
|
||||||
|
|
||||||
(Object.keys(fields)).forEach((FIELD_NAME) => {
|
(Object.keys(fields)).forEach((NAME) => {
|
||||||
let FIELD = {"name": FIELD_NAME, "value": fields[FIELD_NAME]};
|
// Remove trailing spaces within the name.
|
||||||
|
NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME;
|
||||||
|
|
||||||
if (FIELD[`value`]) {
|
// Set the referring value.
|
||||||
|
let VALUE = fields[NAME];
|
||||||
|
VALUE = (typeof VALUE).includes(`str`) ? VALUE.trim() : VALUE;
|
||||||
|
|
||||||
|
if (VALUE && NAME) {
|
||||||
// Check if array.
|
// Check if array.
|
||||||
if (Array.isArray(FIELD[`value`])) {
|
if ((Array.isArray(VALUE)) ? VALUE.length : false) {
|
||||||
// Temporarily create an empty list.
|
// Temporarily create an empty list.
|
||||||
field_data[FIELD[`name`]] = [];
|
DATA[NAME] = [];
|
||||||
|
|
||||||
if (typeof FIELD[`value`][0] == "object" && FIELD[`value`][0] != null && !Array.isArray(FIELD[`value`][0])) {
|
VALUE.forEach((PARTICULAR) => {
|
||||||
field_data[FIELD[`name`]].push(read(FIELD[`value`][0]));
|
if ((typeof PARTICULAR).includes("obj") && PARTICULAR && !Array.isArray(PARTICULAR)) {
|
||||||
|
DATA[NAME].push(read(PARTICULAR));
|
||||||
} else {
|
} else {
|
||||||
let ELEMENTS = (document.querySelectorAll(FIELD[`value`][0]));
|
let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))];
|
||||||
|
|
||||||
if (ELEMENTS.length > 0) {
|
(ELEMENTS && ELEMENTS.length)
|
||||||
(ELEMENTS).forEach((ELEMENT) => {
|
? (ELEMENTS).forEach((ELEMENT) => {
|
||||||
field_data[FIELD[`name`]].push(ELEMENT.innerText);
|
DATA[NAME].push(ELEMENT.textContent.trim());
|
||||||
})
|
})
|
||||||
|
: false;
|
||||||
};
|
};
|
||||||
};
|
})
|
||||||
} else if ((typeof FIELD[`value`]).includes(`obj`) && FIELD[`value`] != null) {
|
} else if ((typeof VALUE).includes(`obj`) && VALUE && !Array.isArray(VALUE)) {
|
||||||
field_data[FIELD[`name`]] = read(FIELD[`value`]);
|
DATA[NAME] = read(VALUE);
|
||||||
} else if (document.querySelector(FIELD[`value`])) {
|
} else if (document.querySelector(VALUE)) {
|
||||||
field_data[FIELD[`name`]] = document.querySelector(FIELD[`value`]).innerText;
|
(document.querySelector(VALUE))
|
||||||
|
? DATA[NAME] = document.querySelector(VALUE).textContent.trim()
|
||||||
|
: false;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
return field_data;
|
return DATA;
|
||||||
};
|
};
|
||||||
field_content = read(scraper_fields);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Object.keys(field_content).length > 0) {
|
// Determine and set the appropriate field source.
|
||||||
(Object.keys(field_content)).forEach((field_name) => {
|
let FIELDS = (((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false) ? fields : this.fields;
|
||||||
this[field_name] = field_content[field_name];
|
((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true)
|
||||||
});
|
? this.fields = FIELDS
|
||||||
}
|
: null;
|
||||||
|
|
||||||
|
// Read the fields.
|
||||||
|
(FIELDS)
|
||||||
|
? CONTENT = read(FIELDS)
|
||||||
|
: false;
|
||||||
|
|
||||||
|
// Set the data if the options doesn't indicate otherwise.
|
||||||
|
(((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT)
|
||||||
|
? this.texts = CONTENT
|
||||||
|
: false;
|
||||||
|
return (CONTENT);
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue