From 7ad7072456c11640a706b198a9df6585855e7e49 Mon Sep 17 00:00:00 2001 From: buzz-lightsnack-2007 <73412182+buzz-lightsnack-2007@users.noreply.github.com> Date: Sat, 11 May 2024 22:50:18 +0800 Subject: [PATCH] scrape images aside from texts --- scripts/external/scraper.js | 144 ++++++++++++++++++++++++++++++++---- 1 file changed, 129 insertions(+), 15 deletions(-) diff --git a/scripts/external/scraper.js b/scripts/external/scraper.js index 9f06293..b3210d5 100644 --- a/scripts/external/scraper.js +++ b/scripts/external/scraper.js @@ -79,22 +79,136 @@ export default class scraper { } }; - // Check every 1 second to check until autosccroll is done. - function wait_autoscroll(OPTIONS) { - return new Promise((resolve, reject) => { - // Check if autoscroll is done. - if (!((typeof window).includes(`undef`))) { - autoscroll(); - resolve(); - } else if (OPTIONS[`wait until available`]) { - setTimeout(() => { - wait_autoscroll().then(resolve).catch(reject); - }, 1000); - } else { - reject(); + /* + Scrape the images from a page. + + @param {Object} fields the fields to scrape + @param {Object} options the options + @return {Object} the blob of the images + */ + async getImages(fields, options) { + let CONTENT; + + /* + Get the blob of the image in an element. + + @param {Element} element the element to get the blob from + @return {Blob} the blob of the image + */ + async function blobbify(element) { + /* + Get the URL of the image. + + @param {Element} element the element to get the URL from + @return {String} the URL of the image + */ + function reference(element) { + let LOCATION; + + // Get using standard attributes. + LOCATION = element.getAttribute(`src`); + + if (!LOCATION) { + // Use the CSS background image. + (window.getComputedStyle(element).backgroundImage) + ? LOCATION = window.getComputedStyle(element).backgroundImage.slice(4, -1).replace(/"/g, "") + : false; } - }); + + // Return the location. + return LOCATION; + } + + /* + Get the blob from the URL. + + @param {String} URL the URL to get the blob from + @return {Blob} the blob of the image + */ + function getBlob(URL) { + return(net.download(URL, `blob`)); + } + + let LOCATION = reference(element); + let BLOB = await getBlob(LOCATION); + + return ((BLOB.type.includes(`image`)) ? BLOB : null); } - wait_autoscroll(options).then(() => {read();}); + + /* Read for the particular fields. */ + async function read(fields) { + /* + Select all images from an element and get their blobs. + + @param {Element} element the element to get the images from + @return {Array} the blobs of the images + */ + async function select(element) { + let IMAGES = [...element.querySelectorAll(`*`)]; + let BLOBS = []; + + if (IMAGES && IMAGES.length) { + for (let IMAGE of IMAGES) { + let BLOB = await blobbify(IMAGE); + (BLOB) ? BLOBS.push(BLOB) : false; + } + } + + return BLOBS; + } + + let DATA = []; // Store here the resulting data + + for (let NAME of Object.keys(fields)) { + // Remove trailing spaces within the name. + NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME; + let VALUE = fields[NAME]; + + if (VALUE && NAME) { + // Check if array. + if (Array.isArray(VALUE)) { + // Temporarily create an empty list. + for (let PARTICULAR of VALUE) { + if ((typeof PARTICULAR).includes(`obj`) && PARTICULAR && !Array.isArray(PARTICULAR)) { + DATA = [...DATA, ...(await read(PARTICULAR))]; + } else { + let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))]; + + if (ELEMENTS && ELEMENTS.length) { + for (let ELEMENT of ELEMENTS) { + let BLOBS = await select(ELEMENT); + if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS]; + } + } + } + } + } else if ((typeof VALUE).includes(`obj`) && VALUE) { + DATA = [...DATA, ...(await read(VALUE))]; + } else if (document.querySelector(VALUE)) { + let ELEMENTS = [...(document.querySelectorAll(VALUE))]; + + if (ELEMENTS && ELEMENTS.length) { + for (let ELEMENT of ELEMENTS) { + let BLOBS = await select(ELEMENT); + if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS]; + } + } + } + } + } + + return (DATA); + }; + + // Read the fields. + (((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false) + ? CONTENT = await read(fields) + : false; + + // Set the data if the options doesn't indicate otherwise. + (((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT) + ? this.images = CONTENT + : false; + return (CONTENT); } } \ No newline at end of file