scrape images aside from texts
This commit is contained in:
		
							parent
							
								
									af996cfc1d
								
							
						
					
					
						commit
						7ad7072456
					
				
					 1 changed files with 129 additions and 15 deletions
				
			
		
							
								
								
									
										146
									
								
								scripts/external/scraper.js
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										146
									
								
								scripts/external/scraper.js
									
										
									
									
										vendored
									
									
								
							|  | @ -79,22 +79,136 @@ export default class scraper { | |||
| 			} | ||||
| 		}; | ||||
| 
 | ||||
| 		// Check every 1 second to check until autosccroll is done.
 | ||||
| 		function wait_autoscroll(OPTIONS) { | ||||
| 			return new Promise((resolve, reject) => { | ||||
| 				// Check if autoscroll is done.
 | ||||
| 				if (!((typeof window).includes(`undef`))) { | ||||
| 					autoscroll(); | ||||
| 					resolve(); | ||||
| 				} else if (OPTIONS[`wait until available`]) { | ||||
| 					setTimeout(() => { | ||||
| 						wait_autoscroll().then(resolve).catch(reject); | ||||
| 					}, 1000); | ||||
| 	/* | ||||
| 	Scrape the images from a page.  | ||||
| 
 | ||||
| 	@param {Object} fields the fields to scrape | ||||
| 	@param {Object} options the options | ||||
| 	@return {Object} the blob of the images | ||||
| 	*/ | ||||
| 	async getImages(fields, options) { | ||||
| 		let CONTENT;  | ||||
| 
 | ||||
| 		/* | ||||
| 		Get the blob of the image in an element.  | ||||
| 
 | ||||
| 		@param {Element} element the element to get the blob from | ||||
| 		@return {Blob} the blob of the image | ||||
| 		*/ | ||||
| 		async function blobbify(element) { | ||||
| 			/* | ||||
| 			Get the URL of the image.  | ||||
| 
 | ||||
| 			@param {Element} element the element to get the URL from | ||||
| 			@return {String} the URL of the image | ||||
| 			*/ | ||||
| 			function reference(element) { | ||||
| 				let LOCATION; | ||||
| 
 | ||||
| 				// Get using standard attributes. 
 | ||||
| 				LOCATION = element.getAttribute(`src`); | ||||
| 
 | ||||
| 				if (!LOCATION) { | ||||
| 					// Use the CSS background image.
 | ||||
| 					(window.getComputedStyle(element).backgroundImage) | ||||
| 						? LOCATION = window.getComputedStyle(element).backgroundImage.slice(4, -1).replace(/"/g, "") | ||||
| 						: false; | ||||
| 				} | ||||
| 
 | ||||
| 				// Return the location. 
 | ||||
| 				return LOCATION; | ||||
| 			} | ||||
| 
 | ||||
| 			/* | ||||
| 			Get the blob from the URL.  | ||||
| 
 | ||||
| 			@param {String} URL the URL to get the blob from | ||||
| 			@return {Blob} the blob of the image | ||||
| 			*/ | ||||
| 			function getBlob(URL) { | ||||
| 				return(net.download(URL, `blob`)); | ||||
| 			} | ||||
| 
 | ||||
| 			let LOCATION = reference(element); | ||||
| 			let BLOB = await getBlob(LOCATION); | ||||
| 
 | ||||
| 			return ((BLOB.type.includes(`image`)) ? BLOB : null); | ||||
| 		} | ||||
| 
 | ||||
| 		/* Read for the particular fields. */ | ||||
| 		async function read(fields) { | ||||
| 			/* | ||||
| 			Select all images from an element and get their blobs.  | ||||
| 
 | ||||
| 			@param {Element} element the element to get the images from | ||||
| 			@return {Array} the blobs of the images | ||||
| 			*/ | ||||
| 			async function select(element) { | ||||
| 				let IMAGES = [...element.querySelectorAll(`*`)]; | ||||
| 				let BLOBS = []; | ||||
| 
 | ||||
| 				if (IMAGES && IMAGES.length) { | ||||
| 					for (let IMAGE of IMAGES) { | ||||
| 						let BLOB = await blobbify(IMAGE); | ||||
| 						(BLOB) ? BLOBS.push(BLOB) : false; | ||||
| 					} | ||||
| 				} | ||||
| 
 | ||||
| 				return BLOBS; | ||||
| 			} | ||||
| 
 | ||||
| 			let DATA = []; // Store here the resulting data
 | ||||
| 
 | ||||
| 			for (let NAME of Object.keys(fields)) { | ||||
| 				// Remove trailing spaces within the name. 
 | ||||
| 				NAME = (typeof NAME).includes(`str`) ? NAME.trim() : NAME; | ||||
| 				let VALUE = fields[NAME]; | ||||
| 
 | ||||
| 				if (VALUE && NAME) { | ||||
| 					// Check if array.
 | ||||
| 					if (Array.isArray(VALUE)) { | ||||
| 						// Temporarily create an empty list. 
 | ||||
| 						for (let PARTICULAR of VALUE) { | ||||
| 							if ((typeof PARTICULAR).includes(`obj`) && PARTICULAR && !Array.isArray(PARTICULAR)) { | ||||
| 								DATA = [...DATA, ...(await read(PARTICULAR))]; | ||||
| 				} else { | ||||
| 					reject(); | ||||
| 				} | ||||
| 			}); | ||||
| 		} | ||||
| 		wait_autoscroll(options).then(() => {read();}); | ||||
| 								let ELEMENTS = [...(document.querySelectorAll(PARTICULAR))]; | ||||
| 
 | ||||
| 								if (ELEMENTS && ELEMENTS.length) { | ||||
| 									for (let ELEMENT of ELEMENTS) { | ||||
| 										let BLOBS = await select(ELEMENT); | ||||
| 										if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS]; | ||||
| 									} | ||||
| 								} | ||||
| 							} | ||||
| 						} | ||||
| 					} else if ((typeof VALUE).includes(`obj`) && VALUE) { | ||||
| 						DATA = [...DATA, ...(await read(VALUE))]; | ||||
| 					} else if (document.querySelector(VALUE)) { | ||||
| 						let ELEMENTS = [...(document.querySelectorAll(VALUE))]; | ||||
| 
 | ||||
| 						if (ELEMENTS && ELEMENTS.length) { | ||||
| 							for (let ELEMENT of ELEMENTS) { | ||||
| 								let BLOBS = await select(ELEMENT); | ||||
| 								if (BLOBS && BLOBS.length) DATA = [...DATA, ...BLOBS]; | ||||
| 							} | ||||
| 						} | ||||
| 					} | ||||
| 				} | ||||
| 			} | ||||
| 
 | ||||
| 			return (DATA); | ||||
| 		}; | ||||
| 
 | ||||
| 		// Read the fields. 
 | ||||
| 		(((typeof fields).includes(`obj`) && fields) ? Object.keys(fields).length : false) | ||||
| 			? CONTENT = await read(fields) | ||||
| 			: false; | ||||
| 
 | ||||
| 		// Set the data if the options doesn't indicate otherwise. 
 | ||||
| 		(((((typeof options).includes(`obj`) && options) ? Object.hasOwn(`update`) : false) ? options[`update`] : true) && CONTENT) | ||||
| 			? this.images = CONTENT | ||||
| 			: false; | ||||
| 		return (CONTENT); | ||||
| 	} | ||||
| } | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue