forked from cadence/breezewiki
		
	Add download wiki names script
This commit is contained in:
		
							parent
							
								
									e709b3cea5
								
							
						
					
					
						commit
						bf055836cc
					
				
					 1 changed files with 45 additions and 0 deletions
				
			
		
							
								
								
									
										45
									
								
								misc/download-wiki-names.rkt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								misc/download-wiki-names.rkt
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,45 @@
 | 
				
			||||||
 | 
					#lang racket/base
 | 
				
			||||||
 | 
					(require racket/generator
 | 
				
			||||||
 | 
					         racket/list
 | 
				
			||||||
 | 
					         racket/string
 | 
				
			||||||
 | 
					         json
 | 
				
			||||||
 | 
					         net/http-easy
 | 
				
			||||||
 | 
					         html-parsing
 | 
				
			||||||
 | 
					         "src/xexpr-utils.rkt"
 | 
				
			||||||
 | 
					         "src/url-utils.rkt")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(define output-file "wiki-names.json")
 | 
				
			||||||
 | 
					(define limit "5000")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(define (get-page offset)
 | 
				
			||||||
 | 
					  (define res (get (format "https://community.fandom.com/wiki/Special:NewWikis?~a"
 | 
				
			||||||
 | 
					                           (params->query `(("offset" . ,offset)
 | 
				
			||||||
 | 
					                                            ("limit" . ,limit))))))
 | 
				
			||||||
 | 
					  (html->xexp (bytes->string/utf-8 (response-body res))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(define (convert-list-items gen)
 | 
				
			||||||
 | 
					  (for/list ([item (in-producer gen #f)])
 | 
				
			||||||
 | 
					    ; '(li "\n" "\t" (a (@ (href "http://terra-hexalis.fandom.com/")) "Terra Hexalis Wiki") "\n" "\t\t\ten\t")
 | 
				
			||||||
 | 
					    (hasheq 'title (third (fourth item))
 | 
				
			||||||
 | 
					            'link (second (second (second (fourth item))))
 | 
				
			||||||
 | 
					            'lang (string-trim (sixth item)))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(define (get-items-recursive [offset ""] [items null])
 | 
				
			||||||
 | 
					  (define page (get-page offset))
 | 
				
			||||||
 | 
					  (define page-content ((query-selector (attribute-selector 'class "mw-spcontent") page)))
 | 
				
			||||||
 | 
					  (define next ((query-selector (attribute-selector 'class "mw-nextlink") page-content)))
 | 
				
			||||||
 | 
					  (define next-offset
 | 
				
			||||||
 | 
					    (if next
 | 
				
			||||||
 | 
					        (second (regexp-match #rx"offset=([0-9]*)" (get-attribute 'href (bits->attributes next))))
 | 
				
			||||||
 | 
					        #f))
 | 
				
			||||||
 | 
					  (define list-item-generator (query-selector (λ (e a c) (eq? e 'li)) page-content))
 | 
				
			||||||
 | 
					  (define these-items (convert-list-items list-item-generator))
 | 
				
			||||||
 | 
					  (define all-items (append items these-items))
 | 
				
			||||||
 | 
					  (printf "page offset \"~a\" has ~a items (~a so far)~n" offset (length these-items) (length all-items))
 | 
				
			||||||
 | 
					  (if next
 | 
				
			||||||
 | 
					      (get-items-recursive next-offset all-items)
 | 
				
			||||||
 | 
					      all-items))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(call-with-output-file output-file #:exists 'truncate/replace
 | 
				
			||||||
 | 
					  (λ (out)
 | 
				
			||||||
 | 
					    (write-json (get-items-recursive) out)))
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue