2022-10-23 11:22:47 +00:00
|
|
|
#lang racket/base
|
|
|
|
(require racket/generator
|
|
|
|
racket/list
|
|
|
|
racket/string
|
|
|
|
json
|
|
|
|
net/http-easy
|
|
|
|
html-parsing
|
2022-10-24 05:13:14 +00:00
|
|
|
"../src/xexpr-utils.rkt"
|
|
|
|
"../src/url-utils.rkt")
|
2022-10-23 11:22:47 +00:00
|
|
|
|
|
|
|
(define output-file "wiki-names.json")
|
|
|
|
(define limit "5000")
|
|
|
|
|
|
|
|
(define (get-page offset)
|
|
|
|
(define res (get (format "https://community.fandom.com/wiki/Special:NewWikis?~a"
|
|
|
|
(params->query `(("offset" . ,offset)
|
|
|
|
("limit" . ,limit))))))
|
|
|
|
(html->xexp (bytes->string/utf-8 (response-body res))))
|
|
|
|
|
|
|
|
(define (convert-list-items gen)
|
|
|
|
(for/list ([item (in-producer gen #f)])
|
|
|
|
; '(li "\n" "\t" (a (@ (href "http://terra-hexalis.fandom.com/")) "Terra Hexalis Wiki") "\n" "\t\t\ten\t")
|
|
|
|
(hasheq 'title (third (fourth item))
|
|
|
|
'link (second (second (second (fourth item))))
|
|
|
|
'lang (string-trim (sixth item)))))
|
|
|
|
|
|
|
|
(define (get-items-recursive [offset ""] [items null])
|
|
|
|
(define page (get-page offset))
|
|
|
|
(define page-content ((query-selector (attribute-selector 'class "mw-spcontent") page)))
|
|
|
|
(define next ((query-selector (attribute-selector 'class "mw-nextlink") page-content)))
|
|
|
|
(define next-offset
|
|
|
|
(if next
|
|
|
|
(second (regexp-match #rx"offset=([0-9]*)" (get-attribute 'href (bits->attributes next))))
|
|
|
|
#f))
|
|
|
|
(define list-item-generator (query-selector (λ (e a c) (eq? e 'li)) page-content))
|
|
|
|
(define these-items (convert-list-items list-item-generator))
|
|
|
|
(define all-items (append items these-items))
|
|
|
|
(printf "page offset \"~a\" has ~a items (~a so far)~n" offset (length these-items) (length all-items))
|
|
|
|
(if next
|
|
|
|
(get-items-recursive next-offset all-items)
|
|
|
|
all-items))
|
|
|
|
|
|
|
|
(call-with-output-file output-file #:exists 'truncate/replace
|
|
|
|
(λ (out)
|
|
|
|
(write-json (get-items-recursive) out)))
|