Compare commits
2 commits
aea627b27f
...
a57445abcb
Author | SHA1 | Date | |
---|---|---|---|
a57445abcb | |||
27c9680f5b |
9 changed files with 397 additions and 323 deletions
|
@ -35,9 +35,12 @@
|
|||
(output-lines? #t)]))
|
||||
(define (update-width)
|
||||
(when (output-progress?)
|
||||
(with-charterm
|
||||
(call-with-values (λ () (charterm-screen-size))
|
||||
(λ (cols rows) (set! width cols))))))
|
||||
(case (system-type 'os)
|
||||
[(linux)
|
||||
(with-charterm
|
||||
(call-with-values (λ () (charterm-screen-size))
|
||||
(λ (cols rows) (set! width cols))))]
|
||||
[else 100])))
|
||||
(update-width)
|
||||
;; check
|
||||
(when (or (not wikiname) (equal? wikiname ""))
|
||||
|
@ -56,8 +59,8 @@
|
|||
(define real-width (min (string-length basename) rest))
|
||||
(define spare-width (- rest real-width))
|
||||
(define name-display (substring basename 0 real-width))
|
||||
(define whitespace (make-string spare-width #\ ))
|
||||
(printf "~a~a~a\r" prefix name-display whitespace)]))
|
||||
(printf "\e[2K\r~a~a" prefix name-display)
|
||||
(flush-output)]))
|
||||
;; download all stages
|
||||
(for ([stage all-stages]
|
||||
[i (in-naturals 1)])
|
||||
|
|
|
@ -48,7 +48,8 @@
|
|||
((query-exec slc* "alter table wiki add column sitename TEXT")
|
||||
(query-exec slc* "alter table wiki add column basepage TEXT")
|
||||
(query-exec slc* "alter table wiki add column license_text TEXT")
|
||||
(query-exec slc* "alter table wiki add column license_url TEXT"))))
|
||||
(query-exec slc* "alter table wiki add column license_url TEXT"))
|
||||
((query-exec slc* "alter table page add column redirect"))))
|
||||
|
||||
(let do-migrate-step ()
|
||||
(when (database-version . < . (length migrations))
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#lang racket/base
|
||||
(require racket/file
|
||||
racket/format
|
||||
racket/function
|
||||
racket/list
|
||||
racket/path
|
||||
|
@ -43,10 +44,18 @@
|
|||
wikiname
|
||||
(params->query '(("action" . "query")
|
||||
("meta" . "siteinfo")
|
||||
("siprop" . "general|rightsinfo|statistics")
|
||||
("siprop" . "general|rightsinfo|statistics|namespaces")
|
||||
("format" . "json")
|
||||
("formatversion" . "2")))))
|
||||
(define data (response-json (get dest-url)))
|
||||
(define content-nss
|
||||
(sort
|
||||
(for/list ([(k v) (in-hash (jp "/query/namespaces" data))]
|
||||
#:do [(define id (hash-ref v 'id))]
|
||||
#:when (and (id . < . 2900) ; exclude maps namespace
|
||||
(hash-ref v 'content))) ; exclude non-content and talk namespaces
|
||||
id)
|
||||
<))
|
||||
(define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
|
||||
(if (and exists? (not (sql-null? exists?)))
|
||||
(query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?"
|
||||
|
@ -61,7 +70,8 @@
|
|||
(second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
|
||||
(jp "/query/rightsinfo/text" data)
|
||||
(jp "/query/rightsinfo/url" data)))
|
||||
(jp "/query/statistics/articles" data))
|
||||
(values (jp "/query/statistics/articles" data)
|
||||
content-nss))
|
||||
|
||||
|
||||
(define (check-style-for-images wikiname path)
|
||||
|
@ -131,48 +141,57 @@
|
|||
;; done yet?
|
||||
(unless (and (real? wiki-progress) (wiki-progress . >= . 1))
|
||||
;; Count total pages
|
||||
(define num-pages (insert-wiki-entry wikiname))
|
||||
(define-values (num-pages namespaces) (insert-wiki-entry wikiname))
|
||||
;; Download the entire index of pages
|
||||
(define basenames
|
||||
(let loop ([path-with-namefrom "/wiki/Local_Sitemap"]
|
||||
[basenames-previous-pages null])
|
||||
;; Download the current index page
|
||||
(define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom))
|
||||
(define r (get url))
|
||||
;; Metadata from this page (the link to the next page)
|
||||
(define page (html->xexp (bytes->string/utf-8 (response-body r))))
|
||||
(define link-namefrom
|
||||
((query-selector (λ (t a c x) (and (eq? t 'a)
|
||||
(pair? x)
|
||||
(string-contains? (car x) "Next page")
|
||||
(let ([href (get-attribute 'href a)] )
|
||||
(and href (string-contains? href "/wiki/Local_Sitemap")))))
|
||||
page #:include-text? #t)))
|
||||
;; Content from this page
|
||||
(define basenames-this-page
|
||||
(for/list ([link (in-producer
|
||||
(query-selector
|
||||
(λ (t a c) (eq? t 'a))
|
||||
((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page)))
|
||||
#f)])
|
||||
(local-encoded-url->basename (get-attribute 'href (bits->attributes link)))))
|
||||
;; Call the progress callback
|
||||
(define all-basenames (append basenames-previous-pages basenames-this-page))
|
||||
(callback (length all-basenames) num-pages (last all-basenames))
|
||||
;; Recurse to download from the next page
|
||||
(if link-namefrom
|
||||
(loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames)
|
||||
all-basenames)))
|
||||
;; Save those pages into the database
|
||||
;; SQLite can have a maximum of 32766 parameters in a single query
|
||||
(for ([slice (in-slice 32760 basenames)])
|
||||
(define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values "))
|
||||
(call-with-transaction
|
||||
(get-slc)
|
||||
(λ ()
|
||||
(apply query-exec* query-template wikiname slice)
|
||||
;; Record that we have the complete list of pages
|
||||
(query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname))))))
|
||||
(for*/fold ([total 0])
|
||||
([namespace namespaces]
|
||||
[redir-filter '("nonredirects" "redirects")])
|
||||
(let loop ([apcontinue ""]
|
||||
[basenames null])
|
||||
(cond
|
||||
[apcontinue
|
||||
(define url (format "https://~a.fandom.com/api.php?~a"
|
||||
wikiname
|
||||
(params->query `(("action" . "query")
|
||||
("list" . "allpages")
|
||||
("apnamespace" . ,(~a namespace))
|
||||
("apfilterredir" . ,redir-filter)
|
||||
("aplimit" . "500")
|
||||
("apcontinue" . ,apcontinue)
|
||||
("format" . "json")
|
||||
("formatversion" . "2")))))
|
||||
;; Download the current listing page
|
||||
(define res (get url))
|
||||
(define json (response-json res))
|
||||
;; Content from this page
|
||||
(define current-basenames
|
||||
(for/list ([page (jp "/query/allpages" json)])
|
||||
(title->basename (jp "/title" page))))
|
||||
(when ((length current-basenames) . > . 0)
|
||||
;; Report
|
||||
(if (equal? redir-filter "nonredirects")
|
||||
(callback (+ (length basenames) (length current-basenames) total) num-pages (last current-basenames))
|
||||
(callback total num-pages (last current-basenames))))
|
||||
;; Loop
|
||||
(loop (jp "/continue/apcontinue" json #f) (append basenames current-basenames))]
|
||||
[else
|
||||
;; All done with this (loop)! Save those pages into the database
|
||||
;; SQLite can have a maximum of 32766 parameters in a single query
|
||||
(begin0
|
||||
;; next for*/fold
|
||||
(if (equal? redir-filter "nonredirects")
|
||||
(+ (length basenames) total)
|
||||
total) ; redirects don't count for the site statistics total
|
||||
(call-with-transaction
|
||||
(get-slc)
|
||||
(λ ()
|
||||
(for ([slice (in-slice 32760 basenames)])
|
||||
(define query-template
|
||||
(string-join #:before-first "insert or ignore into page (wikiname, redirect, basename, progress) values "
|
||||
(make-list (length slice) "(?1, ?2, ?, 0)") ", "))
|
||||
(apply query-exec* query-template wikiname (if (equal? redir-filter "redirects") 1 sql-null) slice)))))])))
|
||||
;; Record that we have the complete list of pages
|
||||
(query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname)))
|
||||
|
||||
|
||||
;; 2. Download each page via API and:
|
||||
|
@ -183,7 +202,7 @@
|
|||
(define save-dir (build-path archive-root wikiname))
|
||||
(make-directory* save-dir)
|
||||
;; gather list of basenames to download (that aren't yet complete)
|
||||
(define basenames (query-list* "select basename from page where wikiname = ? and progress < ?"
|
||||
(define basenames (query-list* "select basename from page where wikiname = ? and progress < ? and redirect is null"
|
||||
wikiname max-page-progress))
|
||||
;; counter of complete/incomplete basenames
|
||||
(define already-done-count
|
||||
|
@ -222,10 +241,41 @@
|
|||
(query-exec* "update page set progress = 1 where wikiname = ? and basename = ?"
|
||||
wikiname basename)
|
||||
(callback i total-count basename))
|
||||
;; save redirects as well
|
||||
(save-redirects wikiname callback (+ already-done-count (length basenames)) total-count)
|
||||
;; saved all pages, register that fact in the database
|
||||
(query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname))
|
||||
|
||||
|
||||
;; 2.5. Download each redirect-target via API and save mapping in database
|
||||
(define (save-redirects wikiname callback already-done-count total-count)
|
||||
(define basenames (query-list* "select basename from page where wikiname = ? and progress < ? and redirect = 1"
|
||||
wikiname max-page-progress))
|
||||
;; loop through basenames, in slices of 50 (MediaWiki API max per request), and download
|
||||
(for ([basename basenames]
|
||||
[i (in-naturals (add1 already-done-count))])
|
||||
(define dest-url
|
||||
(format "https://~a.fandom.com/api.php?~a"
|
||||
wikiname
|
||||
(params->query `(("action" . "query")
|
||||
("prop" . "links")
|
||||
("titles" . ,(basename->name-for-query basename))
|
||||
("format" . "json")
|
||||
("formatversion" . "2")))))
|
||||
(define res (get dest-url))
|
||||
(define json (response-json res))
|
||||
(define dest-title (jp "/query/pages/0/links/0/title" json #f))
|
||||
(callback i total-count basename)
|
||||
(cond
|
||||
[dest-title
|
||||
;; store it
|
||||
(define dest-basename (title->basename dest-title))
|
||||
(query-exec* "update page set progress = 1, redirect = ? where wikiname = ? and basename = ?" dest-basename wikiname basename)]
|
||||
[else
|
||||
;; the page just doesn't exist
|
||||
(query-exec* "delete from page where wikiname = ? and basename = ?" wikiname basename)])))
|
||||
|
||||
|
||||
;; 3. Download CSS and:
|
||||
;; * Save CSS to file
|
||||
;; * Record style images to database
|
||||
|
|
402
archiver/fts.rkt
402
archiver/fts.rkt
|
@ -1,202 +1,200 @@
|
|||
#lang racket
|
||||
(require racket/function
|
||||
racket/future
|
||||
racket/match
|
||||
racket/path
|
||||
racket/promise
|
||||
racket/port
|
||||
racket/string
|
||||
file/gunzip
|
||||
db
|
||||
db/unsafe/sqlite3
|
||||
net/http-easy
|
||||
json
|
||||
json-pointer
|
||||
"../lib/html-parsing/main.rkt"
|
||||
"../lib/xexpr-utils.rkt"
|
||||
"../lib/tree-updater.rkt")
|
||||
|
||||
(define-syntax (seq stx)
|
||||
(syntax-case stx ()
|
||||
[(_ body ...)
|
||||
#`(for ([op (list (lambda () body) ...)]
|
||||
[i (in-naturals)])
|
||||
(define res (op))
|
||||
(when (>= (response-status-code res) 400)
|
||||
(error 'seq "op #~a: status code was ~a: ~v" i (response-status-code res) (response-json res)))
|
||||
(define taskuid (json-pointer-value "/taskUid" (response-json res)))
|
||||
(for/or ([ticks (in-naturals)]
|
||||
[res2 (in-producer (lambda () (get (format "http://localhost:7700/tasks/~a" taskuid))))])
|
||||
(define status (json-pointer-value "/status" (response-json res2)))
|
||||
(case status
|
||||
[("enqueued" "processing")
|
||||
(sleep 1)
|
||||
#f]
|
||||
[("succeeded")
|
||||
(printf "op #~a: ~a (~a ticks)~n" i status ticks)
|
||||
#t]
|
||||
[else
|
||||
(error 'seq "op #~a: task status was ~a: ~v" i status res2)])))]))
|
||||
|
||||
(define (class-has? attributes substrs)
|
||||
(define cl (or (get-attribute 'class attributes) ""))
|
||||
(ormap (λ (substr) (string-contains? cl substr)) substrs))
|
||||
|
||||
(define (updater element element-type attributes children)
|
||||
(cond
|
||||
[(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews"))
|
||||
(list 'div '() '())]
|
||||
[#t
|
||||
(list element-type attributes children)]))
|
||||
|
||||
(define slc (sqlite3-connect #:database "../storage/fts-separate.db"))
|
||||
(sqlite3-load-extension slc "fts5")
|
||||
|
||||
(define (writer tables-mode? page)
|
||||
(define (writer-inner page)
|
||||
(for ([bit page])
|
||||
(cond
|
||||
[(and tables-mode? (pair? bit) (memq (car bit) '(h1 h2 h3 p blockquote q))) (void)]
|
||||
[(and (not tables-mode?) (pair? bit) (memq (car bit) '(ul ol dl table))) (void)]
|
||||
[(memq bit '(div p li td dd dt br)) (displayln "")]
|
||||
[(symbol? bit) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '@)) (void)]
|
||||
[(pair? bit) (writer-inner bit)]
|
||||
[(string? bit) (display bit)])))
|
||||
(writer-inner page))
|
||||
|
||||
(define (write-and-post-process tables-mode? page)
|
||||
(define text (with-output-to-string (λ () (writer tables-mode? page))))
|
||||
;; (define text-no-numbers (regexp-replace* #px"(?:-|[+$£€¥] *)?[0-9,.]{2,}%?\\s*" text ""))
|
||||
(define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n"))
|
||||
shrink-text)
|
||||
|
||||
(define wikiname "bloons")
|
||||
(define tablename (format "page_~a" wikiname))
|
||||
|
||||
(define ((extract f)) ; f - filename
|
||||
(with-handlers
|
||||
([exn:fail? (λ (err) (println f) (raise err))])
|
||||
(define j
|
||||
(case (path-get-extension f)
|
||||
[(#".json")
|
||||
(with-input-from-file f (λ () (read-json)))]
|
||||
[(#".gz")
|
||||
(define-values (in out) (make-pipe))
|
||||
(with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out)))
|
||||
(read-json in)]
|
||||
[else #f]))
|
||||
(define title (json-pointer-value "/parse/title" j))
|
||||
(define pageid (json-pointer-value "/parse/pageid" j))
|
||||
(define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j)))
|
||||
(define page (update-tree updater (html->xexp page-html)))
|
||||
(define body (write-and-post-process #f page))
|
||||
(define table (write-and-post-process #t page))
|
||||
(values title body table pageid)))
|
||||
|
||||
(define results
|
||||
(for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)]
|
||||
#:when (member (path-get-extension f) '(#".json" #".gz")))
|
||||
(extract f)))
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; TESTING WRITER
|
||||
;; ***************************************************************************************************
|
||||
#;(for/first ([fut results]
|
||||
[i (in-naturals 1)]
|
||||
#:when (i . >= . 4859))
|
||||
(define-values (title body table pageid) (fut))
|
||||
(println title)
|
||||
(println body)
|
||||
(println table))
|
||||
|
||||
(println "inserting...")
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; SQLite FTS5
|
||||
;; ***************************************************************************************************
|
||||
#;(begin
|
||||
(query-exec slc "begin transaction")
|
||||
#;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname))
|
||||
(time
|
||||
(for ([fut results]
|
||||
[i (in-naturals 1)])
|
||||
(display "-")
|
||||
(when (and (> i 0) (= (modulo i 100) 0))
|
||||
(println i))
|
||||
(define-values (title shrink-text) (fut))
|
||||
(query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text)))
|
||||
|
||||
(println "running optimize...")
|
||||
(query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename))
|
||||
|
||||
(println "committing...")
|
||||
(query-exec slc "commit"))
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; Solr
|
||||
;; ***************************************************************************************************
|
||||
(begin
|
||||
(define data
|
||||
(cond
|
||||
#;[(file-exists? "cache.rkt")
|
||||
(println "reading in...")
|
||||
(with-input-from-file "cache.rkt" (λ () (read)))]
|
||||
[else
|
||||
(define data
|
||||
(for/list ([fut results]
|
||||
[i (in-naturals 1)])
|
||||
(display "-")
|
||||
(when (and (> i 0) (= (modulo i 100) 0))
|
||||
(println i))
|
||||
(define-values (title body table pageid) (fut))
|
||||
(define len (string-length body))
|
||||
`#hasheq((id . ,(number->string pageid))
|
||||
(title . ,title)
|
||||
(body . ,body)
|
||||
(table . ,table)
|
||||
(len . ,len))))
|
||||
|
||||
(println "writing out...")
|
||||
(with-output-to-file "cache.rkt" (λ () (write data)) #:exists 'truncate/replace)
|
||||
data]))
|
||||
|
||||
(println "posting...")
|
||||
(define res
|
||||
(post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname)
|
||||
#:json data)))
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; Meilisearch
|
||||
;; ***************************************************************************************************
|
||||
#;(begin
|
||||
(seq
|
||||
(put (format "http://localhost:7700/indexes/~a/settings/searchable-attributes" wikiname)
|
||||
#:json '("title" "body"))
|
||||
(put (format "http://localhost:7700/indexes/~a/settings/ranking-rules" wikiname)
|
||||
#:json '("words" "typo" #;"proximity" "attribute" "sort" "exactness" #;"len:desc"))
|
||||
(call-with-input-file "stop-words.json"
|
||||
(λ (in)
|
||||
(put (format "http://localhost:7700/indexes/~a/settings/stop-words" wikiname)
|
||||
#:headers '#hasheq((Content-Type . "application/json"))
|
||||
#:data in))))
|
||||
(define data
|
||||
(for/list ([fut results]
|
||||
[i (in-naturals 1)])
|
||||
(display "-")
|
||||
(when (and (> i 0) (= (modulo i 100) 0))
|
||||
(println i))
|
||||
(define-values (title body pageid) (fut))
|
||||
(define len (string-length body))
|
||||
`#hasheq((id . ,pageid)
|
||||
(title . ,title)
|
||||
(body . ,body)
|
||||
(len . ,len))))
|
||||
(define res
|
||||
(post (format "http://localhost:7700/indexes/~a/documents" wikiname)
|
||||
#:json data))
|
||||
(seq res)
|
||||
(println (response-json res)))
|
||||
|
||||
(disconnect slc)
|
||||
#lang cli
|
||||
(require (for-syntax racket/base))
|
||||
(require racket/format
|
||||
racket/function
|
||||
racket/future
|
||||
racket/match
|
||||
racket/path
|
||||
racket/promise
|
||||
racket/port
|
||||
racket/runtime-path
|
||||
racket/string
|
||||
file/gunzip
|
||||
db
|
||||
db/unsafe/sqlite3
|
||||
net/http-easy
|
||||
json
|
||||
json-pointer
|
||||
"../lib/html-parsing/main.rkt"
|
||||
"../lib/xexpr-utils.rkt"
|
||||
"../lib/tree-updater.rkt")
|
||||
|
||||
(flag (read-from-cache?)
|
||||
("-c" "--read-from-cache" "read from last run cache instead of rebuilding documents")
|
||||
(read-from-cache? #t))
|
||||
|
||||
(define-runtime-path storage-path "../storage/archive")
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; Progress bar display
|
||||
;; ***************************************************************************************************
|
||||
|
||||
(struct progress^ (n max title) #:transparent)
|
||||
|
||||
(define (make-m-s seconds)
|
||||
(define-values (eta-m eta-s) (quotient/remainder seconds 60))
|
||||
(format "~a:~a" eta-m (~a eta-s #:width 2 #:align 'right #:pad-string "0")))
|
||||
|
||||
(define (make-progress get-p [history-size 20])
|
||||
(define update-sleep 1)
|
||||
(define name-width 30)
|
||||
(define max-width 105)
|
||||
(define history (make-vector history-size 0))
|
||||
(define history-pointer 0)
|
||||
(define elapsed 0)
|
||||
(define (report-progress)
|
||||
(define p (get-p))
|
||||
(define history-cycle (vector-ref history history-pointer))
|
||||
(vector-set! history history-pointer (progress^-n p))
|
||||
(set! history-pointer (modulo (add1 history-pointer) history-size))
|
||||
(set! elapsed (add1 elapsed))
|
||||
(define-values (eta-display diff-per-second)
|
||||
(cond
|
||||
[((progress^-n p) . >= . (progress^-max p)) (values (format "~a **" (make-m-s elapsed)) (format "** ~a" (quotient (progress^-max p) (max elapsed 1))))]
|
||||
[(= history-cycle 0) (values "-:--" "--")]
|
||||
[else (define diff-per-second (/ (- (progress^-n p) history-cycle) (* history-size update-sleep)))
|
||||
(define eta-total
|
||||
(if (diff-per-second . > . 0)
|
||||
(floor (round (/ (- (progress^-max p) (progress^-n p)) diff-per-second)))
|
||||
0))
|
||||
(values (make-m-s eta-total)
|
||||
(round diff-per-second))]))
|
||||
(define left (format "~a/~a ~a/s ~a ~a%"
|
||||
(~a (progress^-n p) #:width (string-length (~a (progress^-max p))) #:align 'right #:pad-string " ")
|
||||
(progress^-max p)
|
||||
diff-per-second
|
||||
eta-display
|
||||
(floor (* 100 (/ (progress^-n p) (progress^-max p))))))
|
||||
(define name-display (~a (progress^-title p) #:max-width name-width #:limit-marker "..."))
|
||||
(define remaining-space (- max-width name-width (string-length left) 2))
|
||||
(define bar-width
|
||||
(floor (* (sub1 remaining-space)
|
||||
(/ (progress^-n p) (progress^-max p)))))
|
||||
(define bar (string-append (make-string bar-width #\=)
|
||||
">"
|
||||
(make-string (- remaining-space bar-width) #\ )))
|
||||
(printf "\e[2K\r~a~a~a" left bar name-display)
|
||||
(flush-output))
|
||||
(define (report-progress-loop)
|
||||
(sleep update-sleep)
|
||||
(report-progress)
|
||||
(report-progress-loop))
|
||||
(define t (thread report-progress-loop))
|
||||
(define (quit)
|
||||
(kill-thread t)
|
||||
(report-progress)
|
||||
(displayln ""))
|
||||
quit)
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; Page text extractor
|
||||
;; ***************************************************************************************************
|
||||
|
||||
(define (class-has? attributes substrs)
|
||||
(define cl (or (get-attribute 'class attributes) ""))
|
||||
(ormap (λ (substr) (string-contains? cl substr)) substrs))
|
||||
|
||||
(define (updater element element-type attributes children)
|
||||
(cond
|
||||
[(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews"))
|
||||
(list 'div '() '())]
|
||||
[#t
|
||||
(list element-type attributes children)]))
|
||||
|
||||
(define (writer tables-mode? page)
|
||||
(define (writer-inner page)
|
||||
(for ([bit page])
|
||||
(cond
|
||||
[(and tables-mode? (pair? bit) (memq (car bit) '(h1 h2 h3 p blockquote q))) (void)]
|
||||
[(and (not tables-mode?) (pair? bit) (memq (car bit) '(ul ol dl table))) (void)]
|
||||
[(memq bit '(div p li td dd dt br)) (displayln "")]
|
||||
[(symbol? bit) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '@)) (void)]
|
||||
[(pair? bit) (writer-inner bit)]
|
||||
[(string? bit) (display bit)])))
|
||||
(writer-inner page))
|
||||
|
||||
(define (write-and-post-process tables-mode? page)
|
||||
(define text (with-output-to-string (λ () (writer tables-mode? page))))
|
||||
;; (define text-no-numbers (regexp-replace* #px"(?:-|[+$£€¥] *)?[0-9,.]{2,}%?\\s*" text ""))
|
||||
(define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n"))
|
||||
shrink-text)
|
||||
|
||||
(define ((extract f)) ; f - filename
|
||||
(with-handlers
|
||||
([exn:fail? (λ (err) (printf "extract: ~a: ~v~n" f err))])
|
||||
(define j
|
||||
(case (path-get-extension f)
|
||||
[(#".json")
|
||||
(with-input-from-file f (λ () (read-json)))]
|
||||
[(#".gz")
|
||||
(define-values (in out) (make-pipe))
|
||||
(with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out)))
|
||||
(read-json in)]
|
||||
[else #f]))
|
||||
(define title (json-pointer-value "/parse/title" j))
|
||||
(define pageid (json-pointer-value "/parse/pageid" j))
|
||||
(define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j)))
|
||||
(define page (update-tree updater (html->xexp page-html)))
|
||||
(define body (write-and-post-process #f page))
|
||||
(define table (write-and-post-process #t page))
|
||||
(list title body table pageid)))
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; Program, loop, Solr APIs
|
||||
;; ***************************************************************************************************
|
||||
|
||||
(program
|
||||
(start [wikiname "wikiname to download"])
|
||||
|
||||
(define results
|
||||
(for/list ([f (directory-list (build-path storage-path wikiname) #:build? #t)]
|
||||
#:when (member (path-get-extension f) '(#".gz")))
|
||||
(extract f)))
|
||||
|
||||
(define data
|
||||
(cond
|
||||
[(and (read-from-cache?) (file-exists? "cache.rkt"))
|
||||
(displayln "Reading in...")
|
||||
(with-input-from-file "cache.rkt" (λ () (read)))]
|
||||
[else
|
||||
(define x (box (progress^ 0 1 "...")))
|
||||
(define quit (make-progress (λ () (unbox x))))
|
||||
(define data
|
||||
(for/list ([fut results]
|
||||
[i (in-naturals 1)]
|
||||
#:do [(define page (fut))]
|
||||
#:when (not (void? page)))
|
||||
(match-define (list title body table pageid) page)
|
||||
(define len (string-length body))
|
||||
(set-box! x (progress^ i (length results) title))
|
||||
`#hasheq((id . ,(number->string pageid))
|
||||
(title . ,title)
|
||||
(body . ,body)
|
||||
(table . ,table)
|
||||
(len . ,len))))
|
||||
(quit)
|
||||
|
||||
(display "Writing out... ")
|
||||
(flush-output)
|
||||
(with-output-to-file "cache.rkt" (λ () (write data)) #:exists 'truncate/replace)
|
||||
data]))
|
||||
|
||||
(display "Converting... ")
|
||||
(flush-output)
|
||||
(define ser (jsexpr->bytes data))
|
||||
(define ser-port (open-input-bytes ser))
|
||||
(define quit (make-progress (λ () (progress^ (ceiling (/ (file-position ser-port) 64 1024))
|
||||
(ceiling (/ (bytes-length ser) 64 1024))
|
||||
"Posting..."))
|
||||
2))
|
||||
(define res
|
||||
(post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname)
|
||||
#:data ser-port
|
||||
#:headers '#hasheq((Content-Type . "application/json"))
|
||||
#:timeouts (make-timeout-config #:lease 5 #:connect 5 #:request 300)))
|
||||
(quit)
|
||||
(displayln (response-status-line res)))
|
||||
|
||||
(run start)
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
local-encoded-url->segments
|
||||
url-segments->basename
|
||||
local-encoded-url->basename
|
||||
title->basename
|
||||
basename->name-for-query
|
||||
url-segments->guess-title)
|
||||
|
||||
|
@ -21,6 +22,12 @@
|
|||
(define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix
|
||||
(url-segments->basename (local-encoded-url->segments str)))
|
||||
|
||||
(define (title->basename title) ; "Page title/Strategies" -> "Page_title#Strategies" filename encoded, no extension or dir prefi
|
||||
(define elements (string-split (string-replace title " " "_") "/"))
|
||||
(define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) elements))
|
||||
(define basic-filename (string-join extra-encoded "#"))
|
||||
basic-filename)
|
||||
|
||||
(define (basename->name-for-query str)
|
||||
(uri-decode (regexp-replace* #rx"#" str "/")))
|
||||
|
||||
|
|
|
@ -377,7 +377,7 @@
|
|||
'("runescape") 'default
|
||||
'RuneScape
|
||||
"RuneScape Wiki"
|
||||
"https://runescape.wiki/"
|
||||
"https://runescape.wiki/w/Main_Page"
|
||||
"https://runescape.wiki/images/Wiki.png"
|
||||
(λ (props)
|
||||
`((p "The RuneScape Wiki was founded on April 8, 2005. In October 2018, the wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements."))))
|
||||
|
@ -386,7 +386,7 @@
|
|||
'("oldschoolrunescape") 'default
|
||||
'RuneScape
|
||||
"Old School RuneScape Wiki"
|
||||
"https://oldschool.runescape.wiki/"
|
||||
"https://oldschool.runescape.wiki/w/Main_Page"
|
||||
"https://oldschool.runescape.wiki/images/Wiki.png"
|
||||
(λ (props)
|
||||
`((p "The Old School RuneScape Wiki was founded on February 14, 2013. In October 2018, the RuneScape Wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements, with the Old School RuneScape Wiki following suit."))))
|
||||
|
@ -395,7 +395,7 @@
|
|||
'("runescapeclassic") 'default
|
||||
'RuneScape
|
||||
"RuneScape Classic Wiki"
|
||||
"https://classic.runescape.wiki/"
|
||||
"https://classic.runescape.wiki/w/Main_Page"
|
||||
"https://classic.runescape.wiki/images/Wiki.png"
|
||||
(λ (props)
|
||||
`((p "The Old School RuneScape Wiki was founded on April 19, 2009. In October 2018, the RuneScape Wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements, with the RuneScape Classic Wiki following suit."))))
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
[(not wikiname)
|
||||
(response/output
|
||||
#:code 400
|
||||
#:mime-type "text/plain"
|
||||
#:mime-type #"text/plain"
|
||||
(λ (out)
|
||||
(displayln "Requires wikiname and q parameters." out)))]
|
||||
[(or (not q) (equal? q ""))
|
||||
|
|
|
@ -53,7 +53,8 @@
|
|||
;; grab the part after ?q= which is the search terms
|
||||
(define query (dict-ref params 'q #f))
|
||||
;; figure out which search provider we're going to use
|
||||
(define search-provider (hash-ref search-providers (config-get 'feature_offline::search)))
|
||||
(define search-provider (hash-ref search-providers (config-get 'feature_offline::search)
|
||||
(λ () (error 'search-provider "unknown search provider configured"))))
|
||||
|
||||
;; external special:search url to link at the bottom of the page as the upstream source
|
||||
(define external-search-url
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
web-server/dispatchers/dispatch
|
||||
; my libs
|
||||
"application-globals.rkt"
|
||||
"../archiver/archiver-database.rkt"
|
||||
"config.rkt"
|
||||
"data.rkt"
|
||||
"log.rkt"
|
||||
|
@ -40,6 +41,9 @@
|
|||
|
||||
(define path-archive (anytime-path ".." "storage/archive"))
|
||||
|
||||
(when (config-true? 'feature_offline::only)
|
||||
(void (get-slc)))
|
||||
|
||||
(define (page-wiki-offline req)
|
||||
(response-handler
|
||||
(define wikiname (path/param-path (first (url-path (request-uri req)))))
|
||||
|
@ -64,84 +68,94 @@
|
|||
[else (error 'archive-format "unknown archive format configured")]))
|
||||
(define fs-path (build-path path-archive wikiname (format (car archive-format) maybe-hashed-basename)))
|
||||
(define source-url (format "https://~a.fandom.com/wiki/~a" wikiname (basename->name-for-query basename)))
|
||||
(cond
|
||||
[(not (file-exists? fs-path))
|
||||
(unless (config-true? 'feature_offline::only)
|
||||
(next-dispatcher))
|
||||
(define mirror-path (url->string (request-uri req)))
|
||||
(cond/var
|
||||
|
||||
[(file-exists? fs-path)
|
||||
(when (config-true? 'debug)
|
||||
(printf "using offline mode for ~v~n" fs-path))
|
||||
(response-handler
|
||||
(define data (with-input-from-file fs-path (cdr archive-format)))
|
||||
(define article-title (jp "/parse/title" data))
|
||||
(define original-page (html->xexp (preprocess-html-wiki (jp "/parse/text" data))))
|
||||
(define page ((query-selector (λ (t a c) (has-class? "mw-parser-output" a)) original-page)))
|
||||
(define initial-head-data ((head-data-getter wikiname) data))
|
||||
(define head-data
|
||||
(case theme
|
||||
[(light dark)
|
||||
(struct-copy head-data^ initial-head-data
|
||||
[body-class (regexp-replace #rx"(theme-fandomdesktop-)(light|dark)"
|
||||
(head-data^-body-class initial-head-data)
|
||||
(format "\\1~a" theme))])]
|
||||
[else initial-head-data]))
|
||||
(define body
|
||||
(generate-wiki-page
|
||||
`(div (@ (class "unsaved-page"))
|
||||
(style ".unsaved-page a { text-decoration: underline !important }")
|
||||
(p "breezewiki.com doesn't have this page saved.")
|
||||
(p "You can see this page by visiting a BreezeWiki mirror:")
|
||||
(ul
|
||||
(li (a (@ (href ,(format "https://antifandom.com~a" mirror-path))) "View on antifandom.com"))
|
||||
(li (a (@ (href ,(format "https://bw.artemislena.eu~a" mirror-path))) "View on artemislena.eu"))
|
||||
(li (a (@ (href ,source-url)) "or, you can see the original page on Fandom (ugh)")))
|
||||
(p "If you'd like " ,wikiname ".fandom.com to be added to breezewiki.com, " (a (@ (href "https://lists.sr.ht/~cadence/breezewiki-requests")) "let me know about it!")))
|
||||
(update-tree-wiki page wikiname)
|
||||
#:req req
|
||||
#:source-url source-url
|
||||
#:wikiname wikiname
|
||||
#:title (url-segments->guess-title segments)
|
||||
#:title article-title
|
||||
#:online-styles #f
|
||||
#:head-data head-data
|
||||
#:siteinfo (siteinfo-fetch wikiname)
|
||||
))
|
||||
(define redirect-msg ((query-selector (attribute-selector 'class "redirectMsg") body)))
|
||||
(define redirect-query-parameter (dict-ref (url-query (request-uri req)) 'redirect "yes"))
|
||||
(define headers
|
||||
(build-headers
|
||||
always-headers
|
||||
; redirect-query-parameter: only the string "no" is significant:
|
||||
; https://github.com/Wikia/app/blob/fe60579a53f16816d65dad1644363160a63206a6/includes/Wiki.php#L367
|
||||
(when (and redirect-msg
|
||||
(not (equal? redirect-query-parameter "no")))
|
||||
(let* ([dest (get-attribute 'href (bits->attributes ((query-selector (λ (t a c) (eq? t 'a)) redirect-msg))))]
|
||||
[value (bytes-append #"0;url=" (string->bytes/utf-8 dest))])
|
||||
(header #"Refresh" value)))))
|
||||
(when (config-true? 'debug)
|
||||
; used for its side effects
|
||||
; convert to string with error checking, error will be raised if xexp is invalid
|
||||
(xexp->html body))
|
||||
(response/output
|
||||
#:code 200
|
||||
#:headers always-headers
|
||||
#:headers headers
|
||||
(λ (out)
|
||||
(write-html body out)))]
|
||||
[#t
|
||||
(when (config-true? 'debug)
|
||||
(printf "using offline mode for ~v~n" fs-path))
|
||||
(response-handler
|
||||
(define data (with-input-from-file fs-path (cdr archive-format)))
|
||||
(define article-title (jp "/parse/title" data))
|
||||
(define original-page (html->xexp (preprocess-html-wiki (jp "/parse/text" data))))
|
||||
(define page ((query-selector (λ (t a c) (has-class? "mw-parser-output" a)) original-page)))
|
||||
(define initial-head-data ((head-data-getter wikiname) data))
|
||||
(define head-data
|
||||
(case theme
|
||||
[(light dark)
|
||||
(struct-copy head-data^ initial-head-data
|
||||
[body-class (regexp-replace #rx"(theme-fandomdesktop-)(light|dark)"
|
||||
(head-data^-body-class initial-head-data)
|
||||
(format "\\1~a" theme))])]
|
||||
[else initial-head-data]))
|
||||
(define body
|
||||
(generate-wiki-page
|
||||
(update-tree-wiki page wikiname)
|
||||
#:req req
|
||||
#:source-url source-url
|
||||
#:wikiname wikiname
|
||||
#:title article-title
|
||||
#:online-styles #f
|
||||
#:head-data head-data
|
||||
#:siteinfo (siteinfo-fetch wikiname)
|
||||
))
|
||||
(define redirect-msg ((query-selector (attribute-selector 'class "redirectMsg") body)))
|
||||
(define redirect-query-parameter (dict-ref (url-query (request-uri req)) 'redirect "yes"))
|
||||
(define headers
|
||||
(build-headers
|
||||
always-headers
|
||||
; redirect-query-parameter: only the string "no" is significant:
|
||||
; https://github.com/Wikia/app/blob/fe60579a53f16816d65dad1644363160a63206a6/includes/Wiki.php#L367
|
||||
(when (and redirect-msg
|
||||
(not (equal? redirect-query-parameter "no")))
|
||||
(let* ([dest (get-attribute 'href (bits->attributes ((query-selector (λ (t a c) (eq? t 'a)) redirect-msg))))]
|
||||
[value (bytes-append #"0;url=" (string->bytes/utf-8 dest))])
|
||||
(header #"Refresh" value)))))
|
||||
(when (config-true? 'debug)
|
||||
; used for its side effects
|
||||
; convert to string with error checking, error will be raised if xexp is invalid
|
||||
(xexp->html body))
|
||||
(response/output
|
||||
#:code 200
|
||||
#:headers headers
|
||||
(λ (out)
|
||||
(write-html body out))))])))
|
||||
(write-html body out))))]
|
||||
|
||||
;; page not found on disk, perhaps it's a redirect? redirects are stored in the database
|
||||
(var target (query-maybe-value* "select redirect from page where wikiname = ? and basename = ?" wikiname basename))
|
||||
[target
|
||||
(generate-redirect (basename->name-for-query target))]
|
||||
|
||||
;; breezewiki doesn't have the page archived, see if we can make a network request for it
|
||||
[(not (config-true? 'feature_offline::only))
|
||||
(next-dispatcher)]
|
||||
|
||||
;; no possible way to provide the page
|
||||
[else
|
||||
(define mirror-path (url->string (request-uri req)))
|
||||
(define body
|
||||
(generate-wiki-page
|
||||
`(div (@ (class "unsaved-page"))
|
||||
(style ".unsaved-page a { text-decoration: underline !important }")
|
||||
(p "breezewiki.com doesn't have this page saved.")
|
||||
(p "You can see this page by visiting a BreezeWiki mirror:")
|
||||
(ul
|
||||
(li (a (@ (href ,(format "https://antifandom.com~a" mirror-path))) "View on antifandom.com"))
|
||||
(li (a (@ (href ,(format "https://bw.artemislena.eu~a" mirror-path))) "View on artemislena.eu"))
|
||||
(li (a (@ (href ,source-url)) "or, you can see the original page on Fandom (ugh)")))
|
||||
(p "If you'd like " ,wikiname ".fandom.com to be added to breezewiki.com, " (a (@ (href "https://lists.sr.ht/~cadence/breezewiki-requests")) "let me know about it!")))
|
||||
#:req req
|
||||
#:source-url source-url
|
||||
#:wikiname wikiname
|
||||
#:title (url-segments->guess-title segments)
|
||||
#:online-styles #f
|
||||
#:siteinfo (siteinfo-fetch wikiname)
|
||||
))
|
||||
(when (config-true? 'debug)
|
||||
; used for its side effects
|
||||
; convert to string with error checking, error will be raised if xexp is invalid
|
||||
(xexp->html body))
|
||||
(response/output
|
||||
#:code 200
|
||||
#:headers always-headers
|
||||
(λ (out)
|
||||
(write-html body out)))])))
|
||||
|
|
Loading…
Reference in a new issue