Compare commits

...

2 Commits

Author SHA1 Message Date
Cadence Ember a57445abcb Archiver now saves redirects 2023-12-12 11:10:47 +13:00
Cadence Ember 27c9680f5b Solr indexer code cleanup 2023-12-12 10:35:49 +13:00
9 changed files with 397 additions and 323 deletions

View File

@ -35,9 +35,12 @@
(output-lines? #t)])) (output-lines? #t)]))
(define (update-width) (define (update-width)
(when (output-progress?) (when (output-progress?)
(with-charterm (case (system-type 'os)
(call-with-values (λ () (charterm-screen-size)) [(linux)
(λ (cols rows) (set! width cols)))))) (with-charterm
(call-with-values (λ () (charterm-screen-size))
(λ (cols rows) (set! width cols))))]
[else 100])))
(update-width) (update-width)
;; check ;; check
(when (or (not wikiname) (equal? wikiname "")) (when (or (not wikiname) (equal? wikiname ""))
@ -56,8 +59,8 @@
(define real-width (min (string-length basename) rest)) (define real-width (min (string-length basename) rest))
(define spare-width (- rest real-width)) (define spare-width (- rest real-width))
(define name-display (substring basename 0 real-width)) (define name-display (substring basename 0 real-width))
(define whitespace (make-string spare-width #\ )) (printf "\e[2K\r~a~a" prefix name-display)
(printf "~a~a~a\r" prefix name-display whitespace)])) (flush-output)]))
;; download all stages ;; download all stages
(for ([stage all-stages] (for ([stage all-stages]
[i (in-naturals 1)]) [i (in-naturals 1)])

View File

@ -48,7 +48,8 @@
((query-exec slc* "alter table wiki add column sitename TEXT") ((query-exec slc* "alter table wiki add column sitename TEXT")
(query-exec slc* "alter table wiki add column basepage TEXT") (query-exec slc* "alter table wiki add column basepage TEXT")
(query-exec slc* "alter table wiki add column license_text TEXT") (query-exec slc* "alter table wiki add column license_text TEXT")
(query-exec slc* "alter table wiki add column license_url TEXT")))) (query-exec slc* "alter table wiki add column license_url TEXT"))
((query-exec slc* "alter table page add column redirect"))))
(let do-migrate-step () (let do-migrate-step ()
(when (database-version . < . (length migrations)) (when (database-version . < . (length migrations))

View File

@ -1,5 +1,6 @@
#lang racket/base #lang racket/base
(require racket/file (require racket/file
racket/format
racket/function racket/function
racket/list racket/list
racket/path racket/path
@ -43,10 +44,18 @@
wikiname wikiname
(params->query '(("action" . "query") (params->query '(("action" . "query")
("meta" . "siteinfo") ("meta" . "siteinfo")
("siprop" . "general|rightsinfo|statistics") ("siprop" . "general|rightsinfo|statistics|namespaces")
("format" . "json") ("format" . "json")
("formatversion" . "2"))))) ("formatversion" . "2")))))
(define data (response-json (get dest-url))) (define data (response-json (get dest-url)))
(define content-nss
(sort
(for/list ([(k v) (in-hash (jp "/query/namespaces" data))]
#:do [(define id (hash-ref v 'id))]
#:when (and (id . < . 2900) ; exclude maps namespace
(hash-ref v 'content))) ; exclude non-content and talk namespaces
id)
<))
(define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) (define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
(if (and exists? (not (sql-null? exists?))) (if (and exists? (not (sql-null? exists?)))
(query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" (query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?"
@ -61,7 +70,8 @@
(second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
(jp "/query/rightsinfo/text" data) (jp "/query/rightsinfo/text" data)
(jp "/query/rightsinfo/url" data))) (jp "/query/rightsinfo/url" data)))
(jp "/query/statistics/articles" data)) (values (jp "/query/statistics/articles" data)
content-nss))
(define (check-style-for-images wikiname path) (define (check-style-for-images wikiname path)
@ -131,48 +141,57 @@
;; done yet? ;; done yet?
(unless (and (real? wiki-progress) (wiki-progress . >= . 1)) (unless (and (real? wiki-progress) (wiki-progress . >= . 1))
;; Count total pages ;; Count total pages
(define num-pages (insert-wiki-entry wikiname)) (define-values (num-pages namespaces) (insert-wiki-entry wikiname))
;; Download the entire index of pages ;; Download the entire index of pages
(define basenames (for*/fold ([total 0])
(let loop ([path-with-namefrom "/wiki/Local_Sitemap"] ([namespace namespaces]
[basenames-previous-pages null]) [redir-filter '("nonredirects" "redirects")])
;; Download the current index page (let loop ([apcontinue ""]
(define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom)) [basenames null])
(define r (get url)) (cond
;; Metadata from this page (the link to the next page) [apcontinue
(define page (html->xexp (bytes->string/utf-8 (response-body r)))) (define url (format "https://~a.fandom.com/api.php?~a"
(define link-namefrom wikiname
((query-selector (λ (t a c x) (and (eq? t 'a) (params->query `(("action" . "query")
(pair? x) ("list" . "allpages")
(string-contains? (car x) "Next page") ("apnamespace" . ,(~a namespace))
(let ([href (get-attribute 'href a)] ) ("apfilterredir" . ,redir-filter)
(and href (string-contains? href "/wiki/Local_Sitemap"))))) ("aplimit" . "500")
page #:include-text? #t))) ("apcontinue" . ,apcontinue)
;; Content from this page ("format" . "json")
(define basenames-this-page ("formatversion" . "2")))))
(for/list ([link (in-producer ;; Download the current listing page
(query-selector (define res (get url))
(λ (t a c) (eq? t 'a)) (define json (response-json res))
((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page))) ;; Content from this page
#f)]) (define current-basenames
(local-encoded-url->basename (get-attribute 'href (bits->attributes link))))) (for/list ([page (jp "/query/allpages" json)])
;; Call the progress callback (title->basename (jp "/title" page))))
(define all-basenames (append basenames-previous-pages basenames-this-page)) (when ((length current-basenames) . > . 0)
(callback (length all-basenames) num-pages (last all-basenames)) ;; Report
;; Recurse to download from the next page (if (equal? redir-filter "nonredirects")
(if link-namefrom (callback (+ (length basenames) (length current-basenames) total) num-pages (last current-basenames))
(loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames) (callback total num-pages (last current-basenames))))
all-basenames))) ;; Loop
;; Save those pages into the database (loop (jp "/continue/apcontinue" json #f) (append basenames current-basenames))]
;; SQLite can have a maximum of 32766 parameters in a single query [else
(for ([slice (in-slice 32760 basenames)]) ;; All done with this (loop)! Save those pages into the database
(define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values ")) ;; SQLite can have a maximum of 32766 parameters in a single query
(call-with-transaction (begin0
(get-slc) ;; next for*/fold
(λ () (if (equal? redir-filter "nonredirects")
(apply query-exec* query-template wikiname slice) (+ (length basenames) total)
;; Record that we have the complete list of pages total) ; redirects don't count for the site statistics total
(query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname)))))) (call-with-transaction
(get-slc)
(λ ()
(for ([slice (in-slice 32760 basenames)])
(define query-template
(string-join #:before-first "insert or ignore into page (wikiname, redirect, basename, progress) values "
(make-list (length slice) "(?1, ?2, ?, 0)") ", "))
(apply query-exec* query-template wikiname (if (equal? redir-filter "redirects") 1 sql-null) slice)))))])))
;; Record that we have the complete list of pages
(query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname)))
;; 2. Download each page via API and: ;; 2. Download each page via API and:
@ -183,7 +202,7 @@
(define save-dir (build-path archive-root wikiname)) (define save-dir (build-path archive-root wikiname))
(make-directory* save-dir) (make-directory* save-dir)
;; gather list of basenames to download (that aren't yet complete) ;; gather list of basenames to download (that aren't yet complete)
(define basenames (query-list* "select basename from page where wikiname = ? and progress < ?" (define basenames (query-list* "select basename from page where wikiname = ? and progress < ? and redirect is null"
wikiname max-page-progress)) wikiname max-page-progress))
;; counter of complete/incomplete basenames ;; counter of complete/incomplete basenames
(define already-done-count (define already-done-count
@ -222,10 +241,41 @@
(query-exec* "update page set progress = 1 where wikiname = ? and basename = ?" (query-exec* "update page set progress = 1 where wikiname = ? and basename = ?"
wikiname basename) wikiname basename)
(callback i total-count basename)) (callback i total-count basename))
;; save redirects as well
(save-redirects wikiname callback (+ already-done-count (length basenames)) total-count)
;; saved all pages, register that fact in the database ;; saved all pages, register that fact in the database
(query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname)) (query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname))
;; 2.5. Download each redirect-target via API and save mapping in database
(define (save-redirects wikiname callback already-done-count total-count)
(define basenames (query-list* "select basename from page where wikiname = ? and progress < ? and redirect = 1"
wikiname max-page-progress))
;; loop through basenames, in slices of 50 (MediaWiki API max per request), and download
(for ([basename basenames]
[i (in-naturals (add1 already-done-count))])
(define dest-url
(format "https://~a.fandom.com/api.php?~a"
wikiname
(params->query `(("action" . "query")
("prop" . "links")
("titles" . ,(basename->name-for-query basename))
("format" . "json")
("formatversion" . "2")))))
(define res (get dest-url))
(define json (response-json res))
(define dest-title (jp "/query/pages/0/links/0/title" json #f))
(callback i total-count basename)
(cond
[dest-title
;; store it
(define dest-basename (title->basename dest-title))
(query-exec* "update page set progress = 1, redirect = ? where wikiname = ? and basename = ?" dest-basename wikiname basename)]
[else
;; the page just doesn't exist
(query-exec* "delete from page where wikiname = ? and basename = ?" wikiname basename)])))
;; 3. Download CSS and: ;; 3. Download CSS and:
;; * Save CSS to file ;; * Save CSS to file
;; * Record style images to database ;; * Record style images to database

View File

@ -1,202 +1,200 @@
#lang racket #lang cli
(require racket/function (require (for-syntax racket/base))
racket/future (require racket/format
racket/match racket/function
racket/path racket/future
racket/promise racket/match
racket/port racket/path
racket/string racket/promise
file/gunzip racket/port
db racket/runtime-path
db/unsafe/sqlite3 racket/string
net/http-easy file/gunzip
json db
json-pointer db/unsafe/sqlite3
"../lib/html-parsing/main.rkt" net/http-easy
"../lib/xexpr-utils.rkt" json
"../lib/tree-updater.rkt") json-pointer
"../lib/html-parsing/main.rkt"
(define-syntax (seq stx) "../lib/xexpr-utils.rkt"
(syntax-case stx () "../lib/tree-updater.rkt")
[(_ body ...)
#`(for ([op (list (lambda () body) ...)] (flag (read-from-cache?)
[i (in-naturals)]) ("-c" "--read-from-cache" "read from last run cache instead of rebuilding documents")
(define res (op)) (read-from-cache? #t))
(when (>= (response-status-code res) 400)
(error 'seq "op #~a: status code was ~a: ~v" i (response-status-code res) (response-json res))) (define-runtime-path storage-path "../storage/archive")
(define taskuid (json-pointer-value "/taskUid" (response-json res)))
(for/or ([ticks (in-naturals)] ;; ***************************************************************************************************
[res2 (in-producer (lambda () (get (format "http://localhost:7700/tasks/~a" taskuid))))]) ;; Progress bar display
(define status (json-pointer-value "/status" (response-json res2))) ;; ***************************************************************************************************
(case status
[("enqueued" "processing") (struct progress^ (n max title) #:transparent)
(sleep 1)
#f] (define (make-m-s seconds)
[("succeeded") (define-values (eta-m eta-s) (quotient/remainder seconds 60))
(printf "op #~a: ~a (~a ticks)~n" i status ticks) (format "~a:~a" eta-m (~a eta-s #:width 2 #:align 'right #:pad-string "0")))
#t]
[else (define (make-progress get-p [history-size 20])
(error 'seq "op #~a: task status was ~a: ~v" i status res2)])))])) (define update-sleep 1)
(define name-width 30)
(define (class-has? attributes substrs) (define max-width 105)
(define cl (or (get-attribute 'class attributes) "")) (define history (make-vector history-size 0))
(ormap (λ (substr) (string-contains? cl substr)) substrs)) (define history-pointer 0)
(define elapsed 0)
(define (updater element element-type attributes children) (define (report-progress)
(cond (define p (get-p))
[(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews")) (define history-cycle (vector-ref history history-pointer))
(list 'div '() '())] (vector-set! history history-pointer (progress^-n p))
[#t (set! history-pointer (modulo (add1 history-pointer) history-size))
(list element-type attributes children)])) (set! elapsed (add1 elapsed))
(define-values (eta-display diff-per-second)
(define slc (sqlite3-connect #:database "../storage/fts-separate.db")) (cond
(sqlite3-load-extension slc "fts5") [((progress^-n p) . >= . (progress^-max p)) (values (format "~a **" (make-m-s elapsed)) (format "** ~a" (quotient (progress^-max p) (max elapsed 1))))]
[(= history-cycle 0) (values "-:--" "--")]
(define (writer tables-mode? page) [else (define diff-per-second (/ (- (progress^-n p) history-cycle) (* history-size update-sleep)))
(define (writer-inner page) (define eta-total
(for ([bit page]) (if (diff-per-second . > . 0)
(cond (floor (round (/ (- (progress^-max p) (progress^-n p)) diff-per-second)))
[(and tables-mode? (pair? bit) (memq (car bit) '(h1 h2 h3 p blockquote q))) (void)] 0))
[(and (not tables-mode?) (pair? bit) (memq (car bit) '(ul ol dl table))) (void)] (values (make-m-s eta-total)
[(memq bit '(div p li td dd dt br)) (displayln "")] (round diff-per-second))]))
[(symbol? bit) (void)] (define left (format "~a/~a ~a/s ~a ~a%"
[(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)] (~a (progress^-n p) #:width (string-length (~a (progress^-max p))) #:align 'right #:pad-string " ")
[(and (pair? bit) (eq? (car bit) '@)) (void)] (progress^-max p)
[(pair? bit) (writer-inner bit)] diff-per-second
[(string? bit) (display bit)]))) eta-display
(writer-inner page)) (floor (* 100 (/ (progress^-n p) (progress^-max p))))))
(define name-display (~a (progress^-title p) #:max-width name-width #:limit-marker "..."))
(define (write-and-post-process tables-mode? page) (define remaining-space (- max-width name-width (string-length left) 2))
(define text (with-output-to-string (λ () (writer tables-mode? page)))) (define bar-width
;; (define text-no-numbers (regexp-replace* #px"(?:-|[+$£€¥] *)?[0-9,.]{2,}%?\\s*" text "")) (floor (* (sub1 remaining-space)
(define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n")) (/ (progress^-n p) (progress^-max p)))))
shrink-text) (define bar (string-append (make-string bar-width #\=)
">"
(define wikiname "bloons") (make-string (- remaining-space bar-width) #\ )))
(define tablename (format "page_~a" wikiname)) (printf "\e[2K\r~a~a~a" left bar name-display)
(flush-output))
(define ((extract f)) ; f - filename (define (report-progress-loop)
(with-handlers (sleep update-sleep)
([exn:fail? (λ (err) (println f) (raise err))]) (report-progress)
(define j (report-progress-loop))
(case (path-get-extension f) (define t (thread report-progress-loop))
[(#".json") (define (quit)
(with-input-from-file f (λ () (read-json)))] (kill-thread t)
[(#".gz") (report-progress)
(define-values (in out) (make-pipe)) (displayln ""))
(with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out))) quit)
(read-json in)]
[else #f])) ;; ***************************************************************************************************
(define title (json-pointer-value "/parse/title" j)) ;; Page text extractor
(define pageid (json-pointer-value "/parse/pageid" j)) ;; ***************************************************************************************************
(define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j)))
(define page (update-tree updater (html->xexp page-html))) (define (class-has? attributes substrs)
(define body (write-and-post-process #f page)) (define cl (or (get-attribute 'class attributes) ""))
(define table (write-and-post-process #t page)) (ormap (λ (substr) (string-contains? cl substr)) substrs))
(values title body table pageid)))
(define (updater element element-type attributes children)
(define results (cond
(for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)] [(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews"))
#:when (member (path-get-extension f) '(#".json" #".gz"))) (list 'div '() '())]
(extract f))) [#t
(list element-type attributes children)]))
;; ***************************************************************************************************
;; TESTING WRITER (define (writer tables-mode? page)
;; *************************************************************************************************** (define (writer-inner page)
#;(for/first ([fut results] (for ([bit page])
[i (in-naturals 1)] (cond
#:when (i . >= . 4859)) [(and tables-mode? (pair? bit) (memq (car bit) '(h1 h2 h3 p blockquote q))) (void)]
(define-values (title body table pageid) (fut)) [(and (not tables-mode?) (pair? bit) (memq (car bit) '(ul ol dl table))) (void)]
(println title) [(memq bit '(div p li td dd dt br)) (displayln "")]
(println body) [(symbol? bit) (void)]
(println table)) [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)]
[(and (pair? bit) (eq? (car bit) '@)) (void)]
(println "inserting...") [(pair? bit) (writer-inner bit)]
[(string? bit) (display bit)])))
;; *************************************************************************************************** (writer-inner page))
;; SQLite FTS5
;; *************************************************************************************************** (define (write-and-post-process tables-mode? page)
#;(begin (define text (with-output-to-string (λ () (writer tables-mode? page))))
(query-exec slc "begin transaction") ;; (define text-no-numbers (regexp-replace* #px"(?:-|[+$£€¥] *)?[0-9,.]{2,}%?\\s*" text ""))
#;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname)) (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n"))
(time shrink-text)
(for ([fut results]
[i (in-naturals 1)]) (define ((extract f)) ; f - filename
(display "-") (with-handlers
(when (and (> i 0) (= (modulo i 100) 0)) ([exn:fail? (λ (err) (printf "extract: ~a: ~v~n" f err))])
(println i)) (define j
(define-values (title shrink-text) (fut)) (case (path-get-extension f)
(query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text))) [(#".json")
(with-input-from-file f (λ () (read-json)))]
(println "running optimize...") [(#".gz")
(query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename)) (define-values (in out) (make-pipe))
(with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out)))
(println "committing...") (read-json in)]
(query-exec slc "commit")) [else #f]))
(define title (json-pointer-value "/parse/title" j))
;; *************************************************************************************************** (define pageid (json-pointer-value "/parse/pageid" j))
;; Solr (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j)))
;; *************************************************************************************************** (define page (update-tree updater (html->xexp page-html)))
(begin (define body (write-and-post-process #f page))
(define data (define table (write-and-post-process #t page))
(cond (list title body table pageid)))
#;[(file-exists? "cache.rkt")
(println "reading in...") ;; ***************************************************************************************************
(with-input-from-file "cache.rkt" (λ () (read)))] ;; Program, loop, Solr APIs
[else ;; ***************************************************************************************************
(define data
(for/list ([fut results] (program
[i (in-naturals 1)]) (start [wikiname "wikiname to download"])
(display "-")
(when (and (> i 0) (= (modulo i 100) 0)) (define results
(println i)) (for/list ([f (directory-list (build-path storage-path wikiname) #:build? #t)]
(define-values (title body table pageid) (fut)) #:when (member (path-get-extension f) '(#".gz")))
(define len (string-length body)) (extract f)))
`#hasheq((id . ,(number->string pageid))
(title . ,title) (define data
(body . ,body) (cond
(table . ,table) [(and (read-from-cache?) (file-exists? "cache.rkt"))
(len . ,len)))) (displayln "Reading in...")
(with-input-from-file "cache.rkt" (λ () (read)))]
(println "writing out...") [else
(with-output-to-file "cache.rkt" (λ () (write data)) #:exists 'truncate/replace) (define x (box (progress^ 0 1 "...")))
data])) (define quit (make-progress (λ () (unbox x))))
(define data
(println "posting...") (for/list ([fut results]
(define res [i (in-naturals 1)]
(post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname) #:do [(define page (fut))]
#:json data))) #:when (not (void? page)))
(match-define (list title body table pageid) page)
;; *************************************************************************************************** (define len (string-length body))
;; Meilisearch (set-box! x (progress^ i (length results) title))
;; *************************************************************************************************** `#hasheq((id . ,(number->string pageid))
#;(begin (title . ,title)
(seq (body . ,body)
(put (format "http://localhost:7700/indexes/~a/settings/searchable-attributes" wikiname) (table . ,table)
#:json '("title" "body")) (len . ,len))))
(put (format "http://localhost:7700/indexes/~a/settings/ranking-rules" wikiname) (quit)
#:json '("words" "typo" #;"proximity" "attribute" "sort" "exactness" #;"len:desc"))
(call-with-input-file "stop-words.json" (display "Writing out... ")
(λ (in) (flush-output)
(put (format "http://localhost:7700/indexes/~a/settings/stop-words" wikiname) (with-output-to-file "cache.rkt" (λ () (write data)) #:exists 'truncate/replace)
#:headers '#hasheq((Content-Type . "application/json")) data]))
#:data in))))
(define data (display "Converting... ")
(for/list ([fut results] (flush-output)
[i (in-naturals 1)]) (define ser (jsexpr->bytes data))
(display "-") (define ser-port (open-input-bytes ser))
(when (and (> i 0) (= (modulo i 100) 0)) (define quit (make-progress (λ () (progress^ (ceiling (/ (file-position ser-port) 64 1024))
(println i)) (ceiling (/ (bytes-length ser) 64 1024))
(define-values (title body pageid) (fut)) "Posting..."))
(define len (string-length body)) 2))
`#hasheq((id . ,pageid) (define res
(title . ,title) (post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname)
(body . ,body) #:data ser-port
(len . ,len)))) #:headers '#hasheq((Content-Type . "application/json"))
(define res #:timeouts (make-timeout-config #:lease 5 #:connect 5 #:request 300)))
(post (format "http://localhost:7700/indexes/~a/documents" wikiname) (quit)
#:json data)) (displayln (response-status-line res)))
(seq res)
(println (response-json res))) (run start)
(disconnect slc)

View File

@ -7,6 +7,7 @@
local-encoded-url->segments local-encoded-url->segments
url-segments->basename url-segments->basename
local-encoded-url->basename local-encoded-url->basename
title->basename
basename->name-for-query basename->name-for-query
url-segments->guess-title) url-segments->guess-title)
@ -21,6 +22,12 @@
(define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix (define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix
(url-segments->basename (local-encoded-url->segments str))) (url-segments->basename (local-encoded-url->segments str)))
(define (title->basename title) ; "Page title/Strategies" -> "Page_title#Strategies" filename encoded, no extension or dir prefi
(define elements (string-split (string-replace title " " "_") "/"))
(define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) elements))
(define basic-filename (string-join extra-encoded "#"))
basic-filename)
(define (basename->name-for-query str) (define (basename->name-for-query str)
(uri-decode (regexp-replace* #rx"#" str "/"))) (uri-decode (regexp-replace* #rx"#" str "/")))

View File

@ -377,7 +377,7 @@
'("runescape") 'default '("runescape") 'default
'RuneScape 'RuneScape
"RuneScape Wiki" "RuneScape Wiki"
"https://runescape.wiki/" "https://runescape.wiki/w/Main_Page"
"https://runescape.wiki/images/Wiki.png" "https://runescape.wiki/images/Wiki.png"
(λ (props) (λ (props)
`((p "The RuneScape Wiki was founded on April 8, 2005. In October 2018, the wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements.")))) `((p "The RuneScape Wiki was founded on April 8, 2005. In October 2018, the wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements."))))
@ -386,7 +386,7 @@
'("oldschoolrunescape") 'default '("oldschoolrunescape") 'default
'RuneScape 'RuneScape
"Old School RuneScape Wiki" "Old School RuneScape Wiki"
"https://oldschool.runescape.wiki/" "https://oldschool.runescape.wiki/w/Main_Page"
"https://oldschool.runescape.wiki/images/Wiki.png" "https://oldschool.runescape.wiki/images/Wiki.png"
(λ (props) (λ (props)
`((p "The Old School RuneScape Wiki was founded on February 14, 2013. In October 2018, the RuneScape Wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements, with the Old School RuneScape Wiki following suit.")))) `((p "The Old School RuneScape Wiki was founded on February 14, 2013. In October 2018, the RuneScape Wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements, with the Old School RuneScape Wiki following suit."))))
@ -395,7 +395,7 @@
'("runescapeclassic") 'default '("runescapeclassic") 'default
'RuneScape 'RuneScape
"RuneScape Classic Wiki" "RuneScape Classic Wiki"
"https://classic.runescape.wiki/" "https://classic.runescape.wiki/w/Main_Page"
"https://classic.runescape.wiki/images/Wiki.png" "https://classic.runescape.wiki/images/Wiki.png"
(λ (props) (λ (props)
`((p "The Old School RuneScape Wiki was founded on April 19, 2009. In October 2018, the RuneScape Wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements, with the RuneScape Classic Wiki following suit.")))) `((p "The Old School RuneScape Wiki was founded on April 19, 2009. In October 2018, the RuneScape Wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements, with the RuneScape Classic Wiki following suit."))))

View File

@ -19,7 +19,7 @@
[(not wikiname) [(not wikiname)
(response/output (response/output
#:code 400 #:code 400
#:mime-type "text/plain" #:mime-type #"text/plain"
(λ (out) (λ (out)
(displayln "Requires wikiname and q parameters." out)))] (displayln "Requires wikiname and q parameters." out)))]
[(or (not q) (equal? q "")) [(or (not q) (equal? q ""))

View File

@ -53,7 +53,8 @@
;; grab the part after ?q= which is the search terms ;; grab the part after ?q= which is the search terms
(define query (dict-ref params 'q #f)) (define query (dict-ref params 'q #f))
;; figure out which search provider we're going to use ;; figure out which search provider we're going to use
(define search-provider (hash-ref search-providers (config-get 'feature_offline::search))) (define search-provider (hash-ref search-providers (config-get 'feature_offline::search)
(λ () (error 'search-provider "unknown search provider configured"))))
;; external special:search url to link at the bottom of the page as the upstream source ;; external special:search url to link at the bottom of the page as the upstream source
(define external-search-url (define external-search-url

View File

@ -20,6 +20,7 @@
web-server/dispatchers/dispatch web-server/dispatchers/dispatch
; my libs ; my libs
"application-globals.rkt" "application-globals.rkt"
"../archiver/archiver-database.rkt"
"config.rkt" "config.rkt"
"data.rkt" "data.rkt"
"log.rkt" "log.rkt"
@ -40,6 +41,9 @@
(define path-archive (anytime-path ".." "storage/archive")) (define path-archive (anytime-path ".." "storage/archive"))
(when (config-true? 'feature_offline::only)
(void (get-slc)))
(define (page-wiki-offline req) (define (page-wiki-offline req)
(response-handler (response-handler
(define wikiname (path/param-path (first (url-path (request-uri req))))) (define wikiname (path/param-path (first (url-path (request-uri req)))))
@ -64,84 +68,94 @@
[else (error 'archive-format "unknown archive format configured")])) [else (error 'archive-format "unknown archive format configured")]))
(define fs-path (build-path path-archive wikiname (format (car archive-format) maybe-hashed-basename))) (define fs-path (build-path path-archive wikiname (format (car archive-format) maybe-hashed-basename)))
(define source-url (format "https://~a.fandom.com/wiki/~a" wikiname (basename->name-for-query basename))) (define source-url (format "https://~a.fandom.com/wiki/~a" wikiname (basename->name-for-query basename)))
(cond (cond/var
[(not (file-exists? fs-path))
(unless (config-true? 'feature_offline::only) [(file-exists? fs-path)
(next-dispatcher)) (when (config-true? 'debug)
(define mirror-path (url->string (request-uri req))) (printf "using offline mode for ~v~n" fs-path))
(response-handler
(define data (with-input-from-file fs-path (cdr archive-format)))
(define article-title (jp "/parse/title" data))
(define original-page (html->xexp (preprocess-html-wiki (jp "/parse/text" data))))
(define page ((query-selector (λ (t a c) (has-class? "mw-parser-output" a)) original-page)))
(define initial-head-data ((head-data-getter wikiname) data))
(define head-data
(case theme
[(light dark)
(struct-copy head-data^ initial-head-data
[body-class (regexp-replace #rx"(theme-fandomdesktop-)(light|dark)"
(head-data^-body-class initial-head-data)
(format "\\1~a" theme))])]
[else initial-head-data]))
(define body (define body
(generate-wiki-page (generate-wiki-page
`(div (@ (class "unsaved-page")) (update-tree-wiki page wikiname)
(style ".unsaved-page a { text-decoration: underline !important }")
(p "breezewiki.com doesn't have this page saved.")
(p "You can see this page by visiting a BreezeWiki mirror:")
(ul
(li (a (@ (href ,(format "https://antifandom.com~a" mirror-path))) "View on antifandom.com"))
(li (a (@ (href ,(format "https://bw.artemislena.eu~a" mirror-path))) "View on artemislena.eu"))
(li (a (@ (href ,source-url)) "or, you can see the original page on Fandom (ugh)")))
(p "If you'd like " ,wikiname ".fandom.com to be added to breezewiki.com, " (a (@ (href "https://lists.sr.ht/~cadence/breezewiki-requests")) "let me know about it!")))
#:req req #:req req
#:source-url source-url #:source-url source-url
#:wikiname wikiname #:wikiname wikiname
#:title (url-segments->guess-title segments) #:title article-title
#:online-styles #f #:online-styles #f
#:head-data head-data
#:siteinfo (siteinfo-fetch wikiname) #:siteinfo (siteinfo-fetch wikiname)
)) ))
(define redirect-msg ((query-selector (attribute-selector 'class "redirectMsg") body)))
(define redirect-query-parameter (dict-ref (url-query (request-uri req)) 'redirect "yes"))
(define headers
(build-headers
always-headers
; redirect-query-parameter: only the string "no" is significant:
; https://github.com/Wikia/app/blob/fe60579a53f16816d65dad1644363160a63206a6/includes/Wiki.php#L367
(when (and redirect-msg
(not (equal? redirect-query-parameter "no")))
(let* ([dest (get-attribute 'href (bits->attributes ((query-selector (λ (t a c) (eq? t 'a)) redirect-msg))))]
[value (bytes-append #"0;url=" (string->bytes/utf-8 dest))])
(header #"Refresh" value)))))
(when (config-true? 'debug) (when (config-true? 'debug)
; used for its side effects ; used for its side effects
; convert to string with error checking, error will be raised if xexp is invalid ; convert to string with error checking, error will be raised if xexp is invalid
(xexp->html body)) (xexp->html body))
(response/output (response/output
#:code 200 #:code 200
#:headers always-headers #:headers headers
(λ (out) (λ (out)
(write-html body out)))] (write-html body out))))]
[#t
(when (config-true? 'debug) ;; page not found on disk, perhaps it's a redirect? redirects are stored in the database
(printf "using offline mode for ~v~n" fs-path)) (var target (query-maybe-value* "select redirect from page where wikiname = ? and basename = ?" wikiname basename))
(response-handler [target
(define data (with-input-from-file fs-path (cdr archive-format))) (generate-redirect (basename->name-for-query target))]
(define article-title (jp "/parse/title" data))
(define original-page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) ;; breezewiki doesn't have the page archived, see if we can make a network request for it
(define page ((query-selector (λ (t a c) (has-class? "mw-parser-output" a)) original-page))) [(not (config-true? 'feature_offline::only))
(define initial-head-data ((head-data-getter wikiname) data)) (next-dispatcher)]
(define head-data
(case theme ;; no possible way to provide the page
[(light dark) [else
(struct-copy head-data^ initial-head-data (define mirror-path (url->string (request-uri req)))
[body-class (regexp-replace #rx"(theme-fandomdesktop-)(light|dark)" (define body
(head-data^-body-class initial-head-data) (generate-wiki-page
(format "\\1~a" theme))])] `(div (@ (class "unsaved-page"))
[else initial-head-data])) (style ".unsaved-page a { text-decoration: underline !important }")
(define body (p "breezewiki.com doesn't have this page saved.")
(generate-wiki-page (p "You can see this page by visiting a BreezeWiki mirror:")
(update-tree-wiki page wikiname) (ul
#:req req (li (a (@ (href ,(format "https://antifandom.com~a" mirror-path))) "View on antifandom.com"))
#:source-url source-url (li (a (@ (href ,(format "https://bw.artemislena.eu~a" mirror-path))) "View on artemislena.eu"))
#:wikiname wikiname (li (a (@ (href ,source-url)) "or, you can see the original page on Fandom (ugh)")))
#:title article-title (p "If you'd like " ,wikiname ".fandom.com to be added to breezewiki.com, " (a (@ (href "https://lists.sr.ht/~cadence/breezewiki-requests")) "let me know about it!")))
#:online-styles #f #:req req
#:head-data head-data #:source-url source-url
#:siteinfo (siteinfo-fetch wikiname) #:wikiname wikiname
)) #:title (url-segments->guess-title segments)
(define redirect-msg ((query-selector (attribute-selector 'class "redirectMsg") body))) #:online-styles #f
(define redirect-query-parameter (dict-ref (url-query (request-uri req)) 'redirect "yes")) #:siteinfo (siteinfo-fetch wikiname)
(define headers ))
(build-headers (when (config-true? 'debug)
always-headers ; used for its side effects
; redirect-query-parameter: only the string "no" is significant: ; convert to string with error checking, error will be raised if xexp is invalid
; https://github.com/Wikia/app/blob/fe60579a53f16816d65dad1644363160a63206a6/includes/Wiki.php#L367 (xexp->html body))
(when (and redirect-msg (response/output
(not (equal? redirect-query-parameter "no"))) #:code 200
(let* ([dest (get-attribute 'href (bits->attributes ((query-selector (λ (t a c) (eq? t 'a)) redirect-msg))))] #:headers always-headers
[value (bytes-append #"0;url=" (string->bytes/utf-8 dest))]) (λ (out)
(header #"Refresh" value))))) (write-html body out)))])))
(when (config-true? 'debug)
; used for its side effects
; convert to string with error checking, error will be raised if xexp is invalid
(xexp->html body))
(response/output
#:code 200
#:headers headers
(λ (out)
(write-html body out))))])))