Create archiver and offline code handlers
Somewhat messy. Will clean up gradually in future commits.
This commit is contained in:
parent
b8e5fb8dc5
commit
c7cce5479d
46 changed files with 4274 additions and 407 deletions
3
archiver/archive-info.rkt
Normal file
3
archiver/archive-info.rkt
Normal file
|
@ -0,0 +1,3 @@
|
|||
#lang info
|
||||
|
||||
(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo" "net-cookies-lib" "gui-easy-lib" "sql" "charterm" "cli"))
|
71
archiver/archiver-cli.rkt
Normal file
71
archiver/archiver-cli.rkt
Normal file
|
@ -0,0 +1,71 @@
|
|||
#lang cli
|
||||
(require charterm
|
||||
"archiver.rkt")
|
||||
|
||||
(help (usage "Downloads a single Fandom wiki in BreezeWiki offline format."
|
||||
""
|
||||
"Downloaded pages go into `archive/` next to the executable."
|
||||
"Database goes into `archiver.db*` next to the executable."
|
||||
"The database is necessary to store your download progress and resume where you left off if the process is interrupted.")
|
||||
(ps ""
|
||||
"Default output style is `progress` in a tty and `lines` otherwise."))
|
||||
|
||||
(flag (output-quiet?)
|
||||
("-q" "--output-quiet" "disable progress output")
|
||||
(output-quiet? #t))
|
||||
|
||||
(flag (output-lines?)
|
||||
("-l" "--output-lines" "output the name of each file downloaded")
|
||||
(output-lines? #t))
|
||||
|
||||
(flag (output-progress?)
|
||||
("-p" "--output-progress" "progress output for terminals")
|
||||
(output-progress? #t))
|
||||
|
||||
(constraint (one-of output-quiet? output-lines? output-progress?))
|
||||
|
||||
(program
|
||||
(start [wikiname "wikiname to download"])
|
||||
;; set up arguments
|
||||
(define width 80)
|
||||
(when (not (or (output-quiet?) (output-lines?) (output-progress?)))
|
||||
(cond [(terminal-port? current-input-port)
|
||||
(output-progress? #t)]
|
||||
[else
|
||||
(output-lines? #t)]))
|
||||
(define (update-width)
|
||||
(when (output-progress?)
|
||||
(with-charterm
|
||||
(call-with-values (λ () (charterm-screen-size))
|
||||
(λ (cols rows) (set! width cols))))))
|
||||
(update-width)
|
||||
;; check
|
||||
(when (or (not wikiname) (equal? wikiname ""))
|
||||
(raise-user-error "Please specify the wikiname to download on the command line."))
|
||||
;; stage 1
|
||||
(cond [(output-lines?) (displayln "Downloading list of pages...")]
|
||||
[(output-progress?) (printf "Downloading list of pages... \r")])
|
||||
(if-necessary-download-list-of-pages
|
||||
wikiname
|
||||
(λ (a b c)
|
||||
(cond [(output-progress?) (printf "Downloading list of pages... [~a/~b]\r" a b)])))
|
||||
;; stage 2
|
||||
(save-each-page
|
||||
wikiname
|
||||
(λ (a b c)
|
||||
(define basename (basename->name-for-query c))
|
||||
(cond
|
||||
[(output-lines?)
|
||||
(displayln basename)]
|
||||
[(output-progress?)
|
||||
(when (eq? (modulo a 20) 0)
|
||||
(thread (λ () (update-width))))
|
||||
(define prefix (format "[~a/~a] " a b))
|
||||
(define rest (- width (string-length prefix)))
|
||||
(define real-width (min (string-length basename) rest))
|
||||
(define spare-width (- rest real-width))
|
||||
(define name-display (substring basename 0 real-width))
|
||||
(define whitespace (make-string spare-width #\ ))
|
||||
(printf "~a~a~a\r" prefix name-display whitespace)]))))
|
||||
|
||||
(run start)
|
47
archiver/archiver-database.rkt
Normal file
47
archiver/archiver-database.rkt
Normal file
|
@ -0,0 +1,47 @@
|
|||
#lang racket/base
|
||||
(require racket/list
|
||||
racket/path
|
||||
racket/runtime-path
|
||||
racket/string
|
||||
json
|
||||
json-pointer
|
||||
db
|
||||
"../lib/syntax.rkt")
|
||||
|
||||
(provide
|
||||
slc)
|
||||
|
||||
(define-runtime-path database-file "../storage/archiver.db")
|
||||
|
||||
(define migrations
|
||||
(wrap-sql
|
||||
((query-exec slc "create table page (wikiname TEXT NOT NULL, basename TEXT NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, basename))")
|
||||
(query-exec slc "create table wiki (wikiname TEXT NOT NULL, progress INTEGER, PRIMARY KEY (wikiname))"))
|
||||
((query-exec slc "create table special_page (wikiname TEXT NOT NULL, key TEXT NOT NULL, basename TEXT NOT NULL, PRIMARY KEY (wikiname, key))"))
|
||||
((query-exec slc "update wiki set progress = 2 where wikiname in (select wikiname from wiki inner join page using (wikiname) group by wikiname having min(page.progress) = 1)"))
|
||||
((query-exec slc "create table image (wikiname TEXT NOT NULL, hash TEXT NTO NULL, url TEXT NOT NULL, ext TEXT, source INTEGER NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, hash))"))
|
||||
((query-exec slc "alter table wiki add column sitename TEXT")
|
||||
(query-exec slc "alter table wiki add column basepage TEXT")
|
||||
(query-exec slc "alter table wiki add column license_text TEXT")
|
||||
(query-exec slc "alter table wiki add column license_url TEXT"))))
|
||||
|
||||
(define slc (sqlite3-connect #:database database-file #:mode 'create))
|
||||
(query-exec slc "PRAGMA journal_mode=WAL")
|
||||
(define database-version
|
||||
(with-handlers ([exn:fail:sql?
|
||||
(λ (exn)
|
||||
; need to set up the database
|
||||
(query-exec slc "create table database_version (version integer, primary key (version))")
|
||||
(query-exec slc "insert into database_version values (0)")
|
||||
0)])
|
||||
(query-value slc "select version from database_version")))
|
||||
|
||||
(let do-migrate-step ()
|
||||
(when (database-version . < . (length migrations))
|
||||
(call-with-transaction
|
||||
slc
|
||||
(list-ref migrations database-version))
|
||||
(set! database-version (add1 database-version))
|
||||
(query-exec slc "update database_version set version = $1" database-version)
|
||||
(do-migrate-step)))
|
||||
|
204
archiver/archiver-gui.rkt
Normal file
204
archiver/archiver-gui.rkt
Normal file
|
@ -0,0 +1,204 @@
|
|||
#lang racket/base
|
||||
(require racket/class
|
||||
racket/list
|
||||
racket/port
|
||||
racket/set
|
||||
racket/string
|
||||
db
|
||||
net/http-easy
|
||||
racket/gui/easy
|
||||
racket/gui/easy/operator
|
||||
"archiver-database.rkt"
|
||||
"archiver.rkt"
|
||||
"../lib/url-utils.rkt"
|
||||
"../lib/xexpr-utils.rkt")
|
||||
|
||||
(define active-threads (mutable-seteq))
|
||||
|
||||
(define/obs @auto-retry #f)
|
||||
(define/obs @wikiname "")
|
||||
(define/obs @state 'waiting)
|
||||
(define/obs @num-pages 1)
|
||||
(define/obs @done-pages 0)
|
||||
(define/obs @just-done "")
|
||||
(define/obs @queue '())
|
||||
(define @title
|
||||
(obs-combine
|
||||
(λ (state queue num-pages done-pages)
|
||||
(define suffix (if (pair? queue)
|
||||
(format " +~a" (length queue))
|
||||
""))
|
||||
(define progress (if (eq? num-pages 0)
|
||||
" 0%"
|
||||
(format " ~a%" (round (inexact->exact (* (/ done-pages num-pages) 100))))))
|
||||
(case state
|
||||
[(waiting stage-0) (format "Fandom Archiver~a" suffix)]
|
||||
[(stage-1) (format "Fandom Archiver 0%~a" suffix)]
|
||||
[(stage-2) (format "Fandom Archiver~a~a" progress suffix)]
|
||||
[(err) "ERROR Fandom Archiver"]
|
||||
[(done) "Fandom Archiver 100%"]))
|
||||
@state @queue @num-pages (obs-throttle @done-pages #:duration 5000)))
|
||||
|
||||
(define-syntax-rule (t body ...)
|
||||
(set-add! active-threads (thread (λ () body ...))))
|
||||
|
||||
(define (do-start-or-queue)
|
||||
(define wikiname (obs-peek @wikiname))
|
||||
(:= @wikiname "")
|
||||
(when (not (equal? (string-trim wikiname) ""))
|
||||
(@queue . <~ . (λ (q) (append q (list wikiname))))
|
||||
(shift-queue-maybe)))
|
||||
|
||||
(define (shift-queue-maybe)
|
||||
(when (memq (obs-peek @state) '(waiting done))
|
||||
(define q (obs-peek @queue))
|
||||
(cond
|
||||
[(pair? q)
|
||||
(define wikiname (car q))
|
||||
(:= @queue (cdr q))
|
||||
(do-start-stage1 wikiname)]
|
||||
[#t (:= @state 'done)])))
|
||||
|
||||
(define (do-start-stage1 wikiname)
|
||||
(:= @just-done "")
|
||||
(:= @done-pages 0)
|
||||
(:= @num-pages 1)
|
||||
(t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)])
|
||||
(:= @state 'stage-0)
|
||||
(if-necessary-download-list-of-pages wikiname (λ (now-done num-pages just-done-name)
|
||||
(:= @num-pages num-pages)
|
||||
(:= @done-pages now-done)
|
||||
(:= @just-done just-done-name)
|
||||
(:= @state 'stage-1)))
|
||||
(do-start-stage2 wikiname))))
|
||||
|
||||
(define (do-start-stage2 wikiname)
|
||||
(:= @just-done "")
|
||||
(:= @num-pages 1)
|
||||
(:= @done-pages 0)
|
||||
(t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)])
|
||||
(save-each-page wikiname (λ (now-done num-pages just-done-path)
|
||||
(:= @num-pages num-pages)
|
||||
(:= @done-pages now-done)
|
||||
(:= @just-done just-done-path)))
|
||||
(:= @state 'waiting)
|
||||
(shift-queue-maybe)))
|
||||
(:= @state 'stage-2))
|
||||
|
||||
(define (exn->string e)
|
||||
(with-output-to-string
|
||||
(λ ()
|
||||
(displayln (exn-message e))
|
||||
(displayln "context:")
|
||||
(for ([item (continuation-mark-set->context (exn-continuation-marks e))])
|
||||
(printf " ~a" (srcloc->string (cdr item)))
|
||||
(when (car item)
|
||||
(printf ": ~a" (car item)))
|
||||
(displayln "")))))
|
||||
|
||||
(define ((handle-graphical-exn wikiname) e)
|
||||
(displayln (exn->string e) (current-error-port))
|
||||
(cond
|
||||
[(obs-peek @auto-retry)
|
||||
(do-retry-end wikiname)]
|
||||
[#t
|
||||
(:= @state 'err)
|
||||
(thread
|
||||
(λ ()
|
||||
(define/obs @visible? #t)
|
||||
(render
|
||||
(dialog #:title "Download Error"
|
||||
#:style '(resize-border)
|
||||
#:mixin (λ (%) (class % (super-new)
|
||||
(obs-observe! @visible? (λ (visible?) (send this show visible?)))))
|
||||
(vpanel #:margin '(15 15)
|
||||
(text "Encountered this error while downloading:")
|
||||
(input #:style '(multiple hscroll)
|
||||
#:min-size '(#f 200)
|
||||
(exn->string e))
|
||||
(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname)))
|
||||
(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname)))
|
||||
(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue)))
|
||||
(button "Use Auto-Retry" (λ ()
|
||||
(:= @auto-retry #t)
|
||||
(:= @visible? #f)
|
||||
(do-retry-end wikiname)))
|
||||
(text "Be careful not to auto-retry an infinite loop!")))
|
||||
main-window)))
|
||||
(sleep)
|
||||
; make sure the old broken threads are all gone
|
||||
(for ([th active-threads]) (kill-thread th))
|
||||
(set-clear! active-threads)]))
|
||||
|
||||
(define (do-retry-now wikiname)
|
||||
(@queue . <~ . (λ (q) (append (list wikiname) q)))
|
||||
(:= @state 'waiting)
|
||||
(shift-queue-maybe))
|
||||
|
||||
(define (do-retry-end wikiname)
|
||||
(@queue . <~ . (λ (q) (append q (list wikiname))))
|
||||
(:= @state 'waiting)
|
||||
(shift-queue-maybe))
|
||||
|
||||
(define (do-continue)
|
||||
(:= @state 'waiting)
|
||||
(shift-queue-maybe))
|
||||
|
||||
(define (display-basename basename)
|
||||
(define limit 40)
|
||||
(cond [(string? basename)
|
||||
(define query (basename->name-for-query basename))
|
||||
(define segments (string-split query "/"))
|
||||
(when (and ((string-length query) . > . limit) ((length segments) . >= . 2))
|
||||
(set! query (string-append ".../" (last segments))))
|
||||
(when ((string-length query) . > . limit)
|
||||
(set! query (string-append (substring query 0 (- limit 3)) "...")))
|
||||
query]
|
||||
[#t "?"]))
|
||||
|
||||
(define main-window
|
||||
(render
|
||||
(window #:title @title
|
||||
#:size '(360 200)
|
||||
#:mixin (λ (%) (class %
|
||||
(super-new)
|
||||
(define/augment (on-close)
|
||||
(for ([th active-threads]) (kill-thread th))
|
||||
(disconnect slc))))
|
||||
;; input box at the top
|
||||
(hpanel (text "https://")
|
||||
(input @wikiname
|
||||
(λ (event data) (cond
|
||||
[(eq? event 'input) (:= @wikiname data)]
|
||||
[(eq? event 'return) (do-start-or-queue)])))
|
||||
(text ".fandom.com"))
|
||||
(button (@queue . ~> . (λ (q) (if (null? q) "Start" "Queue"))) (λ () (do-start-or-queue)))
|
||||
(text (@queue . ~> . (λ (q) (if (null? q) "" (string-join #:before-first "Queue: " q ", ")))))
|
||||
;; show status based on overall application state
|
||||
(case-view
|
||||
@state
|
||||
;; waiting for wikiname entry
|
||||
((waiting) (vpanel
|
||||
(text "Fill in the wikiname and click start.")))
|
||||
((stage-0) (vpanel
|
||||
(text "Checking data...")))
|
||||
((stage-1) (vpanel
|
||||
(text "Gathering list of pages...")
|
||||
(text (@just-done . ~> . display-basename))
|
||||
(text (@done-pages . ~> . (λ (x) (if (eq? x 0)
|
||||
"0/?"
|
||||
(format "~a/~a" x (obs-peek @num-pages))))))))
|
||||
;; downloading contents
|
||||
((stage-2) (vpanel
|
||||
(text "Downloading page text...")
|
||||
(progress @done-pages #:range @num-pages)
|
||||
(text (@done-pages . ~> . (λ (x) (format "~a/~a" x (obs-peek @num-pages)))))
|
||||
(text (@just-done . ~> . display-basename))))
|
||||
((done) (vpanel
|
||||
(text "All wikis downloaded!")))
|
||||
((err) (vpanel
|
||||
(text "Error. Check the popup window.")))
|
||||
(else (text (@state . ~> . (λ (state) (format "invalid state: ~a" state))))))
|
||||
(checkbox #:label "Auto-retry on error? (Dangerous)"
|
||||
#:checked? @auto-retry
|
||||
(λ:= @auto-retry)))))
|
309
archiver/archiver.rkt
Normal file
309
archiver/archiver.rkt
Normal file
|
@ -0,0 +1,309 @@
|
|||
#lang racket/base
|
||||
(require racket/file
|
||||
racket/function
|
||||
racket/list
|
||||
racket/runtime-path
|
||||
racket/string
|
||||
net/url
|
||||
net/mime
|
||||
file/sha1
|
||||
net/http-easy
|
||||
db
|
||||
"../lib/html-parsing/main.rkt"
|
||||
json
|
||||
"archiver-database.rkt"
|
||||
"../lib/mime-types.rkt"
|
||||
"../lib/tree-updater.rkt"
|
||||
"../lib/url-utils.rkt"
|
||||
"../lib/xexpr-utils.rkt"
|
||||
"../lib/archive-file-mappings.rkt")
|
||||
|
||||
(define archive-slc slc)
|
||||
|
||||
(provide
|
||||
if-necessary-download-list-of-pages
|
||||
download-list-of-pages
|
||||
save-each-page
|
||||
basename->name-for-query
|
||||
image-url->values
|
||||
hash->save-dir
|
||||
archive-slc)
|
||||
|
||||
(module+ test
|
||||
(require rackunit))
|
||||
|
||||
(define-runtime-path archive-root "../storage/archive")
|
||||
#;(define archive-root "archive")
|
||||
|
||||
(define sources '#hasheq((style . 1) (page . 2)))
|
||||
|
||||
(define (get-origin wikiname)
|
||||
(format "https://~a.fandom.com" wikiname))
|
||||
|
||||
(define (insert-wiki-entry wikiname)
|
||||
(define dest-url
|
||||
(format "https://~a.fandom.com/api.php?~a"
|
||||
wikiname
|
||||
(params->query '(("action" . "query")
|
||||
("meta" . "siteinfo")
|
||||
("siprop" . "general|rightsinfo")
|
||||
("format" . "json")
|
||||
("formatversion" . "2")))))
|
||||
(define data (response-json (get dest-url)))
|
||||
(define exists? (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname))
|
||||
(if exists?
|
||||
(query-exec slc "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?"
|
||||
(jp "/query/general/sitename" data)
|
||||
(second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
|
||||
(jp "/query/rightsinfo/text" data)
|
||||
(jp "/query/rightsinfo/url" data)
|
||||
wikiname)
|
||||
(query-exec slc "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 1, ?, ?, ?, ?)"
|
||||
wikiname
|
||||
(jp "/query/general/sitename" data)
|
||||
(second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
|
||||
(jp "/query/rightsinfo/text" data)
|
||||
(jp "/query/rightsinfo/url" data))))
|
||||
|
||||
;; call 1 if not yet done for that wiki
|
||||
(define (if-necessary-download-list-of-pages wikiname callback)
|
||||
(define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname))
|
||||
;; done yet?
|
||||
(unless (and (real? wiki-progress) (wiki-progress . >= . 1))
|
||||
;; count total pages
|
||||
(define dest-url
|
||||
(format "https://~a.fandom.com/api.php?~a"
|
||||
wikiname
|
||||
(params->query `(("action" . "query") ("meta" . "siteinfo") ("siprop" . "statistics") ("format" . "json")))))
|
||||
(define num-pages (jp "/query/statistics/articles" (response-json (get dest-url))))
|
||||
(download-list-of-pages wikiname callback 0 num-pages #f)))
|
||||
|
||||
;; 1. Download list of wiki pages and store in database
|
||||
(define (download-list-of-pages wikiname callback total-so-far grand-total path-with-namefrom)
|
||||
(define url (if path-with-namefrom
|
||||
(format "https://~a.fandom.com~a" wikiname path-with-namefrom)
|
||||
(format "https://~a.fandom.com/wiki/Local_Sitemap" wikiname)))
|
||||
(define r (get url))
|
||||
(define page (html->xexp (bytes->string/utf-8 (response-body r))))
|
||||
(define link-namefrom
|
||||
((query-selector (λ (t a c x) (and (eq? t 'a)
|
||||
(pair? x)
|
||||
(string-contains? (car x) "Next page")
|
||||
(let ([href (get-attribute 'href a)] )
|
||||
(and href (string-contains? href "/wiki/Local_Sitemap")))))
|
||||
page #:include-text? #t)))
|
||||
(define row-values
|
||||
(for/list ([link (in-producer
|
||||
(query-selector
|
||||
(λ (t a c) (eq? t 'a))
|
||||
((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page)))
|
||||
#f)])
|
||||
(list wikiname (local-encoded-url->basename (get-attribute 'href (bits->attributes link))) 0)))
|
||||
(define query-template (string-join (make-list (length row-values) "(?, ?, ?)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values "))
|
||||
(apply query-exec slc query-template (flatten row-values))
|
||||
(define new-total (+ (length row-values) total-so-far))
|
||||
(callback new-total grand-total (second (last row-values)))
|
||||
(cond
|
||||
[link-namefrom ; repeat on the next page
|
||||
(download-list-of-pages wikiname callback new-total grand-total (get-attribute 'href (bits->attributes link-namefrom)))]
|
||||
[#t ; all done downloading sitemap
|
||||
(insert-wiki-entry wikiname)]))
|
||||
|
||||
;; 2. Download each page via API and:
|
||||
;; * Save API response to file
|
||||
(define max-page-progress 1)
|
||||
(define (save-each-page wikiname callback)
|
||||
;; prepare destination folder
|
||||
(define save-dir (build-path archive-root wikiname))
|
||||
(make-directory* save-dir)
|
||||
;; gather list of basenames to download (that aren't yet complete)
|
||||
(define basenames (query-list slc "select basename from page where wikiname = ? and progress < ?"
|
||||
wikiname max-page-progress))
|
||||
;; counter of complete/incomplete basenames
|
||||
(define already-done-count
|
||||
(query-value slc "select count(*) from page where wikiname = ? and progress = ?"
|
||||
wikiname max-page-progress))
|
||||
(define not-done-count
|
||||
(query-value slc "select count(*) from page where wikiname = ? and progress < ?"
|
||||
wikiname max-page-progress))
|
||||
;; set initial progress
|
||||
(callback already-done-count (+ already-done-count not-done-count) "")
|
||||
;; loop through basenames and download
|
||||
(for ([basename basenames]
|
||||
[i (in-naturals 1)])
|
||||
(define name-for-query (basename->name-for-query basename))
|
||||
(define dest-url
|
||||
(format "https://~a.fandom.com/api.php?~a"
|
||||
wikiname
|
||||
(params->query `(("action" . "parse")
|
||||
("page" . ,name-for-query)
|
||||
("prop" . "text|headhtml|langlinks")
|
||||
("formatversion" . "2")
|
||||
("format" . "json")))))
|
||||
(define r (get dest-url))
|
||||
(define body (response-body r))
|
||||
(define filename (string-append basename ".json"))
|
||||
(define save-path
|
||||
(cond [((string-length basename) . > . 240)
|
||||
(define key (sha1 (string->bytes/latin-1 basename)))
|
||||
(query-exec slc "insert into special_page (wikiname, key, basename) values (?, ?, ?)"
|
||||
wikiname key basename)
|
||||
(build-path save-dir (string-append key ".json"))]
|
||||
[#t
|
||||
(build-path save-dir (string-append basename ".json"))]))
|
||||
(display-to-file body save-path #:exists 'replace)
|
||||
(query-exec slc "update page set progress = 1 where wikiname = ? and basename = ?"
|
||||
wikiname basename)
|
||||
(callback (+ already-done-count i) (+ already-done-count not-done-count) basename))
|
||||
;; saved all pages, register that fact in the database
|
||||
(query-exec slc "update wiki set progress = 2 where wikiname = ?" wikiname))
|
||||
|
||||
;; 3. Download CSS and:
|
||||
;; * Save CSS to file
|
||||
;; * Record style images to database
|
||||
(define (check-style-for-images wikiname path)
|
||||
(define content (file->string path))
|
||||
(define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr))
|
||||
(for/list ([url urls]
|
||||
#:when (not (or (equal? url "")
|
||||
(equal? url "'")
|
||||
(string-contains? url "/resources-ucp/")
|
||||
(string-contains? url "/fonts/")
|
||||
(string-contains? url "/drm_fonts/")
|
||||
(string-contains? url "//db.onlinewebfonts.com/")
|
||||
(string-contains? url "//bits.wikimedia.org/")
|
||||
(string-contains? url "dropbox")
|
||||
(string-contains? url "only=styles")
|
||||
(string-contains? url "https://https://")
|
||||
(regexp-match? #rx"^%20" url)
|
||||
(regexp-match? #rx"^data:" url))))
|
||||
(cond
|
||||
[(string-prefix? url "https://") url]
|
||||
[(string-prefix? url "http://") (regexp-replace #rx"http:" url "https:")]
|
||||
[(string-prefix? url "//") (string-append "https:" url)]
|
||||
[(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)]
|
||||
[else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)])))
|
||||
|
||||
(define (download-styles-for-wiki wikiname)
|
||||
(define save-dir (build-path archive-root wikiname "styles"))
|
||||
(make-directory* save-dir)
|
||||
(define theme (λ (theme-name)
|
||||
(cons (format "https://~a.fandom.com/wikia.php?controller=ThemeApi&method=themeVariables&variant=~a" wikiname theme-name)
|
||||
(build-path save-dir (format "themeVariables-~a.css" theme-name)))))
|
||||
;; (Listof (Pair url save-path))
|
||||
(define styles
|
||||
(list
|
||||
(theme "default")
|
||||
(theme "light")
|
||||
(theme "dark")
|
||||
(cons (format "https://~a.fandom.com/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop" wikiname)
|
||||
(build-path save-dir "site.css"))))
|
||||
(for ([style styles])
|
||||
(define r (get (car style)))
|
||||
(define body (response-body r))
|
||||
(display-to-file body (cdr style) #:exists 'replace)
|
||||
;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url?
|
||||
)
|
||||
styles)
|
||||
|
||||
(define (do-step-3 wikiname)
|
||||
(define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname))
|
||||
(unless (and (number? wiki-progress) (wiki-progress . >= . 3))
|
||||
(define styles (download-styles-for-wiki wikiname))
|
||||
(define unique-image-urls
|
||||
(remove-duplicates
|
||||
(map image-url->values
|
||||
(flatten
|
||||
(for/list ([style styles])
|
||||
(check-style-for-images wikiname (cdr style)))))
|
||||
#:key cdr))
|
||||
(println unique-image-urls)
|
||||
(for ([pair unique-image-urls])
|
||||
(query-exec slc "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair)))
|
||||
(query-exec slc "update wiki set progress = 3 where wikiname = ?" wikiname)))
|
||||
|
||||
;; 4: From downloaded pages, record URLs of image sources and inline style images to database
|
||||
(define (hash->save-dir wikiname hash)
|
||||
(build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2)))
|
||||
|
||||
(define (image-url->values i)
|
||||
;; TODO: handle case where there is multiple cb parameter on minecraft wiki
|
||||
;; TODO: ensure it still "works" with broken & on minecraft wiki
|
||||
(define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing
|
||||
(define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary
|
||||
(define hash (sha1 (string->bytes/utf-8 key)))
|
||||
(cons key hash))
|
||||
|
||||
(define (check-json-for-images wikiname path)
|
||||
(define data (with-input-from-file path (λ () (read-json))))
|
||||
(define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data))))
|
||||
(define tree (update-tree-wiki page wikiname))
|
||||
(remove-duplicates
|
||||
(for/list ([element (in-producer
|
||||
(query-selector
|
||||
(λ (t a c)
|
||||
(and (eq? t 'img)
|
||||
(get-attribute 'src a)))
|
||||
tree)
|
||||
#f)])
|
||||
(image-url->values (get-attribute 'src (bits->attributes element))))))
|
||||
|
||||
;; 5. Download image sources and style images according to database
|
||||
(define (save-each-image wikiname source callback)
|
||||
;; gather list of basenames to download (that aren't yet complete)
|
||||
(define rows (query-rows slc "select url, hash from image where wikiname = ? and source <= ? and progress < 1"
|
||||
wikiname source))
|
||||
;; counter of complete/incomplete basenames
|
||||
(define already-done-count
|
||||
(query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress = 1"
|
||||
wikiname source))
|
||||
(define not-done-count
|
||||
(query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress < 1"
|
||||
wikiname source))
|
||||
;; set initial progress
|
||||
(callback already-done-count (+ already-done-count not-done-count) "")
|
||||
;; loop through urls and download
|
||||
(for ([row rows]
|
||||
[i (in-naturals 1)])
|
||||
;; row fragments
|
||||
(define url (vector-ref row 0))
|
||||
(define hash (vector-ref row 1))
|
||||
;; check
|
||||
(printf "~a -> ~a~n" url hash)
|
||||
(define r (get url))
|
||||
(define declared-type (response-headers-ref r 'content-type))
|
||||
(define final-type (if (equal? declared-type #"application/octet-stream")
|
||||
(let ([sniff-entity (message-entity (mime-analyze (response-body r)))])
|
||||
(string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity))))
|
||||
declared-type))
|
||||
(define ext (bytes->string/latin-1 (mime-type->ext final-type)))
|
||||
;; save
|
||||
(define save-dir (hash->save-dir wikiname hash))
|
||||
(make-directory* save-dir)
|
||||
(define save-path (build-path save-dir (string-append hash "." ext)))
|
||||
(define body (response-body r))
|
||||
(display-to-file body save-path #:exists 'replace)
|
||||
(query-exec slc "update image set progress = 1, ext = ? where wikiname = ? and hash = ?"
|
||||
ext wikiname hash)
|
||||
(callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append hash "." ext)))
|
||||
;; TODO: saved all images, register that fact in the database
|
||||
)
|
||||
|
||||
|
||||
(module+ test
|
||||
(check-equal? (html->xexp "<img src=\"https://example.com/images?src=Blah.jpg&width=150\">")
|
||||
'(*TOP* (img (@ (src "https://example.com/images?src=Blah.jpg&width=150")))))
|
||||
#;(download-list-of-pages "minecraft" values)
|
||||
#;(save-each-page "minecraft" values)
|
||||
#;(check-json-for-images "chiki" (build-path archive-root "chiki" "Fiona.json"))
|
||||
#;(do-step-3 "gallowmere")
|
||||
#;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))
|
||||
|
||||
#;(for ([wikiname (query-list slc "select wikiname from wiki")])
|
||||
(println wikiname)
|
||||
(insert-wiki-entry wikiname))
|
||||
|
||||
#;(for ([wikiname (query-list slc "select wikiname from wiki")])
|
||||
(println wikiname)
|
||||
(do-step-3 wikiname)
|
||||
(save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))))
|
|
@ -19,8 +19,10 @@
|
|||
(require-reloadable "src/page-search.rkt" page-search)
|
||||
(require-reloadable "src/page-set-user-settings.rkt" page-set-user-settings)
|
||||
(require-reloadable "src/page-static.rkt" static-dispatcher)
|
||||
(require-reloadable "src/page-static-archive.rkt" page-static-archive)
|
||||
(require-reloadable "src/page-subdomain.rkt" subdomain-dispatcher)
|
||||
(require-reloadable "src/page-wiki.rkt" page-wiki)
|
||||
(require-reloadable "src/page-wiki-offline.rkt" page-wiki-offline)
|
||||
(require-reloadable "src/page-file.rkt" page-file)
|
||||
|
||||
(reload!)
|
||||
|
@ -42,7 +44,9 @@
|
|||
page-proxy
|
||||
page-search
|
||||
page-set-user-settings
|
||||
page-static-archive
|
||||
page-wiki
|
||||
page-wiki-offline
|
||||
page-file
|
||||
redirect-wiki-home
|
||||
static-dispatcher
|
||||
|
|
4
dist.rkt
4
dist.rkt
|
@ -13,8 +13,10 @@
|
|||
(require (only-in "src/page-search.rkt" page-search))
|
||||
(require (only-in "src/page-set-user-settings.rkt" page-set-user-settings))
|
||||
(require (only-in "src/page-static.rkt" static-dispatcher))
|
||||
(require (only-in "src/page-static-archive.rkt" page-static-archive))
|
||||
(require (only-in "src/page-subdomain.rkt" subdomain-dispatcher))
|
||||
(require (only-in "src/page-wiki.rkt" page-wiki))
|
||||
(require (only-in "src/page-wiki-offline.rkt" page-wiki-offline))
|
||||
(require (only-in "src/page-file.rkt" page-file))
|
||||
|
||||
(serve/launch/wait
|
||||
|
@ -31,7 +33,9 @@
|
|||
page-proxy
|
||||
page-search
|
||||
page-set-user-settings
|
||||
page-static-archive
|
||||
page-wiki
|
||||
page-wiki-offline
|
||||
page-file
|
||||
redirect-wiki-home
|
||||
static-dispatcher
|
||||
|
|
28
lib/archive-file-mappings.rkt
Normal file
28
lib/archive-file-mappings.rkt
Normal file
|
@ -0,0 +1,28 @@
|
|||
#lang racket/base
|
||||
(require racket/string
|
||||
net/url
|
||||
(only-in net/uri-codec uri-decode)
|
||||
"url-utils.rkt")
|
||||
(provide
|
||||
local-encoded-url->segments
|
||||
url-segments->basename
|
||||
local-encoded-url->basename
|
||||
basename->name-for-query
|
||||
url-segments->guess-title)
|
||||
|
||||
(define (local-encoded-url->segments str) ; '("wiki" "Page_title")
|
||||
(map path/param-path (url-path (string->url str))))
|
||||
|
||||
(define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix
|
||||
(define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments)))
|
||||
(define basic-filename (string-join extra-encoded "#"))
|
||||
basic-filename)
|
||||
|
||||
(define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix
|
||||
(url-segments->basename (local-encoded-url->segments str)))
|
||||
|
||||
(define (basename->name-for-query str)
|
||||
(uri-decode (regexp-replace* #rx"#" str "/")))
|
||||
|
||||
(define (url-segments->guess-title segments)
|
||||
(regexp-replace* #rx"_" (cadr segments) " "))
|
1887
lib/html-parsing/main.rkt
Normal file
1887
lib/html-parsing/main.rkt
Normal file
File diff suppressed because it is too large
Load diff
34
lib/mime-types.rkt
Normal file
34
lib/mime-types.rkt
Normal file
|
@ -0,0 +1,34 @@
|
|||
#lang racket/base
|
||||
(require racket/contract
|
||||
racket/match
|
||||
racket/path
|
||||
racket/runtime-path
|
||||
racket/string)
|
||||
|
||||
(provide
|
||||
(contract-out
|
||||
[ext->mime-type (-> bytes? bytes?)]
|
||||
[mime-type->ext (-> bytes? bytes?)]))
|
||||
|
||||
(define-runtime-path mime.types-path "mime.types")
|
||||
|
||||
(define ls
|
||||
(call-with-input-file mime.types-path
|
||||
(λ (in) (for/list ([line (in-lines in)]
|
||||
#:when (not (regexp-match? #rx"^ *($|#)" line)))
|
||||
(match line
|
||||
[(regexp #rx"^([^ ]+) +(.+)$" (list _ mime ext))
|
||||
(cons (string->bytes/utf-8 ext) (string->bytes/utf-8 mime))]
|
||||
[(regexp #rx"^ *#") (void)]
|
||||
[_ (log-warning "mime-types: failed to parse line ~s" line)])))))
|
||||
|
||||
(define forward-hash (make-immutable-hash ls))
|
||||
(define reverse-hash (make-immutable-hash (map (λ (x) (cons (cdr x) (car x))) ls)))
|
||||
|
||||
(define (ext->mime-type ext-in)
|
||||
(define ext (regexp-replace #rx"^\\." ext-in #""))
|
||||
(hash-ref forward-hash ext))
|
||||
|
||||
(define (mime-type->ext m-in)
|
||||
(define m (regexp-replace #rx";.*" m-in #""))
|
||||
(hash-ref reverse-hash m))
|
85
lib/mime.types
Normal file
85
lib/mime.types
Normal file
|
@ -0,0 +1,85 @@
|
|||
text/html html
|
||||
text/css css
|
||||
text/xml xml
|
||||
image/gif gif
|
||||
image/jpeg jpeg
|
||||
application/javascript js
|
||||
text/javascript js
|
||||
application/atom+xml atom
|
||||
application/rss+xml rss
|
||||
|
||||
text/mathml mml
|
||||
text/plain txt
|
||||
text/x-component htc
|
||||
|
||||
image/png png
|
||||
image/tiff tiff
|
||||
image/vnd.wap.wbmp wbmp
|
||||
image/x-icon ico
|
||||
image/x-jng jng
|
||||
image/x-ms-bmp bmp
|
||||
image/svg+xml svg
|
||||
image/webp webp
|
||||
|
||||
application/font-woff2 woff2
|
||||
application/acad woff2
|
||||
font/woff2 woff2
|
||||
application/font-woff woff
|
||||
application/x-font-ttf ttf
|
||||
application/x-font-truetype ttf
|
||||
application/x-truetype-font ttf
|
||||
application/font-sfnt ttf
|
||||
font/sfnt ttf
|
||||
application/vnd.oasis.opendocument.formula-template otf
|
||||
application/x-font-opentype otf
|
||||
application/vnd.ms-opentype otf
|
||||
font/otf otf
|
||||
application/java-archive jar
|
||||
application/json json
|
||||
application/mac-binhex40 hqx
|
||||
application/msword doc
|
||||
application/pdf pdf
|
||||
application/postscript ps
|
||||
application/rtf rtf
|
||||
application/vnd.apple.mpegurl m3u8
|
||||
application/vnd.ms-excel xls
|
||||
application/vnd.ms-fontobject eot
|
||||
application/vnd.ms-powerpoint ppt
|
||||
application/vnd.wap.wmlc wmlc
|
||||
application/vnd.google-earth.kml+xml kml
|
||||
application/vnd.google-earth.kmz kmz
|
||||
application/x-7z-compressed 7z
|
||||
application/x-cocoa cco
|
||||
application/x-java-archive-diff jardiff
|
||||
application/x-java-jnlp-file jnlp
|
||||
application/x-makeself run
|
||||
application/x-perl pl
|
||||
application/x-rar-compressed rar
|
||||
application/x-redhat-package-manager rpm
|
||||
application/x-sea sea
|
||||
application/x-shockwave-flash swf
|
||||
application/x-stuffit sit
|
||||
application/x-tcl tcl
|
||||
application/x-x509-ca-cert pem
|
||||
application/x-xpinstall xpi
|
||||
application/xhtml+xml xhtml
|
||||
application/xspf+xml xspf
|
||||
application/zip zip
|
||||
application/gzip gz
|
||||
|
||||
audio/midi mid midi kar
|
||||
audio/mpeg mp3
|
||||
audio/ogg ogg
|
||||
audio/x-m4a m4a
|
||||
audio/x-realaudio ra
|
||||
|
||||
video/mp2t ts
|
||||
video/mp4 mp4
|
||||
video/mpeg mpeg
|
||||
video/quicktime mov
|
||||
video/webm webm
|
||||
video/x-flv flv
|
||||
video/x-m4v m4v
|
||||
video/x-mng mng
|
||||
video/x-ms-wmv wmv
|
||||
video/x-msvideo avi
|
|
@ -4,13 +4,19 @@
|
|||
; call the updater on the dictionary key only if it has that key
|
||||
alist-maybe-update
|
||||
; update a value only if a condition succeeds on it
|
||||
u)
|
||||
u
|
||||
; like string-join, but for lists
|
||||
list-join
|
||||
u-counter)
|
||||
|
||||
(module+ test
|
||||
(require "typed-rackunit.rkt"))
|
||||
|
||||
(define u-counter (box 0))
|
||||
|
||||
(: alist-maybe-update (∀ (A B) ((Listof (Pairof A B)) A (B -> B) -> (Listof (Pairof A B)))))
|
||||
(define (alist-maybe-update alist key updater)
|
||||
(set-box! u-counter (add1 (unbox u-counter)))
|
||||
(map (λ ([p : (Pairof A B)])
|
||||
(if (eq? (car p) key)
|
||||
(cons (car p) (updater (cdr p)))
|
||||
|
@ -24,7 +30,16 @@
|
|||
|
||||
(: u (∀ (A) ((A -> Any) (A -> A) A -> A)))
|
||||
(define (u condition updater value)
|
||||
(set-box! u-counter (add1 (unbox u-counter)))
|
||||
(if (condition value) (updater value) value))
|
||||
(module+ test
|
||||
(check-equal? (u (λ ([x : Integer]) (< x 5)) (λ ([x : Integer]) (* x -1)) 4) -4)
|
||||
(check-equal? (u (λ ([x : Integer]) (< x 5)) (λ ([x : Integer]) (* x -1)) 8) 8))
|
||||
|
||||
(: list-join (∀ (A B) (A (Listof B) -> (Listof (U A B)))))
|
||||
(define (list-join element ls)
|
||||
(if (pair? (cdr ls))
|
||||
(list* (car ls) element (list-join element (cdr ls)))
|
||||
(list (car ls))))
|
||||
(module+ test
|
||||
(check-equal? (list-join "h" '(2 3 4 5)) '(2 "h" 3 "h" 4 "h" 5)))
|
|
@ -5,7 +5,11 @@
|
|||
; help make a nested if. if/in will gain the same false form of its containing if/out.
|
||||
if/out
|
||||
; let, but the value for each variable is evaluated within a thread
|
||||
thread-let)
|
||||
thread-let
|
||||
; cond, but values can be defined between conditions
|
||||
cond/var
|
||||
; wrap sql statements into lambdas so they can be executed during migration
|
||||
wrap-sql)
|
||||
|
||||
(module+ test
|
||||
(require rackunit)
|
||||
|
@ -17,9 +21,12 @@
|
|||
;; it's in a submodule so that it can be required in both levels, for testing
|
||||
|
||||
(module transform racket/base
|
||||
(require racket/list)
|
||||
|
||||
(provide
|
||||
transform-if/out
|
||||
transform-thread-let)
|
||||
transform-thread-let
|
||||
transform/out-cond/var)
|
||||
|
||||
(define (transform-if/out stx)
|
||||
(define tree (cdr (syntax->datum stx))) ; condition true false
|
||||
|
@ -62,12 +69,46 @@
|
|||
(define def (list-ref defs n))
|
||||
`(,(car def) (channel-get (vector-ref chv ,n))))
|
||||
counter)
|
||||
,@forms)))))
|
||||
,@forms))))
|
||||
|
||||
(define (transform/out-cond/var stx)
|
||||
(define tree (transform-cond/var (cdr (syntax->datum stx))))
|
||||
(datum->syntax
|
||||
stx
|
||||
tree))
|
||||
|
||||
(define (transform-cond/var tree)
|
||||
(define-values (els temp) (splitf-at tree (λ (el) (and (pair? el) (not (eq? (car el) 'var))))))
|
||||
(define-values (vars rest) (splitf-at temp (λ (el) (and (pair? el) (eq? (car el) 'var)))))
|
||||
(if (null? rest)
|
||||
`(cond ,@els)
|
||||
`(cond
|
||||
,@els
|
||||
[#t
|
||||
(let ,(for/list ([var vars])
|
||||
(cdr var))
|
||||
,(transform-cond/var rest))]))))
|
||||
|
||||
;; the syntax definitions and their tests go below here
|
||||
|
||||
(require 'transform (for-syntax 'transform))
|
||||
|
||||
(define-syntax (wrap-sql stx)
|
||||
; the arguments
|
||||
(define xs (cdr (syntax->list stx)))
|
||||
; wrap each argument
|
||||
(define wrapped (map (λ (xe) ; xe is the syntax of an argument
|
||||
(if (list? (car (syntax->datum xe)))
|
||||
; it's a list of lists (a list of sql migration steps)
|
||||
; return instead syntax of a lambda that will call everything in xe
|
||||
(datum->syntax stx `(λ () ,@xe))
|
||||
; it's just a single sql migration step
|
||||
; return instead syntax of a lambda that will call xe
|
||||
(datum->syntax stx `(λ () ,xe))))
|
||||
xs))
|
||||
; since I'm returning *code*, I need to return the form (list ...) so that runtime makes a list
|
||||
(datum->syntax stx `(list ,@wrapped)))
|
||||
|
||||
(define-syntax (if/out stx)
|
||||
(transform-if/out stx))
|
||||
(module+ test
|
||||
|
@ -106,3 +147,15 @@
|
|||
; check that it assigns the correct value to the correct variable
|
||||
(check-equal? (thread-let ([a (sleep 0) 'a] [b 'b]) (list a b))
|
||||
'(a b)))
|
||||
|
||||
(define-syntax (cond/var stx)
|
||||
(transform/out-cond/var stx))
|
||||
(module+ test
|
||||
(check-syntax-equal? (transform/out-cond/var #'(cond/def [#f 0] (var d (* a 2)) [(eq? d 8) d] [#t "not 4"]))
|
||||
#'(cond
|
||||
[#f 0]
|
||||
[#t
|
||||
(let ([d (* a 2)])
|
||||
(cond
|
||||
[(eq? d 8) d]
|
||||
[#t "not 4"]))])))
|
|
@ -3,17 +3,35 @@
|
|||
racket/function
|
||||
racket/match
|
||||
racket/string
|
||||
"config.rkt"
|
||||
"pure-utils.rkt"
|
||||
"url-utils.rkt"
|
||||
"xexpr-utils.rkt")
|
||||
|
||||
(provide
|
||||
preprocess-html-wiki
|
||||
update-tree-wiki)
|
||||
|
||||
(define (preprocess-html-wiki html)
|
||||
(define ((rr* find replace) contents)
|
||||
(regexp-replace* find contents replace))
|
||||
((compose1
|
||||
; fix navbox list nesting
|
||||
; navbox on right of page has incorrect html "<td ...><li>" and the xexpr parser puts the <li> much further up the tree
|
||||
; add a <ul> to make the parser happy
|
||||
; usage: /fallout/wiki/Fallout:_New_Vegas_achievements_and_trophies
|
||||
(rr* #rx"(<td[^>]*>\n?)(<li>)" "\\1<ul>\\2")
|
||||
; change <figcaption><p> to <figcaption><span> to make the parser happy
|
||||
(rr* #rx"(<figcaption[^>]*>)[ \t]*<p class=\"caption\">([^<]*)</p>" "\\1<span class=\"caption\">\\2</span>"))
|
||||
html))
|
||||
(module+ test
|
||||
(check-equal? (preprocess-html-wiki "<td class=\"va-navbox-column\" style=\"width: 33%\">\n<li>Hey</li>")
|
||||
"<td class=\"va-navbox-column\" style=\"width: 33%\">\n<ul><li>Hey</li>")
|
||||
(check-equal? (preprocess-html-wiki "<figure class=\"thumb tright\" style=\"width: 150px\"><a class=\"image\"><img></a><noscript><a><img></a></noscript><figcaption class=\"thumbcaption\"> <p class=\"caption\">Caption text.</p></figcaption></figure>")
|
||||
"<figure class=\"thumb tright\" style=\"width: 150px\"><a class=\"image\"><img></a><noscript><a><img></a></noscript><figcaption class=\"thumbcaption\"><span class=\"caption\">Caption text.</span></figcaption></figure>"))
|
||||
|
||||
(module+ test
|
||||
(require rackunit
|
||||
html-parsing)
|
||||
"html-parsing/main.rkt")
|
||||
(define wiki-document
|
||||
'(*TOP*
|
||||
(div (@ (class "mw-parser-output"))
|
||||
|
@ -47,7 +65,7 @@
|
|||
(figcaption "Test figure!"))
|
||||
(iframe (@ (src "https://example.com/iframe-src")))))))
|
||||
|
||||
(define (updater wikiname)
|
||||
(define (updater wikiname #:strict-proxy? [strict-proxy? #f])
|
||||
(define classlist-updater
|
||||
(compose1
|
||||
; uncollapse all navbox items (bottom of page mass navigation)
|
||||
|
@ -101,7 +119,7 @@
|
|||
'(""))))
|
||||
; proxy images from inline styles, if strict_proxy is set
|
||||
(curry u
|
||||
(λ (v) (config-true? 'strict_proxy))
|
||||
(λ (v) strict-proxy?)
|
||||
(λ (v) (attribute-maybe-update
|
||||
'style
|
||||
(λ (style)
|
||||
|
@ -114,14 +132,14 @@
|
|||
; and also their links, if strict_proxy is set
|
||||
(curry u
|
||||
(λ (v)
|
||||
(and (config-true? 'strict_proxy)
|
||||
(and strict-proxy?
|
||||
#;(eq? element-type 'a)
|
||||
(or (has-class? "image-thumbnail" v)
|
||||
(has-class? "image" v))))
|
||||
(λ (v) (attribute-maybe-update 'href u-proxy-url v)))
|
||||
; proxy images from src attributes, if strict_proxy is set
|
||||
(curry u
|
||||
(λ (v) (config-true? 'strict_proxy))
|
||||
(λ (v) strict-proxy?)
|
||||
(λ (v) (attribute-maybe-update 'src u-proxy-url v)))
|
||||
; don't lazyload images
|
||||
(curry u
|
||||
|
@ -208,13 +226,12 @@
|
|||
|
||||
updater)
|
||||
|
||||
(define (update-tree-wiki tree wikiname)
|
||||
(update-tree (updater wikiname) tree))
|
||||
(define (update-tree-wiki tree wikiname #:strict-proxy? [strict-proxy? #f])
|
||||
(update-tree (updater wikiname #:strict-proxy? strict-proxy?) tree))
|
||||
|
||||
(module+ test
|
||||
(define transformed
|
||||
(parameterize ([(config-parameter 'strict_proxy) "true"])
|
||||
(update-tree-wiki wiki-document "test")))
|
||||
(update-tree-wiki wiki-document "test" #:strict-proxy? #t))
|
||||
; check that wikilinks are changed to be local
|
||||
(check-equal? (get-attribute 'href (bits->attributes
|
||||
((query-selector
|
||||
|
@ -260,8 +277,8 @@
|
|||
; check that noscript images are removed
|
||||
(check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f)
|
||||
; benchmark
|
||||
(when (file-exists? "Frog.html2")
|
||||
(with-input-from-file "Frog.html2"
|
||||
(when (file-exists? "../misc/Frog.html")
|
||||
(with-input-from-file "../misc/Frog.html"
|
||||
(λ ()
|
||||
(define tree (html->xexp (current-input-port)))
|
||||
(time (length (update-tree-wiki tree "minecraft")))))))
|
|
@ -1,6 +1,5 @@
|
|||
#lang typed/racket/base
|
||||
(require racket/string
|
||||
"config.rkt"
|
||||
"pure-utils.rkt")
|
||||
(require/typed web-server/http/request-structs
|
||||
[#:opaque Header header?])
|
||||
|
@ -10,12 +9,14 @@
|
|||
px-wikiname
|
||||
; make a query string from an association list of strings
|
||||
params->query
|
||||
; custom percent encoding (you probably want params->query instead)
|
||||
percent-encode
|
||||
; sets for custom percent encoding
|
||||
path-set urlencoded-set filename-set
|
||||
; make a proxied version of a fandom url
|
||||
u-proxy-url
|
||||
; check whether a url is on a domain controlled by fandom
|
||||
is-fandom-url?
|
||||
; prints "out: <url>"
|
||||
log-outgoing
|
||||
; pass in a header, headers, or something useless. they'll all combine into a list
|
||||
build-headers
|
||||
; try to follow wikimedia's format for which characters should be encoded/replaced in page titles for the url
|
||||
|
@ -41,6 +42,8 @@
|
|||
)
|
||||
path-set))
|
||||
|
||||
(define filename-set '(#\< #\> #\: #\" #\/ #\\ #\| #\? #\* #\# #\~ #\&))
|
||||
|
||||
(: percent-encode (String (Listof Char) Boolean -> Bytes))
|
||||
(define (percent-encode value set space-as-plus)
|
||||
(define b (string->bytes/utf-8 value))
|
||||
|
@ -87,11 +90,6 @@
|
|||
(λ ([v : String]) (string-append "/proxy?" (params->query `(("dest" . ,url)))))
|
||||
url))
|
||||
|
||||
(: log-outgoing (String -> Void))
|
||||
(define (log-outgoing url-string)
|
||||
(when (config-true? 'log_outgoing)
|
||||
(printf "out: ~a~n" url-string)))
|
||||
|
||||
(: build-headers ((U Header (Listof Header) False Void) * -> (Listof Header)))
|
||||
(define (build-headers . fs)
|
||||
(apply
|
|
@ -129,7 +129,7 @@
|
|||
(λ (element-type attributes children)
|
||||
(equal? (get-attribute name attributes) value)))
|
||||
|
||||
(define (query-selector selector element)
|
||||
(define (query-selector selector element #:include-text? [include-text? #f])
|
||||
(generator
|
||||
()
|
||||
(let loop ([element element])
|
||||
|
@ -140,7 +140,9 @@
|
|||
[(equal? element-type '*DECL*) #f]
|
||||
[(equal? element-type '@) #f]
|
||||
[#t
|
||||
(when (selector element-type attributes children)
|
||||
(when (if include-text?
|
||||
(selector element-type attributes children (filter string? (cdr element)))
|
||||
(selector element-type attributes children))
|
||||
(yield element))
|
||||
(for ([child children]) (loop child))]))
|
||||
#f))
|
|
@ -4,7 +4,7 @@
|
|||
racket/string
|
||||
json
|
||||
net/http-easy
|
||||
html-parsing
|
||||
"../lib/html-parsing/main.rkt"
|
||||
"../src/xexpr-utils.rkt"
|
||||
"../src/url-utils.rkt")
|
||||
|
||||
|
|
|
@ -8,13 +8,16 @@
|
|||
html-parsing
|
||||
html-writing
|
||||
web-server/http
|
||||
web-server/http/bindings
|
||||
"config.rkt"
|
||||
"data.rkt"
|
||||
"niwa-data.rkt"
|
||||
"extwiki-data.rkt"
|
||||
"extwiki-generic.rkt"
|
||||
"static-data.rkt"
|
||||
"pure-utils.rkt"
|
||||
"xexpr-utils.rkt"
|
||||
"url-utils.rkt")
|
||||
"../lib/syntax.rkt"
|
||||
"../lib/pure-utils.rkt"
|
||||
"../lib/xexpr-utils.rkt"
|
||||
"../lib/url-utils.rkt")
|
||||
|
||||
(provide
|
||||
; headers to always send on all http responses
|
||||
|
@ -79,32 +82,69 @@
|
|||
|
||||
;; generate a notice with a link if a fandom wiki has a replacement as part of NIWA or similar
|
||||
;; if the wiki has no replacement, display nothing
|
||||
(define (niwa-notice wikiname title)
|
||||
(define ind (findf (λ (item) (member wikiname (first item))) niwa-data))
|
||||
(if ind
|
||||
(let* ([search-page (format "/Special:Search?~a"
|
||||
(params->query `(("search" . ,title)
|
||||
("go" . "Go"))))]
|
||||
[go (if (string-suffix? (third ind) "/")
|
||||
(regexp-replace #rx"/$" (third ind) (λ (_) search-page))
|
||||
(let* ([joiner (second (regexp-match #rx"/(w[^./]*)/" (third ind)))])
|
||||
(regexp-replace #rx"/w[^./]*/.*$" (third ind) (λ (_) (format "/~a~a" joiner search-page)))))])
|
||||
`(aside (@ (class "niwa__notice"))
|
||||
(h1 (@ (class "niwa__header")) ,(second ind) " has its own website separate from Fandom.")
|
||||
(a (@ (class "niwa__go") (href ,go)) "Read " ,title " on " ,(second ind) " →")
|
||||
(div (@ (class "niwa__cols"))
|
||||
(div (@ (class "niwa__left"))
|
||||
(p "Most major Nintendo wikis are part of the "
|
||||
(a (@ (href "https://www.niwanetwork.org/about/")) "Nintendo Independent Wiki Alliance")
|
||||
" and have their own wikis off Fandom. You can help this wiki by "
|
||||
(a (@ (href ,go)) "visiting it directly."))
|
||||
(p ,(fifth ind))
|
||||
(div (@ (class "niwa__divider")))
|
||||
(p "Why are you seeing this message? Fandom refuses to delete or archive their copy of this wiki, so that means their pages will appear high up in search results. Fandom hopes to get clicks from readers who don't know any better.")
|
||||
(p (@ (class "niwa__feedback")) "This notice brought to you by BreezeWiki / " (a (@ (href "https://www.kotaku.com.au/2022/10/massive-zelda-wiki-reclaims-independence-six-months-before-tears-of-the-kingdom/")) "Info & Context") " / " (a (@ (href "https://docs.breezewiki.com/Reporting_Bugs.html")) "Feedback?")))
|
||||
(div (@ (class "niwa__right"))
|
||||
(img (@ (class "niwa__logo") (src ,(format "https://www.niwanetwork.org~a" (fourth ind)))))))))
|
||||
""))
|
||||
(define (extwiki-notice wikiname title)
|
||||
(define xt (findf (λ (item) (member wikiname (extwiki^-wikinames item))) extwikis))
|
||||
(cond/var
|
||||
[xt
|
||||
(let* ([group (hash-ref extwiki-groups (extwiki^-group xt))]
|
||||
[search-page (format "/Special:Search?~a"
|
||||
(params->query `(("search" . ,title)
|
||||
("go" . "Go"))))]
|
||||
[go (if (string-suffix? (extwiki^-home xt) "/")
|
||||
(regexp-replace #rx"/$" (extwiki^-home xt) (λ (_) search-page))
|
||||
(let* ([joiner (second (regexp-match #rx"/(w[^./]*)/" (extwiki^-home xt)))])
|
||||
(regexp-replace #rx"/w[^./]*/.*$" (extwiki^-home xt) (λ (_) (format "/~a~a" joiner search-page)))))]
|
||||
[props (extwiki-props^ go)])
|
||||
(cond
|
||||
[(eq? (extwiki^-banner xt) 'default)
|
||||
`(aside (@ (class "niwa__notice"))
|
||||
(h1 (@ (class "niwa__header")) ,(extwiki^-name xt) " has its own website separate from Fandom.")
|
||||
(a (@ (class "niwa__go") (href ,go)) "Read " ,title " on " ,(extwiki^-name xt) " →")
|
||||
(div (@ (class "niwa__cols"))
|
||||
(div (@ (class "niwa__left"))
|
||||
(p ,((extwiki-group^-description group) props))
|
||||
(p ,((extwiki^-description xt) props))
|
||||
(p "This wiki's core community has wholly migrated away from Fandom. You should "
|
||||
(a (@ (href ,go)) "go to " ,(extwiki^-name xt) " now!"))
|
||||
(p (@ (class "niwa__feedback"))
|
||||
,@(add-between
|
||||
`(,@(for/list ([link (extwiki-group^-links group)])
|
||||
`(a (@ (href ,(cdr link))) ,(car link)))
|
||||
"This notice is from BreezeWiki"
|
||||
(a (@ (href "https://docs.breezewiki.com/Reporting_Bugs.html")) "Feedback?"))
|
||||
" / ")))
|
||||
(div (@ (class "niwa__right"))
|
||||
(img (@ (class "niwa__logo") (src ,(extwiki^-logo xt)))))))]
|
||||
[(eq? (extwiki^-banner xt) 'parallel)
|
||||
`(aside (@ (class "niwa__parallel"))
|
||||
(h1 (@ (class "niwa__header-mini"))
|
||||
"See also "
|
||||
(a (@ (href ,go)) ,(extwiki^-name xt)))
|
||||
(p "This topic has multiple communities of editors, some active on the Fandom wiki, others active on " ,(extwiki^-name xt) ".")
|
||||
(p "For thorough research, be sure to check both communities since they may have different information!")
|
||||
(p (@ (class "niwa__feedback"))
|
||||
,@(add-between
|
||||
`(,@(for/list ([link (extwiki-group^-links group)])
|
||||
`(a (@ (href ,(cdr link))) ,(car link)))
|
||||
"This notice is from BreezeWiki"
|
||||
(a (@ (href "https://docs.breezewiki.com/Reporting_Bugs.html")) "Feedback?"))
|
||||
" / ")))]
|
||||
[(eq? (extwiki^-banner xt) 'empty)
|
||||
`(aside (@ (class "niwa__notice niwa__notice--alt"))
|
||||
(h1 (@ (class "niwa__header")) "You will be redirected to " ,(extwiki^-name xt) ".")
|
||||
(p (@ (style "position: relative; top: -12px;")) "This independent wiki community has its own site separate from Fandom.")
|
||||
(a (@ (class "niwa__go") (href ,go)) "Take me there! →")
|
||||
|
||||
(p (@ (class "niwa__feedback") (style "text-align: left"))
|
||||
,@(add-between
|
||||
`(,@(for/list ([link (extwiki-group^-links group)])
|
||||
`(a (@ (href ,(cdr link))) ,(car link)))
|
||||
"This notice is from BreezeWiki")
|
||||
" / ")))]))]
|
||||
(var fetched-callback (get-redirect-content wikiname))
|
||||
[fetched-callback
|
||||
(fetched-callback title)]
|
||||
[#t ""]))
|
||||
|
||||
(define (generate-wiki-page
|
||||
content
|
||||
|
@ -114,22 +154,26 @@
|
|||
#:title title
|
||||
#:head-data [head-data-in #f]
|
||||
#:siteinfo [siteinfo-in #f]
|
||||
#:user-cookies [user-cookies-in #f])
|
||||
#:user-cookies [user-cookies-in #f]
|
||||
#:online-styles [online-styles #t])
|
||||
(define siteinfo (or siteinfo-in siteinfo-default))
|
||||
(define head-data (or head-data-in ((head-data-getter wikiname))))
|
||||
(define user-cookies (or user-cookies-in (user-cookies-getter req)))
|
||||
(define (required-styles origin)
|
||||
(map (λ (dest-path)
|
||||
(define url (format dest-path origin))
|
||||
(if (config-true? 'strict_proxy)
|
||||
(u-proxy-url url)
|
||||
url))
|
||||
`(#;"~a/load.php?lang=en&modules=skin.fandomdesktop.styles&only=styles&skin=fandomdesktop"
|
||||
#;"~a/load.php?lang=en&modules=ext.gadget.dungeonsWiki%2CearthWiki%2Csite-styles%2Csound-styles&only=styles&skin=fandomdesktop"
|
||||
#;"~a/load.php?lang=en&modules=site.styles&only=styles&skin=fandomdesktop"
|
||||
; combine the above entries into a single request for potentially extra speed - fandom.com doesn't even do this!
|
||||
,(format "~~a/wikia.php?controller=ThemeApi&method=themeVariables&variant=~a" (user-cookies^-theme user-cookies))
|
||||
"~a/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop")))
|
||||
(define origin (format "https://~a.fandom.com" wikiname))
|
||||
(define required-styles
|
||||
(cond
|
||||
[online-styles
|
||||
(define styles
|
||||
(list
|
||||
(format "~a/wikia.php?controller=ThemeApi&method=themeVariables&variant=~a" origin (user-cookies^-theme user-cookies))
|
||||
(format "~a/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop" origin)))
|
||||
(if (config-true? 'strict_proxy)
|
||||
(map u-proxy-url styles)
|
||||
styles)]
|
||||
[#t
|
||||
(list
|
||||
(format "/archive/~a/styles/themeVariables-~a.css" wikiname (user-cookies^-theme user-cookies))
|
||||
(format "/archive/~a/styles/site.css" wikiname))]))
|
||||
`(*TOP*
|
||||
(*DECL* DOCTYPE html)
|
||||
(html
|
||||
|
@ -141,7 +185,7 @@
|
|||
(config-get 'application_name)))
|
||||
,@(map (λ (url)
|
||||
`(link (@ (rel "stylesheet") (type "text/css") (href ,url))))
|
||||
(required-styles (format "https://~a.fandom.com" wikiname)))
|
||||
required-styles)
|
||||
(link (@ (rel "stylesheet") (type "text/css") (href ,(get-static-url "main.css"))))
|
||||