diff --git a/archiver/archiver-cli.rkt b/archiver/archiver-cli.rkt
new file mode 100644
index 0000000..11f25d0
--- /dev/null
+++ b/archiver/archiver-cli.rkt
@@ -0,0 +1,68 @@
+#lang cli
+(require charterm
+ "archiver.rkt")
+
+(help (usage "Downloads a single Fandom wiki in BreezeWiki offline format."
+ ""
+ "Downloaded pages go into `archive/` next to the executable."
+ "Database goes into `archiver.db*` next to the executable."
+ "The database is necessary to store your download progress and resume where you left off if the process is interrupted."))
+
+(flag (output-quiet?)
+ ("-q" "--output-quiet" "disable progress output")
+ (output-quiet? #t))
+
+(flag (output-progress?)
+ ("-p" "--output-progress" "progress output for terminals (default in a tty)")
+ (output-progress? #t))
+
+(flag (output-lines?)
+ ("-l" "--output-lines" "output the name of each file downloaded (default outside of a tty)")
+ (output-lines? #t))
+
+(constraint (one-of output-quiet? output-lines? output-progress?))
+
+
+
+(program
+ (start [wikiname "wikiname to download"])
+ ;; set up arguments
+ (define width 80)
+ (when (not (or (output-quiet?) (output-lines?) (output-progress?)))
+ (cond [(terminal-port? current-input-port)
+ (output-progress? #t)]
+ [else
+ (output-lines? #t)]))
+ (define (update-width)
+ (when (output-progress?)
+ (with-charterm
+ (call-with-values (λ () (charterm-screen-size))
+ (λ (cols rows) (set! width cols))))))
+ (update-width)
+ ;; check
+ (when (or (not wikiname) (equal? wikiname ""))
+ (raise-user-error "Please specify the wikiname to download on the command line."))
+ ;; progress reporting based on selected mode
+ (define (report-progress a b c)
+ (define basename (basename->name-for-query c))
+ (cond
+ [(output-lines?)
+ (displayln basename)]
+ [(output-progress?)
+ (when (eq? (modulo a 20) 0)
+ (thread (λ () (update-width))))
+ (define prefix (format "[~a] [~a/~a] " wikiname a b))
+ (define rest (- width (string-length prefix)))
+ (define real-width (min (string-length basename) rest))
+ (define spare-width (- rest real-width))
+ (define name-display (substring basename 0 real-width))
+ (define whitespace (make-string spare-width #\ ))
+ (printf "~a~a~a\r" prefix name-display whitespace)]))
+ ;; download all stages
+ (for ([stage all-stages]
+ [i (in-naturals 1)])
+ (printf "> Stage ~a/~a~n" i (length all-stages))
+ (stage wikiname report-progress)
+ (displayln "")))
+
+(run start)
diff --git a/archiver/archiver-database.rkt b/archiver/archiver-database.rkt
new file mode 100644
index 0000000..b81ad6c
--- /dev/null
+++ b/archiver/archiver-database.rkt
@@ -0,0 +1,81 @@
+#lang racket/base
+(require racket/file
+ racket/list
+ racket/path
+ racket/string
+ json
+ json-pointer
+ db
+ "../lib/syntax.rkt")
+
+(provide
+ get-slc
+ query-exec*
+ query-rows*
+ query-list*
+ query-value*
+ query-maybe-value*
+ query-maybe-row*)
+
+(define storage-path (anytime-path ".." "storage"))
+(define database-file (build-path storage-path "archiver.db"))
+
+(define slc (box #f))
+(define (get-slc)
+ (define slc* (unbox slc))
+ (cond
+ [slc* slc*]
+ [else
+ (make-directory* storage-path)
+ (define slc* (sqlite3-connect #:database database-file #:mode 'create))
+ (query-exec slc* "PRAGMA journal_mode=WAL")
+ (define database-version
+ (with-handlers ([exn:fail:sql?
+ (λ (exn)
+ ; need to set up the database
+ (query-exec slc* "create table database_version (version integer, primary key (version))")
+ (query-exec slc* "insert into database_version values (0)")
+ 0)])
+ (query-value slc* "select version from database_version")))
+
+ (define migrations
+ (wrap-sql
+ ((query-exec slc* "create table page (wikiname TEXT NOT NULL, basename TEXT NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, basename))")
+ (query-exec slc* "create table wiki (wikiname TEXT NOT NULL, progress INTEGER, PRIMARY KEY (wikiname))"))
+ ((query-exec slc* "create table special_page (wikiname TEXT NOT NULL, key TEXT NOT NULL, basename TEXT NOT NULL, PRIMARY KEY (wikiname, key))"))
+ ((query-exec slc* "update wiki set progress = 2 where wikiname in (select wikiname from wiki inner join page using (wikiname) group by wikiname having min(page.progress) = 1)"))
+ ((query-exec slc* "create table image (wikiname TEXT NOT NULL, hash TEXT NTO NULL, url TEXT NOT NULL, ext TEXT, source INTEGER NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, hash))"))
+ ((query-exec slc* "alter table wiki add column sitename TEXT")
+ (query-exec slc* "alter table wiki add column basepage TEXT")
+ (query-exec slc* "alter table wiki add column license_text TEXT")
+ (query-exec slc* "alter table wiki add column license_url TEXT"))))
+
+ (let do-migrate-step ()
+ (when (database-version . < . (length migrations))
+ (call-with-transaction
+ slc*
+ (list-ref migrations database-version))
+ (set! database-version (add1 database-version))
+ (query-exec slc* "update database_version set version = $1" database-version)
+ (do-migrate-step)))
+
+ (set-box! slc slc*)
+ slc*]))
+
+(define (query-exec* . args)
+ (apply query-exec (get-slc) args))
+
+(define (query-rows* . args)
+ (apply query-rows (get-slc) args))
+
+(define (query-list* . args)
+ (apply query-list (get-slc) args))
+
+(define (query-value* . args)
+ (apply query-value (get-slc) args))
+
+(define (query-maybe-value* . args)
+ (apply query-maybe-value (get-slc) args))
+
+(define (query-maybe-row* . args)
+ (apply query-maybe-row (get-slc) args))
diff --git a/archiver/archiver-gui.rkt b/archiver/archiver-gui.rkt
new file mode 100644
index 0000000..6f09cb8
--- /dev/null
+++ b/archiver/archiver-gui.rkt
@@ -0,0 +1,347 @@
+#lang racket/base
+(require racket/class
+ racket/draw
+ racket/format
+ racket/list
+ racket/port
+ racket/set
+ racket/splicing
+ racket/string
+ db
+ net/http-easy
+ memo
+ (only-in racket/gui timer%)
+ racket/gui/easy
+ racket/gui/easy/operator
+ (only-in pict bitmap)
+ images/icons/arrow
+ images/icons/control
+ images/icons/stickman
+ images/icons/style
+ images/icons/symbol
+ "archiver-database.rkt"
+ "archiver.rkt"
+ "../lib/url-utils.rkt"
+ "../lib/xexpr-utils.rkt")
+
+(default-icon-material rubber-icon-material)
+
+(require (for-syntax racket/base racket/match racket/set racket/string))
+
+(define-syntax (@> stx)
+ (define form (cdr (syntax->datum stx)))
+ (match form
+ [(list form) ; (@> (fn @obs))
+ ;; identify the observables and replace with non-@ symbols
+ (define collection (mutable-set))
+ (define updated
+ (let loop ([sexp form])
+ (cond [(symbol? sexp)
+ (let ([as-s (symbol->string sexp)])
+ (if (string-prefix? as-s "@")
+ (let ([without-@ (string->symbol (substring as-s 1))])
+ (set-add! collection (cons sexp without-@))
+ without-@)
+ sexp))]
+ [(pair? sexp) (cons (loop (car sexp)) (loop (cdr sexp)))]
+ [#t sexp])))
+ (define collection-l (set->list collection))
+ ;; return obs-combine -> updated-form
+ (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-l)) ,updated) ,@(map car collection-l)))]
+ [(list (? string? str) args ...) ; (@> "Blah: ~a/~a" @arg1 arg2)
+ ;; identify the observables and replace with non-@ symbols
+ (define collection-l
+ (for/list ([arg args])
+ (if (symbol? arg)
+ (let ([as-s (symbol->string arg)])
+ (if (string-prefix? as-s "@")
+ (let ([without-@ (string->symbol (substring as-s 1))])
+ (cons arg without-@))
+ (cons #f arg)))
+ (cons #f arg))))
+ (define collection-lo (filter car collection-l))
+ ;; return obs-combine -> format
+ (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-lo)) (format ,str ,@(map cdr collection-l))) ,@(map car collection-lo)))]))
+
+(define/obs @auto-retry #f)
+
+(define-struct qi^ (wikiname st stage progress max-progress eta th) #:transparent) ;; queue item
+
+(define rows (query-rows* "select wikiname, progress from wiki where progress < 4"))
+(define/obs @queue null)
+(define (add-wikiname-to-queue wikiname st stage)
+ (@queue . <~ . (λ (queue)
+ (define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue))
+ (if already-exists?
+ queue
+ (append queue (list (qi^ wikiname st stage 0 1 "..." #f)))))))
+(for ([row rows])
+ (add-wikiname-to-queue (vector-ref row 0)
+ (if (= (vector-ref row 1) 4)
+ 'complete
+ 'queued)
+ (vector-ref row 1)))
+
+(define status-icon-size 32)
+(define status-icon-min-width 36)
+(define button-icon-size 12)
+
+(define color-green (make-color 90 212 68))
+
+(define/obs @input "")
+
+(splicing-let ([frame-count 30])
+ (define stickman-frames
+ (for/vector ([s (in-range 0 1 (/ 1 frame-count))])
+ (running-stickman-icon
+ s
+ #:height status-icon-size
+ #:material (default-icon-material))))
+
+ (define/obs @stick-frame-no 0)
+ (define stick-timer
+ (new timer%
+ [notify-callback (λ () (@stick-frame-no . <~ . add1))]
+ [interval (truncate (/ 1000 frame-count))]))
+ (define/obs @stick
+ (@stick-frame-no . ~> . (λ (n) (vector-ref stickman-frames
+ (modulo n (vector-length stickman-frames)))))))
+
+(define status-icons
+ (hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size)
+ 'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size)
+ 'running @stick
+ 'error (x-icon #:height status-icon-size)
+ 'complete (check-icon #:color color-green #:height status-icon-size)))
+
+(define action-icons
+ (hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size)
+ 'resume (play-icon #:color color-green #:height button-icon-size)
+ 'reset (left-over-arrow-icon #:color halt-icon-color #:height button-icon-size)))
+
+(define (bitmap-view @the-bitmap [min-width 1])
+ (pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f)
+ #:stretch '(#f #f)
+ #:style '(transparent)
+ @the-bitmap
+ bitmap))
+
+(define (exn->string e)
+ (with-output-to-string
+ (λ ()
+ (displayln (exn-message e))
+ (displayln "context:")
+ (for ([item (continuation-mark-set->context (exn-continuation-marks e))])
+ (printf " ~a" (srcloc->string (cdr item)))
+ (when (car item)
+ (printf ": ~a" (car item)))
+ (displayln "")))))
+
+(define ((handle-graphical-exn @qi) e)
+ (displayln (exn->string e) (current-error-port))
+ (cond
+ [(obs-peek @auto-retry)
+ (void) ;; TODO
+ #;(do-retry-end wikiname)]
+ [#t
+ (update-qi @qi [st 'error])
+ (do-try-unpause-next-entry)
+ (thread
+ (λ ()
+ (define/obs @visible? #t)
+ (render
+ (dialog #:title "Download Error"
+ #:style '(resize-border)
+ #:mixin (λ (%) (class % (super-new)
+ (obs-observe! @visible? (λ (visible?) (send this show visible?)))))
+ (vpanel #:margin '(15 15)
+ (text "Encountered this error while downloading:")
+ (input #:style '(multiple hscroll)
+ #:min-size '(#f 200)
+ (exn->string e))
+ ;; TODO
+ #;(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname)))
+ #;(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname)))
+ #;(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue)))
+ #;(button "Use Auto-Retry" (λ ()
+ (:= @auto-retry #t)
+ (:= @visible? #f)
+ (do-retry-end wikiname)))
+ #;(text "Be careful not to auto-retry an infinite loop!")))
+ main-window)))
+ (sleep)
+ ; make sure the broken thread is gone
+ (define th (qi^-th (obs-peek @qi)))
+ (when th (kill-thread th))]))
+
+(define segments
+ (list
+ (list 5/100 (make-color 0 223 217))
+ (list 88/100 color-green)
+ (list 2/100 (make-color 0 223 217))
+ (list 5/100 color-green)))
+(define segment-spacing 2)
+(unless (= (apply + (map car segments)) 1)
+ (error 'segments "segments add up to ~a, not 1" (apply + (map car segments))))
+
+;; return the new bitmap, which can be drawn on a dc<%>
+(define/memoize (ray-trace width height stage progress max-progress)
+ ;; (printf "rendering ~a ~a/~a at ~a~n" stage progress max-progress (current-inexact-milliseconds))
+ (define bm (make-object bitmap% width height #f #t))
+ (define dc (make-object bitmap-dc% bm))
+ (define width-available (- width (* (length segments) segment-spacing)))
+ (send dc set-smoothing 'unsmoothed)
+ (send dc set-pen "black" 0 'transparent)
+ (for/fold ([offset 0])
+ ([segment segments]
+ [i (in-naturals 0)]) ;; zero indexed stages?
+ ;; calculate start and end locations of grey bar
+ (define-values (segment-proportion segment-color) (apply values segment))
+ (define segment-start (if (= offset 0) 0 (+ offset segment-spacing)))
+ (define segment-width (* width-available segment-proportion))
+ ;; draw grey bar
+ (send dc set-brush (make-color 180 180 180 0.4) 'solid)
+ (send dc draw-rectangle segment-start 0 segment-width height)
+ ;; draw solid bar according to the current item's progress
+ (define proportion
+ (cond [(stage . < . i) 0]
+ [(stage . > . i) 1]
+ [(max-progress . <= . 0) 0]
+ [(progress . < . 0) 0]
+ [(progress . >= . max-progress) 1]
+ [else (progress . / . max-progress)]))
+ (send dc set-brush segment-color 'solid)
+ (send dc draw-rectangle segment-start 0 (* proportion segment-width) height)
+ (+ segment-start segment-width))
+ (bitmap-render-icon bm 6/8))
+
+;; get ray traced bitmap (possibly from cache) and draw on dc<%>
+(define (draw-bar orig-dc qi)
+ ;; (println ray-traced)
+ (define-values (width height) (send orig-dc get-size))
+ (send orig-dc draw-bitmap (ray-trace width height (qi^-stage qi) (qi^-progress qi) (qi^-max-progress qi)) 0 0))
+
+(define ((make-progress-updater @qi) a b c)
+ ;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c)
+ (update-qi @qi [progress a] [max-progress b]))
+
+(define (do-add-to-queue)
+ (define wikiname (string-trim (obs-peek @input)))
+ (when ((string-length wikiname) . > . 0)
+ (add-wikiname-to-queue wikiname 'queued 0)) ;; TODO: automatically start?
+ (:= @input ""))
+
+(define-syntax-rule (update-qi @qi args ...)
+ (let ([wikiname (qi^-wikiname (obs-peek @qi))])
+ (@queue . <~ . (λ (queue)
+ (for/list ([qi queue])
+ (if (equal? (qi^-wikiname qi) wikiname)
+ (struct-copy qi^ qi args ...)
+ qi))))))
+
+(define (do-start-qi @qi)
+ (define th
+ (thread (λ ()
+ (with-handlers ([exn? (handle-graphical-exn @qi)])
+ (define last-stage
+ (for/last ([stage all-stages]
+ [i (in-naturals)])
+ (update-qi @qi [stage i])
+ (stage (qi^-wikiname (obs-peek @qi)) (make-progress-updater @qi))
+ i))
+ (update-qi @qi [st 'complete] [stage (add1 last-stage)])
+ (do-try-unpause-next-entry)))))
+ (update-qi @qi [st 'running] [th th]))
+
+(define (do-stop-qi @qi)
+ (define th (qi^-th (obs-peek @qi)))
+ (when th (kill-thread th))
+ (update-qi @qi [th #f] [st 'paused]))
+
+(define (do-reset-qi @qi)
+ (define th (qi^-th (obs-peek @qi)))
+ (when th (kill-thread th))
+ (update-qi @qi [th #f] [st 'queued] [stage 0] [progress 0] [max-progress 0])
+ (query-exec* "update wiki set progress = 0 where wikiname = ?" (qi^-wikiname (obs-peek @qi))))
+
+(define (do-try-unpause-next-entry)
+ (define queue (obs-peek @queue))
+ (define next-qi (for/first ([qi queue]
+ #:when (memq (qi^-st qi) '(paused queued error)))
+ qi))
+ (when next-qi
+ (define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue))))
+ (do-start-qi @qi)))
+
+(define main-window
+ (render
+ (window
+ #:title "Fandom Archiver"
+ #:size '(400 300)
+ #:mixin (λ (%) (class %
+ (super-new)
+ (define/augment (on-close)
+ (send stick-timer stop)
+ (for ([qi (obs-peek @queue)])
+ (when (qi^-th qi)
+ (kill-thread (qi^-th qi))))
+ #;(disconnect*))))
+ (vpanel
+ #:spacing 10
+ #:margin '(5 5)
+ (hpanel
+ #:stretch '(#t #f)
+ #:spacing 10
+ (hpanel
+ (text "https://")
+ (input @input
+ (λ (event data) (cond
+ [(eq? event 'input) (:= @input data)]
+ [(eq? event 'return) (do-add-to-queue)])))
+ (text ".fandom.com"))
+ (button "Download Wiki" do-add-to-queue))
+ (list-view
+ #:style '(vertical)
+ @queue
+ #:key qi^-wikiname
+ (λ (k @qi)
+ (define @status-icons
+ (@> (case (qi^-st @qi)
+ [(running) @stick]
+ [else (hash-ref status-icons (qi^-st @qi))])))
+ (define @is-running?
+ (@> (memq (qi^-st @qi) '(running))))
+ (define @is-complete?
+ (@> (eq? (qi^-st @qi) 'complete)))
+ ;; state icon at the left side
+ (hpanel #:stretch '(#t #f)
+ #:alignment '(left center)
+ #:spacing 8
+ (bitmap-view @status-icons status-icon-min-width)
+ (vpanel
+ ;; name and buttons (top half)
+ (hpanel #:alignment '(left bottom)
+ (text (@> (qi^-wikiname @qi)))
+ (spacer)
+ (hpanel
+ #:stretch '(#f #f)
+ (if-view @is-complete?
+ (button (hash-ref action-icons 'reset)
+ (λ () (do-reset-qi @qi)))
+ (spacer))
+ (if-view @is-running?
+ (button (hash-ref action-icons 'pause)
+ (λ () (do-stop-qi @qi)))
+ (button (hash-ref action-icons 'resume)
+ (λ () (do-start-qi @qi))))))
+ ;; progress bar (bottom half)
+ (hpanel
+ (canvas
+ @qi
+ #:style '(transparent)
+ #:margin '(3 3)
+ draw-bar)
+ (hpanel #:min-size '(68 #f)
+ #:stretch '(#f #f)
+ #:alignment '(right center)
+ (text (@> (format "eta ~a" (qi^-eta @qi))))))))))))))
diff --git a/archiver/archiver.rkt b/archiver/archiver.rkt
new file mode 100644
index 0000000..edd0d2b
--- /dev/null
+++ b/archiver/archiver.rkt
@@ -0,0 +1,335 @@
+#lang racket/base
+(require racket/file
+ racket/function
+ racket/list
+ racket/path
+ racket/sequence
+ racket/string
+ net/url
+ net/mime
+ file/sha1
+ net/http-easy
+ db
+ json
+ "archiver-database.rkt"
+ "../lib/html-parsing/main.rkt"
+ "../lib/mime-types.rkt"
+ "../lib/syntax.rkt"
+ "../lib/tree-updater.rkt"
+ "../lib/url-utils.rkt"
+ "../lib/xexpr-utils.rkt"
+ "../lib/archive-file-mappings.rkt")
+
+(provide
+ basename->name-for-query
+ image-url->values
+ hash->save-dir
+ all-stages)
+
+(module+ test
+ (require rackunit))
+
+(define archive-root (anytime-path ".." "storage/archive"))
+(make-directory* archive-root)
+
+(define sources '#hasheq((style . 1) (page . 2)))
+
+(define (get-origin wikiname)
+ (format "https://~a.fandom.com" wikiname))
+
+(define (insert-wiki-entry wikiname)
+ (define dest-url
+ (format "https://~a.fandom.com/api.php?~a"
+ wikiname
+ (params->query '(("action" . "query")
+ ("meta" . "siteinfo")
+ ("siprop" . "general|rightsinfo|statistics")
+ ("format" . "json")
+ ("formatversion" . "2")))))
+ (define data (response-json (get dest-url)))
+ (define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
+ (if (and exists? (not (sql-null? exists?)))
+ (query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?"
+ (jp "/query/general/sitename" data)
+ (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
+ (jp "/query/rightsinfo/text" data)
+ (jp "/query/rightsinfo/url" data)
+ wikiname)
+ (query-exec* "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 0, ?, ?, ?, ?)"
+ wikiname
+ (jp "/query/general/sitename" data)
+ (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
+ (jp "/query/rightsinfo/text" data)
+ (jp "/query/rightsinfo/url" data)))
+ (jp "/query/statistics/articles" data))
+
+
+(define (check-style-for-images wikiname path)
+ (define content (file->string path))
+ (define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr))
+ (for/list ([url urls]
+ #:when (not (or (equal? url "")
+ (equal? url "'")
+ (string-suffix? url "\"")
+ (string-contains? url "/resources-ucp/")
+ (string-contains? url "/fonts/")
+ (string-contains? url "/drm_fonts/")
+ (string-contains? url "//db.onlinewebfonts.com/")
+ (string-contains? url "//bits.wikimedia.org/")
+ (string-contains? url "dropbox")
+ (string-contains? url "only=styles")
+ (string-contains? url "https://https://")
+ (regexp-match? #rx"^%20" url)
+ (regexp-match? #rx"^data:" url))))
+ (cond
+ [(string-prefix? url "https://") url]
+ [(string-prefix? url "http://") (regexp-replace #rx"http:" url "https:")]
+ [(string-prefix? url "//") (string-append "https:" url)]
+ [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)]
+ [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)])))
+
+(define (download-styles-for-wiki wikiname callback)
+ (define save-dir (build-path archive-root wikiname "styles"))
+ (make-directory* save-dir)
+ (define theme (λ (theme-name)
+ (cons (format "https://~a.fandom.com/wikia.php?controller=ThemeApi&method=themeVariables&variant=~a" wikiname theme-name)
+ (build-path save-dir (format "themeVariables-~a.css" theme-name)))))
+ ;; (Listof (Pair url save-path))
+ (define styles
+ (list
+ (theme "default")
+ (theme "light")
+ (theme "dark")
+ (cons (format "https://~a.fandom.com/load.php?lang=en&modules=site.styles%7Cskin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles&only=styles&skin=fandomdesktop" wikiname)
+ (build-path save-dir "site.css"))))
+ (for ([style styles]
+ [i (in-naturals)])
+ (callback i (length styles) "styles...")
+ (define r (get (car style)))
+ (define body (response-body r))
+ (display-to-file body (cdr style) #:exists 'replace)
+ ;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url?
+ )
+ (callback (length styles) (length styles) "styles...")
+ styles)
+
+(define (hash->save-dir wikiname hash)
+ (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2)))
+
+(define (image-url->values i)
+ ;; TODO: handle case where there is multiple broken cb parameter on minecraft wiki
+ ;; TODO: ensure it still "works" with broken & on minecraft wiki
+ (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing
+ (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary
+ (define hash (sha1 (string->bytes/utf-8 key)))
+ (cons key hash))
+
+
+;; 1. Download list of wiki pages and store in database, if not done yet for that wiki
+(define (if-necessary-download-list-of-pages wikiname callback)
+ (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
+ ;; done yet?
+ (unless (and (real? wiki-progress) (wiki-progress . >= . 1))
+ ;; Count total pages
+ (define num-pages (insert-wiki-entry wikiname))
+ ;; Download the entire index of pages
+ (define basenames
+ (let loop ([path-with-namefrom "/wiki/Local_Sitemap"]
+ [basenames-previous-pages null])
+ ;; Download the current index page
+ (define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom))
+ (define r (get url))
+ ;; Metadata from this page (the link to the next page)
+ (define page (html->xexp (bytes->string/utf-8 (response-body r))))
+ (define link-namefrom
+ ((query-selector (λ (t a c x) (and (eq? t 'a)
+ (pair? x)
+ (string-contains? (car x) "Next page")
+ (let ([href (get-attribute 'href a)] )
+ (and href (string-contains? href "/wiki/Local_Sitemap")))))
+ page #:include-text? #t)))
+ ;; Content from this page
+ (define basenames-this-page
+ (for/list ([link (in-producer
+ (query-selector
+ (λ (t a c) (eq? t 'a))
+ ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page)))
+ #f)])
+ (local-encoded-url->basename (get-attribute 'href (bits->attributes link)))))
+ ;; Call the progress callback
+ (define all-basenames (append basenames-previous-pages basenames-this-page))
+ (callback (length all-basenames) num-pages (last all-basenames))
+ ;; Recurse to download from the next page
+ (if link-namefrom
+ (loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames)
+ all-basenames)))
+ ;; Save those pages into the database
+ ;; SQLite can have a maximum of 32766 parameters in a single query
+ (for ([slice (in-slice 32760 basenames)])
+ (define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values "))
+ (call-with-transaction
+ (get-slc)
+ (λ ()
+ (apply query-exec* query-template wikiname slice)
+ ;; Record that we have the complete list of pages
+ (query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname))))))
+
+
+;; 2. Download each page via API and:
+;; * Save API response to file
+(define max-page-progress 1)
+(define (save-each-page wikiname callback)
+ ;; prepare destination folder
+ (define save-dir (build-path archive-root wikiname))
+ (make-directory* save-dir)
+ ;; gather list of basenames to download (that aren't yet complete)
+ (define basenames (query-list* "select basename from page where wikiname = ? and progress < ?"
+ wikiname max-page-progress))
+ ;; counter of complete/incomplete basenames
+ (define already-done-count
+ (query-value* "select count(*) from page where wikiname = ? and progress = ?"
+ wikiname max-page-progress))
+ (define not-done-count
+ (query-value* "select count(*) from page where wikiname = ? and progress < ?"
+ wikiname max-page-progress))
+ (define total-count (+ already-done-count not-done-count))
+ ;; set initial progress
+ (callback already-done-count total-count "")
+ ;; loop through basenames and download
+ (for ([basename basenames]
+ [i (in-naturals (add1 already-done-count))])
+ (define name-for-query (basename->name-for-query basename))
+ (define dest-url
+ (format "https://~a.fandom.com/api.php?~a"
+ wikiname
+ (params->query `(("action" . "parse")
+ ("page" . ,name-for-query)
+ ("prop" . "text|headhtml|langlinks")
+ ("formatversion" . "2")
+ ("format" . "json")))))
+ (define r (get dest-url))
+ (define body (response-body r))
+ (define filename (string-append basename ".json"))
+ (define save-path
+ (cond [((string-length basename) . > . 240)
+ (define key (sha1 (string->bytes/latin-1 basename)))
+ (query-exec* "insert into special_page (wikiname, key, basename) values (?, ?, ?)"
+ wikiname key basename)
+ (build-path save-dir (string-append key ".json"))]
+ [#t
+ (build-path save-dir (string-append basename ".json"))]))
+ (display-to-file body save-path #:exists 'replace)
+ (query-exec* "update page set progress = 1 where wikiname = ? and basename = ?"
+ wikiname basename)
+ (callback i total-count basename))
+ ;; saved all pages, register that fact in the database
+ (query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname))
+
+
+;; 3. Download CSS and:
+;; * Save CSS to file
+;; * Record style images to database
+(define (if-necessary-download-and-check-styles wikiname callback)
+ (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
+ (unless (and (number? wiki-progress) (wiki-progress . >= . 3))
+ (define styles (download-styles-for-wiki wikiname callback))
+ (define unique-image-urls
+ (remove-duplicates
+ (map image-url->values
+ (flatten
+ (for/list ([style styles])
+ (check-style-for-images wikiname (cdr style)))))
+ #:key cdr))
+ (for ([pair unique-image-urls])
+ (query-exec* "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair)))
+ (query-exec* "update wiki set progress = 3 where wikiname = ?" wikiname)))
+
+
+;; 4: From downloaded pages, record URLs of image sources and inline style images to database
+(define (check-json-for-images wikiname path)
+ (define data (with-input-from-file path (λ () (read-json))))
+ (define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data))))
+ (define tree (update-tree-wiki page wikiname))
+ null
+ #;(remove-duplicates
+ (for/list ([element (in-producer
+ (query-selector
+ (λ (t a c)
+ (and (eq? t 'img)
+ (get-attribute 'src a)))
+ tree)
+ #f)])
+ (image-url->values (get-attribute 'src (bits->attributes element))))))
+
+
+;; 5. Download image sources and style images according to database
+(define (save-each-image wikiname callback)
+ (define source (hash-ref sources 'style)) ;; TODO: download entire wiki images instead?
+ ;; gather list of basenames to download (that aren't yet complete)
+ (define rows (query-rows* "select url, hash from image where wikiname = ? and source <= ? and progress < 1"
+ wikiname source))
+ ;; counter of complete/incomplete basenames
+ (define already-done-count
+ (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress = 1"
+ wikiname source))
+ (define not-done-count
+ (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress < 1"
+ wikiname source))
+ ;; set initial progress
+ (callback already-done-count (+ already-done-count not-done-count) "")
+ ;; loop through urls and download
+ (for ([row rows]
+ [i (in-naturals 1)])
+ ;; row fragments
+ (define url (vector-ref row 0))
+ (define hash (vector-ref row 1))
+ ;; check
+ #; (printf "~a -> ~a~n" url hash)
+ (define r (get url))
+ (define declared-type (response-headers-ref r 'content-type))
+ (define final-type (if (equal? declared-type #"application/octet-stream")
+ (let ([sniff-entity (message-entity (mime-analyze (response-body r)))])
+ (string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity))))
+ declared-type))
+ (define ext
+ (with-handlers ([exn:fail:contract? (λ _ (error 'save-each-image "no ext found for mime type `~a` in file ~a" final-type url))])
+ (bytes->string/latin-1 (mime-type->ext final-type))))
+ ;; save
+ (define save-dir (hash->save-dir wikiname hash))
+ (make-directory* save-dir)
+ (define save-path (build-path save-dir (string-append hash "." ext)))
+ (define body (response-body r))
+ (display-to-file body save-path #:exists 'replace)
+ (query-exec* "update image set progress = 1, ext = ? where wikiname = ? and hash = ?"
+ ext wikiname hash)
+ (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append (substring hash 0 6) "..." ext)))
+ ;; saved all images, register that fact in the database
+ (query-exec* "update wiki set progress = 4 where wikiname = ?" wikiname))
+
+(define all-stages
+ (list
+ if-necessary-download-list-of-pages
+ save-each-page
+ if-necessary-download-and-check-styles
+ ;; check-json-for-images
+ save-each-image))
+
+(module+ test
+ (check-equal? (html->xexp "")
+ '(*TOP* (img (@ (src "https://example.com/images?src=Blah.jpg&width=150")))))
+ #;(download-list-of-pages "minecraft" values)
+ #;(save-each-page "minecraft" values)
+ #;(check-json-for-images "chiki" (build-path archive-root "chiki" "Fiona.json"))
+ #;(do-step-3 "gallowmere")
+ #;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))
+
+ #;(for ([wikiname (query-list* "select wikiname from wiki")])
+ (println wikiname)
+ (insert-wiki-entry wikiname))
+
+ #;(for ([wikiname (query-list* "select wikiname from wiki")])
+ (println wikiname)
+ (do-step-3 wikiname)
+ (save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))))
+
+; (for ([stage all-stages]) (stage "create" (λ (a b c) (printf "~a/~a ~a~n" a b c))))
diff --git a/archiver/info.rkt b/archiver/info.rkt
new file mode 100644
index 0000000..17bb747
--- /dev/null
+++ b/archiver/info.rkt
@@ -0,0 +1,3 @@
+#lang info
+
+(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo" "net-cookies-lib" "gui-easy-lib" "sql" "charterm" "cli"))
diff --git a/archiver/req.rktd b/archiver/req.rktd
new file mode 100644
index 0000000..e2d2fc2
--- /dev/null
+++ b/archiver/req.rktd
@@ -0,0 +1 @@
+((local (".")))
diff --git a/breezewiki.rkt b/breezewiki.rkt
index a8b8c28..5fd34b2 100644
--- a/breezewiki.rkt
+++ b/breezewiki.rkt
@@ -17,9 +17,12 @@
(require-reloadable "src/page-proxy.rkt" page-proxy)
(require-reloadable "src/page-redirect-wiki-home.rkt" redirect-wiki-home)
(require-reloadable "src/page-search.rkt" page-search)
+(require-reloadable "src/page-set-user-settings.rkt" page-set-user-settings)
(require-reloadable "src/page-static.rkt" static-dispatcher)
+(require-reloadable "src/page-static-archive.rkt" page-static-archive)
(require-reloadable "src/page-subdomain.rkt" subdomain-dispatcher)
(require-reloadable "src/page-wiki.rkt" page-wiki)
+(require-reloadable "src/page-wiki-offline.rkt" page-wiki-offline)
(require-reloadable "src/page-file.rkt" page-file)
(reload!)
@@ -27,7 +30,9 @@
(define ch (make-channel))
(define (start)
(serve/launch/wait
- #:listen-ip (if (config-true? 'debug) "127.0.0.1" #f)
+ #:listen-ip (if (equal? (config-get 'bind_host) "auto")
+ (if (config-true? 'debug) "127.0.0.1" #f)
+ (config-get 'bind_host))
#:port (string->number (config-get 'port))
(λ (quit)
(channel-put ch (lambda () (semaphore-post quit)))
@@ -40,7 +45,10 @@
page-not-found
page-proxy
page-search
+ page-set-user-settings
+ page-static-archive
page-wiki
+ page-wiki-offline
page-file
redirect-wiki-home
static-dispatcher
diff --git a/dist.rkt b/dist.rkt
index 777e81a..2e46f8c 100644
--- a/dist.rkt
+++ b/dist.rkt
@@ -11,13 +11,18 @@
(require (only-in "src/page-proxy.rkt" page-proxy))
(require (only-in "src/page-redirect-wiki-home.rkt" redirect-wiki-home))
(require (only-in "src/page-search.rkt" page-search))
+(require (only-in "src/page-set-user-settings.rkt" page-set-user-settings))
(require (only-in "src/page-static.rkt" static-dispatcher))
+(require (only-in "src/page-static-archive.rkt" page-static-archive))
(require (only-in "src/page-subdomain.rkt" subdomain-dispatcher))
(require (only-in "src/page-wiki.rkt" page-wiki))
+(require (only-in "src/page-wiki-offline.rkt" page-wiki-offline))
(require (only-in "src/page-file.rkt" page-file))
(serve/launch/wait
- #:listen-ip (if (config-true? 'debug) "127.0.0.1" #f)
+ #:listen-ip (if (equal? (config-get 'bind_host) "auto")
+ (if (config-true? 'debug) "127.0.0.1" #f)
+ (config-get 'bind_host))
#:port (string->number (config-get 'port))
(λ (quit)
(dispatcher-tree
@@ -29,7 +34,10 @@
page-not-found
page-proxy
page-search
+ page-set-user-settings
+ page-static-archive
page-wiki
+ page-wiki-offline
page-file
redirect-wiki-home
static-dispatcher
diff --git a/info.rkt b/info.rkt
index 46512df..c290d5b 100644
--- a/info.rkt
+++ b/info.rkt
@@ -1,3 +1,3 @@
#lang info
-(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo"))
+(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "typed-ini-lib" "memo" "net-cookies-lib" "db"))
diff --git a/lib/archive-file-mappings.rkt b/lib/archive-file-mappings.rkt
new file mode 100644
index 0000000..4aa8a69
--- /dev/null
+++ b/lib/archive-file-mappings.rkt
@@ -0,0 +1,28 @@
+#lang racket/base
+(require racket/string
+ net/url
+ (only-in net/uri-codec uri-decode)
+ "url-utils.rkt")
+(provide
+ local-encoded-url->segments
+ url-segments->basename
+ local-encoded-url->basename
+ basename->name-for-query
+ url-segments->guess-title)
+
+(define (local-encoded-url->segments str) ; '("wiki" "Page_title")
+ (map path/param-path (url-path (string->url str))))
+
+(define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix
+ (define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments)))
+ (define basic-filename (string-join extra-encoded "#"))
+ basic-filename)
+
+(define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix
+ (url-segments->basename (local-encoded-url->segments str)))
+
+(define (basename->name-for-query str)
+ (uri-decode (regexp-replace* #rx"#" str "/")))
+
+(define (url-segments->guess-title segments)
+ (regexp-replace* #rx"_" (cadr segments) " "))
diff --git a/lib/html-parsing/main.rkt b/lib/html-parsing/main.rkt
new file mode 100644
index 0000000..bdc09b1
--- /dev/null
+++ b/lib/html-parsing/main.rkt
@@ -0,0 +1,1887 @@
+#lang racket/base
+;; Copyright Neil Van Dyke. For legal info, see file "info.rkt".
+
+(require mcfly)
+
+(module+ test
+ (require overeasy))
+
+(doc (section "Introduction")
+
+ (para "The "
+ (code "html-parsing")
+ " library provides a permissive HTML parser. The parser is useful
+for software agent extraction of information from Web pages, for
+programmatically transforming HTML files, and for implementing interactive Web
+browsers. "
+ (code "html-parsing")
+ " emits "
+ ;; TODO: 2016-02-21 Once create sxml-doc package, reference that.
+ (seclink "top"
+ #:doc '(lib "sxml-intro/sxml-intro.scrbl")
+ #:indirect? #true
+ "SXML/xexp")
+ ", so that conventional HTML may be processed with XML tools such as
+SXPath. Like Oleg Kiselyov's "
+ (hyperlink "http://pobox.com/~oleg/ftp/Scheme/xml.html#HTML-parser"
+ "SSAX-based HTML parser")
+ ", "
+ (code "html-parsing")
+ " provides a permissive tokenizer, but "
+ (code "html-parsing")
+ " extends this by attempting to recover syntactic structure.")
+
+ (para "The "
+ (code "html-parsing")
+ " parsing behavior is permissive in that it accepts erroneous HTML,
+handling several classes of HTML syntax errors gracefully, without yielding a
+parse error. This is crucial for parsing arbitrary real-world Web pages, since
+many pages actually contain syntax errors that would defeat a strict or
+validating parser. "
+ (code "html-parsing")
+ "'s handling of errors is intended to generally emulate popular Web
+browsers' interpretation of the structure of erroneous HTML.")
+ (para (code "html-parsing")
+ " also has some support for XHTML, although XML namespace qualifiers
+are accepted but stripped from the resulting SXML/xexp. Note that "
+ (italic "valid")
+ " XHTML input might be better handled by a validating XML parser
+like Kiselyov's SSAX."))
+
+;; BEGIN COPIED FROM XEXP PACKAGE
+
+(define (%html-parsing:make-xexp-char-ref val)
+ (if (or (symbol? val) (integer? val))
+ `(& ,val)
+ (error 'make-xexp-char-ref
+ "invalid xexp reference value: ~S"
+ val)))
+
+(define %html-parsing:always-empty-html-elements
+ '(area base br frame hr img input isindex keygen link meta param
+ spacer wbr))
+
+;; END COPIED FROM XEXP PACKAGE
+
+(define %html-parsing:empty-token-symbol '*empty*)
+(define %html-parsing:end-token-symbol '*end*)
+(define %html-parsing:start-token-symbol '*start*)
+(define %html-parsing:entity-token-symbol '*entity*)
+(define %html-parsing:text-string-token-symbol '*text-string*)
+(define %html-parsing:text-char-token-symbol '*text-char*)
+
+(define %html-parsing:make-html-tokenizer
+ ;; TODO: Have the tokenizer replace contiguous whitespace within individual
+ ;; text tokens with single space characters (except for when in `pre' and
+ ;; verbatim elements). The parser will introduce new contiguous whitespace
+ ;; (e.g., when text tokens are concatenated, invalid end tags are removed,
+ ;; whitespace is irrelevant between certain elements), but then the parser
+ ;; only has to worry about the first and last character of each string.
+ ;; Perhaps the text tokens should have both leading and trailing whitespace
+ ;; stripped, and contain flags for whether or not leading and trailing
+ ;; whitespace occurred.
+ (letrec ((no-token '())
+
+ ;; TODO: Maybe make these three variables options.
+
+ (verbatim-to-eof-elems '(plaintext))
+
+ (verbatim-pair-elems '(script server style xmp))
+
+ (ws-chars (list #\space
+ (integer->char 9)
+ (integer->char 10)
+ (integer->char 11)
+ (integer->char 12)
+ (integer->char 13)))
+
+ (gosc/string-or-false
+ (lambda (os)
+ (let ((s (get-output-string os)))
+ (if (string=? s "") #f s))))
+
+ (gosc/symbol-or-false
+ (lambda (os)
+ (let ((s (gosc/string-or-false os)))
+ (if s (string->symbol s) #f))))
+ )
+ (lambda (in normalized?)
+ ;; TODO: Make a tokenizer option that causes XML namespace qualifiers to
+ ;; be ignored.
+ (letrec
+ (
+ ;; Port buffer with inexpensive unread of one character and slightly
+ ;; more expensive pushback of second character to unread. The
+ ;; procedures themselves do no consing. The tokenizer currently
+ ;; needs two-symbol lookahead, due to ambiguous "/" while parsing
+ ;; element and attribute names, which could be either empty-tag
+ ;; syntax or XML qualified names.
+ (c #f)
+ (next-c #f)
+ (c-consumed? #t)
+ (read-c (lambda ()
+ (if c-consumed?
+ (if next-c
+ (begin (set! c next-c)
+ (set! next-c #f))
+ (set! c (read-char in)))
+ (set! c-consumed? #t))))
+ (unread-c (lambda ()
+ (if c-consumed?
+ (set! c-consumed? #f)
+ ;; TODO: Procedure name in error message really
+ ;; isn't "%html-parsing:make-html-tokenizer"...
+ (error '%html-parsing:make-html-tokenizer
+ "already unread: ~S"
+ c))))
+ (push-c (lambda (new-c)
+ (if c-consumed?
+ (begin (set! c new-c)
+ (set! c-consumed? #f))
+ (if next-c
+ (error '%html-parsing:make-html-tokenizer
+ "pushback full: ~S"
+ c)
+ (begin (set! next-c c)
+ (set! c new-c)
+ (set! c-consumed? #f))))))
+
+ ;; TODO: These procedures are a temporary convenience for
+ ;; enumerating the pertinent character classes, with an eye towards
+ ;; removing redundant tests of character class. These procedures
+ ;; should be eliminated in a future version.
+ (c-eof? (lambda () (eof-object? c)))
+ (c-amp? (lambda () (eqv? c #\&)))
+ (c-apos? (lambda () (eqv? c #\')))
+ (c-bang? (lambda () (eqv? c #\!)))
+ (c-colon? (lambda () (eqv? c #\:)))
+ (c-quot? (lambda () (eqv? c #\")))
+ (c-equals? (lambda () (eqv? c #\=)))
+ (c-gt? (lambda () (eqv? c #\>)))
+ (c-lsquare? (lambda () (eqv? c #\[)))
+ (c-lt? (lambda () (eqv? c #\<)))
+ (c-minus? (lambda () (eqv? c #\-)))
+ (c-pound? (lambda () (eqv? c #\#)))
+ (c-ques? (lambda () (eqv? c #\?)))
+ (c-semi? (lambda () (eqv? c #\;)))
+ (c-slash? (lambda () (eqv? c #\/)))
+ (c-splat? (lambda () (eqv? c #\*)))
+ (c-lf? (lambda () (eqv? c #\newline)))
+ (c-angle? (lambda () (memv c '(#\< #\>))))
+ (c-ws? (lambda () (memv c ws-chars)))
+ (c-alpha? (lambda () (char-alphabetic? c)))
+ (c-digit? (lambda () (char-numeric? c)))
+ (c-alphanum? (lambda () (or (c-alpha?) (c-digit?))))
+ (c-hexlet? (lambda () (memv c '(#\a #\b #\c #\d #\e #\f
+ #\A #\B #\C #\D #\E #\F))))
+
+ (skip-ws (lambda () (read-c) (if (c-ws?) (skip-ws) (unread-c))))
+
+ (if-read-chars
+ (lambda (match-chars yes-thunk no-proc)
+ (let loop ((chars match-chars)
+ (match-count 0))
+ (if (null? chars)
+ (yes-thunk)
+ (begin (read-c)
+ (if (eqv? c (car chars))
+ (begin (loop (cdr chars) (+ 1 match-count)))
+ (begin (unread-c)
+ (no-proc match-chars match-count))))))))
+
+ (write-chars-count
+ (lambda (chars count port)
+ (let loop ((chars chars)
+ (count count))
+ (or (zero? count)
+ (begin (write-char (car chars) port)
+ (loop (cdr chars)
+ (- count 1)))))))
+
+ (make-start-token
+ (if normalized?
+ (lambda (name ns attrs)
+ (list name (cons '@ attrs)))
+ (lambda (name ns attrs)
+ (if (null? attrs)
+ (list name)
+ (list name (cons '@ attrs))))))
+
+ (make-empty-token
+ (lambda (name ns attrs)
+ (cons %html-parsing:empty-token-symbol
+ (make-start-token name ns attrs))))
+
+ (make-end-token
+ (if normalized?
+ (lambda (name ns attrs)
+ (list %html-parsing:end-token-symbol
+ name
+ (cons '@ attrs)))
+ (lambda (name ns attrs)
+ (if (null? attrs)
+ (list %html-parsing:end-token-symbol name)
+ (list %html-parsing:end-token-symbol
+ name
+ (cons '@ attrs))))))
+
+ (make-comment-token
+ (lambda (str) (list '*COMMENT* str)))
+
+ (make-decl-token
+ (lambda (parts) (cons '*DECL* parts)))
+
+ (scan-qname
+ ;; TODO: Make sure we don't accept local names that have "*", since
+ ;; this can break SXML tools. Have to validate this afterwards if
+ ;; "verbatim-safe?". Also check for "@" and maybe "@@". Check
+ ;; qname parsing code, especially for verbatim mode. This is
+ ;; important!
+ (lambda (verbatim-safe?)
+ ;; Note: If we accept some invalid local names, we only need two
+ ;; symbols of lookahead to determine the end of a qname.
+ (letrec ((os #f)
+ (ns '())
+ (vcolons 0)
+ (good-os (lambda ()
+ (or os
+ (begin (set! os (open-output-string))
+ os)))))
+ (let loop ()
+ (read-c)
+ (cond ((c-eof?) #f)
+ ((or (c-ws?) (c-splat?))
+ (if verbatim-safe?
+ (unread-c)
+ #f))
+ ((or (c-angle?) (c-equals?) (c-quot?) (c-apos?))
+ (unread-c))
+ ((c-colon?)
+ (or (null? ns)
+ (set! ns (cons ":" ns)))
+ (if os
+ (begin
+ (set! ns (cons (get-output-string os)
+ ns))
+ (set! os #f))
+ #f)
+ (loop))
+ ((c-slash?)
+ (read-c)
+ (cond ((or (c-eof?)
+ (c-ws?)
+ (c-equals?)
+ (c-apos?)
+ (c-quot?)
+ (c-angle?)
+ (c-splat?))
+ (unread-c)
+ (push-c #\/))
+ (else (write-char #\/ (good-os))
+ (write-char c os)
+ (loop))))
+ (else (write-char c (good-os))
+ (loop))))
+ (let ((ns (if (null? ns)
+ #f
+ (apply string-append
+ (reverse ns))))
+ (localname (if os (get-output-string os) #f)))
+ (if verbatim-safe?
+ ;; TODO: Make sure we don't have ambiguous ":" or drop
+ ;; any characters!
+ (cons ns localname)
+ ;; Note: We represent "xml:" and "xmlns:" syntax as
+ ;; normal qnames, for lack of something better to do with
+ ;; them when we don't support XML namespaces.
+ ;;
+ ;; TODO: Local names are currently forced to lowercase,
+ ;; since HTML is usually case-insensitive. If XML
+ ;; namespaces are used, we might wish to keep local names
+ ;; case-sensitive.
+ (if localname
+ (if ns
+ (if (or (string=? ns "xml")
+ (string=? ns "xmlns"))
+ (string->symbol (string-append ns
+ ":"
+ localname))
+ (cons ns
+ (string->symbol (string-downcase
+ localname))))
+ (string->symbol (string-downcase localname)))
+ (if ns
+ (string->symbol (string-downcase ns))
+ ;; TODO: Ensure in rest of code that returning #f
+ ;; as a name here is OK.
+ #f)))))))
+
+ (scan-tag
+ (lambda (start?)
+ (skip-ws)
+ (let ((tag-name (scan-qname #f))
+ (tag-ns #f)
+ (tag-attrs #f)
+ (tag-empty? #f))
+ ;; Scan element name.
+ (if (pair? tag-name)
+ (begin (set! tag-ns (car tag-name))
+ (set! tag-name (cdr tag-name)))
+ #f)
+ ;; TODO: Ensure there's no case in which a #f tag-name isn't
+ ;; compensated for later.
+ ;;
+ ;; Scan element attributes.
+ (set! tag-attrs
+ (let scan-attr-list ()
+ (read-c)
+ (cond ((c-eof?) '())
+ ((c-angle?) (unread-c) '())
+ ((c-slash?)
+ (set! tag-empty? #t)
+ (scan-attr-list))
+ ((c-alpha?)
+ (unread-c)
+ (let ((attr (scan-attr)))
+ (cons attr (scan-attr-list))))
+ (else (scan-attr-list)))))
+ ;; Find ">" or unnatural end.
+ (let loop ()
+ (read-c)
+ (cond ((c-eof?) no-token)
+ ((c-slash?) (set! tag-empty? #t) (loop))
+ ((c-gt?) #f)
+ ((c-ws?) (loop))
+ (else (unread-c))))
+ ;; Change the tokenizer mode if necessary.
+ (cond ((not start?) #f)
+ (tag-empty? #f)
+ ;; TODO: Maybe make one alist lookup here, instead of
+ ;; two.
+ ((memq tag-name verbatim-to-eof-elems)
+ (set! nexttok verbeof-nexttok))
+ ((memq tag-name verbatim-pair-elems)
+ (set! nexttok (make-verbpair-nexttok tag-name))))
+ ;; Return a token object.
+ (if start?
+ (if tag-empty?
+ (make-empty-token tag-name tag-ns tag-attrs)
+ (make-start-token tag-name tag-ns tag-attrs))
+ (make-end-token tag-name tag-ns tag-attrs)))))
+
+ (scan-attr
+ (lambda ()
+ (let ((name (scan-qname #f))
+ (val #f))
+ (if (pair? name)
+ (set! name (cdr name))
+ #f)
+ (let loop-equals-or-end ()
+ (read-c)
+ (cond ((c-eof?) no-token)
+ ((c-ws?) (loop-equals-or-end))
+ ((c-equals?)
+ (let loop-quote-or-unquoted ()
+ (read-c)
+ (cond ((c-eof?) no-token)
+ ((c-ws?) (loop-quote-or-unquoted))
+ ((or (c-apos?) (c-quot?))
+ (let ((term c))
+ (set! val (open-output-string))
+ (let loop-quoted-val ()
+ (read-c)
+ (cond ((c-eof?) #f)
+ ((eqv? c term) #f)
+ ((c-amp?) (let ((entity (scan-entity)))
+ (display entity val)
+ (loop-quoted-val)))
+ (else (write-char c val)
+ (loop-quoted-val))))))
+ ((c-angle?) (unread-c))
+ (else
+ (set! val (open-output-string))
+ (write-char c val)
+ (let loop-unquoted-val ()
+ (read-c)
+ (cond ((c-eof?) no-token)
+ ((c-apos?) #f)
+ ((c-quot?) #f)
+ ((or (c-ws?) (c-angle?)
+ ;;(c-slash?)
+ )
+ (unread-c))
+ ;; Note: We can treat a slash in an
+ ;; unquoted attribute value as a
+ ;; value constituent because the
+ ;; slash is specially-handled only
+ ;; for XHTML, and XHTML attribute
+ ;; values must always be quoted. We
+ ;; could do lookahead for "/>", but
+ ;; that wouldn't let us parse HTML
+ ;; "" correctly, so this is
+ ;; an easier and more correct way to
+ ;; do things.
+ (else (write-char c val)
+ (loop-unquoted-val))))))))
+ (else (unread-c))))
+ (if normalized?
+ (list name (if val
+ (get-output-string val)
+ (symbol->string name)))
+ (if val
+ (list name (get-output-string val))
+ (list name))))))
+
+ (scan-comment
+ ;; TODO: Rewrite this to use tail recursion rather than a state
+ ;; variable.
+ (lambda ()
+ (let ((os (open-output-string))
+ (state 'start-minus))
+ (let loop ()
+ (read-c)
+ (cond ((c-eof?) #f)
+ ((c-minus?)
+ (set! state
+ (case state
+ ((start-minus) 'start-minus-minus)
+ ((start-minus-minus body) 'end-minus)
+ ((end-minus) 'end-minus-minus)
+ ((end-minus-minus) (write-char #\- os) state)
+ (else (error '<%html-parsing:make-html-tokenizer>
+ "invalid state: ~S"
+ state))))
+ (loop))
+ ((and (c-gt?) (eq? state 'end-minus-minus)) #f)
+ (else (case state
+ ((end-minus) (write-char #\- os))
+ ((end-minus-minus) (display "--" os)))
+ (set! state 'body)
+ (write-char c os)
+ (loop))))
+ (make-comment-token (get-output-string os)))))
+
+ (scan-possible-cdata
+ (lambda ()
+ ;; Read ")
+ (lambda () (get-output-string os))
+ (lambda (chars count)
+ (if (zero? count)
+ (if (eof-object? c)
+ (get-output-string os)
+ (begin (write-char c os)
+ (read-c)
+ (loop)))
+ (begin (write-char #\] os)
+ (if (= count 2)
+ (push-c #\])
+ #f)
+ (loop)))))))))
+
+ (scan-pi
+ (lambda ()
+ (skip-ws)
+ (let ((name (open-output-string))
+ (val (open-output-string)))
+ (let scan-name ()
+ (read-c)
+ (cond ((c-eof?) #f)
+ ((c-ws?) #f)
+ ((c-alpha?) (write-char c name) (scan-name))
+ (else (unread-c))))
+ ;; TODO: Do we really want to emit #f for PI name?
+ (set! name (gosc/symbol-or-false name))
+ (let scan-val ()
+ (read-c)
+ (cond ((c-eof?) #f)
+ ;; ((c-amp?) (display (scan-entity) val)
+ ;; (scan-val))
+ ((c-ques?)
+ (read-c)
+ (cond ((c-eof?) (write-char #\? val))
+ ((c-gt?) #f)
+ (else (write-char #\? val)
+ (unread-c)
+ (scan-val))))
+ (else (write-char c val) (scan-val))))
+ (list '*PI*
+ name
+ (get-output-string val)))))
+
+ (scan-decl
+ ;; TODO: Find if SXML includes declaration forms, and if so, use
+ ;; whatever format SXML wants.
+ ;;
+ ;; TODO: Rewrite to eliminate state variables.
+ (letrec
+ ((scan-parts
+ (lambda ()
+ (let ((part (open-output-string))
+ (nonsymbol? #f)
+ (state 'before)
+ (last? #f))
+ (let loop ()
+ (read-c)
+ (cond ((c-eof?) #f)
+ ((c-ws?)
+ (case state
+ ((before) (loop))
+ ((quoted) (write-char c part) (loop))))
+ ((and (c-gt?) (not (eq? state 'quoted)))
+ (set! last? #t))
+ ((and (c-lt?) (not (eq? state 'quoted)))
+ (unread-c))
+ ((c-quot?)
+ (case state
+ ((before) (set! state 'quoted) (loop))
+ ((unquoted) (unread-c))
+ ((quoted) #f)))
+ (else
+ (if (eq? state 'before)
+ (set! state 'unquoted)
+ #f)
+ (set! nonsymbol? (or nonsymbol?
+ (not (c-alphanum?))))
+ (write-char c part)
+ (loop))))
+ (set! part (get-output-string part))
+ (if (string=? part "")
+ '()
+ (cons (if (or (eq? state 'quoted) nonsymbol?)
+ part
+ ;; TODO: Normalize case of things we make
+ ;; into symbols here.
+ (string->symbol part))
+ (if last?
+ '()
+ (scan-parts))))))))
+ (lambda () (make-decl-token (scan-parts)))))
+
+ (special-entity-reverse-chars-to-string-alist
+ '(((#\p #\m #\a) . "&")
+ ((#\s #\o #\p #\a) . "'")
+ ((#\t #\g) . ">")
+ ((#\t #\l) . "<")
+ ((#\t #\o #\u #\q) . "\"")))
+
+ (finish-terminated-named-entity
+ (lambda (reverse-name-chars)
+ (cond ((equal? '() reverse-name-chars)
+ "&")
+ ((assoc reverse-name-chars
+ special-entity-reverse-chars-to-string-alist)
+ => (lambda (p)
+ (cdr p)))
+ (else (%html-parsing:make-xexp-char-ref
+ (string->symbol (apply string (reverse reverse-name-chars))))))))
+
+ (finish-unterminated-named-entity
+ (lambda (reverse-name-chars)
+ (apply string (cons #\& (reverse reverse-name-chars)))))
+
+ (scan-entity
+ (lambda ()
+ (read-c)
+ (cond ((c-eof?) "&")
+ ((c-alpha?)
+ ;; TODO: Do entity names have a maximum length?
+ (let loop ((reverse-name-chars (cons c '())))
+ (read-c)
+ (cond ((c-eof?) (finish-unterminated-named-entity
+ reverse-name-chars))
+ ((c-alpha?) (let ((reverse-name-chars (cons c reverse-name-chars)))
+ (cond ((assoc reverse-name-chars
+ special-entity-reverse-chars-to-string-alist)
+ => (lambda (p)
+ (read-c)
+ (or (c-semi?)
+ (unread-c))
+ (cdr p)))
+ (else (loop reverse-name-chars)))))
+ ((c-semi?) (finish-terminated-named-entity
+ reverse-name-chars))
+ (else (unread-c)
+ (finish-unterminated-named-entity
+ reverse-name-chars)))))
+ ((c-pound?)
+ (let ((num (open-output-string))
+ (hex? #f))
+ (read-c)
+ (cond ((c-eof?) #f)
+ ((memv c '(#\x #\X)) (set! hex? #t) (read-c)))
+ (let loop ()
+ (cond ((c-eof?) #f)
+ ((c-semi?) #f)
+ ((or (c-digit?) (and hex? (c-hexlet?)))
+ (write-char c num)
+ (read-c)
+ (loop))
+ (else (unread-c))))
+ (set! num (get-output-string num))
+ (if (string=? num "")
+ ""
+ (let ((n (string->number num (if hex? 16 10))))
+ (if (<= 32 n 126)
+ (string (integer->char n))
+ (string (integer->char n)))))))
+ (else (unread-c) "&"))))
+
+ (normal-nexttok
+ (lambda ()
+ (read-c)
+ (cond ((c-eof?) no-token)
+ ((c-lt?)
+ (let loop ()
+ (read-c)
+ (cond ((c-eof?) "<")
+ ;; ((c-ws?) (loop))
+ ((c-slash?) (scan-tag #f))
+ ((c-ques?) (scan-pi))
+ ((c-alpha?) (unread-c) (scan-tag #t))
+ ((c-bang?)
+ (read-c)
+ (if (c-lsquare?)
+ (scan-possible-cdata)
+ (let loop ()
+ (cond ((c-eof?) no-token)
+ ((c-ws?) (read-c) (loop))
+ ((c-minus?) (scan-comment))
+ (else (unread-c)
+ (scan-decl))))))
+ (else (unread-c) "<"))))
+ ((c-gt?) ">")
+ (else (let ((os (open-output-string)))
+ (let loop ()
+ (cond ((c-eof?) #f)
+ ((c-angle?) (unread-c))
+ ((c-amp?)
+ (let ((entity (scan-entity)))
+ (if (string? entity)
+ (begin (display entity os)
+ (read-c)
+ (loop))
+ (let ((saved-nexttok nexttok))
+ (set! nexttok
+ (lambda ()
+ (set! nexttok
+ saved-nexttok)
+ entity))))))
+ (else (write-char c os)
+ (or (c-lf?)
+ (begin (read-c) (loop))))))
+ (let ((text (get-output-string os)))
+ (if (equal? text "")
+ (nexttok)
+ text)))))))
+
+ (verbeof-nexttok
+ (lambda ()
+ (read-c)
+ (if (c-eof?)
+ no-token
+ (let ((os (open-output-string)))
+ (let loop ()
+ (or (c-eof?)
+ (begin (write-char c os)
+ (or (c-lf?)
+ (begin (read-c) (loop))))))
+ (get-output-string os)))))
+
+ (make-verbpair-nexttok
+ (lambda (elem-name)
+ (lambda ()
+ (let ((os (open-output-string)))
+ ;; Accumulate up to a newline-terminated line.
+ (let loop ()
+ (read-c)
+ (cond ((c-eof?)
+ ;; Got EOF in verbatim context, so set the normal
+ ;; nextok procedure, then fall out of loop.
+ (set! nexttok normal-nexttok))
+ ((c-lt?)
+ ;; Got "<" in verbatim context, so get next
+ ;; character.
+ (read-c)
+ (cond ((c-eof?)
+ ;; Got "<" then EOF, so set to the normal
+ ;; nexttok procedure, add the "<" to the
+ ;; verbatim string, and fall out of loop.
+ (set! nexttok normal-nexttok)
+ (write-char #\< os))
+ ((c-slash?)
+ ;; Got "", so...
+ (read-c)
+ (cond
+ ((c-eof?)
+ (display "" os))
+ ((c-alpha?)
+ ;; Got "" followed by alpha, so unread
+ ;; the alpha, scan qname, compare...
+ (unread-c)
+ (let* ((vqname (scan-qname #t))
+ (ns (car vqname))
+ (local (cdr vqname)))
+ ;; Note: We ignore XML namespace
+ ;; qualifier for purposes of comparison.
+ ;;
+ ;; Note: We're interning strings here for
+ ;; comparison when in theory there could
+ ;; be many such unique interned strings
+ ;; in a valid HTML document, although in
+ ;; practice this should not be a problem.
+ (if (and local
+ (eqv? (string->symbol
+ (string-downcase local))
+ elem-name))
+ ;; This is the terminator tag, so
+ ;; scan to the end of it, set the
+ ;; nexttok, and fall out of the loop.
+ (begin
+ (let scan-to-end ()
+ (read-c)
+ (cond ((c-eof?) #f)
+ ((c-gt?) #f)
+ ((c-lt?) (unread-c))
+ ((c-alpha?)
+ (unread-c)
+ ;; Note: This is an
+ ;; expensive way to skip
+ ;; over an attribute, but
+ ;; in practice more
+ ;; verbatim end tags will
+ ;; not have attributes.
+ (scan-attr)
+ (scan-to-end))
+ (else (scan-to-end))))
+ (set! nexttok
+ (lambda ()
+ (set! nexttok
+ normal-nexttok)
+ (make-end-token
+ elem-name #f '()))))
+ ;; This isn't the terminator tag, so
+ ;; add to the verbatim string the
+ ;; "" and the characters of what we
+ ;; were scanning as a qname, and
+ ;; recurse in the loop.
+ (begin
+ (display "" os)
+ (if ns
+ (begin (display ns os)
+ (display ":" os))
+ #f)
+ (if local
+ (display local os)
+ #f)
+ (loop)))))
+ (else
+ ;; Got "" and non-alpha, so unread new
+ ;; character, add the "" to verbatim
+ ;; string, then loop.
+ (unread-c)
+ (display "" os)
+ (loop))))
+ (else
+ ;; Got "<" and non-slash, so unread the new
+ ;; character, write the "<" to the verbatim
+ ;; string, then loop.
+ (unread-c)
+ (write-char #\< os)
+ (loop))))
+ (else
+ ;; Got non-"<" in verbatim context, so just add it
+ ;; to the buffer, then, if it's not a linefeed, fall
+ ;; out of the loop so that the token can be
+ ;; returned.
+ (write-char c os)
+ (or (c-lf?) (loop)))))
+ ;; Return the accumulated line string, if non-null, or call
+ ;; nexttok.
+ (or (gosc/string-or-false os) (nexttok))))))
+
+ (nexttok #f))
+
+ (set! nexttok normal-nexttok)
+ (lambda () (nexttok))))))
+
+(define (%html-parsing:tokenize-html in normalized?)
+ (let ((next-tok (%html-parsing:make-html-tokenizer in normalized?)))
+ (let loop ((tok (next-tok)))
+ (if (null? tok)
+ '()
+ (cons tok (loop (next-tok)))))))
+
+(define (%html-parsing:xexp-token-kind token)
+ (cond ((string? token) %html-parsing:text-string-token-symbol)
+ ((char? token) %html-parsing:text-char-token-symbol)
+ ((list? token)
+ (let ((s (car token)))
+ (if (memq s `(*COMMENT*
+ *DECL*
+ *PI*
+ ,%html-parsing:empty-token-symbol
+ ,%html-parsing:end-token-symbol
+ ,%html-parsing:entity-token-symbol))
+ s
+ %html-parsing:start-token-symbol)))
+ (else (error '%html-parsing:xexp-token-kind
+ "unrecognized token kind: ~S"
+ token))))
+
+(doc (section "Interface"))
+
+;; @defvar %html-parsing:empty-elements
+;;
+;; List of names of HTML element types that have no content, represented as a
+;; list of symbols. This is used internally by the parser and encoder. The
+;; effect of mutating this list is undefined.
+
+;; TODO: Document exactly which elements these are, after we make the new
+;; parameterized parser constructor.
+
+(define %html-parsing:empty-elements
+ (cons '& %html-parsing:always-empty-html-elements))
+
+;; @defproc parse-html/tokenizer tokenizer normalized?
+;;
+;; Emits a parse tree like @code{html->xexp} and related procedures, except
+;; using @var{tokenizer} as a source of tokens, rather than tokenizing from an
+;; input port. This procedure is used internally, and generally should not be
+;; called directly.
+
+(define %html-parsing:parse-html/tokenizer
+ ;; Note: This algorithm was originally written in 2001 (as part of the first
+ ;; Scheme library the author ever wrote), and then on 2009-08-16 was revamped
+ ;; to not use mutable pairs, for PLT 4 compatibility. It could still use
+ ;; some work to be more FP, but it works for now.
+ (letrec ((empty-elements
+ ;; TODO: Maybe make this an option.
+ %html-parsing:empty-elements)
+ (h-elem-parents
+ ;; Am doing this kludge mainly for mid-1990s HTML that uses the `p`
+ ;; element wrong. Trying to get all appropriate parents other than
+ ;; `p` that I can, to reduce breaking other code.
+ '(a article aside blink blockquote body center footer form header html li main nav section slot template))
+ (parent-constraints
+ ;; TODO: Maybe make this an option.
+ `((area . (map span))
+ (body . (html))
+ (caption . (table))
+ (colgroup . (table))
+ (dd . (dl))
+ (dt . (dl))
+ (frame . (frameset))
+ (head . (html))
+ (h1 . ,h-elem-parents)
+ (h2 . ,h-elem-parents)
+ (h3 . ,h-elem-parents)
+ (h4 . ,h-elem-parents)
+ (h5 . ,h-elem-parents)
+ (h6 . ,h-elem-parents)
+ (isindex . (head))
+ (li . (dir menu ol ul))
+ (meta . (head))
+ (noframes . (frameset))
+ (option . (select))
+ (p . (blockquote body details figcaption html li td th))
+ (param . (applet))
+ (tbody . (table))
+ (td . (tr))
+ (th . (tr))
+ (thead . (table))
+ (title . (head))
+ (tr . (table tbody thead))))
+ (token-kinds-that-always-get-added
+ `(*COMMENT*
+ *DECL*
+ *PI*
+ ,%html-parsing:entity-token-symbol
+ ,%html-parsing:text-string-token-symbol
+ ,%html-parsing:text-char-token-symbol))
+ (start-tag-name (lambda (tag-token) (car tag-token)))
+ (end-tag-name (lambda (tag-token) (list-ref tag-token 1))))
+ (lambda (tokenizer normalized?)
+ ;;(log-html-parsing-debug "(%html-parsing:parse-html/tokenizer ~S ~S)" tokenizer normalized?)
+ (let ((begs (list (vector #f '()))))
+ (letrec ((add-thing-as-child-of-current-beg
+ (lambda (tok)
+ (let ((beg (car begs)))
+ (vector-set! beg 1 (cons tok (vector-ref beg 1))))))
+
+ (beg->elem
+ (lambda (beg)
+ (let ((elem-name (vector-ref beg 0))
+ (attrs-and-contents (reverse (vector-ref beg 1))))
+ (cons elem-name attrs-and-contents))))
+
+ (finish-current-beg-and-return-elem
+ (lambda ()
+ (let ((elem (beg->elem (car begs))))
+ (set! begs (cdr begs))
+ (or (null? begs)
+ (add-thing-as-child-of-current-beg elem))
+ elem)))
+
+ (finish-current-beg
+ (lambda ()
+ (finish-current-beg-and-return-elem)))
+
+ (finish-all-begs-and-return-top
+ (lambda ()
+ (let loop ()
+ (let ((elem (finish-current-beg-and-return-elem)))
+ (if (car elem)
+ (loop)
+ (cdr elem))))))
+
+ (finish-begs-up-to-and-including-name
+ (lambda (name)
+ ;; (log-html-parsing-debug "(finish-begs-up-to-and-including-name ~S)" name)
+ (let loop-find-name ((find-begs begs)
+ (depth 1))
+ (let ((beg-name (vector-ref (car find-begs) 0)))
+ (cond ((not beg-name)
+ ;; We reached the root without finding a
+ ;; matching beg, so don't finish anything.
+ ;;
+ ;; TODO: 2022-04-02: Consider having a `*TOP*`
+ ;; kludge in `parent-constraints` that's checked
+ ;; here, especially for handling mid-1990s HTML
+ ;; `p` element (so that we can keep `p` from
+ ;; being a child of `p` even when there's no
+ ;; parent `body` or `html` element).
+ (void))
+ ((eqv? name beg-name)
+ ;; We found a match, so finish the begs up to
+ ;; depth.
+ (let loop-finish ((depth depth))
+ (or (zero? depth)
+ (begin
+ (finish-current-beg)
+ (loop-finish (- depth 1))))))
+ (else
+ ;; Didn't find a match yet, and there's still at
+ ;; least one more beg to look at, so recur.
+ (loop-find-name (cdr find-begs)
+ (+ depth 1))))))))
+
+ (finish-begs-upto-but-not-including-names
+ (lambda (names)
+ ;; (log-html-parsing-debug "(finish-begs-upto-but-not-including-names ~S)" names)
+ ;; (log-html-parsing-debug "begs = ~S" begs)
+ (let loop-find-name ((find-begs begs)
+ (depth 0))
+ (let ((beg-name (vector-ref (car find-begs) 0)))
+ (cond ((not beg-name)
+ ;; We reached the root without finding a
+ ;; matching beg, so simply discard it.
+ (void))
+ ((memq beg-name names)
+ ;; We found a match, so finish the begs up to
+ ;; depth.
+ (let loop-finish ((depth depth))
+ (or (zero? depth)
+ (begin
+ (finish-current-beg)
+ (loop-finish (- depth 1))))))
+ (else
+ ;; Didn't find a match yet, and there's still at
+ ;; least one more beg to look at, so recur.
+ (loop-find-name (cdr find-begs)
+ (+ depth 1)))))))))
+
+ (let loop ()
+ (let ((tok (tokenizer)))
+ (if (null? tok)
+ (finish-all-begs-and-return-top)
+ (let ((kind (%html-parsing:xexp-token-kind tok)))
+ ;; (log-html-parsing-debug "kind = ~S" kind)
+ (cond ((memv kind token-kinds-that-always-get-added)
+ (add-thing-as-child-of-current-beg tok))
+ ((eqv? kind %html-parsing:start-token-symbol)
+ ;; (log-html-parsing-debug "is-start-token-symbol")
+ (let* ((name (start-tag-name tok))
+ (cell (assq name parent-constraints)))
+ ;; (log-html-parsing-debug "name = ~S cell = ~S" name cell)
+ (and cell
+ (finish-begs-upto-but-not-including-names
+ (cons 'div (cdr cell))))
+ (if (memq name empty-elements)
+ (add-thing-as-child-of-current-beg tok)
+ (set! begs (cons (vector (car tok)
+ (cdr tok))
+ begs)))))
+ ((eqv? kind %html-parsing:empty-token-symbol)
+ ;; Empty tag token, so just add it to current
+ ;; beginning while stripping off leading `*EMPTY*'
+ ;; symbol so that the token becomes normal SXML
+ ;; element syntax.
+ (add-thing-as-child-of-current-beg (cdr tok)))
+ ((eqv? kind %html-parsing:end-token-symbol)
+ (let ((name (end-tag-name tok)))
+ (if name
+ ;; Try to finish to a start tag matching this
+ ;; end tag. If none, just drop the token,
+ ;; though we used to add it to the current
+ ;; beginning.
+ (finish-begs-up-to-and-including-name
+ name)
+ ;; We have an anonymous end tag, so match it
+ ;; with the most recent beginning. If no
+ ;; beginning to match, then just drop the
+ ;; token, though we used to add it to the
+ ;; current beginning.
+ (and (vector-ref (car begs) 0)
+ (finish-current-beg)))))
+ (else (error 'parse-html/tokenizer
+ "unknown tag kind: ~S"
+ kind)))
+ (loop))))))))))
+
+;; TODO: Quote of message to a user:
+;;
+;; >I think this behavior is due to HtmlPrag's use in "parse-html/tokenizer"
+;; >of its local "parent-constraints" variable.
+;; >
+;; >The following line of code from the variable binding expresses the
+;; >constraint that any "p" element can have as immediate parent element
+;; >only "body", "td", or "th":
+;; >
+;; > (p . (body td th))
+;; >
+;; >I think I know a good heuristic for dealing with unfamiliar but
+;; >seemingly well-formed elements, like "page" in this case, but I'm afraid
+;; >I don't have time to implement it right now. (I am job-hunting right
+;; >now, and there are many other coding things I need to do first.)
+;; >
+;; >Would adding "page" to the above line of the HtmlPrag source code work
+;; >around the current problem, or do you need a better solution right now?
+
+;; @defproc %parse-html input normalized? top?
+;;
+;; This procedure is now used internally by @code{html->xexp} and its
+;; variants, and should not be used directly by programs. The interface is
+;; likely to change in future versions of HtmlPrag.
+
+(define (%html-parsing:parse-html input normalized? top?)
+ (let ((parse
+ (lambda ()
+ (%html-parsing:parse-html/tokenizer
+ (%html-parsing:make-html-tokenizer
+ (cond ((input-port? input) input)
+ ((string? input) (open-input-string input))
+ (else (error
+ '%html-parsing:parse-html
+ "invalid input type: ~E"
+ input)))
+ normalized?)
+ normalized?))))
+ (if top?
+ (cons '*TOP* (parse))
+ (parse))))
+
+;; @defproc html->sxml-0nf input
+;; @defprocx html->sxml-1nf input
+;; @defprocx html->sxml-2nf input
+;; @defprocx html->sxml input
+(doc (defproc (html->xexp (input (or/c input-port? string?)))
+ xexp
+
+ (para "Parse HTML permissively from "
+ (racket input)
+ ", which is either an input port or a string, and emit an
+SXML/xexp equivalent or approximation. To borrow and slightly modify an
+example from Kiselyov's discussion of his HTML parser:")
+
+ (racketinput
+ (html->xexp
+ (string-append
+ "
" + "
BLah" + " italic bold ened still < bold " + "
But not done yet...")))
+ (racketresultblock
+ (*TOP* (html (head (title) (title "whatever"))
+ (body "\n"
+ (a (@ (href "url")) "link")
+ (p (@ (align "center"))
+ (ul (@ (compact) (style "aa")) "\n"))
+ (p "BLah"
+ (*COMMENT* " comment
") `(*TOP* (hr)))
+ (test (html->xexp "
") `(*TOP* (hr)))
+ (test (html->xexp "
") `(*TOP* (hr)))
+
+ (test (html->xexp "
")
+ `(*TOP* (hr (@ (noshade)))))
+ (test (html->xexp "
")
+ `(*TOP* (hr (@ (noshade)))))
+ (test (html->xexp "
")
+ `(*TOP* (hr (@ (noshade)))))
+ (test (html->xexp "
")
+ `(*TOP* (hr (@ (noshade)))))
+ (test (html->xexp "
")
+ `(*TOP* (hr (@ (noshade "1")))))
+ (test (html->xexp "
")
+ `(*TOP* (hr (@ (noshade "1/")))))
+
+ (test (html->xexp "aaabbb
ccc
BLah italic bold ened " + " still < bold
But not done yet..."))
+ `(*TOP*
+ (html (head (title) (title "whatever"))
+ (body (a (@ (href "url")) "link")
+ (p (@ (align "center"))
+ (ul (@ (compact) (style "aa"))))
+ (p "BLah"
+ (*COMMENT* " comment b a&z a&aposz a<z a>z a"z a&rarrz AT&T Bell Labs a&z a'z a<z a>z a"z B D text in details foo bar")
+ '(*TOP* (html (p "foo") (p "bar"))))
+
+ (test 'non-p-parent-in-which-p-shouldnt-nest
+ (html->xexp " bar")
+ '(*TOP* (html (xyz "foo") (p "bar"))))
+
+ (test 'p-shouldnt-nest-even-without-html-element
+ (html->xexp " foo bar")
+ '(*TOP* (p "foo") (p "bar"))
+ #:fail "see code comment about parent-constraints when no top elements"))
+
+ (test-section 'break-the-world-with-h-elem-mid-90s-handling
+
+ (test 'initial
+ (html->xexp " foo words ")
+ " was used as a separator or terminator, rather than a
+start delimeter. There is a chance that change this will break a real-world
+scraper or other tool.")))
+
+ (#:planet 7:1 #:date "2022-04-02"
+ (itemlist
+ (item "Include a test case #:fail that was unsaved in DrRacket.")))
+
+ (#:planet 7:0 #:date "2022-04-02"
+ (itemlist
+
+ (item "Fixed a quirks-handling bug in which "
+ (code "p")
+ " elements would be (directly or indirectly) nested under other "
+ (code "p")
+ " elements in cases in which there was no "
+ (code "body")
+ " element, but there was an "
+ (code "html")
+ " element. (Thanks to Jonathan Simpson for reporting.)")))
+
+ (#:planet 6:1 #:date "2022-01-22"
+ (itemlist
+ (item "Permit "
+ (code "details")
+ " element to be parent of "
+ (code "p")
+ " element in quirks handling. (Thanks to Jacder for reporting.)")))
+
+ (#:planet 6:0 #:date "2018-05-22"
+ (itemlist
+ (item "Fix to permit "
+ (code "p")
+ " elements as children of "
+ (code "blockquote")
+ " elements. Incrementing major version number because
+this is a breaking change of 17 years, but seems an appropriate change for
+modern HTML, and fixes a particular real-world problem. (Thanks to
+Sorawee Porncharoenwase for reporting.)")))
+
+ (#:planet 5:0 #:date "2018-05-15"
+ (itemlist
+ (item "In a breaking change of handing invalid HTML, most named
+character entity references that are invalid because (possibly among multiple
+reasons) they are not terminated by semicolon, now are treated as literal
+strings (including the ampersand), rather than as named character entites. For
+example, parser input string "
+ (racket " A&B Co. ")
+ " will now parse as "
+ (racket (p "A&B Co."))
+ " rather than as "
+ (racket (p "A" (& B) " Co."))
+ ". (Thanks for Greg Hendershott for suggesting this, and discussing.)")
+ (item "For support of historical quirks handling, five early
+HTML named character entities (specifically, "
+ (tt "amp")
+ ", "
+ (tt "apos")
+ ", "
+ (tt "lt")
+ ", "
+ (tt "gt")
+ ", "
+ (tt "quot")
+ ") do not need to be terminated with a semicolon, and
+will even be recognized if followed immediately by an alphabetic. For
+example, "
+ (racket " a<z to ([^<]*) Caption text. to ([^<]*) Caption text.
F
")
+ '(*TOP* (blockquote (tr (td (blockquote (p)))
+ (div))))))
+
+ (test-section 'p-elem-can-be-child-of-details-elem
+
+ (test 'initial-jacder-example-modified
+ (html->xexp "bar
")
+ '(*TOP* (html (p "foo") (h3 "bar")))))
+
+ (test-section 'twrpme-h2-in-li-elements
+
+ (test 'simple
+ (html->xexp "
")
+ '(*TOP* (html (body (ul (li (h2 "My Header Item"))
+ (li "My Non-Header Item"))))))
+
+ (test 'simon-budinsky-example
+ (html->xexp
+ (string-append
+ ""
+ ""
+ ""
+ "My Header Item
"
+ "
"
+ ""
+ ""))
+ '(*TOP* (html (head)
+ (body (ul (@ (class "post-list"))
+ (li (span (@ (class "post-meta"))
+ "Mar 10, 2022")
+ (h2 (a (@ (class "post-link")
+ (href "/site/update/2022/03/10/twrp-3.6.1-released.html"))
+ "TWRP 3.6.1 Released")))
+
+ (li (span (@ (class "post-meta"))
+ "Mar 10, 2022")
+ (h2 (a (@ (class "post-link")
+ (href "/site/update/2022/03/10/twrp-3.6.1-released.html"))
+ "TWRP 3.6.1 Released")))))))))
+
+ (test-section 'area-in-span-in-microformats
+
+ (test 'area-from-jacob-hall
+ (html->xexp ""
+ "TWRP 3.6.1 Released"
+ "
"
+ ""
+ "TWRP 3.6.1 Released"
+ "
"
+ "")
+ '(*TOP* (img (@ (src "https://example.com/images?src=Blah.jpg&width=150")))))
+
+ (test 'attribute-html-entities-2
+ (html->xexp "https://example.com/phpbb/viewtopic.php?f=31&t=1125")
+ '(*TOP* (a (@ (href "https://example.com/phpbb/viewtopic.php?f=31&t=1125"))
+ "https://example.com/phpbb/viewtopic.php?f=31&t=1125"))))
+
+ ;; TODO: Document this.
+ ;;
+ ;; (define html-1 "
to make the parser happy
+ ; usage: /fallout/wiki/Fallout:_New_Vegas_achievements_and_trophies
+ (rr* #rx"(
]*>\n?)( \\2")
+ ; change
\n \n
"))
+ '#hasheq((links . ((a (@ (target "_blank") (rel "nofollow noreferrer noopener") (class "external text") (href "https://sirdanielfortesque.proboards.com/")) "Forum"))))))
+
+(define (table->links table)
+ (define v (hash-ref table 'links #f))
+ (cond/var
+ [(not v) (values null '("Data table must have a \"Links\" column"))]
+ (var links (filter (λ (a) (and (pair? a) (eq? (car a) 'a))) v)) ; elements
+ [(null? links) (values null '("Links column must have at least one link"))]
+ [#t (values links null)]))
+
+(define (table->logo table)
+ (define logo (hash-ref table 'logo #f))
+ (cond/var
+ [(not logo) (values #f '("Data table must have a \"Logo\" column"))]
+ [(null? logo) (values #f '("Logo table column must have a link"))]
+ (var href (get-attribute 'href (bits->attributes (car (hash-ref table 'logo)))))
+ (var src (get-attribute 'src (bits->attributes (car (hash-ref table 'logo)))))
+ (var true-src (or href src))
+ [(not true-src) (values #f '("Logo table column must have a link"))]
+ [#t (values true-src null)]))
+
+(define (get-api-endpoint wiki)
+ (define main-page (third wiki))
+ (define override (fifth wiki))
+ (or override
+ (match main-page
+ [(regexp #rx"/$") (string-append main-page "api.php")]
+ [(regexp #rx"^(.*)/wiki/" (list _ domain)) (string-append domain "/w/api.php")]
+ [(regexp #rx"^(.*)/w/" (list _ domain)) (string-append domain "/api.php")]
+ [_ (error 'get-api-endpoint "unknown url format: ~a" main-page)])))
+
+(define (get-search-page wiki)
+ (define main-page (third wiki))
+ (define override (fourth wiki))
+ (or override
+ (match main-page
+ [(regexp #rx"/$") (string-append main-page "Special:Search")]
+ [(regexp #rx"^(.*/(?:en|w[^./]*)/)" (list _ wiki-prefix)) (string-append wiki-prefix "Special:Search")]
+ [_ (error 'get-search-page "unknown url format: ~a" main-page)])))
+
+(define/memoize (get-redirect-content wikiname) #:hash hash
+ (define wiki (hash-ref wikis-hash wikiname #f))
+ (cond
+ [wiki
+ (define display-name (cadr wiki))
+ (define endpoint (string-append (get-api-endpoint wiki) "?action=parse&page=MediaWiki:BreezeWikiRedirect&prop=text&formatversion=2&format=json"))
+ (define res (get endpoint))
+ (define html (jp "/parse/text" (response-json res)))
+ (define content ((query-selector (λ (t a c) (has-class? "mw-parser-output" a))
+ (html->xexp html))))
+ (define body (for/list ([p (in-producer (query-selector (λ (t a c) (eq? t 'p)) content) #f)]) p))
+ (define table (parse-table ((query-selector (λ (t a c) (eq? t 'table)) content))))
+ (define-values (links links-errors) (table->links table))
+ (define-values (logo logo-errors) (table->logo table))
+ (define construct-errors (append links-errors logo-errors))
+ (λ (title)
+ (define go
+ (string-append (get-search-page wiki)
+ "?"
+ (params->query `(("search" . ,title)
+ ("go" . "Go")))))
+ `(aside (@ (class "niwa__notice"))
+ (h1 (@ (class "niwa__header")) ,display-name " has its own website separate from Fandom.")
+ (div (@ (class "niwa__cols"))
+ (div (@ (class "niwa__left"))
+ (a (@ (class "niwa__go") (href ,go)) "Read " ,title " on " ,display-name " →")
+ ,@body
+ (p "This wiki's core community has wholly migrated away from Fandom. You should "
+ (a (@ (href ,go)) "go to " ,display-name " now!")))
+ ,(if logo
+ `(div (@ (class "niwa__right"))
+ (img (@ (class "niwa__logo") (src ,logo))))
+ ""))
+ ,(if (pair? links)
+ `(p (@ (class "niwa__feedback"))
+ ,@(add-between links " / "))
+ "")
+ ,(if (pair? construct-errors)
+ `(ul
+ ,@(for/list ([error construct-errors])
+ `(li ,error)))
+ "")))]
+ [#t #f]))
+(module+ test
+ (check-not-false ((get-redirect-content "gallowmere") "MediEvil Wiki")))
diff --git a/src/log.rkt b/src/log.rkt
new file mode 100644
index 0000000..047c8aa
--- /dev/null
+++ b/src/log.rkt
@@ -0,0 +1,63 @@
+#lang typed/racket/base
+(require racket/file
+ racket/path
+ racket/port
+ racket/string
+ typed/srfi/19
+ "config.rkt")
+
+(provide
+ log-page-request
+ log-styles-request
+ log-set-settings-request)
+
+(define last-flush 0)
+(define flush-every-millis 60000)
+
+;; anytime-path macro expansion only works in an untyped submodule for reasons I cannot comprehend
+(module define-log-dir racket/base
+ (require racket/path
+ "../lib/syntax.rkt")
+ (provide log-dir)
+ (define log-dir (anytime-path ".." "storage/logs")))
+(require/typed (submod "." define-log-dir)
+ [log-dir Path])
+
+(define log-file (build-path log-dir "access-0.log"))
+(define log-port
+ (if (config-true? 'access_log::enabled)
+ (begin
+ (make-directory* log-dir)
+ (open-output-file log-file #:exists 'append))
+ (open-output-nowhere)))
+
+(: get-date-iso8601 (-> String))
+(define (get-date-iso8601)
+ (date->string (current-date 0) "~5"))
+
+(: offline-string (Boolean -> String))
+(define (offline-string offline?)
+ (if offline? "---" "ooo"))
+
+(: log (String * -> Void))
+(define (log . entry)
+ ;; create log entry string
+ (define full-entry (cons (get-date-iso8601) entry))
+ ;; write to output port
+ (displayln (string-join full-entry ";") log-port)
+ ;; flush output port to file (don't do this too frequently)
+ (when ((- (current-milliseconds) last-flush) . >= . flush-every-millis)
+ (flush-output log-port)
+ (set! last-flush (current-milliseconds))))
+
+(: log-page-request (Boolean String String (U 'light 'dark 'default) -> Void))
+(define (log-page-request offline? wikiname title theme)
+ (log "page" (offline-string offline?) wikiname title (symbol->string theme)))
+
+(: log-styles-request (Boolean String String -> Void))
+(define (log-styles-request offline? wikiname basename)
+ (log "style" (offline-string offline?) wikiname basename))
+
+(: log-set-settings-request (Symbol -> Void))
+(define (log-set-settings-request theme)
+ (log "settings" (symbol->string theme)))
diff --git a/src/niwa-data.rkt b/src/niwa-data.rkt
deleted file mode 100644
index a1036af..0000000
--- a/src/niwa-data.rkt
+++ /dev/null
@@ -1,156 +0,0 @@
-#lang racket/base
-
-(provide
- niwa-data)
-
-;; wikiname, niwa-name, url, logo-url
-(define niwa-data
- '((("arms" "armsgame")
- "ARMS Institute"
- "https://armswiki.org/wiki/Home"
- "/images/logos/armswiki.png"
- "ARMS Institute is a comprehensive resource for information about the Nintendo Switch game, ARMS. Founded on May 1, 2017 and growing rapidly, the wiki strives to offer in-depth coverage of ARMS from both a competitive and casual perspective. Join us and ARM yourself with knowledge!")
- (("pokemon" "monster")
- "Bulbapedia"
- "https://bulbapedia.bulbagarden.net/wiki/Main_Page"
- "/images/logos/bulbapedia.png"
- "A part of the Bulbagarden community, Bulbapedia was founded on December 21, 2004 by Liam Pomfret. Everything you need to know about Pokémon can be found at Bulbapedia, whether about the games, the anime, the manga, or something else entirely. With its Bulbanews section and the Bulbagarden forums, it's your one-stop online place for Pokémon.")
- (("dragalialost")
- "Dragalia Lost Wiki"
- "https://dragalialost.wiki/w/Dragalia_Lost_Wiki"
- "/images/logos/dragalialost.png"
- "The Dragalia Lost Wiki was originally founded in September 2018 on the Gamepedia platform but went independent in January 2021. The Wiki aims to document anything and everything Dragalia Lost, from in-game data to mechanics, story, guides, and more!")
- (("dragonquest")
- "Dragon Quest Wiki"
- "https://dragon-quest.org/wiki/Main_Page"
- "/images/logos/dragonquestwiki.png"
- "Originally founded on Wikia, the Dragon Quest Wiki was largely inactive until FlyingRagnar became an admin in late 2009. The wiki went independent about a year later when it merged with the Dragon Quest Dictionary/Encyclopedia which was run by Zenithian and supported by the Dragon's Den. The Dragon Quest Wiki aims to be the most complete resource for Dragon Quest information on the web. It continues to grow in the hope that one day the series will be as popular in the rest of the world as it is in Japan.")
- (("fireemblem")
- "Fire Emblem Wiki"
- "https://fireemblemwiki.org/wiki/Main_Page"
- "/images/logos/fireemblemwiki.png"
- "Growing since August 26, 2010, Fire Emblem Wiki is a project whose goal is to cover all information pertaining to the Fire Emblem series. It aspires to become the most complete and accurate independent source of information on this series.")
- (("fzero" "f-zero")
- "F-Zero Wiki"
- "https://mutecity.org/wiki/F-Zero_Wiki"
- "/images/logos/fzerowiki.png"
- "Founded on Wikia in November 2007, F-Zero Wiki became independent with NIWA's help in 2011. F-Zero Wiki is quickly growing into the Internet's definitive source for the world of 2200 km/h+, from pilots to machines, and is the founding part of MuteCity.org, the web's first major F-Zero community.")
- (("goldensun")
- "Golden Sun Universe"
- "https://www.goldensunwiki.net/wiki/Main_Page"
- "/images/logos/goldensununiverse.png"
- "Originally founded on Wikia in late 2006, Golden Sun Universe has always worked hard to meet one particular goal: to be the single most comprehensive yet accessible resource on the Internet for Nintendo's RPG series Golden Sun. It became an independent wiki four years later. Covering characters and plot, documenting all aspects of the gameplay, featuring walkthroughs both thorough and bare-bones, and packed with all manner of odd and fascinating minutiae, Golden Sun Universe leaves no stone unturned!")
- (("tetris")
- "Hard Drop - Tetris Wiki"
- "https://harddrop.com/wiki/Main_Page"
- "/images/logos/harddrop.png"
- "The Tetris Wiki was founded by Tetris fans for Tetris fans on tetrisconcept.com in March 2006. The Tetris Wiki torch was passed to harddrop.com in July 2009. Hard Drop is a Tetris community for all Tetris players, regardless of skill or what version of Tetris you play.")
- (("kidicarus")
- "Icaruspedia"
- "https://www.kidicaruswiki.org/wiki/Main_Page"
- "/images/logos/icaruspedia.png"
- "Icaruspedia is the Kid Icarus wiki that keeps flying to new heights. After going independent on January 8, 2012, Icaruspedia has worked to become the largest and most trusted independent source of Kid Icarus information. Just like Pit, they'll keep on fighting until the job is done.")
- (("splatoon" "uk-splatoon" "splatoon3" "splatoon2")
- "Inkipedia"
- "https://splatoonwiki.org/wiki/Main_Page"
- "/images/logos/inkipedia.png"
- "Inkipedia is your ever-growing go-to source for all things Splatoon related. Though founded on Wikia on June 10, 2014, Inkipedia went independent on May 18, 2015, just days before Splatoon's release. Our aim is to cover all aspects of the series, both high and low. Come splat with us now!")
- (("starfox")
- "Lylat Wiki"
- "https://starfoxwiki.info/wiki/Lylat_Wiki"
- "/images/logos/lylatwiki.png"
- "Out of seemingly nowhere, Lylat Wiki sprung up one day in early 2010. Led by creator, Justin Folvarcik, and project head, Tacopill, the wiki has reached stability since the move to its own domain. The staff of Lylat Wiki are glad to help out the NIWA wikis and are even prouder to join NIWA's ranks as the source for information on the Star Fox series.")
- (("metroid" "themetroid")
- "Metroid Wiki"
- "https://www.metroidwiki.org/wiki/Main_Page"
- "/images/logos/metroidwiki.png"
- "Metroid Wiki, founded on January 27, 2010 by Nathanial Rumphol-Janc and Zelda Informer, is a rapidly expanding wiki that covers everything Metroid, from the games, to every suit, vehicle and weapon.")
- (("nintendo" "nintendoseries" "nintendogames")
- "Nintendo Wiki"
- "http://niwanetwork.org/wiki/Main_Page"
- "/images/logos/nintendowiki.png"
- "Created on May 12, 2010, NintendoWiki (N-Wiki) is a collaborative project by the NIWA team to create an encyclopedia dedicated to Nintendo, being the company around which all other NIWA content is focused. It ranges from mainstream information such as the games and people who work for the company, to the most obscure info like patents and interesting trivia.")
- (("animalcrossing" "animalcrossingcf" "acnh")
- "Nookipedia"
- "https://nookipedia.com/wiki/Main_Page"
- "/images/logos/nookipedia.png"
- "Founded in August 2005 on Wikia, Nookipedia was originally known as Animal Crossing City. Shortly after its five-year anniversary, Animal Crossing City decided to merge with the independent Animal Crossing Wiki, which in January 2011 was renamed to Nookipedia. Covering everything from the series including characters, items, critters, and much more, Nookipedia is your number one resource for everything Animal Crossing!")
- (("pikmin")
- "Pikipedia"
- "https://www.pikminwiki.com/"
- "/images/logos/pikipedia.png"
- "Pikipedia, also known as Pikmin Wiki, was founded by Dark Lord Revan on Wikia in December 2005. In September 2010, with NIWA's help, Pikipedia moved away from Wikia to become independent. Pikipedia is working towards their goal of being the foremost source for everything Pikmin.")
- (("pikmin-fan" "pikpikpedia")
- "Pimkin Fanon"
- "https://www.pikminfanon.com/wiki/Main_Page"
- "/images/logos/pikifanon.png"
- "Pikmin Fanon is a Pikmin wiki for fan stories (fanon). Founded back on November 1, 2008 by Rocky0718 as a part of Wikia, Pikmin Fanon has been independent since September 14, 2010. Check them out for fan created stories based around the Pikmin series.")
- (("supersmashbros")
- "SmashWiki"
- "https://www.ssbwiki.com/"
- "/images/logos/smashwiki.png"
- "Originally two separate wikis (one on SmashBoards, the other on Wikia), SmashWiki as we know it was formed out of a merge on February 29th, 2008, becoming independent on September 28th, 2010. SmashWiki is the premier source of Smash Bros. information, from simple tidbits to detailed mechanics, and also touches on the origins of its wealth of content from its sibling franchises.")
- (("starfy")
- "Starfy Wiki"
- "https://www.starfywiki.org/wiki/Main_Page"
- "/images/logos/starfywiki.png"
- "Founded on May 30, 2009, Starfy Wiki's one goal is to become the best source on Nintendo's elusive game series The Legendary Starfy. After gaining independence in 2011 with the help of Tappy and the wiki's original administrative team, the wiki still hopes to achieve its goal and be the best source of Starfy info for all present and future fans.")
- (()
- "StrategyWiki"
- "https://www.strategywiki.org/wiki/Main_Page"
- "/images/logos/strategywiki.png"
- "StrategyWiki was founded in December 2005 by former member Brandon Suit with the idea that the existing strategy guides on the Internet could be improved. Three years later, in December 2008, Scott Jacobi officially established Abxy LLC for the purpose of owning and operating StrategyWiki as a community. Their vision is to bring free, collaborative video game strategy guides to the masses, including Nintendo franchise strategy guides.")
- (("mario" "themario" "imario" "supermarionintendo" "mariokart" "luigi-kart" "mario3")
- "Super Mario Wiki"
- "https://www.mariowiki.com/"
- "/images/logos/mariowiki.png"
- "Online since August 12, 2005, when it was founded by Steve Shinn, Super Mario Wiki has you covered for anything Mario, Donkey Kong, Wario, Luigi, Yoshi—the whole gang, in fact. With its own large community in its accompanying forum, Super Mario Wiki is not only a great encyclopedia, but a fansite for you to talk anything Mario.")
- (("mario64")
- "Ukikipedia"
- "https://ukikipedia.net/wiki/Main_Page"
- "/images/logos/ukikipedia.png"
- "Founded in 2018, Ukikipedia is a wiki focused on expert level knowledge of Super Mario 64, including detailed coverage of game mechanics, glitches, speedrunning, and challenges.")
- (("advancewars")
- "Wars Wiki"
- "https://www.warswiki.org/wiki/Main_Page"
- "/images/logos/warswiki.png"
- "Created in February 2009, Wars Wiki is a small wiki community with a large heart. Founded by JoJo and Wars Central, Wars Wiki is going strong on one of Nintendo's lesser known franchises. Wars Wiki is keen to contribute to NIWA, and we're proud to be able to support them. With the Wars Central community, including forums, it's definitely worth checking out.")
- (("earthbound")
- "WikiBound"
- "https://www.wikibound.info/wiki/WikiBound"
- "/images/logos/wikibound.png"
- "Founded in early 2010 by Tacopill, WikiBound strives to create a detailed database on the Mother/EarthBound games, a quaint series only having two games officially released outside of Japan. Help spread the PK Love by editing WikiBound!")
- (("kirby")
- "WiKirby"
- "https://wikirby.com/wiki/Kirby_Wiki"
- "/images/logos/wikirby.png"
- "WiKirby. It's a wiki. About Kirby! Amidst the excitement of NIWA being founded, Josh LeJeune decided to create a Kirby Wiki, due to lack of a strong independent one online. Coming online on January 24, 2010, WiKirby continues its strong launch with a dedicated community and a daily growing source of Kirby based knowledge.")
- (("xenoblade" "xenoseries" "xenogears" "xenosaga")
- "Xeno Series Wiki"
- "https://www.xenoserieswiki.org/wiki/Main_Page"
- "/images/logos/xenoserieswiki.png"
- "Xeno Series Wiki was created February 4, 2020 by Sir Teatei Moonlight. While founded by the desire to have an independent wiki for Xenoblade, there was an interest in including the Xenogears and Xenosaga games within its focus as well. This wide range of coverage means it's always in need of new editors to help bolster its many subjects.")
- (("zelda" "zelda-archive")
- "Zeldapedia"
- "https://zeldapedia.wiki/wiki/Main_Page"
- "/images/logos/zeldapedia.png"
- "Founded on April 23, 2005 as Zelda Wiki, today's Zeldapedia is your definitive source for encyclopedic information on The Legend of Zelda series, as well as all of the latest Zelda news.")))
-
-;; get the current dataset so it can be stored above
-(module+ fetch
- (require racket/generator
- racket/list
- net/http-easy
- html-parsing
- "xexpr-utils.rkt")
- (define r (get "https://www.niwanetwork.org/members/"))
- (define x (html->xexp (bytes->string/utf-8 (response-body r))))
- (define english ((query-selector (λ (e a c) (equal? (get-attribute 'id a) "content1")) x)))
- (define gen (query-selector (λ (e a c) (has-class? "member" a)) english))
- (for/list ([item (in-producer gen #f)])
- (define links (query-selector (λ (e a c) (eq? e 'a)) item))
- (define url (get-attribute 'href (bits->attributes (links))))
- (define title (third (links)))
- (define icon (get-attribute 'src (bits->attributes ((query-selector (λ (e a c) (eq? e 'img)) item)))))
- (define description (second ((query-selector (λ (e a c) (eq? e 'p)) item))))
- (list '() title url icon description)))
diff --git a/src/page-category.rkt b/src/page-category.rkt
index 89bc45a..29b541c 100644
--- a/src/page-category.rkt
+++ b/src/page-category.rkt
@@ -16,19 +16,23 @@
"config.rkt"
"data.rkt"
"page-wiki.rkt"
- "syntax.rkt"
- "url-utils.rkt"
- "xexpr-utils.rkt")
+ "../lib/syntax.rkt"
+ "../lib/thread-utils.rkt"
+ "../lib/url-utils.rkt"
+ "whole-utils.rkt"
+ "../lib/xexpr-utils.rkt")
(provide
page-category)
(module+ test
- (require rackunit)
+ (require rackunit
+ "test-utils.rkt")
(define category-json-data
'#hasheq((batchcomplete . #t) (continue . #hasheq((cmcontinue . "page|4150504c45|41473") (continue . "-||"))) (query . #hasheq((categorymembers . (#hasheq((ns . 0) (pageid . 25049) (title . "Item (entity)")) #hasheq((ns . 0) (pageid . 128911) (title . "3D")) #hasheq((ns . 0) (pageid . 124018) (title . "A Very Fine Item")) #hasheq((ns . 0) (pageid . 142208) (title . "Amethyst Shard")) #hasheq((ns . 0) (pageid . 121612) (title . "Ankle Monitor")))))))))
(define (generate-results-page
+ #:req req
#:source-url source-url
#:wikiname wikiname
#:title title
@@ -38,6 +42,7 @@
#:siteinfo [siteinfo #f])
(define members (jp "/query/categorymembers" members-data))
(generate-wiki-page
+ #:req req
#:source-url source-url
#:wikiname wikiname
#:title title
@@ -52,7 +57,7 @@
,@(map
(λ (result)
(define title (jp "/title" result))
- (define page-path (regexp-replace* #rx" " title "_"))
+ (define page-path (page-title->path title))
`(li
(a (@ (href ,(format "/~a/wiki/~a" wikiname page-path)))
,title)))
@@ -65,57 +70,63 @@
(define origin (format "https://~a.fandom.com" wikiname))
(define source-url (format "~a/wiki/~a" origin prefixed-category))
- (thread-let
- ([members-data (define dest-url
- (format "~a/api.php?~a"
- origin
- (params->query `(("action" . "query")
- ("list" . "categorymembers")
- ("cmtitle" . ,prefixed-category)
- ("cmlimit" . "max")
- ("formatversion" . "2")
- ("format" . "json")))))
- (log-outgoing dest-url)
- (define dest-res (easy:get dest-url #:timeouts timeouts))
- (easy:response-json dest-res)]
- [page-data (define dest-url
- (format "~a/api.php?~a"
- origin
- (params->query `(("action" . "parse")
- ("page" . ,prefixed-category)
- ("prop" . "text|headhtml|langlinks")
- ("formatversion" . "2")
- ("format" . "json")))))
- (log-outgoing dest-url)
- (define dest-res (easy:get dest-url #:timeouts timeouts))
- (easy:response-json dest-res)]
- [siteinfo (siteinfo-fetch wikiname)])
+ (define-values (members-data page-data siteinfo)
+ (thread-values
+ (λ ()
+ (define dest-url
+ (format "~a/api.php?~a"
+ origin
+ (params->query `(("action" . "query")
+ ("list" . "categorymembers")
+ ("cmtitle" . ,prefixed-category)
+ ("cmlimit" . "max")
+ ("formatversion" . "2")
+ ("format" . "json")))))
+ (log-outgoing dest-url)
+ (define dest-res (easy:get dest-url #:timeouts timeouts))
+ (easy:response-json dest-res))
+ (λ ()
+ (define dest-url
+ (format "~a/api.php?~a"
+ origin
+ (params->query `(("action" . "parse")
+ ("page" . ,prefixed-category)
+ ("prop" . "text|headhtml|langlinks")
+ ("formatversion" . "2")
+ ("format" . "json")))))
+ (log-outgoing dest-url)
+ (define dest-res (easy:get dest-url #:timeouts timeouts))
+ (easy:response-json dest-res))
+ (λ ()
+ (siteinfo-fetch wikiname))))
- (define title (preprocess-html-wiki (jp "/parse/title" page-data prefixed-category)))
- (define page-html (preprocess-html-wiki (jp "/parse/text" page-data "")))
- (define page (html->xexp page-html))
- (define head-data ((head-data-getter wikiname) page-data))
- (define body (generate-results-page
- #:source-url source-url
- #:wikiname wikiname
- #:title title
- #:members-data members-data
- #:page page
- #:head-data head-data
- #:siteinfo siteinfo))
+ (define title (preprocess-html-wiki (jp "/parse/title" page-data prefixed-category)))
+ (define page-html (preprocess-html-wiki (jp "/parse/text" page-data "")))
+ (define page (html->xexp page-html))
+ (define head-data ((head-data-getter wikiname) page-data))
+ (define body (generate-results-page
+ #:req req
+ #:source-url source-url
+ #:wikiname wikiname
+ #:title title
+ #:members-data members-data
+ #:page page
+ #:head-data head-data
+ #:siteinfo siteinfo))
- (when (config-true? 'debug)
- ; used for its side effects
- ; convert to string with error checking, error will be raised if xexp is invalid
- (xexp->html body))
- (response/output
- #:code 200
- #:headers (build-headers always-headers)
- (λ (out)
- (write-html body out))))))
+ (when (config-true? 'debug)
+ ; used for its side effects
+ ; convert to string with error checking, error will be raised if xexp is invalid
+ (xexp->html body))
+ (response/output
+ #:code 200
+ #:headers (build-headers always-headers)
+ (λ (out)
+ (write-html body out)))))
(module+ test
(check-not-false ((query-selector (attribute-selector 'href "/test/wiki/Ankle_Monitor")
(generate-results-page
+ #:req test-req
#:source-url ""
#:wikiname "test"
#:title "Category:Items"
diff --git a/src/page-file.rkt b/src/page-file.rkt
index 1802568..2a7332c 100644
--- a/src/page-file.rkt
+++ b/src/page-file.rkt
@@ -16,14 +16,17 @@
"config.rkt"
"data.rkt"
"page-wiki.rkt"
- "syntax.rkt"
- "url-utils.rkt"
- "xexpr-utils.rkt")
+ "../lib/syntax.rkt"
+ "../lib/thread-utils.rkt"
+ "../lib/url-utils.rkt"
+ "whole-utils.rkt"
+ "../lib/xexpr-utils.rkt")
(provide page-file)
(module+ test
- (require rackunit)
+ (require rackunit
+ "test-utils.rkt")
(define test-media-detail
'#hasheq((fileTitle . "Example file")
(videoEmbedCode . "")
@@ -51,7 +54,8 @@
[(regexp-match? #rx"(?i:^video/)" content-type) `(video (@ (src ,maybe-proxied-url) (controls)))]
[else `""]))
-(define (generate-results-page #:source-url source-url
+(define (generate-results-page #:req req
+ #:source-url source-url
#:wikiname wikiname
#:title title
#:media-detail media-detail
@@ -68,6 +72,7 @@
(define maybe-proxied-raw-image-url
(if (config-true? 'strict_proxy) (u-proxy-url raw-image-url) raw-image-url))
(generate-wiki-page
+ #:req req
#:source-url source-url
#:wikiname wikiname
#:title title
@@ -98,46 +103,51 @@
`""))))
(define (page-file req)
- (define wikiname (path/param-path (first (url-path (request-uri req)))))
- (define prefixed-title (path/param-path (caddr (url-path (request-uri req)))))
- (define origin (format "https://~a.fandom.com" wikiname))
- (define source-url (format "~a/wiki/~a" origin prefixed-title))
+ (response-handler
+ (define wikiname (path/param-path (first (url-path (request-uri req)))))
+ (define prefixed-title (path/param-path (caddr (url-path (request-uri req)))))
+ (define origin (format "https://~a.fandom.com" wikiname))
+ (define source-url (format "~a/wiki/~a" origin prefixed-title))
- (thread-let ([media-detail
- (define dest-url
- (format "~a/wikia.php?~a"
- origin
- (params->query `(("format" . "json") ("controller" . "Lightbox")
- ("method" . "getMediaDetail")
- ("fileTitle" . ,prefixed-title)))))
- (log-outgoing dest-url)
- (define dest-res (easy:get dest-url #:timeouts timeouts))
- (easy:response-json dest-res)]
- [siteinfo (siteinfo-fetch wikiname)])
- (if (not (jp "/exists" media-detail #f))
- (next-dispatcher)
- (response-handler
- (define file-title (jp "/fileTitle" media-detail ""))
- (define title
- (if (non-empty-string? file-title) (format "File:~a" file-title) prefixed-title))
- (define image-content-type
- (if (non-empty-string? (jp "/videoEmbedCode" media-detail ""))
- #f
- (url-content-type (jp "/imageUrl" media-detail))))
- (define body
- (generate-results-page #:source-url source-url
- #:wikiname wikiname
- #:title title
- #:media-detail media-detail
- #:image-content-type image-content-type
- #:siteinfo siteinfo))
- (when (config-true? 'debug)
- ; used for its side effects
- ; convert to string with error checking, error will be raised if xexp is invalid
- (xexp->html body))
- (response/output #:code 200
- #:headers (build-headers always-headers)
- (λ (out) (write-html body out)))))))
+ (define-values (media-detail siteinfo)
+ (thread-values
+ (λ ()
+ (define dest-url
+ (format "~a/wikia.php?~a"
+ origin
+ (params->query `(("format" . "json") ("controller" . "Lightbox")
+ ("method" . "getMediaDetail")
+ ("fileTitle" . ,prefixed-title)))))
+ (log-outgoing dest-url)
+ (define dest-res (easy:get dest-url #:timeouts timeouts))
+ (easy:response-json dest-res))
+ (λ ()
+ (siteinfo-fetch wikiname))))
+ (if (not (jp "/exists" media-detail #f))
+ (next-dispatcher)
+ (response-handler
+ (define file-title (jp "/fileTitle" media-detail ""))
+ (define title
+ (if (non-empty-string? file-title) (format "File:~a" file-title) prefixed-title))
+ (define image-content-type
+ (if (non-empty-string? (jp "/videoEmbedCode" media-detail ""))
+ #f
+ (url-content-type (jp "/imageUrl" media-detail))))
+ (define body
+ (generate-results-page #:req req
+ #:source-url source-url
+ #:wikiname wikiname
+ #:title title
+ #:media-detail media-detail
+ #:image-content-type image-content-type
+ #:siteinfo siteinfo))
+ (when (config-true? 'debug)
+ ; used for its side effects
+ ; convert to string with error checking, error will be raised if xexp is invalid
+ (xexp->html body))
+ (response/output #:code 200
+ #:headers (build-headers always-headers)
+ (λ (out) (write-html body out)))))))
(module+ test
(parameterize ([(config-parameter 'strict_proxy) "true"])
(check-equal? (get-media-html "https://static.wikia.nocookie.net/a" "image/jpeg")
@@ -159,7 +169,8 @@
(check-not-false
((query-selector
(attribute-selector 'src "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fexamplefile")
- (generate-results-page #:source-url ""
+ (generate-results-page #:req test-req
+ #:source-url ""
#:wikiname "test"
#:title "File:Example file"
#:media-detail test-media-detail
diff --git a/src/page-global-search.rkt b/src/page-global-search.rkt
index a9c79bb..a7748d5 100644
--- a/src/page-global-search.rkt
+++ b/src/page-global-search.rkt
@@ -5,8 +5,8 @@
web-server/http
"application-globals.rkt"
"data.rkt"
- "url-utils.rkt"
- "xexpr-utils.rkt")
+ "../lib/url-utils.rkt"
+ "../lib/xexpr-utils.rkt")
(provide
page-global-search)
diff --git a/src/page-home.rkt b/src/page-home.rkt
index 6037d9a..95793d3 100644
--- a/src/page-home.rkt
+++ b/src/page-home.rkt
@@ -6,8 +6,8 @@
"application-globals.rkt"
"data.rkt"
"static-data.rkt"
- "url-utils.rkt"
- "xexpr-utils.rkt"
+ "../lib/url-utils.rkt"
+ "../lib/xexpr-utils.rkt"
"config.rkt")
(provide
@@ -18,21 +18,24 @@
(define examples
'(("minecraft" "Bricks")
- ("crosscode" "CrossCode_Wiki")
- ("undertale" "Hot_Dog...%3F")
- ("tardis" "Eleanor_Blake")
+ ("crosscode" "CrossCode Wiki")
+ ("undertale" "Hot Dog...?")
+ ("tardis" "Eleanor Blake")
("zelda" "Boomerang")))
(define content
`((h2 "BreezeWiki makes wiki pages on Fandom readable")
(p "It removes ads, videos, and suggested content, leaving you with a clean page that doesn't slow down your device or use up your data.")
- (p "BreezeWiki can also be called an \"alternative frontend for Fandom\".")
(p ,(format "To use BreezeWiki, just replace \"fandom.com\" with \"~a\", and you'll instantly be teleported to a better world."
(if (config-true? 'canonical_origin)
(url-host (string->url (config-get 'canonical_origin)))
"breezewiki.com")))
(p "If you'd like to be automatically sent to BreezeWiki every time in the future, "
+ (a (@ (href "https://getindie.wiki")) "get our affiliated browser extension (NEW!)")
+ " or "
(a (@ (href "https://docs.breezewiki.com/Automatic_Redirection.html")) "check out the tutorial in the manual."))
+ (p "BreezeWiki is available on several different websites called " (a (@ (href "https://en.wikipedia.org/wiki/Mirror_site")) "mirrors") ". Each is independently run. If one mirror is offline, the others still work. "
+ (a (@ (href "https://docs.breezewiki.com/Links.html#%28part._.Mirrors%29")) "See the list."))
(h2 "Find a page")
(form (@ (action "/search"))
(label (@ (class "paired__label"))
@@ -45,12 +48,12 @@
(h2 "Example pages")
(ul
,@(map (λ (x)
- `(li (a (@ (href ,(apply format "/~a/wiki/~a" x)))
+ `(li (a (@ (href ,(format "/~a/wiki/~a" (car x) (page-title->path (cadr x)))))
,(apply format "~a: ~a" x))))
examples))
(h2 "Testimonials")
(p (@ (class "testimonial")) ">so glad someone introduced me to a F*ndom alternative (BreezeWiki) because that x-factorized spillway of an ad-infested radioactive dumpsite can go die in a fire —RB")
- (p (@ (class "testimonial")) ">you are so right that fandom still sucks even with adblock somehow. even zapping all the stupid padding it still sucks —Minimus")
+ (p (@ (class "testimonial")) ">apparently there are thousands of people essentially running our company " (em "for free") " right now, creating tons of content, and we just put ads on top of it and they're not even employees. thousands of people we can't lay off. thousands! —" (a (@ (href "https://hard-drive.net/fandom-ceo-frustrated-its-impossible-to-lay-off-unpaid-users-who-update-wikias-for-fun/?utm_source=breezewiki") (target "_blank")) "Perkins Miller, Fandom CEO"))
(p (@ (class "testimonial")) ">attempting to go to a wiki's forum page with breezewiki doesn't work, which is based honestly —Tom Skeleton")
(p (@ (class "testimonial")) ">Fandom pages crashing and closing, taking forever to load and locking up as they load the ads on the site... they are causing the site to crash because they are trying to load video ads both at the top and bottom of the site as well as two or three banner ads, then a massive top of site ad and eventually my anti-virus shuts the whole site down because it's literally pulling more resources than WoW in ultra settings... —Anonymous")
(p (@ (class "testimonial")) ">reblogs EXTREMELY appreciated I want that twink* (*fandom wiki) obliterated —footlong")
diff --git a/src/page-proxy.rkt b/src/page-proxy.rkt
index 3c22e1e..cd94603 100644
--- a/src/page-proxy.rkt
+++ b/src/page-proxy.rkt
@@ -9,8 +9,8 @@
web-server/http
(only-in web-server/dispatchers/dispatch next-dispatcher)
"application-globals.rkt"
- "url-utils.rkt"
- "xexpr-utils.rkt")
+ "../lib/url-utils.rkt"
+ "../lib/xexpr-utils.rkt")
(provide
page-proxy)
diff --git a/src/page-redirect-wiki-home.rkt b/src/page-redirect-wiki-home.rkt
index c8e6dde..255f625 100644
--- a/src/page-redirect-wiki-home.rkt
+++ b/src/page-redirect-wiki-home.rkt
@@ -3,8 +3,8 @@
web-server/http
"application-globals.rkt"
"data.rkt"
- "url-utils.rkt"
- "xexpr-utils.rkt")
+ "../lib/url-utils.rkt"
+ "../lib/xexpr-utils.rkt")
(provide
redirect-wiki-home)
diff --git a/src/page-search.rkt b/src/page-search.rkt
index f4d1ce3..ce527c0 100644
--- a/src/page-search.rkt
+++ b/src/page-search.rkt
@@ -13,21 +13,25 @@
"application-globals.rkt"
"config.rkt"
"data.rkt"
- "syntax.rkt"
- "url-utils.rkt"
- "xexpr-utils.rkt")
+ "../lib/syntax.rkt"
+ "../lib/thread-utils.rkt"
+ "../lib/url-utils.rkt"
+ "whole-utils.rkt"
+ "../lib/xexpr-utils.rkt")
(provide
page-search)
(module+ test
- (require rackunit)
+ (require rackunit
+ "test-utils.rkt")
(define search-json-data
'#hasheq((batchcomplete . #t) (query . #hasheq((search . (#hasheq((ns . 0) (pageid . 219) (size . 1482) (snippet . "") (timestamp . "2022-08-21T08:54:23Z") (title . "Gacha Capsule") (wordcount . 214)) #hasheq((ns . 0) (pageid . 201) (size . 1198) (snippet . "") (timestamp . "2022-07-11T17:52:47Z") (title . "Badges") (wordcount . 181)))))))))
-(define (generate-results-page dest-url wikiname query data #:siteinfo [siteinfo #f])
+(define (generate-results-page req dest-url wikiname query data #:siteinfo [siteinfo #f])
(define search-results (jp "/query/search" data))
(generate-wiki-page
+ #:req req
#:source-url dest-url
#:wikiname wikiname
#:title query
@@ -38,7 +42,7 @@
(ul ,@(map
(λ (result)
(let* ([title (jp "/title" result)]
- [page-path (regexp-replace* #rx" " title "_")]
+ [page-path (page-title->path title)]
[timestamp (jp "/timestamp" result)]
[wordcount (jp "/wordcount" result)]
[size (jp "/size" result)])
@@ -58,6 +62,8 @@
(define wikiname (path/param-path (first (url-path (request-uri req)))))
(define query (dict-ref (url-query (request-uri req)) 'q #f))
(define origin (format "https://~a.fandom.com" wikiname))
+ (when (config-true? 'feature_offline::only)
+ (raise-user-error "Full search is currently not available on breezewiki.com - for now, please use the pop-up search suggestions or wait for me to fix it! Thanks <3"))
(define dest-url
(format "~a/api.php?~a"
origin
@@ -67,23 +73,27 @@
("formatversion" . "2")
("format" . "json")))))
- (thread-let
- ([dest-res (log-outgoing dest-url)
- (easy:get dest-url #:timeouts timeouts)]
- [siteinfo (siteinfo-fetch wikiname)])
+ (define-values (dest-res siteinfo)
+ (thread-values
+ (λ ()
+ (log-outgoing dest-url)
+ (easy:get dest-url #:timeouts timeouts))
+ (λ ()
+ (siteinfo-fetch wikiname))))
- (define data (easy:response-json dest-res))
+ (define data (easy:response-json dest-res))
- (define body (generate-results-page dest-url wikiname query data #:siteinfo siteinfo))
- (when (config-true? 'debug)
- ; used for its side effects
- ; convert to string with error checking, error will be raised if xexp is invalid
- (xexp->html body))
- (response/output
- #:code 200
- #:headers (build-headers always-headers)
- (λ (out)
- (write-html body out))))))
+ (define body (generate-results-page req dest-url wikiname query data #:siteinfo siteinfo))
+ (when (config-true? 'debug)
+ ; used for its side effects
+ ; convert to string with error checking, error will be raised if xexp is invalid
+ (xexp->html body))
+ (response/output
+ #:code 200
+ #:headers (build-headers always-headers)
+ (λ (out)
+ (write-html body out)))))
(module+ test
- (check-not-false ((query-selector (attribute-selector 'href "/test/wiki/Gacha_Capsule")
- (generate-results-page "" "test" "Gacha" search-json-data)))))
+ (parameterize ([(config-parameter 'feature_offline::only) "false"])
+ (check-not-false ((query-selector (attribute-selector 'href "/test/wiki/Gacha_Capsule")
+ (generate-results-page test-req "" "test" "Gacha" search-json-data))))))
diff --git a/src/page-set-user-settings.rkt b/src/page-set-user-settings.rkt
new file mode 100644
index 0000000..b949142
--- /dev/null
+++ b/src/page-set-user-settings.rkt
@@ -0,0 +1,20 @@
+#lang racket/base
+(require racket/dict
+ net/url
+ web-server/http
+ "application-globals.rkt"
+ "data.rkt"
+ "log.rkt"
+ "../lib/url-utils.rkt"
+ "../lib/xexpr-utils.rkt")
+
+(provide
+ page-set-user-settings)
+
+(define (page-set-user-settings req)
+ (response-handler
+ (define next-location (dict-ref (url-query (request-uri req)) 'next_location))
+ (define new-settings (read (open-input-string (dict-ref (url-query (request-uri req)) 'new_settings))))
+ (log-set-settings-request (user-cookies^-theme new-settings))
+ (define headers (user-cookies-setter new-settings))
+ (generate-redirect next-location #:headers headers)))
diff --git a/src/page-static-archive.rkt b/src/page-static-archive.rkt
new file mode 100644
index 0000000..c0c2e09
--- /dev/null
+++ b/src/page-static-archive.rkt
@@ -0,0 +1,91 @@
+#lang racket/base
+(require racket/file
+ racket/path
+ racket/port
+ racket/string
+ net/url
+ web-server/http
+ web-server/servlet-dispatch
+ web-server/dispatchers/filesystem-map
+ (only-in web-server/dispatchers/dispatch next-dispatcher)
+ "../archiver/archiver.rkt"
+ "../lib/mime-types.rkt"
+ "../lib/syntax.rkt"
+ "../lib/xexpr-utils.rkt"
+ "config.rkt"
+ "log.rkt")
+
+(provide
+ page-static-archive)
+
+(define path-archive (anytime-path ".." "storage/archive"))
+
+(define ((replacer wikiname) whole url)
+ (format
+ "url(~a)"
+ (if (or (equal? url "")
+ (equal? url "'")
+ (string-contains? url "/resources-ucp/")
+ (string-contains? url "/fonts/")
+ (string-contains? url "/drm_fonts/")
+ (string-contains? url "//db.onlinewebfonts.com/")
+ (string-contains? url "//bits.wikimedia.org/")
+ (string-contains? url "dropbox")
+ (string-contains? url "only=styles")
+ (string-contains? url "https://https://")
+ (regexp-match? #rx"^%20|^'" url)
+ (regexp-match? #rx"^\"?data:" url))
+ url
+ (let* ([norm-url
+ (cond
+ [(string-prefix? url "https://") url]
+ [(string-prefix? url "http://") (regexp-replace #rx"http:" url "https:")]
+ [(string-prefix? url "//") (string-append "https:" url)]
+ [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)]
+ [else (error 'replace-style-for-images "unknown URL format: ~a" url)])])
+ (define p (image-url->values norm-url))
+ ;; (printf "hashed: ~a~n -> ~a~n #-> ~a~n" url (car p) (cdr p))
+ (format "/archive/~a/images/~a" wikiname (cdr p))))))
+
+(define (replace-style-for-images wikiname path)
+ (define content (file->string path))
+ (regexp-replace* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content (replacer wikiname)))
+
+(define (handle-style wikiname dest)
+ (when (config-true? 'debug)
+ (printf "using offline mode for style ~a ~a~n" wikiname dest))
+ (log-styles-request #t wikiname dest)
+ (define fs-path (build-path path-archive wikiname "styles" dest))
+ (unless (file-exists? fs-path)
+ (next-dispatcher))
+ (response-handler
+ (define new-content (replace-style-for-images wikiname fs-path))
+ (response/output
+ #:code 200
+ #:headers (list (header #"Content-Type" #"text/css")
+ (header #"Referrer-Policy" #"same-origin"))
+ (λ (out) (displayln new-content out)))))
+
+(define (handle-image wikiname dest) ;; dest is the hash with no extension
+ (unless ((string-length dest) . >= . 40) (next-dispatcher))
+ (response-handler
+ (define dir (build-path path-archive wikiname "images" (substring dest 0 1) (substring dest 0 2)))
+ (unless (directory-exists? dir) (next-dispatcher))
+ (define candidates (directory-list dir))
+ (define target (path->string (findf (λ (f) (string-prefix? (path->string f) dest)) candidates)))
+ (unless target (next-dispatcher))
+ (define ext (substring target 41))
+ (response/output
+ #:code 200
+ #:headers (list (header #"Content-Type" (ext->mime-type (string->bytes/latin-1 ext))))
+ (λ (out)
+ (call-with-input-file (build-path dir target)
+ (λ (in)
+ (copy-port in out)))))))
+
+(define (page-static-archive req)
+ (define path (url-path (request-uri req)))
+ (define-values (_ wikiname kind dest) (apply values (map path/param-path path)))
+ (cond [(equal? kind "styles") (handle-style wikiname dest)]
+ [(equal? kind "images") (handle-image wikiname dest)]
+ [else (response-handler (raise-user-error "page-static-archive: how did we get here?" kind))]))
diff --git a/src/page-static.rkt b/src/page-static.rkt
index 2bf684c..0311229 100644
--- a/src/page-static.rkt
+++ b/src/page-static.rkt
@@ -7,6 +7,8 @@
web-server/dispatchers/filesystem-map
(only-in web-server/dispatchers/dispatch next-dispatcher)
(prefix-in files: web-server/dispatchers/dispatch-files)
+ "../lib/mime-types.rkt"
+ "../lib/syntax.rkt"
"config.rkt")
(provide
@@ -16,6 +18,7 @@
(require rackunit))
(define-runtime-path path-static "../static")
+(define path-archive (anytime-path ".." "storage/archive"))
(define hash-ext-mime-type
(hash #".css" #"text/css"
@@ -25,45 +28,49 @@
#".woff2" #"font/woff2"
#".txt" #"text/plain"))
-(define (ext->mime-type ext)
- (hash-ref hash-ext-mime-type ext))
-(module+ test
- (check-equal? (ext->mime-type #".png") #"image/png"))
-
(define (make-path segments)
(map (λ (seg) (path/param seg '())) segments))
(module+ test
(check-equal? (make-path '("static" "main.css"))
(list (path/param "static" '()) (path/param "main.css" '()))))
+;; given a request path, return a rewritten request path and the source directory on the filesystem to serve based on
(define (path-rewriter p)
(cond
; url is ^/static/... ?
[(equal? (path/param-path (car p)) "static")
; rewrite to ^/... which will be treated as relative to static/ on the filesystem
- (cdr p)]
+ (values (cdr p) path-static)]
+ ; url is ^/archive/... ?
+ [(equal? (path/param-path (car p)) "archive")
+ ; rewrite req to ^/ Links Forum to make the parser happy
- ; usage: /fallout/wiki/Fallout:_New_Vegas_achievements_and_trophies
- (curry rr* #rx"(
]*>\n?)( \\2")
- ; change
\n \n