diff --git a/archiver/archiver-cli.rkt b/archiver/archiver-cli.rkt new file mode 100644 index 0000000..11f25d0 --- /dev/null +++ b/archiver/archiver-cli.rkt @@ -0,0 +1,68 @@ +#lang cli +(require charterm + "archiver.rkt") + +(help (usage "Downloads a single Fandom wiki in BreezeWiki offline format." + "" + "Downloaded pages go into `archive/` next to the executable." + "Database goes into `archiver.db*` next to the executable." + "The database is necessary to store your download progress and resume where you left off if the process is interrupted.")) + +(flag (output-quiet?) + ("-q" "--output-quiet" "disable progress output") + (output-quiet? #t)) + +(flag (output-progress?) + ("-p" "--output-progress" "progress output for terminals (default in a tty)") + (output-progress? #t)) + +(flag (output-lines?) + ("-l" "--output-lines" "output the name of each file downloaded (default outside of a tty)") + (output-lines? #t)) + +(constraint (one-of output-quiet? output-lines? output-progress?)) + + + +(program + (start [wikiname "wikiname to download"]) + ;; set up arguments + (define width 80) + (when (not (or (output-quiet?) (output-lines?) (output-progress?))) + (cond [(terminal-port? current-input-port) + (output-progress? #t)] + [else + (output-lines? #t)])) + (define (update-width) + (when (output-progress?) + (with-charterm + (call-with-values (λ () (charterm-screen-size)) + (λ (cols rows) (set! width cols)))))) + (update-width) + ;; check + (when (or (not wikiname) (equal? wikiname "")) + (raise-user-error "Please specify the wikiname to download on the command line.")) + ;; progress reporting based on selected mode + (define (report-progress a b c) + (define basename (basename->name-for-query c)) + (cond + [(output-lines?) + (displayln basename)] + [(output-progress?) + (when (eq? (modulo a 20) 0) + (thread (λ () (update-width)))) + (define prefix (format "[~a] [~a/~a] " wikiname a b)) + (define rest (- width (string-length prefix))) + (define real-width (min (string-length basename) rest)) + (define spare-width (- rest real-width)) + (define name-display (substring basename 0 real-width)) + (define whitespace (make-string spare-width #\ )) + (printf "~a~a~a\r" prefix name-display whitespace)])) + ;; download all stages + (for ([stage all-stages] + [i (in-naturals 1)]) + (printf "> Stage ~a/~a~n" i (length all-stages)) + (stage wikiname report-progress) + (displayln ""))) + +(run start) diff --git a/archiver/archiver-database.rkt b/archiver/archiver-database.rkt new file mode 100644 index 0000000..b81ad6c --- /dev/null +++ b/archiver/archiver-database.rkt @@ -0,0 +1,81 @@ +#lang racket/base +(require racket/file + racket/list + racket/path + racket/string + json + json-pointer + db + "../lib/syntax.rkt") + +(provide + get-slc + query-exec* + query-rows* + query-list* + query-value* + query-maybe-value* + query-maybe-row*) + +(define storage-path (anytime-path ".." "storage")) +(define database-file (build-path storage-path "archiver.db")) + +(define slc (box #f)) +(define (get-slc) + (define slc* (unbox slc)) + (cond + [slc* slc*] + [else + (make-directory* storage-path) + (define slc* (sqlite3-connect #:database database-file #:mode 'create)) + (query-exec slc* "PRAGMA journal_mode=WAL") + (define database-version + (with-handlers ([exn:fail:sql? + (λ (exn) + ; need to set up the database + (query-exec slc* "create table database_version (version integer, primary key (version))") + (query-exec slc* "insert into database_version values (0)") + 0)]) + (query-value slc* "select version from database_version"))) + + (define migrations + (wrap-sql + ((query-exec slc* "create table page (wikiname TEXT NOT NULL, basename TEXT NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, basename))") + (query-exec slc* "create table wiki (wikiname TEXT NOT NULL, progress INTEGER, PRIMARY KEY (wikiname))")) + ((query-exec slc* "create table special_page (wikiname TEXT NOT NULL, key TEXT NOT NULL, basename TEXT NOT NULL, PRIMARY KEY (wikiname, key))")) + ((query-exec slc* "update wiki set progress = 2 where wikiname in (select wikiname from wiki inner join page using (wikiname) group by wikiname having min(page.progress) = 1)")) + ((query-exec slc* "create table image (wikiname TEXT NOT NULL, hash TEXT NTO NULL, url TEXT NOT NULL, ext TEXT, source INTEGER NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, hash))")) + ((query-exec slc* "alter table wiki add column sitename TEXT") + (query-exec slc* "alter table wiki add column basepage TEXT") + (query-exec slc* "alter table wiki add column license_text TEXT") + (query-exec slc* "alter table wiki add column license_url TEXT")))) + + (let do-migrate-step () + (when (database-version . < . (length migrations)) + (call-with-transaction + slc* + (list-ref migrations database-version)) + (set! database-version (add1 database-version)) + (query-exec slc* "update database_version set version = $1" database-version) + (do-migrate-step))) + + (set-box! slc slc*) + slc*])) + +(define (query-exec* . args) + (apply query-exec (get-slc) args)) + +(define (query-rows* . args) + (apply query-rows (get-slc) args)) + +(define (query-list* . args) + (apply query-list (get-slc) args)) + +(define (query-value* . args) + (apply query-value (get-slc) args)) + +(define (query-maybe-value* . args) + (apply query-maybe-value (get-slc) args)) + +(define (query-maybe-row* . args) + (apply query-maybe-row (get-slc) args)) diff --git a/archiver/archiver-gui.rkt b/archiver/archiver-gui.rkt new file mode 100644 index 0000000..6f09cb8 --- /dev/null +++ b/archiver/archiver-gui.rkt @@ -0,0 +1,347 @@ +#lang racket/base +(require racket/class + racket/draw + racket/format + racket/list + racket/port + racket/set + racket/splicing + racket/string + db + net/http-easy + memo + (only-in racket/gui timer%) + racket/gui/easy + racket/gui/easy/operator + (only-in pict bitmap) + images/icons/arrow + images/icons/control + images/icons/stickman + images/icons/style + images/icons/symbol + "archiver-database.rkt" + "archiver.rkt" + "../lib/url-utils.rkt" + "../lib/xexpr-utils.rkt") + +(default-icon-material rubber-icon-material) + +(require (for-syntax racket/base racket/match racket/set racket/string)) + +(define-syntax (@> stx) + (define form (cdr (syntax->datum stx))) + (match form + [(list form) ; (@> (fn @obs)) + ;; identify the observables and replace with non-@ symbols + (define collection (mutable-set)) + (define updated + (let loop ([sexp form]) + (cond [(symbol? sexp) + (let ([as-s (symbol->string sexp)]) + (if (string-prefix? as-s "@") + (let ([without-@ (string->symbol (substring as-s 1))]) + (set-add! collection (cons sexp without-@)) + without-@) + sexp))] + [(pair? sexp) (cons (loop (car sexp)) (loop (cdr sexp)))] + [#t sexp]))) + (define collection-l (set->list collection)) + ;; return obs-combine -> updated-form + (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-l)) ,updated) ,@(map car collection-l)))] + [(list (? string? str) args ...) ; (@> "Blah: ~a/~a" @arg1 arg2) + ;; identify the observables and replace with non-@ symbols + (define collection-l + (for/list ([arg args]) + (if (symbol? arg) + (let ([as-s (symbol->string arg)]) + (if (string-prefix? as-s "@") + (let ([without-@ (string->symbol (substring as-s 1))]) + (cons arg without-@)) + (cons #f arg))) + (cons #f arg)))) + (define collection-lo (filter car collection-l)) + ;; return obs-combine -> format + (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-lo)) (format ,str ,@(map cdr collection-l))) ,@(map car collection-lo)))])) + +(define/obs @auto-retry #f) + +(define-struct qi^ (wikiname st stage progress max-progress eta th) #:transparent) ;; queue item + +(define rows (query-rows* "select wikiname, progress from wiki where progress < 4")) +(define/obs @queue null) +(define (add-wikiname-to-queue wikiname st stage) + (@queue . <~ . (λ (queue) + (define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue)) + (if already-exists? + queue + (append queue (list (qi^ wikiname st stage 0 1 "..." #f))))))) +(for ([row rows]) + (add-wikiname-to-queue (vector-ref row 0) + (if (= (vector-ref row 1) 4) + 'complete + 'queued) + (vector-ref row 1))) + +(define status-icon-size 32) +(define status-icon-min-width 36) +(define button-icon-size 12) + +(define color-green (make-color 90 212 68)) + +(define/obs @input "") + +(splicing-let ([frame-count 30]) + (define stickman-frames + (for/vector ([s (in-range 0 1 (/ 1 frame-count))]) + (running-stickman-icon + s + #:height status-icon-size + #:material (default-icon-material)))) + + (define/obs @stick-frame-no 0) + (define stick-timer + (new timer% + [notify-callback (λ () (@stick-frame-no . <~ . add1))] + [interval (truncate (/ 1000 frame-count))])) + (define/obs @stick + (@stick-frame-no . ~> . (λ (n) (vector-ref stickman-frames + (modulo n (vector-length stickman-frames))))))) + +(define status-icons + (hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size) + 'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size) + 'running @stick + 'error (x-icon #:height status-icon-size) + 'complete (check-icon #:color color-green #:height status-icon-size))) + +(define action-icons + (hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size) + 'resume (play-icon #:color color-green #:height button-icon-size) + 'reset (left-over-arrow-icon #:color halt-icon-color #:height button-icon-size))) + +(define (bitmap-view @the-bitmap [min-width 1]) + (pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f) + #:stretch '(#f #f) + #:style '(transparent) + @the-bitmap + bitmap)) + +(define (exn->string e) + (with-output-to-string + (λ () + (displayln (exn-message e)) + (displayln "context:") + (for ([item (continuation-mark-set->context (exn-continuation-marks e))]) + (printf " ~a" (srcloc->string (cdr item))) + (when (car item) + (printf ": ~a" (car item))) + (displayln ""))))) + +(define ((handle-graphical-exn @qi) e) + (displayln (exn->string e) (current-error-port)) + (cond + [(obs-peek @auto-retry) + (void) ;; TODO + #;(do-retry-end wikiname)] + [#t + (update-qi @qi [st 'error]) + (do-try-unpause-next-entry) + (thread + (λ () + (define/obs @visible? #t) + (render + (dialog #:title "Download Error" + #:style '(resize-border) + #:mixin (λ (%) (class % (super-new) + (obs-observe! @visible? (λ (visible?) (send this show visible?))))) + (vpanel #:margin '(15 15) + (text "Encountered this error while downloading:") + (input #:style '(multiple hscroll) + #:min-size '(#f 200) + (exn->string e)) + ;; TODO + #;(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname))) + #;(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname))) + #;(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue))) + #;(button "Use Auto-Retry" (λ () + (:= @auto-retry #t) + (:= @visible? #f) + (do-retry-end wikiname))) + #;(text "Be careful not to auto-retry an infinite loop!"))) + main-window))) + (sleep) + ; make sure the broken thread is gone + (define th (qi^-th (obs-peek @qi))) + (when th (kill-thread th))])) + +(define segments + (list + (list 5/100 (make-color 0 223 217)) + (list 88/100 color-green) + (list 2/100 (make-color 0 223 217)) + (list 5/100 color-green))) +(define segment-spacing 2) +(unless (= (apply + (map car segments)) 1) + (error 'segments "segments add up to ~a, not 1" (apply + (map car segments)))) + +;; return the new bitmap, which can be drawn on a dc<%> +(define/memoize (ray-trace width height stage progress max-progress) + ;; (printf "rendering ~a ~a/~a at ~a~n" stage progress max-progress (current-inexact-milliseconds)) + (define bm (make-object bitmap% width height #f #t)) + (define dc (make-object bitmap-dc% bm)) + (define width-available (- width (* (length segments) segment-spacing))) + (send dc set-smoothing 'unsmoothed) + (send dc set-pen "black" 0 'transparent) + (for/fold ([offset 0]) + ([segment segments] + [i (in-naturals 0)]) ;; zero indexed stages? + ;; calculate start and end locations of grey bar + (define-values (segment-proportion segment-color) (apply values segment)) + (define segment-start (if (= offset 0) 0 (+ offset segment-spacing))) + (define segment-width (* width-available segment-proportion)) + ;; draw grey bar + (send dc set-brush (make-color 180 180 180 0.4) 'solid) + (send dc draw-rectangle segment-start 0 segment-width height) + ;; draw solid bar according to the current item's progress + (define proportion + (cond [(stage . < . i) 0] + [(stage . > . i) 1] + [(max-progress . <= . 0) 0] + [(progress . < . 0) 0] + [(progress . >= . max-progress) 1] + [else (progress . / . max-progress)])) + (send dc set-brush segment-color 'solid) + (send dc draw-rectangle segment-start 0 (* proportion segment-width) height) + (+ segment-start segment-width)) + (bitmap-render-icon bm 6/8)) + +;; get ray traced bitmap (possibly from cache) and draw on dc<%> +(define (draw-bar orig-dc qi) + ;; (println ray-traced) + (define-values (width height) (send orig-dc get-size)) + (send orig-dc draw-bitmap (ray-trace width height (qi^-stage qi) (qi^-progress qi) (qi^-max-progress qi)) 0 0)) + +(define ((make-progress-updater @qi) a b c) + ;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c) + (update-qi @qi [progress a] [max-progress b])) + +(define (do-add-to-queue) + (define wikiname (string-trim (obs-peek @input))) + (when ((string-length wikiname) . > . 0) + (add-wikiname-to-queue wikiname 'queued 0)) ;; TODO: automatically start? + (:= @input "")) + +(define-syntax-rule (update-qi @qi args ...) + (let ([wikiname (qi^-wikiname (obs-peek @qi))]) + (@queue . <~ . (λ (queue) + (for/list ([qi queue]) + (if (equal? (qi^-wikiname qi) wikiname) + (struct-copy qi^ qi args ...) + qi)))))) + +(define (do-start-qi @qi) + (define th + (thread (λ () + (with-handlers ([exn? (handle-graphical-exn @qi)]) + (define last-stage + (for/last ([stage all-stages] + [i (in-naturals)]) + (update-qi @qi [stage i]) + (stage (qi^-wikiname (obs-peek @qi)) (make-progress-updater @qi)) + i)) + (update-qi @qi [st 'complete] [stage (add1 last-stage)]) + (do-try-unpause-next-entry))))) + (update-qi @qi [st 'running] [th th])) + +(define (do-stop-qi @qi) + (define th (qi^-th (obs-peek @qi))) + (when th (kill-thread th)) + (update-qi @qi [th #f] [st 'paused])) + +(define (do-reset-qi @qi) + (define th (qi^-th (obs-peek @qi))) + (when th (kill-thread th)) + (update-qi @qi [th #f] [st 'queued] [stage 0] [progress 0] [max-progress 0]) + (query-exec* "update wiki set progress = 0 where wikiname = ?" (qi^-wikiname (obs-peek @qi)))) + +(define (do-try-unpause-next-entry) + (define queue (obs-peek @queue)) + (define next-qi (for/first ([qi queue] + #:when (memq (qi^-st qi) '(paused queued error))) + qi)) + (when next-qi + (define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue)))) + (do-start-qi @qi))) + +(define main-window + (render + (window + #:title "Fandom Archiver" + #:size '(400 300) + #:mixin (λ (%) (class % + (super-new) + (define/augment (on-close) + (send stick-timer stop) + (for ([qi (obs-peek @queue)]) + (when (qi^-th qi) + (kill-thread (qi^-th qi)))) + #;(disconnect*)))) + (vpanel + #:spacing 10 + #:margin '(5 5) + (hpanel + #:stretch '(#t #f) + #:spacing 10 + (hpanel + (text "https://") + (input @input + (λ (event data) (cond + [(eq? event 'input) (:= @input data)] + [(eq? event 'return) (do-add-to-queue)]))) + (text ".fandom.com")) + (button "Download Wiki" do-add-to-queue)) + (list-view + #:style '(vertical) + @queue + #:key qi^-wikiname + (λ (k @qi) + (define @status-icons + (@> (case (qi^-st @qi) + [(running) @stick] + [else (hash-ref status-icons (qi^-st @qi))]))) + (define @is-running? + (@> (memq (qi^-st @qi) '(running)))) + (define @is-complete? + (@> (eq? (qi^-st @qi) 'complete))) + ;; state icon at the left side + (hpanel #:stretch '(#t #f) + #:alignment '(left center) + #:spacing 8 + (bitmap-view @status-icons status-icon-min-width) + (vpanel + ;; name and buttons (top half) + (hpanel #:alignment '(left bottom) + (text (@> (qi^-wikiname @qi))) + (spacer) + (hpanel + #:stretch '(#f #f) + (if-view @is-complete? + (button (hash-ref action-icons 'reset) + (λ () (do-reset-qi @qi))) + (spacer)) + (if-view @is-running? + (button (hash-ref action-icons 'pause) + (λ () (do-stop-qi @qi))) + (button (hash-ref action-icons 'resume) + (λ () (do-start-qi @qi)))))) + ;; progress bar (bottom half) + (hpanel + (canvas + @qi + #:style '(transparent) + #:margin '(3 3) + draw-bar) + (hpanel #:min-size '(68 #f) + #:stretch '(#f #f) + #:alignment '(right center) + (text (@> (format "eta ~a" (qi^-eta @qi)))))))))))))) diff --git a/archiver/archiver.rkt b/archiver/archiver.rkt new file mode 100644 index 0000000..edd0d2b --- /dev/null +++ b/archiver/archiver.rkt @@ -0,0 +1,335 @@ +#lang racket/base +(require racket/file + racket/function + racket/list + racket/path + racket/sequence + racket/string + net/url + net/mime + file/sha1 + net/http-easy + db + json + "archiver-database.rkt" + "../lib/html-parsing/main.rkt" + "../lib/mime-types.rkt" + "../lib/syntax.rkt" + "../lib/tree-updater.rkt" + "../lib/url-utils.rkt" + "../lib/xexpr-utils.rkt" + "../lib/archive-file-mappings.rkt") + +(provide + basename->name-for-query + image-url->values + hash->save-dir + all-stages) + +(module+ test + (require rackunit)) + +(define archive-root (anytime-path ".." "storage/archive")) +(make-directory* archive-root) + +(define sources '#hasheq((style . 1) (page . 2))) + +(define (get-origin wikiname) + (format "https://~a.fandom.com" wikiname)) + +(define (insert-wiki-entry wikiname) + (define dest-url + (format "https://~a.fandom.com/api.php?~a" + wikiname + (params->query '(("action" . "query") + ("meta" . "siteinfo") + ("siprop" . "general|rightsinfo|statistics") + ("format" . "json") + ("formatversion" . "2"))))) + (define data (response-json (get dest-url))) + (define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) + (if (and exists? (not (sql-null? exists?))) + (query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" + (jp "/query/general/sitename" data) + (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) + (jp "/query/rightsinfo/text" data) + (jp "/query/rightsinfo/url" data) + wikiname) + (query-exec* "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 0, ?, ?, ?, ?)" + wikiname + (jp "/query/general/sitename" data) + (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) + (jp "/query/rightsinfo/text" data) + (jp "/query/rightsinfo/url" data))) + (jp "/query/statistics/articles" data)) + + +(define (check-style-for-images wikiname path) + (define content (file->string path)) + (define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr)) + (for/list ([url urls] + #:when (not (or (equal? url "") + (equal? url "'") + (string-suffix? url "\"") + (string-contains? url "/resources-ucp/") + (string-contains? url "/fonts/") + (string-contains? url "/drm_fonts/") + (string-contains? url "//db.onlinewebfonts.com/") + (string-contains? url "//bits.wikimedia.org/") + (string-contains? url "dropbox") + (string-contains? url "only=styles") + (string-contains? url "https://https://") + (regexp-match? #rx"^%20" url) + (regexp-match? #rx"^data:" url)))) + (cond + [(string-prefix? url "https://") url] + [(string-prefix? url "http://") (regexp-replace #rx"http:" url "https:")] + [(string-prefix? url "//") (string-append "https:" url)] + [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)] + [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)]))) + +(define (download-styles-for-wiki wikiname callback) + (define save-dir (build-path archive-root wikiname "styles")) + (make-directory* save-dir) + (define theme (λ (theme-name) + (cons (format "https://~a.fandom.com/wikia.php?controller=ThemeApi&method=themeVariables&variant=~a" wikiname theme-name) + (build-path save-dir (format "themeVariables-~a.css" theme-name))))) + ;; (Listof (Pair url save-path)) + (define styles + (list + (theme "default") + (theme "light") + (theme "dark") + (cons (format "https://~a.fandom.com/load.php?lang=en&modules=site.styles%7Cskin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles&only=styles&skin=fandomdesktop" wikiname) + (build-path save-dir "site.css")))) + (for ([style styles] + [i (in-naturals)]) + (callback i (length styles) "styles...") + (define r (get (car style))) + (define body (response-body r)) + (display-to-file body (cdr style) #:exists 'replace) + ;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url? + ) + (callback (length styles) (length styles) "styles...") + styles) + +(define (hash->save-dir wikiname hash) + (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2))) + +(define (image-url->values i) + ;; TODO: handle case where there is multiple broken cb parameter on minecraft wiki + ;; TODO: ensure it still "works" with broken & on minecraft wiki + (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing + (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary + (define hash (sha1 (string->bytes/utf-8 key))) + (cons key hash)) + + +;; 1. Download list of wiki pages and store in database, if not done yet for that wiki +(define (if-necessary-download-list-of-pages wikiname callback) + (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) + ;; done yet? + (unless (and (real? wiki-progress) (wiki-progress . >= . 1)) + ;; Count total pages + (define num-pages (insert-wiki-entry wikiname)) + ;; Download the entire index of pages + (define basenames + (let loop ([path-with-namefrom "/wiki/Local_Sitemap"] + [basenames-previous-pages null]) + ;; Download the current index page + (define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom)) + (define r (get url)) + ;; Metadata from this page (the link to the next page) + (define page (html->xexp (bytes->string/utf-8 (response-body r)))) + (define link-namefrom + ((query-selector (λ (t a c x) (and (eq? t 'a) + (pair? x) + (string-contains? (car x) "Next page") + (let ([href (get-attribute 'href a)] ) + (and href (string-contains? href "/wiki/Local_Sitemap"))))) + page #:include-text? #t))) + ;; Content from this page + (define basenames-this-page + (for/list ([link (in-producer + (query-selector + (λ (t a c) (eq? t 'a)) + ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page))) + #f)]) + (local-encoded-url->basename (get-attribute 'href (bits->attributes link))))) + ;; Call the progress callback + (define all-basenames (append basenames-previous-pages basenames-this-page)) + (callback (length all-basenames) num-pages (last all-basenames)) + ;; Recurse to download from the next page + (if link-namefrom + (loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames) + all-basenames))) + ;; Save those pages into the database + ;; SQLite can have a maximum of 32766 parameters in a single query + (for ([slice (in-slice 32760 basenames)]) + (define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values ")) + (call-with-transaction + (get-slc) + (λ () + (apply query-exec* query-template wikiname slice) + ;; Record that we have the complete list of pages + (query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname)))))) + + +;; 2. Download each page via API and: +;; * Save API response to file +(define max-page-progress 1) +(define (save-each-page wikiname callback) + ;; prepare destination folder + (define save-dir (build-path archive-root wikiname)) + (make-directory* save-dir) + ;; gather list of basenames to download (that aren't yet complete) + (define basenames (query-list* "select basename from page where wikiname = ? and progress < ?" + wikiname max-page-progress)) + ;; counter of complete/incomplete basenames + (define already-done-count + (query-value* "select count(*) from page where wikiname = ? and progress = ?" + wikiname max-page-progress)) + (define not-done-count + (query-value* "select count(*) from page where wikiname = ? and progress < ?" + wikiname max-page-progress)) + (define total-count (+ already-done-count not-done-count)) + ;; set initial progress + (callback already-done-count total-count "") + ;; loop through basenames and download + (for ([basename basenames] + [i (in-naturals (add1 already-done-count))]) + (define name-for-query (basename->name-for-query basename)) + (define dest-url + (format "https://~a.fandom.com/api.php?~a" + wikiname + (params->query `(("action" . "parse") + ("page" . ,name-for-query) + ("prop" . "text|headhtml|langlinks") + ("formatversion" . "2") + ("format" . "json"))))) + (define r (get dest-url)) + (define body (response-body r)) + (define filename (string-append basename ".json")) + (define save-path + (cond [((string-length basename) . > . 240) + (define key (sha1 (string->bytes/latin-1 basename))) + (query-exec* "insert into special_page (wikiname, key, basename) values (?, ?, ?)" + wikiname key basename) + (build-path save-dir (string-append key ".json"))] + [#t + (build-path save-dir (string-append basename ".json"))])) + (display-to-file body save-path #:exists 'replace) + (query-exec* "update page set progress = 1 where wikiname = ? and basename = ?" + wikiname basename) + (callback i total-count basename)) + ;; saved all pages, register that fact in the database + (query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname)) + + +;; 3. Download CSS and: +;; * Save CSS to file +;; * Record style images to database +(define (if-necessary-download-and-check-styles wikiname callback) + (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) + (unless (and (number? wiki-progress) (wiki-progress . >= . 3)) + (define styles (download-styles-for-wiki wikiname callback)) + (define unique-image-urls + (remove-duplicates + (map image-url->values + (flatten + (for/list ([style styles]) + (check-style-for-images wikiname (cdr style))))) + #:key cdr)) + (for ([pair unique-image-urls]) + (query-exec* "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair))) + (query-exec* "update wiki set progress = 3 where wikiname = ?" wikiname))) + + +;; 4: From downloaded pages, record URLs of image sources and inline style images to database +(define (check-json-for-images wikiname path) + (define data (with-input-from-file path (λ () (read-json)))) + (define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) + (define tree (update-tree-wiki page wikiname)) + null + #;(remove-duplicates + (for/list ([element (in-producer + (query-selector + (λ (t a c) + (and (eq? t 'img) + (get-attribute 'src a))) + tree) + #f)]) + (image-url->values (get-attribute 'src (bits->attributes element)))))) + + +;; 5. Download image sources and style images according to database +(define (save-each-image wikiname callback) + (define source (hash-ref sources 'style)) ;; TODO: download entire wiki images instead? + ;; gather list of basenames to download (that aren't yet complete) + (define rows (query-rows* "select url, hash from image where wikiname = ? and source <= ? and progress < 1" + wikiname source)) + ;; counter of complete/incomplete basenames + (define already-done-count + (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress = 1" + wikiname source)) + (define not-done-count + (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress < 1" + wikiname source)) + ;; set initial progress + (callback already-done-count (+ already-done-count not-done-count) "") + ;; loop through urls and download + (for ([row rows] + [i (in-naturals 1)]) + ;; row fragments + (define url (vector-ref row 0)) + (define hash (vector-ref row 1)) + ;; check + #; (printf "~a -> ~a~n" url hash) + (define r (get url)) + (define declared-type (response-headers-ref r 'content-type)) + (define final-type (if (equal? declared-type #"application/octet-stream") + (let ([sniff-entity (message-entity (mime-analyze (response-body r)))]) + (string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity)))) + declared-type)) + (define ext + (with-handlers ([exn:fail:contract? (λ _ (error 'save-each-image "no ext found for mime type `~a` in file ~a" final-type url))]) + (bytes->string/latin-1 (mime-type->ext final-type)))) + ;; save + (define save-dir (hash->save-dir wikiname hash)) + (make-directory* save-dir) + (define save-path (build-path save-dir (string-append hash "." ext))) + (define body (response-body r)) + (display-to-file body save-path #:exists 'replace) + (query-exec* "update image set progress = 1, ext = ? where wikiname = ? and hash = ?" + ext wikiname hash) + (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append (substring hash 0 6) "..." ext))) + ;; saved all images, register that fact in the database + (query-exec* "update wiki set progress = 4 where wikiname = ?" wikiname)) + +(define all-stages + (list + if-necessary-download-list-of-pages + save-each-page + if-necessary-download-and-check-styles + ;; check-json-for-images + save-each-image)) + +(module+ test + (check-equal? (html->xexp "") + '(*TOP* (img (@ (src "https://example.com/images?src=Blah.jpg&width=150"))))) + #;(download-list-of-pages "minecraft" values) + #;(save-each-page "minecraft" values) + #;(check-json-for-images "chiki" (build-path archive-root "chiki" "Fiona.json")) + #;(do-step-3 "gallowmere") + #;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))) + + #;(for ([wikiname (query-list* "select wikiname from wiki")]) + (println wikiname) + (insert-wiki-entry wikiname)) + + #;(for ([wikiname (query-list* "select wikiname from wiki")]) + (println wikiname) + (do-step-3 wikiname) + (save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))))) + +; (for ([stage all-stages]) (stage "create" (λ (a b c) (printf "~a/~a ~a~n" a b c)))) diff --git a/archiver/info.rkt b/archiver/info.rkt new file mode 100644 index 0000000..17bb747 --- /dev/null +++ b/archiver/info.rkt @@ -0,0 +1,3 @@ +#lang info + +(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo" "net-cookies-lib" "gui-easy-lib" "sql" "charterm" "cli")) diff --git a/archiver/req.rktd b/archiver/req.rktd new file mode 100644 index 0000000..e2d2fc2 --- /dev/null +++ b/archiver/req.rktd @@ -0,0 +1 @@ +((local ("."))) diff --git a/breezewiki.rkt b/breezewiki.rkt index a8b8c28..5fd34b2 100644 --- a/breezewiki.rkt +++ b/breezewiki.rkt @@ -17,9 +17,12 @@ (require-reloadable "src/page-proxy.rkt" page-proxy) (require-reloadable "src/page-redirect-wiki-home.rkt" redirect-wiki-home) (require-reloadable "src/page-search.rkt" page-search) +(require-reloadable "src/page-set-user-settings.rkt" page-set-user-settings) (require-reloadable "src/page-static.rkt" static-dispatcher) +(require-reloadable "src/page-static-archive.rkt" page-static-archive) (require-reloadable "src/page-subdomain.rkt" subdomain-dispatcher) (require-reloadable "src/page-wiki.rkt" page-wiki) +(require-reloadable "src/page-wiki-offline.rkt" page-wiki-offline) (require-reloadable "src/page-file.rkt" page-file) (reload!) @@ -27,7 +30,9 @@ (define ch (make-channel)) (define (start) (serve/launch/wait - #:listen-ip (if (config-true? 'debug) "127.0.0.1" #f) + #:listen-ip (if (equal? (config-get 'bind_host) "auto") + (if (config-true? 'debug) "127.0.0.1" #f) + (config-get 'bind_host)) #:port (string->number (config-get 'port)) (λ (quit) (channel-put ch (lambda () (semaphore-post quit))) @@ -40,7 +45,10 @@ page-not-found page-proxy page-search + page-set-user-settings + page-static-archive page-wiki + page-wiki-offline page-file redirect-wiki-home static-dispatcher diff --git a/dist.rkt b/dist.rkt index 777e81a..2e46f8c 100644 --- a/dist.rkt +++ b/dist.rkt @@ -11,13 +11,18 @@ (require (only-in "src/page-proxy.rkt" page-proxy)) (require (only-in "src/page-redirect-wiki-home.rkt" redirect-wiki-home)) (require (only-in "src/page-search.rkt" page-search)) +(require (only-in "src/page-set-user-settings.rkt" page-set-user-settings)) (require (only-in "src/page-static.rkt" static-dispatcher)) +(require (only-in "src/page-static-archive.rkt" page-static-archive)) (require (only-in "src/page-subdomain.rkt" subdomain-dispatcher)) (require (only-in "src/page-wiki.rkt" page-wiki)) +(require (only-in "src/page-wiki-offline.rkt" page-wiki-offline)) (require (only-in "src/page-file.rkt" page-file)) (serve/launch/wait - #:listen-ip (if (config-true? 'debug) "127.0.0.1" #f) + #:listen-ip (if (equal? (config-get 'bind_host) "auto") + (if (config-true? 'debug) "127.0.0.1" #f) + (config-get 'bind_host)) #:port (string->number (config-get 'port)) (λ (quit) (dispatcher-tree @@ -29,7 +34,10 @@ page-not-found page-proxy page-search + page-set-user-settings + page-static-archive page-wiki + page-wiki-offline page-file redirect-wiki-home static-dispatcher diff --git a/info.rkt b/info.rkt index 46512df..c290d5b 100644 --- a/info.rkt +++ b/info.rkt @@ -1,3 +1,3 @@ #lang info -(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo")) +(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "typed-ini-lib" "memo" "net-cookies-lib" "db")) diff --git a/lib/archive-file-mappings.rkt b/lib/archive-file-mappings.rkt new file mode 100644 index 0000000..4aa8a69 --- /dev/null +++ b/lib/archive-file-mappings.rkt @@ -0,0 +1,28 @@ +#lang racket/base +(require racket/string + net/url + (only-in net/uri-codec uri-decode) + "url-utils.rkt") +(provide + local-encoded-url->segments + url-segments->basename + local-encoded-url->basename + basename->name-for-query + url-segments->guess-title) + +(define (local-encoded-url->segments str) ; '("wiki" "Page_title") + (map path/param-path (url-path (string->url str)))) + +(define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix + (define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments))) + (define basic-filename (string-join extra-encoded "#")) + basic-filename) + +(define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix + (url-segments->basename (local-encoded-url->segments str))) + +(define (basename->name-for-query str) + (uri-decode (regexp-replace* #rx"#" str "/"))) + +(define (url-segments->guess-title segments) + (regexp-replace* #rx"_" (cadr segments) " ")) diff --git a/lib/html-parsing/main.rkt b/lib/html-parsing/main.rkt new file mode 100644 index 0000000..bdc09b1 --- /dev/null +++ b/lib/html-parsing/main.rkt @@ -0,0 +1,1887 @@ +#lang racket/base +;; Copyright Neil Van Dyke. For legal info, see file "info.rkt". + +(require mcfly) + +(module+ test + (require overeasy)) + +(doc (section "Introduction") + + (para "The " + (code "html-parsing") + " library provides a permissive HTML parser. The parser is useful +for software agent extraction of information from Web pages, for +programmatically transforming HTML files, and for implementing interactive Web +browsers. " + (code "html-parsing") + " emits " + ;; TODO: 2016-02-21 Once create sxml-doc package, reference that. + (seclink "top" + #:doc '(lib "sxml-intro/sxml-intro.scrbl") + #:indirect? #true + "SXML/xexp") + ", so that conventional HTML may be processed with XML tools such as +SXPath. Like Oleg Kiselyov's " + (hyperlink "http://pobox.com/~oleg/ftp/Scheme/xml.html#HTML-parser" + "SSAX-based HTML parser") + ", " + (code "html-parsing") + " provides a permissive tokenizer, but " + (code "html-parsing") + " extends this by attempting to recover syntactic structure.") + + (para "The " + (code "html-parsing") + " parsing behavior is permissive in that it accepts erroneous HTML, +handling several classes of HTML syntax errors gracefully, without yielding a +parse error. This is crucial for parsing arbitrary real-world Web pages, since +many pages actually contain syntax errors that would defeat a strict or +validating parser. " + (code "html-parsing") + "'s handling of errors is intended to generally emulate popular Web +browsers' interpretation of the structure of erroneous HTML.") + (para (code "html-parsing") + " also has some support for XHTML, although XML namespace qualifiers +are accepted but stripped from the resulting SXML/xexp. Note that " + (italic "valid") + " XHTML input might be better handled by a validating XML parser +like Kiselyov's SSAX.")) + +;; BEGIN COPIED FROM XEXP PACKAGE + +(define (%html-parsing:make-xexp-char-ref val) + (if (or (symbol? val) (integer? val)) + `(& ,val) + (error 'make-xexp-char-ref + "invalid xexp reference value: ~S" + val))) + +(define %html-parsing:always-empty-html-elements + '(area base br frame hr img input isindex keygen link meta param + spacer wbr)) + +;; END COPIED FROM XEXP PACKAGE + +(define %html-parsing:empty-token-symbol '*empty*) +(define %html-parsing:end-token-symbol '*end*) +(define %html-parsing:start-token-symbol '*start*) +(define %html-parsing:entity-token-symbol '*entity*) +(define %html-parsing:text-string-token-symbol '*text-string*) +(define %html-parsing:text-char-token-symbol '*text-char*) + +(define %html-parsing:make-html-tokenizer + ;; TODO: Have the tokenizer replace contiguous whitespace within individual + ;; text tokens with single space characters (except for when in `pre' and + ;; verbatim elements). The parser will introduce new contiguous whitespace + ;; (e.g., when text tokens are concatenated, invalid end tags are removed, + ;; whitespace is irrelevant between certain elements), but then the parser + ;; only has to worry about the first and last character of each string. + ;; Perhaps the text tokens should have both leading and trailing whitespace + ;; stripped, and contain flags for whether or not leading and trailing + ;; whitespace occurred. + (letrec ((no-token '()) + + ;; TODO: Maybe make these three variables options. + + (verbatim-to-eof-elems '(plaintext)) + + (verbatim-pair-elems '(script server style xmp)) + + (ws-chars (list #\space + (integer->char 9) + (integer->char 10) + (integer->char 11) + (integer->char 12) + (integer->char 13))) + + (gosc/string-or-false + (lambda (os) + (let ((s (get-output-string os))) + (if (string=? s "") #f s)))) + + (gosc/symbol-or-false + (lambda (os) + (let ((s (gosc/string-or-false os))) + (if s (string->symbol s) #f)))) + ) + (lambda (in normalized?) + ;; TODO: Make a tokenizer option that causes XML namespace qualifiers to + ;; be ignored. + (letrec + ( + ;; Port buffer with inexpensive unread of one character and slightly + ;; more expensive pushback of second character to unread. The + ;; procedures themselves do no consing. The tokenizer currently + ;; needs two-symbol lookahead, due to ambiguous "/" while parsing + ;; element and attribute names, which could be either empty-tag + ;; syntax or XML qualified names. + (c #f) + (next-c #f) + (c-consumed? #t) + (read-c (lambda () + (if c-consumed? + (if next-c + (begin (set! c next-c) + (set! next-c #f)) + (set! c (read-char in))) + (set! c-consumed? #t)))) + (unread-c (lambda () + (if c-consumed? + (set! c-consumed? #f) + ;; TODO: Procedure name in error message really + ;; isn't "%html-parsing:make-html-tokenizer"... + (error '%html-parsing:make-html-tokenizer + "already unread: ~S" + c)))) + (push-c (lambda (new-c) + (if c-consumed? + (begin (set! c new-c) + (set! c-consumed? #f)) + (if next-c + (error '%html-parsing:make-html-tokenizer + "pushback full: ~S" + c) + (begin (set! next-c c) + (set! c new-c) + (set! c-consumed? #f)))))) + + ;; TODO: These procedures are a temporary convenience for + ;; enumerating the pertinent character classes, with an eye towards + ;; removing redundant tests of character class. These procedures + ;; should be eliminated in a future version. + (c-eof? (lambda () (eof-object? c))) + (c-amp? (lambda () (eqv? c #\&))) + (c-apos? (lambda () (eqv? c #\'))) + (c-bang? (lambda () (eqv? c #\!))) + (c-colon? (lambda () (eqv? c #\:))) + (c-quot? (lambda () (eqv? c #\"))) + (c-equals? (lambda () (eqv? c #\=))) + (c-gt? (lambda () (eqv? c #\>))) + (c-lsquare? (lambda () (eqv? c #\[))) + (c-lt? (lambda () (eqv? c #\<))) + (c-minus? (lambda () (eqv? c #\-))) + (c-pound? (lambda () (eqv? c #\#))) + (c-ques? (lambda () (eqv? c #\?))) + (c-semi? (lambda () (eqv? c #\;))) + (c-slash? (lambda () (eqv? c #\/))) + (c-splat? (lambda () (eqv? c #\*))) + (c-lf? (lambda () (eqv? c #\newline))) + (c-angle? (lambda () (memv c '(#\< #\>)))) + (c-ws? (lambda () (memv c ws-chars))) + (c-alpha? (lambda () (char-alphabetic? c))) + (c-digit? (lambda () (char-numeric? c))) + (c-alphanum? (lambda () (or (c-alpha?) (c-digit?)))) + (c-hexlet? (lambda () (memv c '(#\a #\b #\c #\d #\e #\f + #\A #\B #\C #\D #\E #\F)))) + + (skip-ws (lambda () (read-c) (if (c-ws?) (skip-ws) (unread-c)))) + + (if-read-chars + (lambda (match-chars yes-thunk no-proc) + (let loop ((chars match-chars) + (match-count 0)) + (if (null? chars) + (yes-thunk) + (begin (read-c) + (if (eqv? c (car chars)) + (begin (loop (cdr chars) (+ 1 match-count))) + (begin (unread-c) + (no-proc match-chars match-count)))))))) + + (write-chars-count + (lambda (chars count port) + (let loop ((chars chars) + (count count)) + (or (zero? count) + (begin (write-char (car chars) port) + (loop (cdr chars) + (- count 1))))))) + + (make-start-token + (if normalized? + (lambda (name ns attrs) + (list name (cons '@ attrs))) + (lambda (name ns attrs) + (if (null? attrs) + (list name) + (list name (cons '@ attrs)))))) + + (make-empty-token + (lambda (name ns attrs) + (cons %html-parsing:empty-token-symbol + (make-start-token name ns attrs)))) + + (make-end-token + (if normalized? + (lambda (name ns attrs) + (list %html-parsing:end-token-symbol + name + (cons '@ attrs))) + (lambda (name ns attrs) + (if (null? attrs) + (list %html-parsing:end-token-symbol name) + (list %html-parsing:end-token-symbol + name + (cons '@ attrs)))))) + + (make-comment-token + (lambda (str) (list '*COMMENT* str))) + + (make-decl-token + (lambda (parts) (cons '*DECL* parts))) + + (scan-qname + ;; TODO: Make sure we don't accept local names that have "*", since + ;; this can break SXML tools. Have to validate this afterwards if + ;; "verbatim-safe?". Also check for "@" and maybe "@@". Check + ;; qname parsing code, especially for verbatim mode. This is + ;; important! + (lambda (verbatim-safe?) + ;; Note: If we accept some invalid local names, we only need two + ;; symbols of lookahead to determine the end of a qname. + (letrec ((os #f) + (ns '()) + (vcolons 0) + (good-os (lambda () + (or os + (begin (set! os (open-output-string)) + os))))) + (let loop () + (read-c) + (cond ((c-eof?) #f) + ((or (c-ws?) (c-splat?)) + (if verbatim-safe? + (unread-c) + #f)) + ((or (c-angle?) (c-equals?) (c-quot?) (c-apos?)) + (unread-c)) + ((c-colon?) + (or (null? ns) + (set! ns (cons ":" ns))) + (if os + (begin + (set! ns (cons (get-output-string os) + ns)) + (set! os #f)) + #f) + (loop)) + ((c-slash?) + (read-c) + (cond ((or (c-eof?) + (c-ws?) + (c-equals?) + (c-apos?) + (c-quot?) + (c-angle?) + (c-splat?)) + (unread-c) + (push-c #\/)) + (else (write-char #\/ (good-os)) + (write-char c os) + (loop)))) + (else (write-char c (good-os)) + (loop)))) + (let ((ns (if (null? ns) + #f + (apply string-append + (reverse ns)))) + (localname (if os (get-output-string os) #f))) + (if verbatim-safe? + ;; TODO: Make sure we don't have ambiguous ":" or drop + ;; any characters! + (cons ns localname) + ;; Note: We represent "xml:" and "xmlns:" syntax as + ;; normal qnames, for lack of something better to do with + ;; them when we don't support XML namespaces. + ;; + ;; TODO: Local names are currently forced to lowercase, + ;; since HTML is usually case-insensitive. If XML + ;; namespaces are used, we might wish to keep local names + ;; case-sensitive. + (if localname + (if ns + (if (or (string=? ns "xml") + (string=? ns "xmlns")) + (string->symbol (string-append ns + ":" + localname)) + (cons ns + (string->symbol (string-downcase + localname)))) + (string->symbol (string-downcase localname))) + (if ns + (string->symbol (string-downcase ns)) + ;; TODO: Ensure in rest of code that returning #f + ;; as a name here is OK. + #f))))))) + + (scan-tag + (lambda (start?) + (skip-ws) + (let ((tag-name (scan-qname #f)) + (tag-ns #f) + (tag-attrs #f) + (tag-empty? #f)) + ;; Scan element name. + (if (pair? tag-name) + (begin (set! tag-ns (car tag-name)) + (set! tag-name (cdr tag-name))) + #f) + ;; TODO: Ensure there's no case in which a #f tag-name isn't + ;; compensated for later. + ;; + ;; Scan element attributes. + (set! tag-attrs + (let scan-attr-list () + (read-c) + (cond ((c-eof?) '()) + ((c-angle?) (unread-c) '()) + ((c-slash?) + (set! tag-empty? #t) + (scan-attr-list)) + ((c-alpha?) + (unread-c) + (let ((attr (scan-attr))) + (cons attr (scan-attr-list)))) + (else (scan-attr-list))))) + ;; Find ">" or unnatural end. + (let loop () + (read-c) + (cond ((c-eof?) no-token) + ((c-slash?) (set! tag-empty? #t) (loop)) + ((c-gt?) #f) + ((c-ws?) (loop)) + (else (unread-c)))) + ;; Change the tokenizer mode if necessary. + (cond ((not start?) #f) + (tag-empty? #f) + ;; TODO: Maybe make one alist lookup here, instead of + ;; two. + ((memq tag-name verbatim-to-eof-elems) + (set! nexttok verbeof-nexttok)) + ((memq tag-name verbatim-pair-elems) + (set! nexttok (make-verbpair-nexttok tag-name)))) + ;; Return a token object. + (if start? + (if tag-empty? + (make-empty-token tag-name tag-ns tag-attrs) + (make-start-token tag-name tag-ns tag-attrs)) + (make-end-token tag-name tag-ns tag-attrs))))) + + (scan-attr + (lambda () + (let ((name (scan-qname #f)) + (val #f)) + (if (pair? name) + (set! name (cdr name)) + #f) + (let loop-equals-or-end () + (read-c) + (cond ((c-eof?) no-token) + ((c-ws?) (loop-equals-or-end)) + ((c-equals?) + (let loop-quote-or-unquoted () + (read-c) + (cond ((c-eof?) no-token) + ((c-ws?) (loop-quote-or-unquoted)) + ((or (c-apos?) (c-quot?)) + (let ((term c)) + (set! val (open-output-string)) + (let loop-quoted-val () + (read-c) + (cond ((c-eof?) #f) + ((eqv? c term) #f) + ((c-amp?) (let ((entity (scan-entity))) + (display entity val) + (loop-quoted-val))) + (else (write-char c val) + (loop-quoted-val)))))) + ((c-angle?) (unread-c)) + (else + (set! val (open-output-string)) + (write-char c val) + (let loop-unquoted-val () + (read-c) + (cond ((c-eof?) no-token) + ((c-apos?) #f) + ((c-quot?) #f) + ((or (c-ws?) (c-angle?) + ;;(c-slash?) + ) + (unread-c)) + ;; Note: We can treat a slash in an + ;; unquoted attribute value as a + ;; value constituent because the + ;; slash is specially-handled only + ;; for XHTML, and XHTML attribute + ;; values must always be quoted. We + ;; could do lookahead for "/>", but + ;; that wouldn't let us parse HTML + ;; "" correctly, so this is + ;; an easier and more correct way to + ;; do things. + (else (write-char c val) + (loop-unquoted-val)))))))) + (else (unread-c)))) + (if normalized? + (list name (if val + (get-output-string val) + (symbol->string name))) + (if val + (list name (get-output-string val)) + (list name)))))) + + (scan-comment + ;; TODO: Rewrite this to use tail recursion rather than a state + ;; variable. + (lambda () + (let ((os (open-output-string)) + (state 'start-minus)) + (let loop () + (read-c) + (cond ((c-eof?) #f) + ((c-minus?) + (set! state + (case state + ((start-minus) 'start-minus-minus) + ((start-minus-minus body) 'end-minus) + ((end-minus) 'end-minus-minus) + ((end-minus-minus) (write-char #\- os) state) + (else (error '<%html-parsing:make-html-tokenizer> + "invalid state: ~S" + state)))) + (loop)) + ((and (c-gt?) (eq? state 'end-minus-minus)) #f) + (else (case state + ((end-minus) (write-char #\- os)) + ((end-minus-minus) (display "--" os))) + (set! state 'body) + (write-char c os) + (loop)))) + (make-comment-token (get-output-string os))))) + + (scan-possible-cdata + (lambda () + ;; Read ") + (lambda () (get-output-string os)) + (lambda (chars count) + (if (zero? count) + (if (eof-object? c) + (get-output-string os) + (begin (write-char c os) + (read-c) + (loop))) + (begin (write-char #\] os) + (if (= count 2) + (push-c #\]) + #f) + (loop))))))))) + + (scan-pi + (lambda () + (skip-ws) + (let ((name (open-output-string)) + (val (open-output-string))) + (let scan-name () + (read-c) + (cond ((c-eof?) #f) + ((c-ws?) #f) + ((c-alpha?) (write-char c name) (scan-name)) + (else (unread-c)))) + ;; TODO: Do we really want to emit #f for PI name? + (set! name (gosc/symbol-or-false name)) + (let scan-val () + (read-c) + (cond ((c-eof?) #f) + ;; ((c-amp?) (display (scan-entity) val) + ;; (scan-val)) + ((c-ques?) + (read-c) + (cond ((c-eof?) (write-char #\? val)) + ((c-gt?) #f) + (else (write-char #\? val) + (unread-c) + (scan-val)))) + (else (write-char c val) (scan-val)))) + (list '*PI* + name + (get-output-string val))))) + + (scan-decl + ;; TODO: Find if SXML includes declaration forms, and if so, use + ;; whatever format SXML wants. + ;; + ;; TODO: Rewrite to eliminate state variables. + (letrec + ((scan-parts + (lambda () + (let ((part (open-output-string)) + (nonsymbol? #f) + (state 'before) + (last? #f)) + (let loop () + (read-c) + (cond ((c-eof?) #f) + ((c-ws?) + (case state + ((before) (loop)) + ((quoted) (write-char c part) (loop)))) + ((and (c-gt?) (not (eq? state 'quoted))) + (set! last? #t)) + ((and (c-lt?) (not (eq? state 'quoted))) + (unread-c)) + ((c-quot?) + (case state + ((before) (set! state 'quoted) (loop)) + ((unquoted) (unread-c)) + ((quoted) #f))) + (else + (if (eq? state 'before) + (set! state 'unquoted) + #f) + (set! nonsymbol? (or nonsymbol? + (not (c-alphanum?)))) + (write-char c part) + (loop)))) + (set! part (get-output-string part)) + (if (string=? part "") + '() + (cons (if (or (eq? state 'quoted) nonsymbol?) + part + ;; TODO: Normalize case of things we make + ;; into symbols here. + (string->symbol part)) + (if last? + '() + (scan-parts)))))))) + (lambda () (make-decl-token (scan-parts))))) + + (special-entity-reverse-chars-to-string-alist + '(((#\p #\m #\a) . "&") + ((#\s #\o #\p #\a) . "'") + ((#\t #\g) . ">") + ((#\t #\l) . "<") + ((#\t #\o #\u #\q) . "\""))) + + (finish-terminated-named-entity + (lambda (reverse-name-chars) + (cond ((equal? '() reverse-name-chars) + "&") + ((assoc reverse-name-chars + special-entity-reverse-chars-to-string-alist) + => (lambda (p) + (cdr p))) + (else (%html-parsing:make-xexp-char-ref + (string->symbol (apply string (reverse reverse-name-chars)))))))) + + (finish-unterminated-named-entity + (lambda (reverse-name-chars) + (apply string (cons #\& (reverse reverse-name-chars))))) + + (scan-entity + (lambda () + (read-c) + (cond ((c-eof?) "&") + ((c-alpha?) + ;; TODO: Do entity names have a maximum length? + (let loop ((reverse-name-chars (cons c '()))) + (read-c) + (cond ((c-eof?) (finish-unterminated-named-entity + reverse-name-chars)) + ((c-alpha?) (let ((reverse-name-chars (cons c reverse-name-chars))) + (cond ((assoc reverse-name-chars + special-entity-reverse-chars-to-string-alist) + => (lambda (p) + (read-c) + (or (c-semi?) + (unread-c)) + (cdr p))) + (else (loop reverse-name-chars))))) + ((c-semi?) (finish-terminated-named-entity + reverse-name-chars)) + (else (unread-c) + (finish-unterminated-named-entity + reverse-name-chars))))) + ((c-pound?) + (let ((num (open-output-string)) + (hex? #f)) + (read-c) + (cond ((c-eof?) #f) + ((memv c '(#\x #\X)) (set! hex? #t) (read-c))) + (let loop () + (cond ((c-eof?) #f) + ((c-semi?) #f) + ((or (c-digit?) (and hex? (c-hexlet?))) + (write-char c num) + (read-c) + (loop)) + (else (unread-c)))) + (set! num (get-output-string num)) + (if (string=? num "") + "&#;" + (let ((n (string->number num (if hex? 16 10)))) + (if (<= 32 n 126) + (string (integer->char n)) + (string (integer->char n))))))) + (else (unread-c) "&")))) + + (normal-nexttok + (lambda () + (read-c) + (cond ((c-eof?) no-token) + ((c-lt?) + (let loop () + (read-c) + (cond ((c-eof?) "<") + ;; ((c-ws?) (loop)) + ((c-slash?) (scan-tag #f)) + ((c-ques?) (scan-pi)) + ((c-alpha?) (unread-c) (scan-tag #t)) + ((c-bang?) + (read-c) + (if (c-lsquare?) + (scan-possible-cdata) + (let loop () + (cond ((c-eof?) no-token) + ((c-ws?) (read-c) (loop)) + ((c-minus?) (scan-comment)) + (else (unread-c) + (scan-decl)))))) + (else (unread-c) "<")))) + ((c-gt?) ">") + (else (let ((os (open-output-string))) + (let loop () + (cond ((c-eof?) #f) + ((c-angle?) (unread-c)) + ((c-amp?) + (let ((entity (scan-entity))) + (if (string? entity) + (begin (display entity os) + (read-c) + (loop)) + (let ((saved-nexttok nexttok)) + (set! nexttok + (lambda () + (set! nexttok + saved-nexttok) + entity)))))) + (else (write-char c os) + (or (c-lf?) + (begin (read-c) (loop)))))) + (let ((text (get-output-string os))) + (if (equal? text "") + (nexttok) + text))))))) + + (verbeof-nexttok + (lambda () + (read-c) + (if (c-eof?) + no-token + (let ((os (open-output-string))) + (let loop () + (or (c-eof?) + (begin (write-char c os) + (or (c-lf?) + (begin (read-c) (loop)))))) + (get-output-string os))))) + + (make-verbpair-nexttok + (lambda (elem-name) + (lambda () + (let ((os (open-output-string))) + ;; Accumulate up to a newline-terminated line. + (let loop () + (read-c) + (cond ((c-eof?) + ;; Got EOF in verbatim context, so set the normal + ;; nextok procedure, then fall out of loop. + (set! nexttok normal-nexttok)) + ((c-lt?) + ;; Got "<" in verbatim context, so get next + ;; character. + (read-c) + (cond ((c-eof?) + ;; Got "<" then EOF, so set to the normal + ;; nexttok procedure, add the "<" to the + ;; verbatim string, and fall out of loop. + (set! nexttok normal-nexttok) + (write-char #\< os)) + ((c-slash?) + ;; Got "symbol + (string-downcase local)) + elem-name)) + ;; This is the terminator tag, so + ;; scan to the end of it, set the + ;; nexttok, and fall out of the loop. + (begin + (let scan-to-end () + (read-c) + (cond ((c-eof?) #f) + ((c-gt?) #f) + ((c-lt?) (unread-c)) + ((c-alpha?) + (unread-c) + ;; Note: This is an + ;; expensive way to skip + ;; over an attribute, but + ;; in practice more + ;; verbatim end tags will + ;; not have attributes. + (scan-attr) + (scan-to-end)) + (else (scan-to-end)))) + (set! nexttok + (lambda () + (set! nexttok + normal-nexttok) + (make-end-token + elem-name #f '())))) + ;; This isn't the terminator tag, so + ;; add to the verbatim string the + ;; "xexp} and related procedures, except +;; using @var{tokenizer} as a source of tokens, rather than tokenizing from an +;; input port. This procedure is used internally, and generally should not be +;; called directly. + +(define %html-parsing:parse-html/tokenizer + ;; Note: This algorithm was originally written in 2001 (as part of the first + ;; Scheme library the author ever wrote), and then on 2009-08-16 was revamped + ;; to not use mutable pairs, for PLT 4 compatibility. It could still use + ;; some work to be more FP, but it works for now. + (letrec ((empty-elements + ;; TODO: Maybe make this an option. + %html-parsing:empty-elements) + (h-elem-parents + ;; Am doing this kludge mainly for mid-1990s HTML that uses the `p` + ;; element wrong. Trying to get all appropriate parents other than + ;; `p` that I can, to reduce breaking other code. + '(a article aside blink blockquote body center footer form header html li main nav section slot template)) + (parent-constraints + ;; TODO: Maybe make this an option. + `((area . (map span)) + (body . (html)) + (caption . (table)) + (colgroup . (table)) + (dd . (dl)) + (dt . (dl)) + (frame . (frameset)) + (head . (html)) + (h1 . ,h-elem-parents) + (h2 . ,h-elem-parents) + (h3 . ,h-elem-parents) + (h4 . ,h-elem-parents) + (h5 . ,h-elem-parents) + (h6 . ,h-elem-parents) + (isindex . (head)) + (li . (dir menu ol ul)) + (meta . (head)) + (noframes . (frameset)) + (option . (select)) + (p . (blockquote body details figcaption html li td th)) + (param . (applet)) + (tbody . (table)) + (td . (tr)) + (th . (tr)) + (thead . (table)) + (title . (head)) + (tr . (table tbody thead)))) + (token-kinds-that-always-get-added + `(*COMMENT* + *DECL* + *PI* + ,%html-parsing:entity-token-symbol + ,%html-parsing:text-string-token-symbol + ,%html-parsing:text-char-token-symbol)) + (start-tag-name (lambda (tag-token) (car tag-token))) + (end-tag-name (lambda (tag-token) (list-ref tag-token 1)))) + (lambda (tokenizer normalized?) + ;;(log-html-parsing-debug "(%html-parsing:parse-html/tokenizer ~S ~S)" tokenizer normalized?) + (let ((begs (list (vector #f '())))) + (letrec ((add-thing-as-child-of-current-beg + (lambda (tok) + (let ((beg (car begs))) + (vector-set! beg 1 (cons tok (vector-ref beg 1)))))) + + (beg->elem + (lambda (beg) + (let ((elem-name (vector-ref beg 0)) + (attrs-and-contents (reverse (vector-ref beg 1)))) + (cons elem-name attrs-and-contents)))) + + (finish-current-beg-and-return-elem + (lambda () + (let ((elem (beg->elem (car begs)))) + (set! begs (cdr begs)) + (or (null? begs) + (add-thing-as-child-of-current-beg elem)) + elem))) + + (finish-current-beg + (lambda () + (finish-current-beg-and-return-elem))) + + (finish-all-begs-and-return-top + (lambda () + (let loop () + (let ((elem (finish-current-beg-and-return-elem))) + (if (car elem) + (loop) + (cdr elem)))))) + + (finish-begs-up-to-and-including-name + (lambda (name) + ;; (log-html-parsing-debug "(finish-begs-up-to-and-including-name ~S)" name) + (let loop-find-name ((find-begs begs) + (depth 1)) + (let ((beg-name (vector-ref (car find-begs) 0))) + (cond ((not beg-name) + ;; We reached the root without finding a + ;; matching beg, so don't finish anything. + ;; + ;; TODO: 2022-04-02: Consider having a `*TOP*` + ;; kludge in `parent-constraints` that's checked + ;; here, especially for handling mid-1990s HTML + ;; `p` element (so that we can keep `p` from + ;; being a child of `p` even when there's no + ;; parent `body` or `html` element). + (void)) + ((eqv? name beg-name) + ;; We found a match, so finish the begs up to + ;; depth. + (let loop-finish ((depth depth)) + (or (zero? depth) + (begin + (finish-current-beg) + (loop-finish (- depth 1)))))) + (else + ;; Didn't find a match yet, and there's still at + ;; least one more beg to look at, so recur. + (loop-find-name (cdr find-begs) + (+ depth 1)))))))) + + (finish-begs-upto-but-not-including-names + (lambda (names) + ;; (log-html-parsing-debug "(finish-begs-upto-but-not-including-names ~S)" names) + ;; (log-html-parsing-debug "begs = ~S" begs) + (let loop-find-name ((find-begs begs) + (depth 0)) + (let ((beg-name (vector-ref (car find-begs) 0))) + (cond ((not beg-name) + ;; We reached the root without finding a + ;; matching beg, so simply discard it. + (void)) + ((memq beg-name names) + ;; We found a match, so finish the begs up to + ;; depth. + (let loop-finish ((depth depth)) + (or (zero? depth) + (begin + (finish-current-beg) + (loop-finish (- depth 1)))))) + (else + ;; Didn't find a match yet, and there's still at + ;; least one more beg to look at, so recur. + (loop-find-name (cdr find-begs) + (+ depth 1))))))))) + + (let loop () + (let ((tok (tokenizer))) + (if (null? tok) + (finish-all-begs-and-return-top) + (let ((kind (%html-parsing:xexp-token-kind tok))) + ;; (log-html-parsing-debug "kind = ~S" kind) + (cond ((memv kind token-kinds-that-always-get-added) + (add-thing-as-child-of-current-beg tok)) + ((eqv? kind %html-parsing:start-token-symbol) + ;; (log-html-parsing-debug "is-start-token-symbol") + (let* ((name (start-tag-name tok)) + (cell (assq name parent-constraints))) + ;; (log-html-parsing-debug "name = ~S cell = ~S" name cell) + (and cell + (finish-begs-upto-but-not-including-names + (cons 'div (cdr cell)))) + (if (memq name empty-elements) + (add-thing-as-child-of-current-beg tok) + (set! begs (cons (vector (car tok) + (cdr tok)) + begs))))) + ((eqv? kind %html-parsing:empty-token-symbol) + ;; Empty tag token, so just add it to current + ;; beginning while stripping off leading `*EMPTY*' + ;; symbol so that the token becomes normal SXML + ;; element syntax. + (add-thing-as-child-of-current-beg (cdr tok))) + ((eqv? kind %html-parsing:end-token-symbol) + (let ((name (end-tag-name tok))) + (if name + ;; Try to finish to a start tag matching this + ;; end tag. If none, just drop the token, + ;; though we used to add it to the current + ;; beginning. + (finish-begs-up-to-and-including-name + name) + ;; We have an anonymous end tag, so match it + ;; with the most recent beginning. If no + ;; beginning to match, then just drop the + ;; token, though we used to add it to the + ;; current beginning. + (and (vector-ref (car begs) 0) + (finish-current-beg))))) + (else (error 'parse-html/tokenizer + "unknown tag kind: ~S" + kind))) + (loop)))))))))) + +;; TODO: Quote of message to a user: +;; +;; >I think this behavior is due to HtmlPrag's use in "parse-html/tokenizer" +;; >of its local "parent-constraints" variable. +;; > +;; >The following line of code from the variable binding expresses the +;; >constraint that any "p" element can have as immediate parent element +;; >only "body", "td", or "th": +;; > +;; > (p . (body td th)) +;; > +;; >I think I know a good heuristic for dealing with unfamiliar but +;; >seemingly well-formed elements, like "page" in this case, but I'm afraid +;; >I don't have time to implement it right now. (I am job-hunting right +;; >now, and there are many other coding things I need to do first.) +;; > +;; >Would adding "page" to the above line of the HtmlPrag source code work +;; >around the current problem, or do you need a better solution right now? + +;; @defproc %parse-html input normalized? top? +;; +;; This procedure is now used internally by @code{html->xexp} and its +;; variants, and should not be used directly by programs. The interface is +;; likely to change in future versions of HtmlPrag. + +(define (%html-parsing:parse-html input normalized? top?) + (let ((parse + (lambda () + (%html-parsing:parse-html/tokenizer + (%html-parsing:make-html-tokenizer + (cond ((input-port? input) input) + ((string? input) (open-input-string input)) + (else (error + '%html-parsing:parse-html + "invalid input type: ~E" + input))) + normalized?) + normalized?)))) + (if top? + (cons '*TOP* (parse)) + (parse)))) + +;; @defproc html->sxml-0nf input +;; @defprocx html->sxml-1nf input +;; @defprocx html->sxml-2nf input +;; @defprocx html->sxml input +(doc (defproc (html->xexp (input (or/c input-port? string?))) + xexp + + (para "Parse HTML permissively from " + (racket input) + ", which is either an input port or a string, and emit an +SXML/xexp equivalent or approximation. To borrow and slightly modify an +example from Kiselyov's discussion of his HTML parser:") + + (racketinput + (html->xexp + (string-append + "whatever" + " link

" + "