diff --git a/archiver/archive-info.rkt b/archiver/archive-info.rkt new file mode 100644 index 0000000..17bb747 --- /dev/null +++ b/archiver/archive-info.rkt @@ -0,0 +1,3 @@ +#lang info + +(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo" "net-cookies-lib" "gui-easy-lib" "sql" "charterm" "cli")) diff --git a/archiver/archiver-cli.rkt b/archiver/archiver-cli.rkt new file mode 100644 index 0000000..d96aa65 --- /dev/null +++ b/archiver/archiver-cli.rkt @@ -0,0 +1,71 @@ +#lang cli +(require charterm + "archiver.rkt") + +(help (usage "Downloads a single Fandom wiki in BreezeWiki offline format." + "" + "Downloaded pages go into `archive/` next to the executable." + "Database goes into `archiver.db*` next to the executable." + "The database is necessary to store your download progress and resume where you left off if the process is interrupted.") + (ps "" + "Default output style is `progress` in a tty and `lines` otherwise.")) + +(flag (output-quiet?) + ("-q" "--output-quiet" "disable progress output") + (output-quiet? #t)) + +(flag (output-lines?) + ("-l" "--output-lines" "output the name of each file downloaded") + (output-lines? #t)) + +(flag (output-progress?) + ("-p" "--output-progress" "progress output for terminals") + (output-progress? #t)) + +(constraint (one-of output-quiet? output-lines? output-progress?)) + +(program + (start [wikiname "wikiname to download"]) + ;; set up arguments + (define width 80) + (when (not (or (output-quiet?) (output-lines?) (output-progress?))) + (cond [(terminal-port? current-input-port) + (output-progress? #t)] + [else + (output-lines? #t)])) + (define (update-width) + (when (output-progress?) + (with-charterm + (call-with-values (λ () (charterm-screen-size)) + (λ (cols rows) (set! width cols)))))) + (update-width) + ;; check + (when (or (not wikiname) (equal? wikiname "")) + (raise-user-error "Please specify the wikiname to download on the command line.")) + ;; stage 1 + (cond [(output-lines?) (displayln "Downloading list of pages...")] + [(output-progress?) (printf "Downloading list of pages... \r")]) + (if-necessary-download-list-of-pages + wikiname + (λ (a b c) + (cond [(output-progress?) (printf "Downloading list of pages... [~a/~b]\r" a b)]))) + ;; stage 2 + (save-each-page + wikiname + (λ (a b c) + (define basename (basename->name-for-query c)) + (cond + [(output-lines?) + (displayln basename)] + [(output-progress?) + (when (eq? (modulo a 20) 0) + (thread (λ () (update-width)))) + (define prefix (format "[~a/~a] " a b)) + (define rest (- width (string-length prefix))) + (define real-width (min (string-length basename) rest)) + (define spare-width (- rest real-width)) + (define name-display (substring basename 0 real-width)) + (define whitespace (make-string spare-width #\ )) + (printf "~a~a~a\r" prefix name-display whitespace)])))) + +(run start) diff --git a/archiver/archiver-database.rkt b/archiver/archiver-database.rkt new file mode 100644 index 0000000..2defb0e --- /dev/null +++ b/archiver/archiver-database.rkt @@ -0,0 +1,47 @@ +#lang racket/base +(require racket/list + racket/path + racket/runtime-path + racket/string + json + json-pointer + db + "../lib/syntax.rkt") + +(provide + slc) + +(define-runtime-path database-file "../storage/archiver.db") + +(define migrations + (wrap-sql + ((query-exec slc "create table page (wikiname TEXT NOT NULL, basename TEXT NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, basename))") + (query-exec slc "create table wiki (wikiname TEXT NOT NULL, progress INTEGER, PRIMARY KEY (wikiname))")) + ((query-exec slc "create table special_page (wikiname TEXT NOT NULL, key TEXT NOT NULL, basename TEXT NOT NULL, PRIMARY KEY (wikiname, key))")) + ((query-exec slc "update wiki set progress = 2 where wikiname in (select wikiname from wiki inner join page using (wikiname) group by wikiname having min(page.progress) = 1)")) + ((query-exec slc "create table image (wikiname TEXT NOT NULL, hash TEXT NTO NULL, url TEXT NOT NULL, ext TEXT, source INTEGER NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, hash))")) + ((query-exec slc "alter table wiki add column sitename TEXT") + (query-exec slc "alter table wiki add column basepage TEXT") + (query-exec slc "alter table wiki add column license_text TEXT") + (query-exec slc "alter table wiki add column license_url TEXT")))) + +(define slc (sqlite3-connect #:database database-file #:mode 'create)) +(query-exec slc "PRAGMA journal_mode=WAL") +(define database-version + (with-handlers ([exn:fail:sql? + (λ (exn) + ; need to set up the database + (query-exec slc "create table database_version (version integer, primary key (version))") + (query-exec slc "insert into database_version values (0)") + 0)]) + (query-value slc "select version from database_version"))) + +(let do-migrate-step () + (when (database-version . < . (length migrations)) + (call-with-transaction + slc + (list-ref migrations database-version)) + (set! database-version (add1 database-version)) + (query-exec slc "update database_version set version = $1" database-version) + (do-migrate-step))) + diff --git a/archiver/archiver-gui.rkt b/archiver/archiver-gui.rkt new file mode 100644 index 0000000..1badf65 --- /dev/null +++ b/archiver/archiver-gui.rkt @@ -0,0 +1,204 @@ +#lang racket/base +(require racket/class + racket/list + racket/port + racket/set + racket/string + db + net/http-easy + racket/gui/easy + racket/gui/easy/operator + "archiver-database.rkt" + "archiver.rkt" + "../lib/url-utils.rkt" + "../lib/xexpr-utils.rkt") + +(define active-threads (mutable-seteq)) + +(define/obs @auto-retry #f) +(define/obs @wikiname "") +(define/obs @state 'waiting) +(define/obs @num-pages 1) +(define/obs @done-pages 0) +(define/obs @just-done "") +(define/obs @queue '()) +(define @title + (obs-combine + (λ (state queue num-pages done-pages) + (define suffix (if (pair? queue) + (format " +~a" (length queue)) + "")) + (define progress (if (eq? num-pages 0) + " 0%" + (format " ~a%" (round (inexact->exact (* (/ done-pages num-pages) 100)))))) + (case state + [(waiting stage-0) (format "Fandom Archiver~a" suffix)] + [(stage-1) (format "Fandom Archiver 0%~a" suffix)] + [(stage-2) (format "Fandom Archiver~a~a" progress suffix)] + [(err) "ERROR Fandom Archiver"] + [(done) "Fandom Archiver 100%"])) + @state @queue @num-pages (obs-throttle @done-pages #:duration 5000))) + +(define-syntax-rule (t body ...) + (set-add! active-threads (thread (λ () body ...)))) + +(define (do-start-or-queue) + (define wikiname (obs-peek @wikiname)) + (:= @wikiname "") + (when (not (equal? (string-trim wikiname) "")) + (@queue . <~ . (λ (q) (append q (list wikiname)))) + (shift-queue-maybe))) + +(define (shift-queue-maybe) + (when (memq (obs-peek @state) '(waiting done)) + (define q (obs-peek @queue)) + (cond + [(pair? q) + (define wikiname (car q)) + (:= @queue (cdr q)) + (do-start-stage1 wikiname)] + [#t (:= @state 'done)]))) + +(define (do-start-stage1 wikiname) + (:= @just-done "") + (:= @done-pages 0) + (:= @num-pages 1) + (t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)]) + (:= @state 'stage-0) + (if-necessary-download-list-of-pages wikiname (λ (now-done num-pages just-done-name) + (:= @num-pages num-pages) + (:= @done-pages now-done) + (:= @just-done just-done-name) + (:= @state 'stage-1))) + (do-start-stage2 wikiname)))) + +(define (do-start-stage2 wikiname) + (:= @just-done "") + (:= @num-pages 1) + (:= @done-pages 0) + (t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)]) + (save-each-page wikiname (λ (now-done num-pages just-done-path) + (:= @num-pages num-pages) + (:= @done-pages now-done) + (:= @just-done just-done-path))) + (:= @state 'waiting) + (shift-queue-maybe))) + (:= @state 'stage-2)) + +(define (exn->string e) + (with-output-to-string + (λ () + (displayln (exn-message e)) + (displayln "context:") + (for ([item (continuation-mark-set->context (exn-continuation-marks e))]) + (printf " ~a" (srcloc->string (cdr item))) + (when (car item) + (printf ": ~a" (car item))) + (displayln ""))))) + +(define ((handle-graphical-exn wikiname) e) + (displayln (exn->string e) (current-error-port)) + (cond + [(obs-peek @auto-retry) + (do-retry-end wikiname)] + [#t + (:= @state 'err) + (thread + (λ () + (define/obs @visible? #t) + (render + (dialog #:title "Download Error" + #:style '(resize-border) + #:mixin (λ (%) (class % (super-new) + (obs-observe! @visible? (λ (visible?) (send this show visible?))))) + (vpanel #:margin '(15 15) + (text "Encountered this error while downloading:") + (input #:style '(multiple hscroll) + #:min-size '(#f 200) + (exn->string e)) + (button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname))) + (button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname))) + (button "Skip Wiki" (λ () (:= @visible? #f) (do-continue))) + (button "Use Auto-Retry" (λ () + (:= @auto-retry #t) + (:= @visible? #f) + (do-retry-end wikiname))) + (text "Be careful not to auto-retry an infinite loop!"))) + main-window))) + (sleep) + ; make sure the old broken threads are all gone + (for ([th active-threads]) (kill-thread th)) + (set-clear! active-threads)])) + +(define (do-retry-now wikiname) + (@queue . <~ . (λ (q) (append (list wikiname) q))) + (:= @state 'waiting) + (shift-queue-maybe)) + +(define (do-retry-end wikiname) + (@queue . <~ . (λ (q) (append q (list wikiname)))) + (:= @state 'waiting) + (shift-queue-maybe)) + +(define (do-continue) + (:= @state 'waiting) + (shift-queue-maybe)) + +(define (display-basename basename) + (define limit 40) + (cond [(string? basename) + (define query (basename->name-for-query basename)) + (define segments (string-split query "/")) + (when (and ((string-length query) . > . limit) ((length segments) . >= . 2)) + (set! query (string-append ".../" (last segments)))) + (when ((string-length query) . > . limit) + (set! query (string-append (substring query 0 (- limit 3)) "..."))) + query] + [#t "?"])) + +(define main-window + (render + (window #:title @title + #:size '(360 200) + #:mixin (λ (%) (class % + (super-new) + (define/augment (on-close) + (for ([th active-threads]) (kill-thread th)) + (disconnect slc)))) + ;; input box at the top + (hpanel (text "https://") + (input @wikiname + (λ (event data) (cond + [(eq? event 'input) (:= @wikiname data)] + [(eq? event 'return) (do-start-or-queue)]))) + (text ".fandom.com")) + (button (@queue . ~> . (λ (q) (if (null? q) "Start" "Queue"))) (λ () (do-start-or-queue))) + (text (@queue . ~> . (λ (q) (if (null? q) "" (string-join #:before-first "Queue: " q ", "))))) + ;; show status based on overall application state + (case-view + @state + ;; waiting for wikiname entry + ((waiting) (vpanel + (text "Fill in the wikiname and click start."))) + ((stage-0) (vpanel + (text "Checking data..."))) + ((stage-1) (vpanel + (text "Gathering list of pages...") + (text (@just-done . ~> . display-basename)) + (text (@done-pages . ~> . (λ (x) (if (eq? x 0) + "0/?" + (format "~a/~a" x (obs-peek @num-pages)))))))) + ;; downloading contents + ((stage-2) (vpanel + (text "Downloading page text...") + (progress @done-pages #:range @num-pages) + (text (@done-pages . ~> . (λ (x) (format "~a/~a" x (obs-peek @num-pages))))) + (text (@just-done . ~> . display-basename)))) + ((done) (vpanel + (text "All wikis downloaded!"))) + ((err) (vpanel + (text "Error. Check the popup window."))) + (else (text (@state . ~> . (λ (state) (format "invalid state: ~a" state)))))) + (checkbox #:label "Auto-retry on error? (Dangerous)" + #:checked? @auto-retry + (λ:= @auto-retry))))) diff --git a/archiver/archiver.rkt b/archiver/archiver.rkt new file mode 100644 index 0000000..fbeec46 --- /dev/null +++ b/archiver/archiver.rkt @@ -0,0 +1,309 @@ +#lang racket/base +(require racket/file + racket/function + racket/list + racket/runtime-path + racket/string + net/url + net/mime + file/sha1 + net/http-easy + db + "../lib/html-parsing/main.rkt" + json + "archiver-database.rkt" + "../lib/mime-types.rkt" + "../lib/tree-updater.rkt" + "../lib/url-utils.rkt" + "../lib/xexpr-utils.rkt" + "../lib/archive-file-mappings.rkt") + +(define archive-slc slc) + +(provide + if-necessary-download-list-of-pages + download-list-of-pages + save-each-page + basename->name-for-query + image-url->values + hash->save-dir + archive-slc) + +(module+ test + (require rackunit)) + +(define-runtime-path archive-root "../storage/archive") +#;(define archive-root "archive") + +(define sources '#hasheq((style . 1) (page . 2))) + +(define (get-origin wikiname) + (format "https://~a.fandom.com" wikiname)) + +(define (insert-wiki-entry wikiname) + (define dest-url + (format "https://~a.fandom.com/api.php?~a" + wikiname + (params->query '(("action" . "query") + ("meta" . "siteinfo") + ("siprop" . "general|rightsinfo") + ("format" . "json") + ("formatversion" . "2"))))) + (define data (response-json (get dest-url))) + (define exists? (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) + (if exists? + (query-exec slc "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" + (jp "/query/general/sitename" data) + (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) + (jp "/query/rightsinfo/text" data) + (jp "/query/rightsinfo/url" data) + wikiname) + (query-exec slc "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 1, ?, ?, ?, ?)" + wikiname + (jp "/query/general/sitename" data) + (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) + (jp "/query/rightsinfo/text" data) + (jp "/query/rightsinfo/url" data)))) + +;; call 1 if not yet done for that wiki +(define (if-necessary-download-list-of-pages wikiname callback) + (define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) + ;; done yet? + (unless (and (real? wiki-progress) (wiki-progress . >= . 1)) + ;; count total pages + (define dest-url + (format "https://~a.fandom.com/api.php?~a" + wikiname + (params->query `(("action" . "query") ("meta" . "siteinfo") ("siprop" . "statistics") ("format" . "json"))))) + (define num-pages (jp "/query/statistics/articles" (response-json (get dest-url)))) + (download-list-of-pages wikiname callback 0 num-pages #f))) + +;; 1. Download list of wiki pages and store in database +(define (download-list-of-pages wikiname callback total-so-far grand-total path-with-namefrom) + (define url (if path-with-namefrom + (format "https://~a.fandom.com~a" wikiname path-with-namefrom) + (format "https://~a.fandom.com/wiki/Local_Sitemap" wikiname))) + (define r (get url)) + (define page (html->xexp (bytes->string/utf-8 (response-body r)))) + (define link-namefrom + ((query-selector (λ (t a c x) (and (eq? t 'a) + (pair? x) + (string-contains? (car x) "Next page") + (let ([href (get-attribute 'href a)] ) + (and href (string-contains? href "/wiki/Local_Sitemap"))))) + page #:include-text? #t))) + (define row-values + (for/list ([link (in-producer + (query-selector + (λ (t a c) (eq? t 'a)) + ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page))) + #f)]) + (list wikiname (local-encoded-url->basename (get-attribute 'href (bits->attributes link))) 0))) + (define query-template (string-join (make-list (length row-values) "(?, ?, ?)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values ")) + (apply query-exec slc query-template (flatten row-values)) + (define new-total (+ (length row-values) total-so-far)) + (callback new-total grand-total (second (last row-values))) + (cond + [link-namefrom ; repeat on the next page + (download-list-of-pages wikiname callback new-total grand-total (get-attribute 'href (bits->attributes link-namefrom)))] + [#t ; all done downloading sitemap + (insert-wiki-entry wikiname)])) + +;; 2. Download each page via API and: +;; * Save API response to file +(define max-page-progress 1) +(define (save-each-page wikiname callback) + ;; prepare destination folder + (define save-dir (build-path archive-root wikiname)) + (make-directory* save-dir) + ;; gather list of basenames to download (that aren't yet complete) + (define basenames (query-list slc "select basename from page where wikiname = ? and progress < ?" + wikiname max-page-progress)) + ;; counter of complete/incomplete basenames + (define already-done-count + (query-value slc "select count(*) from page where wikiname = ? and progress = ?" + wikiname max-page-progress)) + (define not-done-count + (query-value slc "select count(*) from page where wikiname = ? and progress < ?" + wikiname max-page-progress)) + ;; set initial progress + (callback already-done-count (+ already-done-count not-done-count) "") + ;; loop through basenames and download + (for ([basename basenames] + [i (in-naturals 1)]) + (define name-for-query (basename->name-for-query basename)) + (define dest-url + (format "https://~a.fandom.com/api.php?~a" + wikiname + (params->query `(("action" . "parse") + ("page" . ,name-for-query) + ("prop" . "text|headhtml|langlinks") + ("formatversion" . "2") + ("format" . "json"))))) + (define r (get dest-url)) + (define body (response-body r)) + (define filename (string-append basename ".json")) + (define save-path + (cond [((string-length basename) . > . 240) + (define key (sha1 (string->bytes/latin-1 basename))) + (query-exec slc "insert into special_page (wikiname, key, basename) values (?, ?, ?)" + wikiname key basename) + (build-path save-dir (string-append key ".json"))] + [#t + (build-path save-dir (string-append basename ".json"))])) + (display-to-file body save-path #:exists 'replace) + (query-exec slc "update page set progress = 1 where wikiname = ? and basename = ?" + wikiname basename) + (callback (+ already-done-count i) (+ already-done-count not-done-count) basename)) + ;; saved all pages, register that fact in the database + (query-exec slc "update wiki set progress = 2 where wikiname = ?" wikiname)) + +;; 3. Download CSS and: +;; * Save CSS to file +;; * Record style images to database +(define (check-style-for-images wikiname path) + (define content (file->string path)) + (define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr)) + (for/list ([url urls] + #:when (not (or (equal? url "") + (equal? url "'") + (string-contains? url "/resources-ucp/") + (string-contains? url "/fonts/") + (string-contains? url "/drm_fonts/") + (string-contains? url "//db.onlinewebfonts.com/") + (string-contains? url "//bits.wikimedia.org/") + (string-contains? url "dropbox") + (string-contains? url "only=styles") + (string-contains? url "https://https://") + (regexp-match? #rx"^%20" url) + (regexp-match? #rx"^data:" url)))) + (cond + [(string-prefix? url "https://") url] + [(string-prefix? url "http://") (regexp-replace #rx"http:" url "https:")] + [(string-prefix? url "//") (string-append "https:" url)] + [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)] + [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)]))) + +(define (download-styles-for-wiki wikiname) + (define save-dir (build-path archive-root wikiname "styles")) + (make-directory* save-dir) + (define theme (λ (theme-name) + (cons (format "https://~a.fandom.com/wikia.php?controller=ThemeApi&method=themeVariables&variant=~a" wikiname theme-name) + (build-path save-dir (format "themeVariables-~a.css" theme-name))))) + ;; (Listof (Pair url save-path)) + (define styles + (list + (theme "default") + (theme "light") + (theme "dark") + (cons (format "https://~a.fandom.com/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop" wikiname) + (build-path save-dir "site.css")))) + (for ([style styles]) + (define r (get (car style))) + (define body (response-body r)) + (display-to-file body (cdr style) #:exists 'replace) + ;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url? + ) + styles) + +(define (do-step-3 wikiname) + (define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) + (unless (and (number? wiki-progress) (wiki-progress . >= . 3)) + (define styles (download-styles-for-wiki wikiname)) + (define unique-image-urls + (remove-duplicates + (map image-url->values + (flatten + (for/list ([style styles]) + (check-style-for-images wikiname (cdr style))))) + #:key cdr)) + (println unique-image-urls) + (for ([pair unique-image-urls]) + (query-exec slc "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair))) + (query-exec slc "update wiki set progress = 3 where wikiname = ?" wikiname))) + +;; 4: From downloaded pages, record URLs of image sources and inline style images to database +(define (hash->save-dir wikiname hash) + (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2))) + +(define (image-url->values i) + ;; TODO: handle case where there is multiple cb parameter on minecraft wiki + ;; TODO: ensure it still "works" with broken & on minecraft wiki + (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing + (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary + (define hash (sha1 (string->bytes/utf-8 key))) + (cons key hash)) + +(define (check-json-for-images wikiname path) + (define data (with-input-from-file path (λ () (read-json)))) + (define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) + (define tree (update-tree-wiki page wikiname)) + (remove-duplicates + (for/list ([element (in-producer + (query-selector + (λ (t a c) + (and (eq? t 'img) + (get-attribute 'src a))) + tree) + #f)]) + (image-url->values (get-attribute 'src (bits->attributes element)))))) + +;; 5. Download image sources and style images according to database +(define (save-each-image wikiname source callback) + ;; gather list of basenames to download (that aren't yet complete) + (define rows (query-rows slc "select url, hash from image where wikiname = ? and source <= ? and progress < 1" + wikiname source)) + ;; counter of complete/incomplete basenames + (define already-done-count + (query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress = 1" + wikiname source)) + (define not-done-count + (query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress < 1" + wikiname source)) + ;; set initial progress + (callback already-done-count (+ already-done-count not-done-count) "") + ;; loop through urls and download + (for ([row rows] + [i (in-naturals 1)]) + ;; row fragments + (define url (vector-ref row 0)) + (define hash (vector-ref row 1)) + ;; check + (printf "~a -> ~a~n" url hash) + (define r (get url)) + (define declared-type (response-headers-ref r 'content-type)) + (define final-type (if (equal? declared-type #"application/octet-stream") + (let ([sniff-entity (message-entity (mime-analyze (response-body r)))]) + (string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity)))) + declared-type)) + (define ext (bytes->string/latin-1 (mime-type->ext final-type))) + ;; save + (define save-dir (hash->save-dir wikiname hash)) + (make-directory* save-dir) + (define save-path (build-path save-dir (string-append hash "." ext))) + (define body (response-body r)) + (display-to-file body save-path #:exists 'replace) + (query-exec slc "update image set progress = 1, ext = ? where wikiname = ? and hash = ?" + ext wikiname hash) + (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append hash "." ext))) + ;; TODO: saved all images, register that fact in the database + ) + + +(module+ test + (check-equal? (html->xexp "") + '(*TOP* (img (@ (src "https://example.com/images?src=Blah.jpg&width=150"))))) + #;(download-list-of-pages "minecraft" values) + #;(save-each-page "minecraft" values) + #;(check-json-for-images "chiki" (build-path archive-root "chiki" "Fiona.json")) + #;(do-step-3 "gallowmere") + #;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))) + + #;(for ([wikiname (query-list slc "select wikiname from wiki")]) + (println wikiname) + (insert-wiki-entry wikiname)) + + #;(for ([wikiname (query-list slc "select wikiname from wiki")]) + (println wikiname) + (do-step-3 wikiname) + (save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))))) diff --git a/breezewiki.rkt b/breezewiki.rkt index 44d6771..2e2772f 100644 --- a/breezewiki.rkt +++ b/breezewiki.rkt @@ -19,8 +19,10 @@ (require-reloadable "src/page-search.rkt" page-search) (require-reloadable "src/page-set-user-settings.rkt" page-set-user-settings) (require-reloadable "src/page-static.rkt" static-dispatcher) +(require-reloadable "src/page-static-archive.rkt" page-static-archive) (require-reloadable "src/page-subdomain.rkt" subdomain-dispatcher) (require-reloadable "src/page-wiki.rkt" page-wiki) +(require-reloadable "src/page-wiki-offline.rkt" page-wiki-offline) (require-reloadable "src/page-file.rkt" page-file) (reload!) @@ -42,7 +44,9 @@ page-proxy page-search page-set-user-settings + page-static-archive page-wiki + page-wiki-offline page-file redirect-wiki-home static-dispatcher diff --git a/dist.rkt b/dist.rkt index 35e1824..deb08a8 100644 --- a/dist.rkt +++ b/dist.rkt @@ -13,8 +13,10 @@ (require (only-in "src/page-search.rkt" page-search)) (require (only-in "src/page-set-user-settings.rkt" page-set-user-settings)) (require (only-in "src/page-static.rkt" static-dispatcher)) +(require (only-in "src/page-static-archive.rkt" page-static-archive)) (require (only-in "src/page-subdomain.rkt" subdomain-dispatcher)) (require (only-in "src/page-wiki.rkt" page-wiki)) +(require (only-in "src/page-wiki-offline.rkt" page-wiki-offline)) (require (only-in "src/page-file.rkt" page-file)) (serve/launch/wait @@ -31,7 +33,9 @@ page-proxy page-search page-set-user-settings + page-static-archive page-wiki + page-wiki-offline page-file redirect-wiki-home static-dispatcher diff --git a/lib/archive-file-mappings.rkt b/lib/archive-file-mappings.rkt new file mode 100644 index 0000000..4aa8a69 --- /dev/null +++ b/lib/archive-file-mappings.rkt @@ -0,0 +1,28 @@ +#lang racket/base +(require racket/string + net/url + (only-in net/uri-codec uri-decode) + "url-utils.rkt") +(provide + local-encoded-url->segments + url-segments->basename + local-encoded-url->basename + basename->name-for-query + url-segments->guess-title) + +(define (local-encoded-url->segments str) ; '("wiki" "Page_title") + (map path/param-path (url-path (string->url str)))) + +(define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix + (define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments))) + (define basic-filename (string-join extra-encoded "#")) + basic-filename) + +(define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix + (url-segments->basename (local-encoded-url->segments str))) + +(define (basename->name-for-query str) + (uri-decode (regexp-replace* #rx"#" str "/"))) + +(define (url-segments->guess-title segments) + (regexp-replace* #rx"_" (cadr segments) " ")) diff --git a/lib/html-parsing/main.rkt b/lib/html-parsing/main.rkt new file mode 100644 index 0000000..bdc09b1 --- /dev/null +++ b/lib/html-parsing/main.rkt @@ -0,0 +1,1887 @@ +#lang racket/base +;; Copyright Neil Van Dyke. For legal info, see file "info.rkt". + +(require mcfly) + +(module+ test + (require overeasy)) + +(doc (section "Introduction") + + (para "The " + (code "html-parsing") + " library provides a permissive HTML parser. The parser is useful +for software agent extraction of information from Web pages, for +programmatically transforming HTML files, and for implementing interactive Web +browsers. " + (code "html-parsing") + " emits " + ;; TODO: 2016-02-21 Once create sxml-doc package, reference that. + (seclink "top" + #:doc '(lib "sxml-intro/sxml-intro.scrbl") + #:indirect? #true + "SXML/xexp") + ", so that conventional HTML may be processed with XML tools such as +SXPath. Like Oleg Kiselyov's " + (hyperlink "http://pobox.com/~oleg/ftp/Scheme/xml.html#HTML-parser" + "SSAX-based HTML parser") + ", " + (code "html-parsing") + " provides a permissive tokenizer, but " + (code "html-parsing") + " extends this by attempting to recover syntactic structure.") + + (para "The " + (code "html-parsing") + " parsing behavior is permissive in that it accepts erroneous HTML, +handling several classes of HTML syntax errors gracefully, without yielding a +parse error. This is crucial for parsing arbitrary real-world Web pages, since +many pages actually contain syntax errors that would defeat a strict or +validating parser. " + (code "html-parsing") + "'s handling of errors is intended to generally emulate popular Web +browsers' interpretation of the structure of erroneous HTML.") + (para (code "html-parsing") + " also has some support for XHTML, although XML namespace qualifiers +are accepted but stripped from the resulting SXML/xexp. Note that " + (italic "valid") + " XHTML input might be better handled by a validating XML parser +like Kiselyov's SSAX.")) + +;; BEGIN COPIED FROM XEXP PACKAGE + +(define (%html-parsing:make-xexp-char-ref val) + (if (or (symbol? val) (integer? val)) + `(& ,val) + (error 'make-xexp-char-ref + "invalid xexp reference value: ~S" + val))) + +(define %html-parsing:always-empty-html-elements + '(area base br frame hr img input isindex keygen link meta param + spacer wbr)) + +;; END COPIED FROM XEXP PACKAGE + +(define %html-parsing:empty-token-symbol '*empty*) +(define %html-parsing:end-token-symbol '*end*) +(define %html-parsing:start-token-symbol '*start*) +(define %html-parsing:entity-token-symbol '*entity*) +(define %html-parsing:text-string-token-symbol '*text-string*) +(define %html-parsing:text-char-token-symbol '*text-char*) + +(define %html-parsing:make-html-tokenizer + ;; TODO: Have the tokenizer replace contiguous whitespace within individual + ;; text tokens with single space characters (except for when in `pre' and + ;; verbatim elements). The parser will introduce new contiguous whitespace + ;; (e.g., when text tokens are concatenated, invalid end tags are removed, + ;; whitespace is irrelevant between certain elements), but then the parser + ;; only has to worry about the first and last character of each string. + ;; Perhaps the text tokens should have both leading and trailing whitespace + ;; stripped, and contain flags for whether or not leading and trailing + ;; whitespace occurred. + (letrec ((no-token '()) + + ;; TODO: Maybe make these three variables options. + + (verbatim-to-eof-elems '(plaintext)) + + (verbatim-pair-elems '(script server style xmp)) + + (ws-chars (list #\space + (integer->char 9) + (integer->char 10) + (integer->char 11) + (integer->char 12) + (integer->char 13))) + + (gosc/string-or-false + (lambda (os) + (let ((s (get-output-string os))) + (if (string=? s "") #f s)))) + + (gosc/symbol-or-false + (lambda (os) + (let ((s (gosc/string-or-false os))) + (if s (string->symbol s) #f)))) + ) + (lambda (in normalized?) + ;; TODO: Make a tokenizer option that causes XML namespace qualifiers to + ;; be ignored. + (letrec + ( + ;; Port buffer with inexpensive unread of one character and slightly + ;; more expensive pushback of second character to unread. The + ;; procedures themselves do no consing. The tokenizer currently + ;; needs two-symbol lookahead, due to ambiguous "/" while parsing + ;; element and attribute names, which could be either empty-tag + ;; syntax or XML qualified names. + (c #f) + (next-c #f) + (c-consumed? #t) + (read-c (lambda () + (if c-consumed? + (if next-c + (begin (set! c next-c) + (set! next-c #f)) + (set! c (read-char in))) + (set! c-consumed? #t)))) + (unread-c (lambda () + (if c-consumed? + (set! c-consumed? #f) + ;; TODO: Procedure name in error message really + ;; isn't "%html-parsing:make-html-tokenizer"... + (error '%html-parsing:make-html-tokenizer + "already unread: ~S" + c)))) + (push-c (lambda (new-c) + (if c-consumed? + (begin (set! c new-c) + (set! c-consumed? #f)) + (if next-c + (error '%html-parsing:make-html-tokenizer + "pushback full: ~S" + c) + (begin (set! next-c c) + (set! c new-c) + (set! c-consumed? #f)))))) + + ;; TODO: These procedures are a temporary convenience for + ;; enumerating the pertinent character classes, with an eye towards + ;; removing redundant tests of character class. These procedures + ;; should be eliminated in a future version. + (c-eof? (lambda () (eof-object? c))) + (c-amp? (lambda () (eqv? c #\&))) + (c-apos? (lambda () (eqv? c #\'))) + (c-bang? (lambda () (eqv? c #\!))) + (c-colon? (lambda () (eqv? c #\:))) + (c-quot? (lambda () (eqv? c #\"))) + (c-equals? (lambda () (eqv? c #\=))) + (c-gt? (lambda () (eqv? c #\>))) + (c-lsquare? (lambda () (eqv? c #\[))) + (c-lt? (lambda () (eqv? c #\<))) + (c-minus? (lambda () (eqv? c #\-))) + (c-pound? (lambda () (eqv? c #\#))) + (c-ques? (lambda () (eqv? c #\?))) + (c-semi? (lambda () (eqv? c #\;))) + (c-slash? (lambda () (eqv? c #\/))) + (c-splat? (lambda () (eqv? c #\*))) + (c-lf? (lambda () (eqv? c #\newline))) + (c-angle? (lambda () (memv c '(#\< #\>)))) + (c-ws? (lambda () (memv c ws-chars))) + (c-alpha? (lambda () (char-alphabetic? c))) + (c-digit? (lambda () (char-numeric? c))) + (c-alphanum? (lambda () (or (c-alpha?) (c-digit?)))) + (c-hexlet? (lambda () (memv c '(#\a #\b #\c #\d #\e #\f + #\A #\B #\C #\D #\E #\F)))) + + (skip-ws (lambda () (read-c) (if (c-ws?) (skip-ws) (unread-c)))) + + (if-read-chars + (lambda (match-chars yes-thunk no-proc) + (let loop ((chars match-chars) + (match-count 0)) + (if (null? chars) + (yes-thunk) + (begin (read-c) + (if (eqv? c (car chars)) + (begin (loop (cdr chars) (+ 1 match-count))) + (begin (unread-c) + (no-proc match-chars match-count)))))))) + + (write-chars-count + (lambda (chars count port) + (let loop ((chars chars) + (count count)) + (or (zero? count) + (begin (write-char (car chars) port) + (loop (cdr chars) + (- count 1))))))) + + (make-start-token + (if normalized? + (lambda (name ns attrs) + (list name (cons '@ attrs))) + (lambda (name ns attrs) + (if (null? attrs) + (list name) + (list name (cons '@ attrs)))))) + + (make-empty-token + (lambda (name ns attrs) + (cons %html-parsing:empty-token-symbol + (make-start-token name ns attrs)))) + + (make-end-token + (if normalized? + (lambda (name ns attrs) + (list %html-parsing:end-token-symbol + name + (cons '@ attrs))) + (lambda (name ns attrs) + (if (null? attrs) + (list %html-parsing:end-token-symbol name) + (list %html-parsing:end-token-symbol + name + (cons '@ attrs)))))) + + (make-comment-token + (lambda (str) (list '*COMMENT* str))) + + (make-decl-token + (lambda (parts) (cons '*DECL* parts))) + + (scan-qname + ;; TODO: Make sure we don't accept local names that have "*", since + ;; this can break SXML tools. Have to validate this afterwards if + ;; "verbatim-safe?". Also check for "@" and maybe "@@". Check + ;; qname parsing code, especially for verbatim mode. This is + ;; important! + (lambda (verbatim-safe?) + ;; Note: If we accept some invalid local names, we only need two + ;; symbols of lookahead to determine the end of a qname. + (letrec ((os #f) + (ns '()) + (vcolons 0) + (good-os (lambda () + (or os + (begin (set! os (open-output-string)) + os))))) + (let loop () + (read-c) + (cond ((c-eof?) #f) + ((or (c-ws?) (c-splat?)) + (if verbatim-safe? + (unread-c) + #f)) + ((or (c-angle?) (c-equals?) (c-quot?) (c-apos?)) + (unread-c)) + ((c-colon?) + (or (null? ns) + (set! ns (cons ":" ns))) + (if os + (begin + (set! ns (cons (get-output-string os) + ns)) + (set! os #f)) + #f) + (loop)) + ((c-slash?) + (read-c) + (cond ((or (c-eof?) + (c-ws?) + (c-equals?) + (c-apos?) + (c-quot?) + (c-angle?) + (c-splat?)) + (unread-c) + (push-c #\/)) + (else (write-char #\/ (good-os)) + (write-char c os) + (loop)))) + (else (write-char c (good-os)) + (loop)))) + (let ((ns (if (null? ns) + #f + (apply string-append + (reverse ns)))) + (localname (if os (get-output-string os) #f))) + (if verbatim-safe? + ;; TODO: Make sure we don't have ambiguous ":" or drop + ;; any characters! + (cons ns localname) + ;; Note: We represent "xml:" and "xmlns:" syntax as + ;; normal qnames, for lack of something better to do with + ;; them when we don't support XML namespaces. + ;; + ;; TODO: Local names are currently forced to lowercase, + ;; since HTML is usually case-insensitive. If XML + ;; namespaces are used, we might wish to keep local names + ;; case-sensitive. + (if localname + (if ns + (if (or (string=? ns "xml") + (string=? ns "xmlns")) + (string->symbol (string-append ns + ":" + localname)) + (cons ns + (string->symbol (string-downcase + localname)))) + (string->symbol (string-downcase localname))) + (if ns + (string->symbol (string-downcase ns)) + ;; TODO: Ensure in rest of code that returning #f + ;; as a name here is OK. + #f))))))) + + (scan-tag + (lambda (start?) + (skip-ws) + (let ((tag-name (scan-qname #f)) + (tag-ns #f) + (tag-attrs #f) + (tag-empty? #f)) + ;; Scan element name. + (if (pair? tag-name) + (begin (set! tag-ns (car tag-name)) + (set! tag-name (cdr tag-name))) + #f) + ;; TODO: Ensure there's no case in which a #f tag-name isn't + ;; compensated for later. + ;; + ;; Scan element attributes. + (set! tag-attrs + (let scan-attr-list () + (read-c) + (cond ((c-eof?) '()) + ((c-angle?) (unread-c) '()) + ((c-slash?) + (set! tag-empty? #t) + (scan-attr-list)) + ((c-alpha?) + (unread-c) + (let ((attr (scan-attr))) + (cons attr (scan-attr-list)))) + (else (scan-attr-list))))) + ;; Find ">" or unnatural end. + (let loop () + (read-c) + (cond ((c-eof?) no-token) + ((c-slash?) (set! tag-empty? #t) (loop)) + ((c-gt?) #f) + ((c-ws?) (loop)) + (else (unread-c)))) + ;; Change the tokenizer mode if necessary. + (cond ((not start?) #f) + (tag-empty? #f) + ;; TODO: Maybe make one alist lookup here, instead of + ;; two. + ((memq tag-name verbatim-to-eof-elems) + (set! nexttok verbeof-nexttok)) + ((memq tag-name verbatim-pair-elems) + (set! nexttok (make-verbpair-nexttok tag-name)))) + ;; Return a token object. + (if start? + (if tag-empty? + (make-empty-token tag-name tag-ns tag-attrs) + (make-start-token tag-name tag-ns tag-attrs)) + (make-end-token tag-name tag-ns tag-attrs))))) + + (scan-attr + (lambda () + (let ((name (scan-qname #f)) + (val #f)) + (if (pair? name) + (set! name (cdr name)) + #f) + (let loop-equals-or-end () + (read-c) + (cond ((c-eof?) no-token) + ((c-ws?) (loop-equals-or-end)) + ((c-equals?) + (let loop-quote-or-unquoted () + (read-c) + (cond ((c-eof?) no-token) + ((c-ws?) (loop-quote-or-unquoted)) + ((or (c-apos?) (c-quot?)) + (let ((term c)) + (set! val (open-output-string)) + (let loop-quoted-val () + (read-c) + (cond ((c-eof?) #f) + ((eqv? c term) #f) + ((c-amp?) (let ((entity (scan-entity))) + (display entity val) + (loop-quoted-val))) + (else (write-char c val) + (loop-quoted-val)))))) + ((c-angle?) (unread-c)) + (else + (set! val (open-output-string)) + (write-char c val) + (let loop-unquoted-val () + (read-c) + (cond ((c-eof?) no-token) + ((c-apos?) #f) + ((c-quot?) #f) + ((or (c-ws?) (c-angle?) + ;;(c-slash?) + ) + (unread-c)) + ;; Note: We can treat a slash in an + ;; unquoted attribute value as a + ;; value constituent because the + ;; slash is specially-handled only + ;; for XHTML, and XHTML attribute + ;; values must always be quoted. We + ;; could do lookahead for "/>", but + ;; that wouldn't let us parse HTML + ;; "" correctly, so this is + ;; an easier and more correct way to + ;; do things. + (else (write-char c val) + (loop-unquoted-val)))))))) + (else (unread-c)))) + (if normalized? + (list name (if val + (get-output-string val) + (symbol->string name))) + (if val + (list name (get-output-string val)) + (list name)))))) + + (scan-comment + ;; TODO: Rewrite this to use tail recursion rather than a state + ;; variable. + (lambda () + (let ((os (open-output-string)) + (state 'start-minus)) + (let loop () + (read-c) + (cond ((c-eof?) #f) + ((c-minus?) + (set! state + (case state + ((start-minus) 'start-minus-minus) + ((start-minus-minus body) 'end-minus) + ((end-minus) 'end-minus-minus) + ((end-minus-minus) (write-char #\- os) state) + (else (error '<%html-parsing:make-html-tokenizer> + "invalid state: ~S" + state)))) + (loop)) + ((and (c-gt?) (eq? state 'end-minus-minus)) #f) + (else (case state + ((end-minus) (write-char #\- os)) + ((end-minus-minus) (display "--" os))) + (set! state 'body) + (write-char c os) + (loop)))) + (make-comment-token (get-output-string os))))) + + (scan-possible-cdata + (lambda () + ;; Read ") + (lambda () (get-output-string os)) + (lambda (chars count) + (if (zero? count) + (if (eof-object? c) + (get-output-string os) + (begin (write-char c os) + (read-c) + (loop))) + (begin (write-char #\] os) + (if (= count 2) + (push-c #\]) + #f) + (loop))))))))) + + (scan-pi + (lambda () + (skip-ws) + (let ((name (open-output-string)) + (val (open-output-string))) + (let scan-name () + (read-c) + (cond ((c-eof?) #f) + ((c-ws?) #f) + ((c-alpha?) (write-char c name) (scan-name)) + (else (unread-c)))) + ;; TODO: Do we really want to emit #f for PI name? + (set! name (gosc/symbol-or-false name)) + (let scan-val () + (read-c) + (cond ((c-eof?) #f) + ;; ((c-amp?) (display (scan-entity) val) + ;; (scan-val)) + ((c-ques?) + (read-c) + (cond ((c-eof?) (write-char #\? val)) + ((c-gt?) #f) + (else (write-char #\? val) + (unread-c) + (scan-val)))) + (else (write-char c val) (scan-val)))) + (list '*PI* + name + (get-output-string val))))) + + (scan-decl + ;; TODO: Find if SXML includes declaration forms, and if so, use + ;; whatever format SXML wants. + ;; + ;; TODO: Rewrite to eliminate state variables. + (letrec + ((scan-parts + (lambda () + (let ((part (open-output-string)) + (nonsymbol? #f) + (state 'before) + (last? #f)) + (let loop () + (read-c) + (cond ((c-eof?) #f) + ((c-ws?) + (case state + ((before) (loop)) + ((quoted) (write-char c part) (loop)))) + ((and (c-gt?) (not (eq? state 'quoted))) + (set! last? #t)) + ((and (c-lt?) (not (eq? state 'quoted))) + (unread-c)) + ((c-quot?) + (case state + ((before) (set! state 'quoted) (loop)) + ((unquoted) (unread-c)) + ((quoted) #f))) + (else + (if (eq? state 'before) + (set! state 'unquoted) + #f) + (set! nonsymbol? (or nonsymbol? + (not (c-alphanum?)))) + (write-char c part) + (loop)))) + (set! part (get-output-string part)) + (if (string=? part "") + '() + (cons (if (or (eq? state 'quoted) nonsymbol?) + part + ;; TODO: Normalize case of things we make + ;; into symbols here. + (string->symbol part)) + (if last? + '() + (scan-parts)))))))) + (lambda () (make-decl-token (scan-parts))))) + + (special-entity-reverse-chars-to-string-alist + '(((#\p #\m #\a) . "&") + ((#\s #\o #\p #\a) . "'") + ((#\t #\g) . ">") + ((#\t #\l) . "<") + ((#\t #\o #\u #\q) . "\""))) + + (finish-terminated-named-entity + (lambda (reverse-name-chars) + (cond ((equal? '() reverse-name-chars) + "&") + ((assoc reverse-name-chars + special-entity-reverse-chars-to-string-alist) + => (lambda (p) + (cdr p))) + (else (%html-parsing:make-xexp-char-ref + (string->symbol (apply string (reverse reverse-name-chars)))))))) + + (finish-unterminated-named-entity + (lambda (reverse-name-chars) + (apply string (cons #\& (reverse reverse-name-chars))))) + + (scan-entity + (lambda () + (read-c) + (cond ((c-eof?) "&") + ((c-alpha?) + ;; TODO: Do entity names have a maximum length? + (let loop ((reverse-name-chars (cons c '()))) + (read-c) + (cond ((c-eof?) (finish-unterminated-named-entity + reverse-name-chars)) + ((c-alpha?) (let ((reverse-name-chars (cons c reverse-name-chars))) + (cond ((assoc reverse-name-chars + special-entity-reverse-chars-to-string-alist) + => (lambda (p) + (read-c) + (or (c-semi?) + (unread-c)) + (cdr p))) + (else (loop reverse-name-chars))))) + ((c-semi?) (finish-terminated-named-entity + reverse-name-chars)) + (else (unread-c) + (finish-unterminated-named-entity + reverse-name-chars))))) + ((c-pound?) + (let ((num (open-output-string)) + (hex? #f)) + (read-c) + (cond ((c-eof?) #f) + ((memv c '(#\x #\X)) (set! hex? #t) (read-c))) + (let loop () + (cond ((c-eof?) #f) + ((c-semi?) #f) + ((or (c-digit?) (and hex? (c-hexlet?))) + (write-char c num) + (read-c) + (loop)) + (else (unread-c)))) + (set! num (get-output-string num)) + (if (string=? num "") + "&#;" + (let ((n (string->number num (if hex? 16 10)))) + (if (<= 32 n 126) + (string (integer->char n)) + (string (integer->char n))))))) + (else (unread-c) "&")))) + + (normal-nexttok + (lambda () + (read-c) + (cond ((c-eof?) no-token) + ((c-lt?) + (let loop () + (read-c) + (cond ((c-eof?) "<") + ;; ((c-ws?) (loop)) + ((c-slash?) (scan-tag #f)) + ((c-ques?) (scan-pi)) + ((c-alpha?) (unread-c) (scan-tag #t)) + ((c-bang?) + (read-c) + (if (c-lsquare?) + (scan-possible-cdata) + (let loop () + (cond ((c-eof?) no-token) + ((c-ws?) (read-c) (loop)) + ((c-minus?) (scan-comment)) + (else (unread-c) + (scan-decl)))))) + (else (unread-c) "<")))) + ((c-gt?) ">") + (else (let ((os (open-output-string))) + (let loop () + (cond ((c-eof?) #f) + ((c-angle?) (unread-c)) + ((c-amp?) + (let ((entity (scan-entity))) + (if (string? entity) + (begin (display entity os) + (read-c) + (loop)) + (let ((saved-nexttok nexttok)) + (set! nexttok + (lambda () + (set! nexttok + saved-nexttok) + entity)))))) + (else (write-char c os) + (or (c-lf?) + (begin (read-c) (loop)))))) + (let ((text (get-output-string os))) + (if (equal? text "") + (nexttok) + text))))))) + + (verbeof-nexttok + (lambda () + (read-c) + (if (c-eof?) + no-token + (let ((os (open-output-string))) + (let loop () + (or (c-eof?) + (begin (write-char c os) + (or (c-lf?) + (begin (read-c) (loop)))))) + (get-output-string os))))) + + (make-verbpair-nexttok + (lambda (elem-name) + (lambda () + (let ((os (open-output-string))) + ;; Accumulate up to a newline-terminated line. + (let loop () + (read-c) + (cond ((c-eof?) + ;; Got EOF in verbatim context, so set the normal + ;; nextok procedure, then fall out of loop. + (set! nexttok normal-nexttok)) + ((c-lt?) + ;; Got "<" in verbatim context, so get next + ;; character. + (read-c) + (cond ((c-eof?) + ;; Got "<" then EOF, so set to the normal + ;; nexttok procedure, add the "<" to the + ;; verbatim string, and fall out of loop. + (set! nexttok normal-nexttok) + (write-char #\< os)) + ((c-slash?) + ;; Got "symbol + (string-downcase local)) + elem-name)) + ;; This is the terminator tag, so + ;; scan to the end of it, set the + ;; nexttok, and fall out of the loop. + (begin + (let scan-to-end () + (read-c) + (cond ((c-eof?) #f) + ((c-gt?) #f) + ((c-lt?) (unread-c)) + ((c-alpha?) + (unread-c) + ;; Note: This is an + ;; expensive way to skip + ;; over an attribute, but + ;; in practice more + ;; verbatim end tags will + ;; not have attributes. + (scan-attr) + (scan-to-end)) + (else (scan-to-end)))) + (set! nexttok + (lambda () + (set! nexttok + normal-nexttok) + (make-end-token + elem-name #f '())))) + ;; This isn't the terminator tag, so + ;; add to the verbatim string the + ;; "xexp} and related procedures, except +;; using @var{tokenizer} as a source of tokens, rather than tokenizing from an +;; input port. This procedure is used internally, and generally should not be +;; called directly. + +(define %html-parsing:parse-html/tokenizer + ;; Note: This algorithm was originally written in 2001 (as part of the first + ;; Scheme library the author ever wrote), and then on 2009-08-16 was revamped + ;; to not use mutable pairs, for PLT 4 compatibility. It could still use + ;; some work to be more FP, but it works for now. + (letrec ((empty-elements + ;; TODO: Maybe make this an option. + %html-parsing:empty-elements) + (h-elem-parents + ;; Am doing this kludge mainly for mid-1990s HTML that uses the `p` + ;; element wrong. Trying to get all appropriate parents other than + ;; `p` that I can, to reduce breaking other code. + '(a article aside blink blockquote body center footer form header html li main nav section slot template)) + (parent-constraints + ;; TODO: Maybe make this an option. + `((area . (map span)) + (body . (html)) + (caption . (table)) + (colgroup . (table)) + (dd . (dl)) + (dt . (dl)) + (frame . (frameset)) + (head . (html)) + (h1 . ,h-elem-parents) + (h2 . ,h-elem-parents) + (h3 . ,h-elem-parents) + (h4 . ,h-elem-parents) + (h5 . ,h-elem-parents) + (h6 . ,h-elem-parents) + (isindex . (head)) + (li . (dir menu ol ul)) + (meta . (head)) + (noframes . (frameset)) + (option . (select)) + (p . (blockquote body details figcaption html li td th)) + (param . (applet)) + (tbody . (table)) + (td . (tr)) + (th . (tr)) + (thead . (table)) + (title . (head)) + (tr . (table tbody thead)))) + (token-kinds-that-always-get-added + `(*COMMENT* + *DECL* + *PI* + ,%html-parsing:entity-token-symbol + ,%html-parsing:text-string-token-symbol + ,%html-parsing:text-char-token-symbol)) + (start-tag-name (lambda (tag-token) (car tag-token))) + (end-tag-name (lambda (tag-token) (list-ref tag-token 1)))) + (lambda (tokenizer normalized?) + ;;(log-html-parsing-debug "(%html-parsing:parse-html/tokenizer ~S ~S)" tokenizer normalized?) + (let ((begs (list (vector #f '())))) + (letrec ((add-thing-as-child-of-current-beg + (lambda (tok) + (let ((beg (car begs))) + (vector-set! beg 1 (cons tok (vector-ref beg 1)))))) + + (beg->elem + (lambda (beg) + (let ((elem-name (vector-ref beg 0)) + (attrs-and-contents (reverse (vector-ref beg 1)))) + (cons elem-name attrs-and-contents)))) + + (finish-current-beg-and-return-elem + (lambda () + (let ((elem (beg->elem (car begs)))) + (set! begs (cdr begs)) + (or (null? begs) + (add-thing-as-child-of-current-beg elem)) + elem))) + + (finish-current-beg + (lambda () + (finish-current-beg-and-return-elem))) + + (finish-all-begs-and-return-top + (lambda () + (let loop () + (let ((elem (finish-current-beg-and-return-elem))) + (if (car elem) + (loop) + (cdr elem)))))) + + (finish-begs-up-to-and-including-name + (lambda (name) + ;; (log-html-parsing-debug "(finish-begs-up-to-and-including-name ~S)" name) + (let loop-find-name ((find-begs begs) + (depth 1)) + (let ((beg-name (vector-ref (car find-begs) 0))) + (cond ((not beg-name) + ;; We reached the root without finding a + ;; matching beg, so don't finish anything. + ;; + ;; TODO: 2022-04-02: Consider having a `*TOP*` + ;; kludge in `parent-constraints` that's checked + ;; here, especially for handling mid-1990s HTML + ;; `p` element (so that we can keep `p` from + ;; being a child of `p` even when there's no + ;; parent `body` or `html` element). + (void)) + ((eqv? name beg-name) + ;; We found a match, so finish the begs up to + ;; depth. + (let loop-finish ((depth depth)) + (or (zero? depth) + (begin + (finish-current-beg) + (loop-finish (- depth 1)))))) + (else + ;; Didn't find a match yet, and there's still at + ;; least one more beg to look at, so recur. + (loop-find-name (cdr find-begs) + (+ depth 1)))))))) + + (finish-begs-upto-but-not-including-names + (lambda (names) + ;; (log-html-parsing-debug "(finish-begs-upto-but-not-including-names ~S)" names) + ;; (log-html-parsing-debug "begs = ~S" begs) + (let loop-find-name ((find-begs begs) + (depth 0)) + (let ((beg-name (vector-ref (car find-begs) 0))) + (cond ((not beg-name) + ;; We reached the root without finding a + ;; matching beg, so simply discard it. + (void)) + ((memq beg-name names) + ;; We found a match, so finish the begs up to + ;; depth. + (let loop-finish ((depth depth)) + (or (zero? depth) + (begin + (finish-current-beg) + (loop-finish (- depth 1)))))) + (else + ;; Didn't find a match yet, and there's still at + ;; least one more beg to look at, so recur. + (loop-find-name (cdr find-begs) + (+ depth 1))))))))) + + (let loop () + (let ((tok (tokenizer))) + (if (null? tok) + (finish-all-begs-and-return-top) + (let ((kind (%html-parsing:xexp-token-kind tok))) + ;; (log-html-parsing-debug "kind = ~S" kind) + (cond ((memv kind token-kinds-that-always-get-added) + (add-thing-as-child-of-current-beg tok)) + ((eqv? kind %html-parsing:start-token-symbol) + ;; (log-html-parsing-debug "is-start-token-symbol") + (let* ((name (start-tag-name tok)) + (cell (assq name parent-constraints))) + ;; (log-html-parsing-debug "name = ~S cell = ~S" name cell) + (and cell + (finish-begs-upto-but-not-including-names + (cons 'div (cdr cell)))) + (if (memq name empty-elements) + (add-thing-as-child-of-current-beg tok) + (set! begs (cons (vector (car tok) + (cdr tok)) + begs))))) + ((eqv? kind %html-parsing:empty-token-symbol) + ;; Empty tag token, so just add it to current + ;; beginning while stripping off leading `*EMPTY*' + ;; symbol so that the token becomes normal SXML + ;; element syntax. + (add-thing-as-child-of-current-beg (cdr tok))) + ((eqv? kind %html-parsing:end-token-symbol) + (let ((name (end-tag-name tok))) + (if name + ;; Try to finish to a start tag matching this + ;; end tag. If none, just drop the token, + ;; though we used to add it to the current + ;; beginning. + (finish-begs-up-to-and-including-name + name) + ;; We have an anonymous end tag, so match it + ;; with the most recent beginning. If no + ;; beginning to match, then just drop the + ;; token, though we used to add it to the + ;; current beginning. + (and (vector-ref (car begs) 0) + (finish-current-beg))))) + (else (error 'parse-html/tokenizer + "unknown tag kind: ~S" + kind))) + (loop)))))))))) + +;; TODO: Quote of message to a user: +;; +;; >I think this behavior is due to HtmlPrag's use in "parse-html/tokenizer" +;; >of its local "parent-constraints" variable. +;; > +;; >The following line of code from the variable binding expresses the +;; >constraint that any "p" element can have as immediate parent element +;; >only "body", "td", or "th": +;; > +;; > (p . (body td th)) +;; > +;; >I think I know a good heuristic for dealing with unfamiliar but +;; >seemingly well-formed elements, like "page" in this case, but I'm afraid +;; >I don't have time to implement it right now. (I am job-hunting right +;; >now, and there are many other coding things I need to do first.) +;; > +;; >Would adding "page" to the above line of the HtmlPrag source code work +;; >around the current problem, or do you need a better solution right now? + +;; @defproc %parse-html input normalized? top? +;; +;; This procedure is now used internally by @code{html->xexp} and its +;; variants, and should not be used directly by programs. The interface is +;; likely to change in future versions of HtmlPrag. + +(define (%html-parsing:parse-html input normalized? top?) + (let ((parse + (lambda () + (%html-parsing:parse-html/tokenizer + (%html-parsing:make-html-tokenizer + (cond ((input-port? input) input) + ((string? input) (open-input-string input)) + (else (error + '%html-parsing:parse-html + "invalid input type: ~E" + input))) + normalized?) + normalized?)))) + (if top? + (cons '*TOP* (parse)) + (parse)))) + +;; @defproc html->sxml-0nf input +;; @defprocx html->sxml-1nf input +;; @defprocx html->sxml-2nf input +;; @defprocx html->sxml input +(doc (defproc (html->xexp (input (or/c input-port? string?))) + xexp + + (para "Parse HTML permissively from " + (racket input) + ", which is either an input port or a string, and emit an +SXML/xexp equivalent or approximation. To borrow and slightly modify an +example from Kiselyov's discussion of his HTML parser:") + + (racketinput + (html->xexp + (string-append + "whatever" + " link

" + "