diff --git a/archiver/archiver-cli.rkt b/archiver/archiver-cli.rkt deleted file mode 100644 index 11f25d0..0000000 --- a/archiver/archiver-cli.rkt +++ /dev/null @@ -1,68 +0,0 @@ -#lang cli -(require charterm - "archiver.rkt") - -(help (usage "Downloads a single Fandom wiki in BreezeWiki offline format." - "" - "Downloaded pages go into `archive/` next to the executable." - "Database goes into `archiver.db*` next to the executable." - "The database is necessary to store your download progress and resume where you left off if the process is interrupted.")) - -(flag (output-quiet?) - ("-q" "--output-quiet" "disable progress output") - (output-quiet? #t)) - -(flag (output-progress?) - ("-p" "--output-progress" "progress output for terminals (default in a tty)") - (output-progress? #t)) - -(flag (output-lines?) - ("-l" "--output-lines" "output the name of each file downloaded (default outside of a tty)") - (output-lines? #t)) - -(constraint (one-of output-quiet? output-lines? output-progress?)) - - - -(program - (start [wikiname "wikiname to download"]) - ;; set up arguments - (define width 80) - (when (not (or (output-quiet?) (output-lines?) (output-progress?))) - (cond [(terminal-port? current-input-port) - (output-progress? #t)] - [else - (output-lines? #t)])) - (define (update-width) - (when (output-progress?) - (with-charterm - (call-with-values (λ () (charterm-screen-size)) - (λ (cols rows) (set! width cols)))))) - (update-width) - ;; check - (when (or (not wikiname) (equal? wikiname "")) - (raise-user-error "Please specify the wikiname to download on the command line.")) - ;; progress reporting based on selected mode - (define (report-progress a b c) - (define basename (basename->name-for-query c)) - (cond - [(output-lines?) - (displayln basename)] - [(output-progress?) - (when (eq? (modulo a 20) 0) - (thread (λ () (update-width)))) - (define prefix (format "[~a] [~a/~a] " wikiname a b)) - (define rest (- width (string-length prefix))) - (define real-width (min (string-length basename) rest)) - (define spare-width (- rest real-width)) - (define name-display (substring basename 0 real-width)) - (define whitespace (make-string spare-width #\ )) - (printf "~a~a~a\r" prefix name-display whitespace)])) - ;; download all stages - (for ([stage all-stages] - [i (in-naturals 1)]) - (printf "> Stage ~a/~a~n" i (length all-stages)) - (stage wikiname report-progress) - (displayln ""))) - -(run start) diff --git a/archiver/archiver-database.rkt b/archiver/archiver-database.rkt deleted file mode 100644 index b81ad6c..0000000 --- a/archiver/archiver-database.rkt +++ /dev/null @@ -1,81 +0,0 @@ -#lang racket/base -(require racket/file - racket/list - racket/path - racket/string - json - json-pointer - db - "../lib/syntax.rkt") - -(provide - get-slc - query-exec* - query-rows* - query-list* - query-value* - query-maybe-value* - query-maybe-row*) - -(define storage-path (anytime-path ".." "storage")) -(define database-file (build-path storage-path "archiver.db")) - -(define slc (box #f)) -(define (get-slc) - (define slc* (unbox slc)) - (cond - [slc* slc*] - [else - (make-directory* storage-path) - (define slc* (sqlite3-connect #:database database-file #:mode 'create)) - (query-exec slc* "PRAGMA journal_mode=WAL") - (define database-version - (with-handlers ([exn:fail:sql? - (λ (exn) - ; need to set up the database - (query-exec slc* "create table database_version (version integer, primary key (version))") - (query-exec slc* "insert into database_version values (0)") - 0)]) - (query-value slc* "select version from database_version"))) - - (define migrations - (wrap-sql - ((query-exec slc* "create table page (wikiname TEXT NOT NULL, basename TEXT NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, basename))") - (query-exec slc* "create table wiki (wikiname TEXT NOT NULL, progress INTEGER, PRIMARY KEY (wikiname))")) - ((query-exec slc* "create table special_page (wikiname TEXT NOT NULL, key TEXT NOT NULL, basename TEXT NOT NULL, PRIMARY KEY (wikiname, key))")) - ((query-exec slc* "update wiki set progress = 2 where wikiname in (select wikiname from wiki inner join page using (wikiname) group by wikiname having min(page.progress) = 1)")) - ((query-exec slc* "create table image (wikiname TEXT NOT NULL, hash TEXT NTO NULL, url TEXT NOT NULL, ext TEXT, source INTEGER NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, hash))")) - ((query-exec slc* "alter table wiki add column sitename TEXT") - (query-exec slc* "alter table wiki add column basepage TEXT") - (query-exec slc* "alter table wiki add column license_text TEXT") - (query-exec slc* "alter table wiki add column license_url TEXT")))) - - (let do-migrate-step () - (when (database-version . < . (length migrations)) - (call-with-transaction - slc* - (list-ref migrations database-version)) - (set! database-version (add1 database-version)) - (query-exec slc* "update database_version set version = $1" database-version) - (do-migrate-step))) - - (set-box! slc slc*) - slc*])) - -(define (query-exec* . args) - (apply query-exec (get-slc) args)) - -(define (query-rows* . args) - (apply query-rows (get-slc) args)) - -(define (query-list* . args) - (apply query-list (get-slc) args)) - -(define (query-value* . args) - (apply query-value (get-slc) args)) - -(define (query-maybe-value* . args) - (apply query-maybe-value (get-slc) args)) - -(define (query-maybe-row* . args) - (apply query-maybe-row (get-slc) args)) diff --git a/archiver/archiver-gui.rkt b/archiver/archiver-gui.rkt deleted file mode 100644 index 6f09cb8..0000000 --- a/archiver/archiver-gui.rkt +++ /dev/null @@ -1,347 +0,0 @@ -#lang racket/base -(require racket/class - racket/draw - racket/format - racket/list - racket/port - racket/set - racket/splicing - racket/string - db - net/http-easy - memo - (only-in racket/gui timer%) - racket/gui/easy - racket/gui/easy/operator - (only-in pict bitmap) - images/icons/arrow - images/icons/control - images/icons/stickman - images/icons/style - images/icons/symbol - "archiver-database.rkt" - "archiver.rkt" - "../lib/url-utils.rkt" - "../lib/xexpr-utils.rkt") - -(default-icon-material rubber-icon-material) - -(require (for-syntax racket/base racket/match racket/set racket/string)) - -(define-syntax (@> stx) - (define form (cdr (syntax->datum stx))) - (match form - [(list form) ; (@> (fn @obs)) - ;; identify the observables and replace with non-@ symbols - (define collection (mutable-set)) - (define updated - (let loop ([sexp form]) - (cond [(symbol? sexp) - (let ([as-s (symbol->string sexp)]) - (if (string-prefix? as-s "@") - (let ([without-@ (string->symbol (substring as-s 1))]) - (set-add! collection (cons sexp without-@)) - without-@) - sexp))] - [(pair? sexp) (cons (loop (car sexp)) (loop (cdr sexp)))] - [#t sexp]))) - (define collection-l (set->list collection)) - ;; return obs-combine -> updated-form - (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-l)) ,updated) ,@(map car collection-l)))] - [(list (? string? str) args ...) ; (@> "Blah: ~a/~a" @arg1 arg2) - ;; identify the observables and replace with non-@ symbols - (define collection-l - (for/list ([arg args]) - (if (symbol? arg) - (let ([as-s (symbol->string arg)]) - (if (string-prefix? as-s "@") - (let ([without-@ (string->symbol (substring as-s 1))]) - (cons arg without-@)) - (cons #f arg))) - (cons #f arg)))) - (define collection-lo (filter car collection-l)) - ;; return obs-combine -> format - (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-lo)) (format ,str ,@(map cdr collection-l))) ,@(map car collection-lo)))])) - -(define/obs @auto-retry #f) - -(define-struct qi^ (wikiname st stage progress max-progress eta th) #:transparent) ;; queue item - -(define rows (query-rows* "select wikiname, progress from wiki where progress < 4")) -(define/obs @queue null) -(define (add-wikiname-to-queue wikiname st stage) - (@queue . <~ . (λ (queue) - (define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue)) - (if already-exists? - queue - (append queue (list (qi^ wikiname st stage 0 1 "..." #f))))))) -(for ([row rows]) - (add-wikiname-to-queue (vector-ref row 0) - (if (= (vector-ref row 1) 4) - 'complete - 'queued) - (vector-ref row 1))) - -(define status-icon-size 32) -(define status-icon-min-width 36) -(define button-icon-size 12) - -(define color-green (make-color 90 212 68)) - -(define/obs @input "") - -(splicing-let ([frame-count 30]) - (define stickman-frames - (for/vector ([s (in-range 0 1 (/ 1 frame-count))]) - (running-stickman-icon - s - #:height status-icon-size - #:material (default-icon-material)))) - - (define/obs @stick-frame-no 0) - (define stick-timer - (new timer% - [notify-callback (λ () (@stick-frame-no . <~ . add1))] - [interval (truncate (/ 1000 frame-count))])) - (define/obs @stick - (@stick-frame-no . ~> . (λ (n) (vector-ref stickman-frames - (modulo n (vector-length stickman-frames))))))) - -(define status-icons - (hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size) - 'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size) - 'running @stick - 'error (x-icon #:height status-icon-size) - 'complete (check-icon #:color color-green #:height status-icon-size))) - -(define action-icons - (hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size) - 'resume (play-icon #:color color-green #:height button-icon-size) - 'reset (left-over-arrow-icon #:color halt-icon-color #:height button-icon-size))) - -(define (bitmap-view @the-bitmap [min-width 1]) - (pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f) - #:stretch '(#f #f) - #:style '(transparent) - @the-bitmap - bitmap)) - -(define (exn->string e) - (with-output-to-string - (λ () - (displayln (exn-message e)) - (displayln "context:") - (for ([item (continuation-mark-set->context (exn-continuation-marks e))]) - (printf " ~a" (srcloc->string (cdr item))) - (when (car item) - (printf ": ~a" (car item))) - (displayln ""))))) - -(define ((handle-graphical-exn @qi) e) - (displayln (exn->string e) (current-error-port)) - (cond - [(obs-peek @auto-retry) - (void) ;; TODO - #;(do-retry-end wikiname)] - [#t - (update-qi @qi [st 'error]) - (do-try-unpause-next-entry) - (thread - (λ () - (define/obs @visible? #t) - (render - (dialog #:title "Download Error" - #:style '(resize-border) - #:mixin (λ (%) (class % (super-new) - (obs-observe! @visible? (λ (visible?) (send this show visible?))))) - (vpanel #:margin '(15 15) - (text "Encountered this error while downloading:") - (input #:style '(multiple hscroll) - #:min-size '(#f 200) - (exn->string e)) - ;; TODO - #;(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname))) - #;(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname))) - #;(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue))) - #;(button "Use Auto-Retry" (λ () - (:= @auto-retry #t) - (:= @visible? #f) - (do-retry-end wikiname))) - #;(text "Be careful not to auto-retry an infinite loop!"))) - main-window))) - (sleep) - ; make sure the broken thread is gone - (define th (qi^-th (obs-peek @qi))) - (when th (kill-thread th))])) - -(define segments - (list - (list 5/100 (make-color 0 223 217)) - (list 88/100 color-green) - (list 2/100 (make-color 0 223 217)) - (list 5/100 color-green))) -(define segment-spacing 2) -(unless (= (apply + (map car segments)) 1) - (error 'segments "segments add up to ~a, not 1" (apply + (map car segments)))) - -;; return the new bitmap, which can be drawn on a dc<%> -(define/memoize (ray-trace width height stage progress max-progress) - ;; (printf "rendering ~a ~a/~a at ~a~n" stage progress max-progress (current-inexact-milliseconds)) - (define bm (make-object bitmap% width height #f #t)) - (define dc (make-object bitmap-dc% bm)) - (define width-available (- width (* (length segments) segment-spacing))) - (send dc set-smoothing 'unsmoothed) - (send dc set-pen "black" 0 'transparent) - (for/fold ([offset 0]) - ([segment segments] - [i (in-naturals 0)]) ;; zero indexed stages? - ;; calculate start and end locations of grey bar - (define-values (segment-proportion segment-color) (apply values segment)) - (define segment-start (if (= offset 0) 0 (+ offset segment-spacing))) - (define segment-width (* width-available segment-proportion)) - ;; draw grey bar - (send dc set-brush (make-color 180 180 180 0.4) 'solid) - (send dc draw-rectangle segment-start 0 segment-width height) - ;; draw solid bar according to the current item's progress - (define proportion - (cond [(stage . < . i) 0] - [(stage . > . i) 1] - [(max-progress . <= . 0) 0] - [(progress . < . 0) 0] - [(progress . >= . max-progress) 1] - [else (progress . / . max-progress)])) - (send dc set-brush segment-color 'solid) - (send dc draw-rectangle segment-start 0 (* proportion segment-width) height) - (+ segment-start segment-width)) - (bitmap-render-icon bm 6/8)) - -;; get ray traced bitmap (possibly from cache) and draw on dc<%> -(define (draw-bar orig-dc qi) - ;; (println ray-traced) - (define-values (width height) (send orig-dc get-size)) - (send orig-dc draw-bitmap (ray-trace width height (qi^-stage qi) (qi^-progress qi) (qi^-max-progress qi)) 0 0)) - -(define ((make-progress-updater @qi) a b c) - ;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c) - (update-qi @qi [progress a] [max-progress b])) - -(define (do-add-to-queue) - (define wikiname (string-trim (obs-peek @input))) - (when ((string-length wikiname) . > . 0) - (add-wikiname-to-queue wikiname 'queued 0)) ;; TODO: automatically start? - (:= @input "")) - -(define-syntax-rule (update-qi @qi args ...) - (let ([wikiname (qi^-wikiname (obs-peek @qi))]) - (@queue . <~ . (λ (queue) - (for/list ([qi queue]) - (if (equal? (qi^-wikiname qi) wikiname) - (struct-copy qi^ qi args ...) - qi)))))) - -(define (do-start-qi @qi) - (define th - (thread (λ () - (with-handlers ([exn? (handle-graphical-exn @qi)]) - (define last-stage - (for/last ([stage all-stages] - [i (in-naturals)]) - (update-qi @qi [stage i]) - (stage (qi^-wikiname (obs-peek @qi)) (make-progress-updater @qi)) - i)) - (update-qi @qi [st 'complete] [stage (add1 last-stage)]) - (do-try-unpause-next-entry))))) - (update-qi @qi [st 'running] [th th])) - -(define (do-stop-qi @qi) - (define th (qi^-th (obs-peek @qi))) - (when th (kill-thread th)) - (update-qi @qi [th #f] [st 'paused])) - -(define (do-reset-qi @qi) - (define th (qi^-th (obs-peek @qi))) - (when th (kill-thread th)) - (update-qi @qi [th #f] [st 'queued] [stage 0] [progress 0] [max-progress 0]) - (query-exec* "update wiki set progress = 0 where wikiname = ?" (qi^-wikiname (obs-peek @qi)))) - -(define (do-try-unpause-next-entry) - (define queue (obs-peek @queue)) - (define next-qi (for/first ([qi queue] - #:when (memq (qi^-st qi) '(paused queued error))) - qi)) - (when next-qi - (define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue)))) - (do-start-qi @qi))) - -(define main-window - (render - (window - #:title "Fandom Archiver" - #:size '(400 300) - #:mixin (λ (%) (class % - (super-new) - (define/augment (on-close) - (send stick-timer stop) - (for ([qi (obs-peek @queue)]) - (when (qi^-th qi) - (kill-thread (qi^-th qi)))) - #;(disconnect*)))) - (vpanel - #:spacing 10 - #:margin '(5 5) - (hpanel - #:stretch '(#t #f) - #:spacing 10 - (hpanel - (text "https://") - (input @input - (λ (event data) (cond - [(eq? event 'input) (:= @input data)] - [(eq? event 'return) (do-add-to-queue)]))) - (text ".fandom.com")) - (button "Download Wiki" do-add-to-queue)) - (list-view - #:style '(vertical) - @queue - #:key qi^-wikiname - (λ (k @qi) - (define @status-icons - (@> (case (qi^-st @qi) - [(running) @stick] - [else (hash-ref status-icons (qi^-st @qi))]))) - (define @is-running? - (@> (memq (qi^-st @qi) '(running)))) - (define @is-complete? - (@> (eq? (qi^-st @qi) 'complete))) - ;; state icon at the left side - (hpanel #:stretch '(#t #f) - #:alignment '(left center) - #:spacing 8 - (bitmap-view @status-icons status-icon-min-width) - (vpanel - ;; name and buttons (top half) - (hpanel #:alignment '(left bottom) - (text (@> (qi^-wikiname @qi))) - (spacer) - (hpanel - #:stretch '(#f #f) - (if-view @is-complete? - (button (hash-ref action-icons 'reset) - (λ () (do-reset-qi @qi))) - (spacer)) - (if-view @is-running? - (button (hash-ref action-icons 'pause) - (λ () (do-stop-qi @qi))) - (button (hash-ref action-icons 'resume) - (λ () (do-start-qi @qi)))))) - ;; progress bar (bottom half) - (hpanel - (canvas - @qi - #:style '(transparent) - #:margin '(3 3) - draw-bar) - (hpanel #:min-size '(68 #f) - #:stretch '(#f #f) - #:alignment '(right center) - (text (@> (format "eta ~a" (qi^-eta @qi)))))))))))))) diff --git a/archiver/archiver.rkt b/archiver/archiver.rkt deleted file mode 100644 index edd0d2b..0000000 --- a/archiver/archiver.rkt +++ /dev/null @@ -1,335 +0,0 @@ -#lang racket/base -(require racket/file - racket/function - racket/list - racket/path - racket/sequence - racket/string - net/url - net/mime - file/sha1 - net/http-easy - db - json - "archiver-database.rkt" - "../lib/html-parsing/main.rkt" - "../lib/mime-types.rkt" - "../lib/syntax.rkt" - "../lib/tree-updater.rkt" - "../lib/url-utils.rkt" - "../lib/xexpr-utils.rkt" - "../lib/archive-file-mappings.rkt") - -(provide - basename->name-for-query - image-url->values - hash->save-dir - all-stages) - -(module+ test - (require rackunit)) - -(define archive-root (anytime-path ".." "storage/archive")) -(make-directory* archive-root) - -(define sources '#hasheq((style . 1) (page . 2))) - -(define (get-origin wikiname) - (format "https://~a.fandom.com" wikiname)) - -(define (insert-wiki-entry wikiname) - (define dest-url - (format "https://~a.fandom.com/api.php?~a" - wikiname - (params->query '(("action" . "query") - ("meta" . "siteinfo") - ("siprop" . "general|rightsinfo|statistics") - ("format" . "json") - ("formatversion" . "2"))))) - (define data (response-json (get dest-url))) - (define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) - (if (and exists? (not (sql-null? exists?))) - (query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" - (jp "/query/general/sitename" data) - (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) - (jp "/query/rightsinfo/text" data) - (jp "/query/rightsinfo/url" data) - wikiname) - (query-exec* "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 0, ?, ?, ?, ?)" - wikiname - (jp "/query/general/sitename" data) - (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) - (jp "/query/rightsinfo/text" data) - (jp "/query/rightsinfo/url" data))) - (jp "/query/statistics/articles" data)) - - -(define (check-style-for-images wikiname path) - (define content (file->string path)) - (define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr)) - (for/list ([url urls] - #:when (not (or (equal? url "") - (equal? url "'") - (string-suffix? url "\"") - (string-contains? url "/resources-ucp/") - (string-contains? url "/fonts/") - (string-contains? url "/drm_fonts/") - (string-contains? url "//db.onlinewebfonts.com/") - (string-contains? url "//bits.wikimedia.org/") - (string-contains? url "dropbox") - (string-contains? url "only=styles") - (string-contains? url "https://https://") - (regexp-match? #rx"^%20" url) - (regexp-match? #rx"^data:" url)))) - (cond - [(string-prefix? url "https://") url] - [(string-prefix? url "http://") (regexp-replace #rx"http:" url "https:")] - [(string-prefix? url "//") (string-append "https:" url)] - [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)] - [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)]))) - -(define (download-styles-for-wiki wikiname callback) - (define save-dir (build-path archive-root wikiname "styles")) - (make-directory* save-dir) - (define theme (λ (theme-name) - (cons (format "https://~a.fandom.com/wikia.php?controller=ThemeApi&method=themeVariables&variant=~a" wikiname theme-name) - (build-path save-dir (format "themeVariables-~a.css" theme-name))))) - ;; (Listof (Pair url save-path)) - (define styles - (list - (theme "default") - (theme "light") - (theme "dark") - (cons (format "https://~a.fandom.com/load.php?lang=en&modules=site.styles%7Cskin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles&only=styles&skin=fandomdesktop" wikiname) - (build-path save-dir "site.css")))) - (for ([style styles] - [i (in-naturals)]) - (callback i (length styles) "styles...") - (define r (get (car style))) - (define body (response-body r)) - (display-to-file body (cdr style) #:exists 'replace) - ;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url? - ) - (callback (length styles) (length styles) "styles...") - styles) - -(define (hash->save-dir wikiname hash) - (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2))) - -(define (image-url->values i) - ;; TODO: handle case where there is multiple broken cb parameter on minecraft wiki - ;; TODO: ensure it still "works" with broken & on minecraft wiki - (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing - (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary - (define hash (sha1 (string->bytes/utf-8 key))) - (cons key hash)) - - -;; 1. Download list of wiki pages and store in database, if not done yet for that wiki -(define (if-necessary-download-list-of-pages wikiname callback) - (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) - ;; done yet? - (unless (and (real? wiki-progress) (wiki-progress . >= . 1)) - ;; Count total pages - (define num-pages (insert-wiki-entry wikiname)) - ;; Download the entire index of pages - (define basenames - (let loop ([path-with-namefrom "/wiki/Local_Sitemap"] - [basenames-previous-pages null]) - ;; Download the current index page - (define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom)) - (define r (get url)) - ;; Metadata from this page (the link to the next page) - (define page (html->xexp (bytes->string/utf-8 (response-body r)))) - (define link-namefrom - ((query-selector (λ (t a c x) (and (eq? t 'a) - (pair? x) - (string-contains? (car x) "Next page") - (let ([href (get-attribute 'href a)] ) - (and href (string-contains? href "/wiki/Local_Sitemap"))))) - page #:include-text? #t))) - ;; Content from this page - (define basenames-this-page - (for/list ([link (in-producer - (query-selector - (λ (t a c) (eq? t 'a)) - ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page))) - #f)]) - (local-encoded-url->basename (get-attribute 'href (bits->attributes link))))) - ;; Call the progress callback - (define all-basenames (append basenames-previous-pages basenames-this-page)) - (callback (length all-basenames) num-pages (last all-basenames)) - ;; Recurse to download from the next page - (if link-namefrom - (loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames) - all-basenames))) - ;; Save those pages into the database - ;; SQLite can have a maximum of 32766 parameters in a single query - (for ([slice (in-slice 32760 basenames)]) - (define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values ")) - (call-with-transaction - (get-slc) - (λ () - (apply query-exec* query-template wikiname slice) - ;; Record that we have the complete list of pages - (query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname)))))) - - -;; 2. Download each page via API and: -;; * Save API response to file -(define max-page-progress 1) -(define (save-each-page wikiname callback) - ;; prepare destination folder - (define save-dir (build-path archive-root wikiname)) - (make-directory* save-dir) - ;; gather list of basenames to download (that aren't yet complete) - (define basenames (query-list* "select basename from page where wikiname = ? and progress < ?" - wikiname max-page-progress)) - ;; counter of complete/incomplete basenames - (define already-done-count - (query-value* "select count(*) from page where wikiname = ? and progress = ?" - wikiname max-page-progress)) - (define not-done-count - (query-value* "select count(*) from page where wikiname = ? and progress < ?" - wikiname max-page-progress)) - (define total-count (+ already-done-count not-done-count)) - ;; set initial progress - (callback already-done-count total-count "") - ;; loop through basenames and download - (for ([basename basenames] - [i (in-naturals (add1 already-done-count))]) - (define name-for-query (basename->name-for-query basename)) - (define dest-url - (format "https://~a.fandom.com/api.php?~a" - wikiname - (params->query `(("action" . "parse") - ("page" . ,name-for-query) - ("prop" . "text|headhtml|langlinks") - ("formatversion" . "2") - ("format" . "json"))))) - (define r (get dest-url)) - (define body (response-body r)) - (define filename (string-append basename ".json")) - (define save-path - (cond [((string-length basename) . > . 240) - (define key (sha1 (string->bytes/latin-1 basename))) - (query-exec* "insert into special_page (wikiname, key, basename) values (?, ?, ?)" - wikiname key basename) - (build-path save-dir (string-append key ".json"))] - [#t - (build-path save-dir (string-append basename ".json"))])) - (display-to-file body save-path #:exists 'replace) - (query-exec* "update page set progress = 1 where wikiname = ? and basename = ?" - wikiname basename) - (callback i total-count basename)) - ;; saved all pages, register that fact in the database - (query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname)) - - -;; 3. Download CSS and: -;; * Save CSS to file -;; * Record style images to database -(define (if-necessary-download-and-check-styles wikiname callback) - (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) - (unless (and (number? wiki-progress) (wiki-progress . >= . 3)) - (define styles (download-styles-for-wiki wikiname callback)) - (define unique-image-urls - (remove-duplicates - (map image-url->values - (flatten - (for/list ([style styles]) - (check-style-for-images wikiname (cdr style))))) - #:key cdr)) - (for ([pair unique-image-urls]) - (query-exec* "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair))) - (query-exec* "update wiki set progress = 3 where wikiname = ?" wikiname))) - - -;; 4: From downloaded pages, record URLs of image sources and inline style images to database -(define (check-json-for-images wikiname path) - (define data (with-input-from-file path (λ () (read-json)))) - (define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) - (define tree (update-tree-wiki page wikiname)) - null - #;(remove-duplicates - (for/list ([element (in-producer - (query-selector - (λ (t a c) - (and (eq? t 'img) - (get-attribute 'src a))) - tree) - #f)]) - (image-url->values (get-attribute 'src (bits->attributes element)))))) - - -;; 5. Download image sources and style images according to database -(define (save-each-image wikiname callback) - (define source (hash-ref sources 'style)) ;; TODO: download entire wiki images instead? - ;; gather list of basenames to download (that aren't yet complete) - (define rows (query-rows* "select url, hash from image where wikiname = ? and source <= ? and progress < 1" - wikiname source)) - ;; counter of complete/incomplete basenames - (define already-done-count - (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress = 1" - wikiname source)) - (define not-done-count - (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress < 1" - wikiname source)) - ;; set initial progress - (callback already-done-count (+ already-done-count not-done-count) "") - ;; loop through urls and download - (for ([row rows] - [i (in-naturals 1)]) - ;; row fragments - (define url (vector-ref row 0)) - (define hash (vector-ref row 1)) - ;; check - #; (printf "~a -> ~a~n" url hash) - (define r (get url)) - (define declared-type (response-headers-ref r 'content-type)) - (define final-type (if (equal? declared-type #"application/octet-stream") - (let ([sniff-entity (message-entity (mime-analyze (response-body r)))]) - (string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity)))) - declared-type)) - (define ext - (with-handlers ([exn:fail:contract? (λ _ (error 'save-each-image "no ext found for mime type `~a` in file ~a" final-type url))]) - (bytes->string/latin-1 (mime-type->ext final-type)))) - ;; save - (define save-dir (hash->save-dir wikiname hash)) - (make-directory* save-dir) - (define save-path (build-path save-dir (string-append hash "." ext))) - (define body (response-body r)) - (display-to-file body save-path #:exists 'replace) - (query-exec* "update image set progress = 1, ext = ? where wikiname = ? and hash = ?" - ext wikiname hash) - (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append (substring hash 0 6) "..." ext))) - ;; saved all images, register that fact in the database - (query-exec* "update wiki set progress = 4 where wikiname = ?" wikiname)) - -(define all-stages - (list - if-necessary-download-list-of-pages - save-each-page - if-necessary-download-and-check-styles - ;; check-json-for-images - save-each-image)) - -(module+ test - (check-equal? (html->xexp "") - '(*TOP* (img (@ (src "https://example.com/images?src=Blah.jpg&width=150"))))) - #;(download-list-of-pages "minecraft" values) - #;(save-each-page "minecraft" values) - #;(check-json-for-images "chiki" (build-path archive-root "chiki" "Fiona.json")) - #;(do-step-3 "gallowmere") - #;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))) - - #;(for ([wikiname (query-list* "select wikiname from wiki")]) - (println wikiname) - (insert-wiki-entry wikiname)) - - #;(for ([wikiname (query-list* "select wikiname from wiki")]) - (println wikiname) - (do-step-3 wikiname) - (save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))))) - -; (for ([stage all-stages]) (stage "create" (λ (a b c) (printf "~a/~a ~a~n" a b c)))) diff --git a/archiver/info.rkt b/archiver/info.rkt deleted file mode 100644 index 17bb747..0000000 --- a/archiver/info.rkt +++ /dev/null @@ -1,3 +0,0 @@ -#lang info - -(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo" "net-cookies-lib" "gui-easy-lib" "sql" "charterm" "cli")) diff --git a/archiver/req.rktd b/archiver/req.rktd deleted file mode 100644 index e2d2fc2..0000000 --- a/archiver/req.rktd +++ /dev/null @@ -1 +0,0 @@ -((local ("."))) diff --git a/breezewiki.rkt b/breezewiki.rkt index 5fd34b2..a8b8c28 100644 --- a/breezewiki.rkt +++ b/breezewiki.rkt @@ -17,12 +17,9 @@ (require-reloadable "src/page-proxy.rkt" page-proxy) (require-reloadable "src/page-redirect-wiki-home.rkt" redirect-wiki-home) (require-reloadable "src/page-search.rkt" page-search) -(require-reloadable "src/page-set-user-settings.rkt" page-set-user-settings) (require-reloadable "src/page-static.rkt" static-dispatcher) -(require-reloadable "src/page-static-archive.rkt" page-static-archive) (require-reloadable "src/page-subdomain.rkt" subdomain-dispatcher) (require-reloadable "src/page-wiki.rkt" page-wiki) -(require-reloadable "src/page-wiki-offline.rkt" page-wiki-offline) (require-reloadable "src/page-file.rkt" page-file) (reload!) @@ -30,9 +27,7 @@ (define ch (make-channel)) (define (start) (serve/launch/wait - #:listen-ip (if (equal? (config-get 'bind_host) "auto") - (if (config-true? 'debug) "127.0.0.1" #f) - (config-get 'bind_host)) + #:listen-ip (if (config-true? 'debug) "127.0.0.1" #f) #:port (string->number (config-get 'port)) (λ (quit) (channel-put ch (lambda () (semaphore-post quit))) @@ -45,10 +40,7 @@ page-not-found page-proxy page-search - page-set-user-settings - page-static-archive page-wiki - page-wiki-offline page-file redirect-wiki-home static-dispatcher diff --git a/dist.rkt b/dist.rkt index 2e46f8c..777e81a 100644 --- a/dist.rkt +++ b/dist.rkt @@ -11,18 +11,13 @@ (require (only-in "src/page-proxy.rkt" page-proxy)) (require (only-in "src/page-redirect-wiki-home.rkt" redirect-wiki-home)) (require (only-in "src/page-search.rkt" page-search)) -(require (only-in "src/page-set-user-settings.rkt" page-set-user-settings)) (require (only-in "src/page-static.rkt" static-dispatcher)) -(require (only-in "src/page-static-archive.rkt" page-static-archive)) (require (only-in "src/page-subdomain.rkt" subdomain-dispatcher)) (require (only-in "src/page-wiki.rkt" page-wiki)) -(require (only-in "src/page-wiki-offline.rkt" page-wiki-offline)) (require (only-in "src/page-file.rkt" page-file)) (serve/launch/wait - #:listen-ip (if (equal? (config-get 'bind_host) "auto") - (if (config-true? 'debug) "127.0.0.1" #f) - (config-get 'bind_host)) + #:listen-ip (if (config-true? 'debug) "127.0.0.1" #f) #:port (string->number (config-get 'port)) (λ (quit) (dispatcher-tree @@ -34,10 +29,7 @@ page-not-found page-proxy page-search - page-set-user-settings - page-static-archive page-wiki - page-wiki-offline page-file redirect-wiki-home static-dispatcher diff --git a/info.rkt b/info.rkt index c290d5b..46512df 100644 --- a/info.rkt +++ b/info.rkt @@ -1,3 +1,3 @@ #lang info -(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "typed-ini-lib" "memo" "net-cookies-lib" "db")) +(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo")) diff --git a/lib/archive-file-mappings.rkt b/lib/archive-file-mappings.rkt deleted file mode 100644 index 4aa8a69..0000000 --- a/lib/archive-file-mappings.rkt +++ /dev/null @@ -1,28 +0,0 @@ -#lang racket/base -(require racket/string - net/url - (only-in net/uri-codec uri-decode) - "url-utils.rkt") -(provide - local-encoded-url->segments - url-segments->basename - local-encoded-url->basename - basename->name-for-query - url-segments->guess-title) - -(define (local-encoded-url->segments str) ; '("wiki" "Page_title") - (map path/param-path (url-path (string->url str)))) - -(define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix - (define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments))) - (define basic-filename (string-join extra-encoded "#")) - basic-filename) - -(define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix - (url-segments->basename (local-encoded-url->segments str))) - -(define (basename->name-for-query str) - (uri-decode (regexp-replace* #rx"#" str "/"))) - -(define (url-segments->guess-title segments) - (regexp-replace* #rx"_" (cadr segments) " ")) diff --git a/lib/html-parsing/main.rkt b/lib/html-parsing/main.rkt deleted file mode 100644 index bdc09b1..0000000 --- a/lib/html-parsing/main.rkt +++ /dev/null @@ -1,1887 +0,0 @@ -#lang racket/base -;; Copyright Neil Van Dyke. For legal info, see file "info.rkt". - -(require mcfly) - -(module+ test - (require overeasy)) - -(doc (section "Introduction") - - (para "The " - (code "html-parsing") - " library provides a permissive HTML parser. The parser is useful -for software agent extraction of information from Web pages, for -programmatically transforming HTML files, and for implementing interactive Web -browsers. " - (code "html-parsing") - " emits " - ;; TODO: 2016-02-21 Once create sxml-doc package, reference that. - (seclink "top" - #:doc '(lib "sxml-intro/sxml-intro.scrbl") - #:indirect? #true - "SXML/xexp") - ", so that conventional HTML may be processed with XML tools such as -SXPath. Like Oleg Kiselyov's " - (hyperlink "http://pobox.com/~oleg/ftp/Scheme/xml.html#HTML-parser" - "SSAX-based HTML parser") - ", " - (code "html-parsing") - " provides a permissive tokenizer, but " - (code "html-parsing") - " extends this by attempting to recover syntactic structure.") - - (para "The " - (code "html-parsing") - " parsing behavior is permissive in that it accepts erroneous HTML, -handling several classes of HTML syntax errors gracefully, without yielding a -parse error. This is crucial for parsing arbitrary real-world Web pages, since -many pages actually contain syntax errors that would defeat a strict or -validating parser. " - (code "html-parsing") - "'s handling of errors is intended to generally emulate popular Web -browsers' interpretation of the structure of erroneous HTML.") - (para (code "html-parsing") - " also has some support for XHTML, although XML namespace qualifiers -are accepted but stripped from the resulting SXML/xexp. Note that " - (italic "valid") - " XHTML input might be better handled by a validating XML parser -like Kiselyov's SSAX.")) - -;; BEGIN COPIED FROM XEXP PACKAGE - -(define (%html-parsing:make-xexp-char-ref val) - (if (or (symbol? val) (integer? val)) - `(& ,val) - (error 'make-xexp-char-ref - "invalid xexp reference value: ~S" - val))) - -(define %html-parsing:always-empty-html-elements - '(area base br frame hr img input isindex keygen link meta param - spacer wbr)) - -;; END COPIED FROM XEXP PACKAGE - -(define %html-parsing:empty-token-symbol '*empty*) -(define %html-parsing:end-token-symbol '*end*) -(define %html-parsing:start-token-symbol '*start*) -(define %html-parsing:entity-token-symbol '*entity*) -(define %html-parsing:text-string-token-symbol '*text-string*) -(define %html-parsing:text-char-token-symbol '*text-char*) - -(define %html-parsing:make-html-tokenizer - ;; TODO: Have the tokenizer replace contiguous whitespace within individual - ;; text tokens with single space characters (except for when in `pre' and - ;; verbatim elements). The parser will introduce new contiguous whitespace - ;; (e.g., when text tokens are concatenated, invalid end tags are removed, - ;; whitespace is irrelevant between certain elements), but then the parser - ;; only has to worry about the first and last character of each string. - ;; Perhaps the text tokens should have both leading and trailing whitespace - ;; stripped, and contain flags for whether or not leading and trailing - ;; whitespace occurred. - (letrec ((no-token '()) - - ;; TODO: Maybe make these three variables options. - - (verbatim-to-eof-elems '(plaintext)) - - (verbatim-pair-elems '(script server style xmp)) - - (ws-chars (list #\space - (integer->char 9) - (integer->char 10) - (integer->char 11) - (integer->char 12) - (integer->char 13))) - - (gosc/string-or-false - (lambda (os) - (let ((s (get-output-string os))) - (if (string=? s "") #f s)))) - - (gosc/symbol-or-false - (lambda (os) - (let ((s (gosc/string-or-false os))) - (if s (string->symbol s) #f)))) - ) - (lambda (in normalized?) - ;; TODO: Make a tokenizer option that causes XML namespace qualifiers to - ;; be ignored. - (letrec - ( - ;; Port buffer with inexpensive unread of one character and slightly - ;; more expensive pushback of second character to unread. The - ;; procedures themselves do no consing. The tokenizer currently - ;; needs two-symbol lookahead, due to ambiguous "/" while parsing - ;; element and attribute names, which could be either empty-tag - ;; syntax or XML qualified names. - (c #f) - (next-c #f) - (c-consumed? #t) - (read-c (lambda () - (if c-consumed? - (if next-c - (begin (set! c next-c) - (set! next-c #f)) - (set! c (read-char in))) - (set! c-consumed? #t)))) - (unread-c (lambda () - (if c-consumed? - (set! c-consumed? #f) - ;; TODO: Procedure name in error message really - ;; isn't "%html-parsing:make-html-tokenizer"... - (error '%html-parsing:make-html-tokenizer - "already unread: ~S" - c)))) - (push-c (lambda (new-c) - (if c-consumed? - (begin (set! c new-c) - (set! c-consumed? #f)) - (if next-c - (error '%html-parsing:make-html-tokenizer - "pushback full: ~S" - c) - (begin (set! next-c c) - (set! c new-c) - (set! c-consumed? #f)))))) - - ;; TODO: These procedures are a temporary convenience for - ;; enumerating the pertinent character classes, with an eye towards - ;; removing redundant tests of character class. These procedures - ;; should be eliminated in a future version. - (c-eof? (lambda () (eof-object? c))) - (c-amp? (lambda () (eqv? c #\&))) - (c-apos? (lambda () (eqv? c #\'))) - (c-bang? (lambda () (eqv? c #\!))) - (c-colon? (lambda () (eqv? c #\:))) - (c-quot? (lambda () (eqv? c #\"))) - (c-equals? (lambda () (eqv? c #\=))) - (c-gt? (lambda () (eqv? c #\>))) - (c-lsquare? (lambda () (eqv? c #\[))) - (c-lt? (lambda () (eqv? c #\<))) - (c-minus? (lambda () (eqv? c #\-))) - (c-pound? (lambda () (eqv? c #\#))) - (c-ques? (lambda () (eqv? c #\?))) - (c-semi? (lambda () (eqv? c #\;))) - (c-slash? (lambda () (eqv? c #\/))) - (c-splat? (lambda () (eqv? c #\*))) - (c-lf? (lambda () (eqv? c #\newline))) - (c-angle? (lambda () (memv c '(#\< #\>)))) - (c-ws? (lambda () (memv c ws-chars))) - (c-alpha? (lambda () (char-alphabetic? c))) - (c-digit? (lambda () (char-numeric? c))) - (c-alphanum? (lambda () (or (c-alpha?) (c-digit?)))) - (c-hexlet? (lambda () (memv c '(#\a #\b #\c #\d #\e #\f - #\A #\B #\C #\D #\E #\F)))) - - (skip-ws (lambda () (read-c) (if (c-ws?) (skip-ws) (unread-c)))) - - (if-read-chars - (lambda (match-chars yes-thunk no-proc) - (let loop ((chars match-chars) - (match-count 0)) - (if (null? chars) - (yes-thunk) - (begin (read-c) - (if (eqv? c (car chars)) - (begin (loop (cdr chars) (+ 1 match-count))) - (begin (unread-c) - (no-proc match-chars match-count)))))))) - - (write-chars-count - (lambda (chars count port) - (let loop ((chars chars) - (count count)) - (or (zero? count) - (begin (write-char (car chars) port) - (loop (cdr chars) - (- count 1))))))) - - (make-start-token - (if normalized? - (lambda (name ns attrs) - (list name (cons '@ attrs))) - (lambda (name ns attrs) - (if (null? attrs) - (list name) - (list name (cons '@ attrs)))))) - - (make-empty-token - (lambda (name ns attrs) - (cons %html-parsing:empty-token-symbol - (make-start-token name ns attrs)))) - - (make-end-token - (if normalized? - (lambda (name ns attrs) - (list %html-parsing:end-token-symbol - name - (cons '@ attrs))) - (lambda (name ns attrs) - (if (null? attrs) - (list %html-parsing:end-token-symbol name) - (list %html-parsing:end-token-symbol - name - (cons '@ attrs)))))) - - (make-comment-token - (lambda (str) (list '*COMMENT* str))) - - (make-decl-token - (lambda (parts) (cons '*DECL* parts))) - - (scan-qname - ;; TODO: Make sure we don't accept local names that have "*", since - ;; this can break SXML tools. Have to validate this afterwards if - ;; "verbatim-safe?". Also check for "@" and maybe "@@". Check - ;; qname parsing code, especially for verbatim mode. This is - ;; important! - (lambda (verbatim-safe?) - ;; Note: If we accept some invalid local names, we only need two - ;; symbols of lookahead to determine the end of a qname. - (letrec ((os #f) - (ns '()) - (vcolons 0) - (good-os (lambda () - (or os - (begin (set! os (open-output-string)) - os))))) - (let loop () - (read-c) - (cond ((c-eof?) #f) - ((or (c-ws?) (c-splat?)) - (if verbatim-safe? - (unread-c) - #f)) - ((or (c-angle?) (c-equals?) (c-quot?) (c-apos?)) - (unread-c)) - ((c-colon?) - (or (null? ns) - (set! ns (cons ":" ns))) - (if os - (begin - (set! ns (cons (get-output-string os) - ns)) - (set! os #f)) - #f) - (loop)) - ((c-slash?) - (read-c) - (cond ((or (c-eof?) - (c-ws?) - (c-equals?) - (c-apos?) - (c-quot?) - (c-angle?) - (c-splat?)) - (unread-c) - (push-c #\/)) - (else (write-char #\/ (good-os)) - (write-char c os) - (loop)))) - (else (write-char c (good-os)) - (loop)))) - (let ((ns (if (null? ns) - #f - (apply string-append - (reverse ns)))) - (localname (if os (get-output-string os) #f))) - (if verbatim-safe? - ;; TODO: Make sure we don't have ambiguous ":" or drop - ;; any characters! - (cons ns localname) - ;; Note: We represent "xml:" and "xmlns:" syntax as - ;; normal qnames, for lack of something better to do with - ;; them when we don't support XML namespaces. - ;; - ;; TODO: Local names are currently forced to lowercase, - ;; since HTML is usually case-insensitive. If XML - ;; namespaces are used, we might wish to keep local names - ;; case-sensitive. - (if localname - (if ns - (if (or (string=? ns "xml") - (string=? ns "xmlns")) - (string->symbol (string-append ns - ":" - localname)) - (cons ns - (string->symbol (string-downcase - localname)))) - (string->symbol (string-downcase localname))) - (if ns - (string->symbol (string-downcase ns)) - ;; TODO: Ensure in rest of code that returning #f - ;; as a name here is OK. - #f))))))) - - (scan-tag - (lambda (start?) - (skip-ws) - (let ((tag-name (scan-qname #f)) - (tag-ns #f) - (tag-attrs #f) - (tag-empty? #f)) - ;; Scan element name. - (if (pair? tag-name) - (begin (set! tag-ns (car tag-name)) - (set! tag-name (cdr tag-name))) - #f) - ;; TODO: Ensure there's no case in which a #f tag-name isn't - ;; compensated for later. - ;; - ;; Scan element attributes. - (set! tag-attrs - (let scan-attr-list () - (read-c) - (cond ((c-eof?) '()) - ((c-angle?) (unread-c) '()) - ((c-slash?) - (set! tag-empty? #t) - (scan-attr-list)) - ((c-alpha?) - (unread-c) - (let ((attr (scan-attr))) - (cons attr (scan-attr-list)))) - (else (scan-attr-list))))) - ;; Find ">" or unnatural end. - (let loop () - (read-c) - (cond ((c-eof?) no-token) - ((c-slash?) (set! tag-empty? #t) (loop)) - ((c-gt?) #f) - ((c-ws?) (loop)) - (else (unread-c)))) - ;; Change the tokenizer mode if necessary. - (cond ((not start?) #f) - (tag-empty? #f) - ;; TODO: Maybe make one alist lookup here, instead of - ;; two. - ((memq tag-name verbatim-to-eof-elems) - (set! nexttok verbeof-nexttok)) - ((memq tag-name verbatim-pair-elems) - (set! nexttok (make-verbpair-nexttok tag-name)))) - ;; Return a token object. - (if start? - (if tag-empty? - (make-empty-token tag-name tag-ns tag-attrs) - (make-start-token tag-name tag-ns tag-attrs)) - (make-end-token tag-name tag-ns tag-attrs))))) - - (scan-attr - (lambda () - (let ((name (scan-qname #f)) - (val #f)) - (if (pair? name) - (set! name (cdr name)) - #f) - (let loop-equals-or-end () - (read-c) - (cond ((c-eof?) no-token) - ((c-ws?) (loop-equals-or-end)) - ((c-equals?) - (let loop-quote-or-unquoted () - (read-c) - (cond ((c-eof?) no-token) - ((c-ws?) (loop-quote-or-unquoted)) - ((or (c-apos?) (c-quot?)) - (let ((term c)) - (set! val (open-output-string)) - (let loop-quoted-val () - (read-c) - (cond ((c-eof?) #f) - ((eqv? c term) #f) - ((c-amp?) (let ((entity (scan-entity))) - (display entity val) - (loop-quoted-val))) - (else (write-char c val) - (loop-quoted-val)))))) - ((c-angle?) (unread-c)) - (else - (set! val (open-output-string)) - (write-char c val) - (let loop-unquoted-val () - (read-c) - (cond ((c-eof?) no-token) - ((c-apos?) #f) - ((c-quot?) #f) - ((or (c-ws?) (c-angle?) - ;;(c-slash?) - ) - (unread-c)) - ;; Note: We can treat a slash in an - ;; unquoted attribute value as a - ;; value constituent because the - ;; slash is specially-handled only - ;; for XHTML, and XHTML attribute - ;; values must always be quoted. We - ;; could do lookahead for "/>", but - ;; that wouldn't let us parse HTML - ;; "" correctly, so this is - ;; an easier and more correct way to - ;; do things. - (else (write-char c val) - (loop-unquoted-val)))))))) - (else (unread-c)))) - (if normalized? - (list name (if val - (get-output-string val) - (symbol->string name))) - (if val - (list name (get-output-string val)) - (list name)))))) - - (scan-comment - ;; TODO: Rewrite this to use tail recursion rather than a state - ;; variable. - (lambda () - (let ((os (open-output-string)) - (state 'start-minus)) - (let loop () - (read-c) - (cond ((c-eof?) #f) - ((c-minus?) - (set! state - (case state - ((start-minus) 'start-minus-minus) - ((start-minus-minus body) 'end-minus) - ((end-minus) 'end-minus-minus) - ((end-minus-minus) (write-char #\- os) state) - (else (error '<%html-parsing:make-html-tokenizer> - "invalid state: ~S" - state)))) - (loop)) - ((and (c-gt?) (eq? state 'end-minus-minus)) #f) - (else (case state - ((end-minus) (write-char #\- os)) - ((end-minus-minus) (display "--" os))) - (set! state 'body) - (write-char c os) - (loop)))) - (make-comment-token (get-output-string os))))) - - (scan-possible-cdata - (lambda () - ;; Read ") - (lambda () (get-output-string os)) - (lambda (chars count) - (if (zero? count) - (if (eof-object? c) - (get-output-string os) - (begin (write-char c os) - (read-c) - (loop))) - (begin (write-char #\] os) - (if (= count 2) - (push-c #\]) - #f) - (loop))))))))) - - (scan-pi - (lambda () - (skip-ws) - (let ((name (open-output-string)) - (val (open-output-string))) - (let scan-name () - (read-c) - (cond ((c-eof?) #f) - ((c-ws?) #f) - ((c-alpha?) (write-char c name) (scan-name)) - (else (unread-c)))) - ;; TODO: Do we really want to emit #f for PI name? - (set! name (gosc/symbol-or-false name)) - (let scan-val () - (read-c) - (cond ((c-eof?) #f) - ;; ((c-amp?) (display (scan-entity) val) - ;; (scan-val)) - ((c-ques?) - (read-c) - (cond ((c-eof?) (write-char #\? val)) - ((c-gt?) #f) - (else (write-char #\? val) - (unread-c) - (scan-val)))) - (else (write-char c val) (scan-val)))) - (list '*PI* - name - (get-output-string val))))) - - (scan-decl - ;; TODO: Find if SXML includes declaration forms, and if so, use - ;; whatever format SXML wants. - ;; - ;; TODO: Rewrite to eliminate state variables. - (letrec - ((scan-parts - (lambda () - (let ((part (open-output-string)) - (nonsymbol? #f) - (state 'before) - (last? #f)) - (let loop () - (read-c) - (cond ((c-eof?) #f) - ((c-ws?) - (case state - ((before) (loop)) - ((quoted) (write-char c part) (loop)))) - ((and (c-gt?) (not (eq? state 'quoted))) - (set! last? #t)) - ((and (c-lt?) (not (eq? state 'quoted))) - (unread-c)) - ((c-quot?) - (case state - ((before) (set! state 'quoted) (loop)) - ((unquoted) (unread-c)) - ((quoted) #f))) - (else - (if (eq? state 'before) - (set! state 'unquoted) - #f) - (set! nonsymbol? (or nonsymbol? - (not (c-alphanum?)))) - (write-char c part) - (loop)))) - (set! part (get-output-string part)) - (if (string=? part "") - '() - (cons (if (or (eq? state 'quoted) nonsymbol?) - part - ;; TODO: Normalize case of things we make - ;; into symbols here. - (string->symbol part)) - (if last? - '() - (scan-parts)))))))) - (lambda () (make-decl-token (scan-parts))))) - - (special-entity-reverse-chars-to-string-alist - '(((#\p #\m #\a) . "&") - ((#\s #\o #\p #\a) . "'") - ((#\t #\g) . ">") - ((#\t #\l) . "<") - ((#\t #\o #\u #\q) . "\""))) - - (finish-terminated-named-entity - (lambda (reverse-name-chars) - (cond ((equal? '() reverse-name-chars) - "&") - ((assoc reverse-name-chars - special-entity-reverse-chars-to-string-alist) - => (lambda (p) - (cdr p))) - (else (%html-parsing:make-xexp-char-ref - (string->symbol (apply string (reverse reverse-name-chars)))))))) - - (finish-unterminated-named-entity - (lambda (reverse-name-chars) - (apply string (cons #\& (reverse reverse-name-chars))))) - - (scan-entity - (lambda () - (read-c) - (cond ((c-eof?) "&") - ((c-alpha?) - ;; TODO: Do entity names have a maximum length? - (let loop ((reverse-name-chars (cons c '()))) - (read-c) - (cond ((c-eof?) (finish-unterminated-named-entity - reverse-name-chars)) - ((c-alpha?) (let ((reverse-name-chars (cons c reverse-name-chars))) - (cond ((assoc reverse-name-chars - special-entity-reverse-chars-to-string-alist) - => (lambda (p) - (read-c) - (or (c-semi?) - (unread-c)) - (cdr p))) - (else (loop reverse-name-chars))))) - ((c-semi?) (finish-terminated-named-entity - reverse-name-chars)) - (else (unread-c) - (finish-unterminated-named-entity - reverse-name-chars))))) - ((c-pound?) - (let ((num (open-output-string)) - (hex? #f)) - (read-c) - (cond ((c-eof?) #f) - ((memv c '(#\x #\X)) (set! hex? #t) (read-c))) - (let loop () - (cond ((c-eof?) #f) - ((c-semi?) #f) - ((or (c-digit?) (and hex? (c-hexlet?))) - (write-char c num) - (read-c) - (loop)) - (else (unread-c)))) - (set! num (get-output-string num)) - (if (string=? num "") - "&#;" - (let ((n (string->number num (if hex? 16 10)))) - (if (<= 32 n 126) - (string (integer->char n)) - (string (integer->char n))))))) - (else (unread-c) "&")))) - - (normal-nexttok - (lambda () - (read-c) - (cond ((c-eof?) no-token) - ((c-lt?) - (let loop () - (read-c) - (cond ((c-eof?) "<") - ;; ((c-ws?) (loop)) - ((c-slash?) (scan-tag #f)) - ((c-ques?) (scan-pi)) - ((c-alpha?) (unread-c) (scan-tag #t)) - ((c-bang?) - (read-c) - (if (c-lsquare?) - (scan-possible-cdata) - (let loop () - (cond ((c-eof?) no-token) - ((c-ws?) (read-c) (loop)) - ((c-minus?) (scan-comment)) - (else (unread-c) - (scan-decl)))))) - (else (unread-c) "<")))) - ((c-gt?) ">") - (else (let ((os (open-output-string))) - (let loop () - (cond ((c-eof?) #f) - ((c-angle?) (unread-c)) - ((c-amp?) - (let ((entity (scan-entity))) - (if (string? entity) - (begin (display entity os) - (read-c) - (loop)) - (let ((saved-nexttok nexttok)) - (set! nexttok - (lambda () - (set! nexttok - saved-nexttok) - entity)))))) - (else (write-char c os) - (or (c-lf?) - (begin (read-c) (loop)))))) - (let ((text (get-output-string os))) - (if (equal? text "") - (nexttok) - text))))))) - - (verbeof-nexttok - (lambda () - (read-c) - (if (c-eof?) - no-token - (let ((os (open-output-string))) - (let loop () - (or (c-eof?) - (begin (write-char c os) - (or (c-lf?) - (begin (read-c) (loop)))))) - (get-output-string os))))) - - (make-verbpair-nexttok - (lambda (elem-name) - (lambda () - (let ((os (open-output-string))) - ;; Accumulate up to a newline-terminated line. - (let loop () - (read-c) - (cond ((c-eof?) - ;; Got EOF in verbatim context, so set the normal - ;; nextok procedure, then fall out of loop. - (set! nexttok normal-nexttok)) - ((c-lt?) - ;; Got "<" in verbatim context, so get next - ;; character. - (read-c) - (cond ((c-eof?) - ;; Got "<" then EOF, so set to the normal - ;; nexttok procedure, add the "<" to the - ;; verbatim string, and fall out of loop. - (set! nexttok normal-nexttok) - (write-char #\< os)) - ((c-slash?) - ;; Got "symbol - (string-downcase local)) - elem-name)) - ;; This is the terminator tag, so - ;; scan to the end of it, set the - ;; nexttok, and fall out of the loop. - (begin - (let scan-to-end () - (read-c) - (cond ((c-eof?) #f) - ((c-gt?) #f) - ((c-lt?) (unread-c)) - ((c-alpha?) - (unread-c) - ;; Note: This is an - ;; expensive way to skip - ;; over an attribute, but - ;; in practice more - ;; verbatim end tags will - ;; not have attributes. - (scan-attr) - (scan-to-end)) - (else (scan-to-end)))) - (set! nexttok - (lambda () - (set! nexttok - normal-nexttok) - (make-end-token - elem-name #f '())))) - ;; This isn't the terminator tag, so - ;; add to the verbatim string the - ;; "xexp} and related procedures, except -;; using @var{tokenizer} as a source of tokens, rather than tokenizing from an -;; input port. This procedure is used internally, and generally should not be -;; called directly. - -(define %html-parsing:parse-html/tokenizer - ;; Note: This algorithm was originally written in 2001 (as part of the first - ;; Scheme library the author ever wrote), and then on 2009-08-16 was revamped - ;; to not use mutable pairs, for PLT 4 compatibility. It could still use - ;; some work to be more FP, but it works for now. - (letrec ((empty-elements - ;; TODO: Maybe make this an option. - %html-parsing:empty-elements) - (h-elem-parents - ;; Am doing this kludge mainly for mid-1990s HTML that uses the `p` - ;; element wrong. Trying to get all appropriate parents other than - ;; `p` that I can, to reduce breaking other code. - '(a article aside blink blockquote body center footer form header html li main nav section slot template)) - (parent-constraints - ;; TODO: Maybe make this an option. - `((area . (map span)) - (body . (html)) - (caption . (table)) - (colgroup . (table)) - (dd . (dl)) - (dt . (dl)) - (frame . (frameset)) - (head . (html)) - (h1 . ,h-elem-parents) - (h2 . ,h-elem-parents) - (h3 . ,h-elem-parents) - (h4 . ,h-elem-parents) - (h5 . ,h-elem-parents) - (h6 . ,h-elem-parents) - (isindex . (head)) - (li . (dir menu ol ul)) - (meta . (head)) - (noframes . (frameset)) - (option . (select)) - (p . (blockquote body details figcaption html li td th)) - (param . (applet)) - (tbody . (table)) - (td . (tr)) - (th . (tr)) - (thead . (table)) - (title . (head)) - (tr . (table tbody thead)))) - (token-kinds-that-always-get-added - `(*COMMENT* - *DECL* - *PI* - ,%html-parsing:entity-token-symbol - ,%html-parsing:text-string-token-symbol - ,%html-parsing:text-char-token-symbol)) - (start-tag-name (lambda (tag-token) (car tag-token))) - (end-tag-name (lambda (tag-token) (list-ref tag-token 1)))) - (lambda (tokenizer normalized?) - ;;(log-html-parsing-debug "(%html-parsing:parse-html/tokenizer ~S ~S)" tokenizer normalized?) - (let ((begs (list (vector #f '())))) - (letrec ((add-thing-as-child-of-current-beg - (lambda (tok) - (let ((beg (car begs))) - (vector-set! beg 1 (cons tok (vector-ref beg 1)))))) - - (beg->elem - (lambda (beg) - (let ((elem-name (vector-ref beg 0)) - (attrs-and-contents (reverse (vector-ref beg 1)))) - (cons elem-name attrs-and-contents)))) - - (finish-current-beg-and-return-elem - (lambda () - (let ((elem (beg->elem (car begs)))) - (set! begs (cdr begs)) - (or (null? begs) - (add-thing-as-child-of-current-beg elem)) - elem))) - - (finish-current-beg - (lambda () - (finish-current-beg-and-return-elem))) - - (finish-all-begs-and-return-top - (lambda () - (let loop () - (let ((elem (finish-current-beg-and-return-elem))) - (if (car elem) - (loop) - (cdr elem)))))) - - (finish-begs-up-to-and-including-name - (lambda (name) - ;; (log-html-parsing-debug "(finish-begs-up-to-and-including-name ~S)" name) - (let loop-find-name ((find-begs begs) - (depth 1)) - (let ((beg-name (vector-ref (car find-begs) 0))) - (cond ((not beg-name) - ;; We reached the root without finding a - ;; matching beg, so don't finish anything. - ;; - ;; TODO: 2022-04-02: Consider having a `*TOP*` - ;; kludge in `parent-constraints` that's checked - ;; here, especially for handling mid-1990s HTML - ;; `p` element (so that we can keep `p` from - ;; being a child of `p` even when there's no - ;; parent `body` or `html` element). - (void)) - ((eqv? name beg-name) - ;; We found a match, so finish the begs up to - ;; depth. - (let loop-finish ((depth depth)) - (or (zero? depth) - (begin - (finish-current-beg) - (loop-finish (- depth 1)))))) - (else - ;; Didn't find a match yet, and there's still at - ;; least one more beg to look at, so recur. - (loop-find-name (cdr find-begs) - (+ depth 1)))))))) - - (finish-begs-upto-but-not-including-names - (lambda (names) - ;; (log-html-parsing-debug "(finish-begs-upto-but-not-including-names ~S)" names) - ;; (log-html-parsing-debug "begs = ~S" begs) - (let loop-find-name ((find-begs begs) - (depth 0)) - (let ((beg-name (vector-ref (car find-begs) 0))) - (cond ((not beg-name) - ;; We reached the root without finding a - ;; matching beg, so simply discard it. - (void)) - ((memq beg-name names) - ;; We found a match, so finish the begs up to - ;; depth. - (let loop-finish ((depth depth)) - (or (zero? depth) - (begin - (finish-current-beg) - (loop-finish (- depth 1)))))) - (else - ;; Didn't find a match yet, and there's still at - ;; least one more beg to look at, so recur. - (loop-find-name (cdr find-begs) - (+ depth 1))))))))) - - (let loop () - (let ((tok (tokenizer))) - (if (null? tok) - (finish-all-begs-and-return-top) - (let ((kind (%html-parsing:xexp-token-kind tok))) - ;; (log-html-parsing-debug "kind = ~S" kind) - (cond ((memv kind token-kinds-that-always-get-added) - (add-thing-as-child-of-current-beg tok)) - ((eqv? kind %html-parsing:start-token-symbol) - ;; (log-html-parsing-debug "is-start-token-symbol") - (let* ((name (start-tag-name tok)) - (cell (assq name parent-constraints))) - ;; (log-html-parsing-debug "name = ~S cell = ~S" name cell) - (and cell - (finish-begs-upto-but-not-including-names - (cons 'div (cdr cell)))) - (if (memq name empty-elements) - (add-thing-as-child-of-current-beg tok) - (set! begs (cons (vector (car tok) - (cdr tok)) - begs))))) - ((eqv? kind %html-parsing:empty-token-symbol) - ;; Empty tag token, so just add it to current - ;; beginning while stripping off leading `*EMPTY*' - ;; symbol so that the token becomes normal SXML - ;; element syntax. - (add-thing-as-child-of-current-beg (cdr tok))) - ((eqv? kind %html-parsing:end-token-symbol) - (let ((name (end-tag-name tok))) - (if name - ;; Try to finish to a start tag matching this - ;; end tag. If none, just drop the token, - ;; though we used to add it to the current - ;; beginning. - (finish-begs-up-to-and-including-name - name) - ;; We have an anonymous end tag, so match it - ;; with the most recent beginning. If no - ;; beginning to match, then just drop the - ;; token, though we used to add it to the - ;; current beginning. - (and (vector-ref (car begs) 0) - (finish-current-beg))))) - (else (error 'parse-html/tokenizer - "unknown tag kind: ~S" - kind))) - (loop)))))))))) - -;; TODO: Quote of message to a user: -;; -;; >I think this behavior is due to HtmlPrag's use in "parse-html/tokenizer" -;; >of its local "parent-constraints" variable. -;; > -;; >The following line of code from the variable binding expresses the -;; >constraint that any "p" element can have as immediate parent element -;; >only "body", "td", or "th": -;; > -;; > (p . (body td th)) -;; > -;; >I think I know a good heuristic for dealing with unfamiliar but -;; >seemingly well-formed elements, like "page" in this case, but I'm afraid -;; >I don't have time to implement it right now. (I am job-hunting right -;; >now, and there are many other coding things I need to do first.) -;; > -;; >Would adding "page" to the above line of the HtmlPrag source code work -;; >around the current problem, or do you need a better solution right now? - -;; @defproc %parse-html input normalized? top? -;; -;; This procedure is now used internally by @code{html->xexp} and its -;; variants, and should not be used directly by programs. The interface is -;; likely to change in future versions of HtmlPrag. - -(define (%html-parsing:parse-html input normalized? top?) - (let ((parse - (lambda () - (%html-parsing:parse-html/tokenizer - (%html-parsing:make-html-tokenizer - (cond ((input-port? input) input) - ((string? input) (open-input-string input)) - (else (error - '%html-parsing:parse-html - "invalid input type: ~E" - input))) - normalized?) - normalized?)))) - (if top? - (cons '*TOP* (parse)) - (parse)))) - -;; @defproc html->sxml-0nf input -;; @defprocx html->sxml-1nf input -;; @defprocx html->sxml-2nf input -;; @defprocx html->sxml input -(doc (defproc (html->xexp (input (or/c input-port? string?))) - xexp - - (para "Parse HTML permissively from " - (racket input) - ", which is either an input port or a string, and emit an -SXML/xexp equivalent or approximation. To borrow and slightly modify an -example from Kiselyov's discussion of his HTML parser:") - - (racketinput - (html->xexp - (string-append - "whatever" - " link

" - "