diff --git a/archiver/archiver-cli.rkt b/archiver/archiver-cli.rkt index d96aa65..11f25d0 100644 --- a/archiver/archiver-cli.rkt +++ b/archiver/archiver-cli.rkt @@ -6,24 +6,24 @@ "" "Downloaded pages go into `archive/` next to the executable." "Database goes into `archiver.db*` next to the executable." - "The database is necessary to store your download progress and resume where you left off if the process is interrupted.") - (ps "" - "Default output style is `progress` in a tty and `lines` otherwise.")) + "The database is necessary to store your download progress and resume where you left off if the process is interrupted.")) (flag (output-quiet?) ("-q" "--output-quiet" "disable progress output") (output-quiet? #t)) -(flag (output-lines?) - ("-l" "--output-lines" "output the name of each file downloaded") - (output-lines? #t)) - (flag (output-progress?) - ("-p" "--output-progress" "progress output for terminals") + ("-p" "--output-progress" "progress output for terminals (default in a tty)") (output-progress? #t)) +(flag (output-lines?) + ("-l" "--output-lines" "output the name of each file downloaded (default outside of a tty)") + (output-lines? #t)) + (constraint (one-of output-quiet? output-lines? output-progress?)) + + (program (start [wikiname "wikiname to download"]) ;; set up arguments @@ -42,30 +42,27 @@ ;; check (when (or (not wikiname) (equal? wikiname "")) (raise-user-error "Please specify the wikiname to download on the command line.")) - ;; stage 1 - (cond [(output-lines?) (displayln "Downloading list of pages...")] - [(output-progress?) (printf "Downloading list of pages... \r")]) - (if-necessary-download-list-of-pages - wikiname - (λ (a b c) - (cond [(output-progress?) (printf "Downloading list of pages... [~a/~b]\r" a b)]))) - ;; stage 2 - (save-each-page - wikiname - (λ (a b c) - (define basename (basename->name-for-query c)) - (cond - [(output-lines?) - (displayln basename)] - [(output-progress?) - (when (eq? (modulo a 20) 0) - (thread (λ () (update-width)))) - (define prefix (format "[~a/~a] " a b)) - (define rest (- width (string-length prefix))) - (define real-width (min (string-length basename) rest)) - (define spare-width (- rest real-width)) - (define name-display (substring basename 0 real-width)) - (define whitespace (make-string spare-width #\ )) - (printf "~a~a~a\r" prefix name-display whitespace)])))) + ;; progress reporting based on selected mode + (define (report-progress a b c) + (define basename (basename->name-for-query c)) + (cond + [(output-lines?) + (displayln basename)] + [(output-progress?) + (when (eq? (modulo a 20) 0) + (thread (λ () (update-width)))) + (define prefix (format "[~a] [~a/~a] " wikiname a b)) + (define rest (- width (string-length prefix))) + (define real-width (min (string-length basename) rest)) + (define spare-width (- rest real-width)) + (define name-display (substring basename 0 real-width)) + (define whitespace (make-string spare-width #\ )) + (printf "~a~a~a\r" prefix name-display whitespace)])) + ;; download all stages + (for ([stage all-stages] + [i (in-naturals 1)]) + (printf "> Stage ~a/~a~n" i (length all-stages)) + (stage wikiname report-progress) + (displayln ""))) (run start) diff --git a/archiver/archiver-database.rkt b/archiver/archiver-database.rkt index 2defb0e..aaea4d6 100644 --- a/archiver/archiver-database.rkt +++ b/archiver/archiver-database.rkt @@ -1,7 +1,7 @@ #lang racket/base -(require racket/list +(require racket/file + racket/list racket/path - racket/runtime-path racket/string json json-pointer @@ -9,9 +9,16 @@ "../lib/syntax.rkt") (provide - slc) + get-slc + query-exec* + query-rows* + query-list* + query-value* + query-maybe-value* + query-maybe-row*) -(define-runtime-path database-file "../storage/archiver.db") +(define storage-path (anytime-path ".." "storage")) +(define database-file (build-path storage-path "archiver.db")) (define migrations (wrap-sql @@ -25,23 +32,50 @@ (query-exec slc "alter table wiki add column license_text TEXT") (query-exec slc "alter table wiki add column license_url TEXT")))) -(define slc (sqlite3-connect #:database database-file #:mode 'create)) -(query-exec slc "PRAGMA journal_mode=WAL") -(define database-version - (with-handlers ([exn:fail:sql? - (λ (exn) - ; need to set up the database - (query-exec slc "create table database_version (version integer, primary key (version))") - (query-exec slc "insert into database_version values (0)") - 0)]) - (query-value slc "select version from database_version"))) +(define slc (box #f)) +(define (get-slc) + (define slc* (unbox slc)) + (cond + [slc* slc*] + [else + (make-directory* storage-path) + (define slc* (sqlite3-connect #:database database-file #:mode 'create)) + (query-exec slc* "PRAGMA journal_mode=WAL") + (define database-version + (with-handlers ([exn:fail:sql? + (λ (exn) + ; need to set up the database + (query-exec slc* "create table database_version (version integer, primary key (version))") + (query-exec slc* "insert into database_version values (0)") + 0)]) + (query-value slc* "select version from database_version"))) -(let do-migrate-step () - (when (database-version . < . (length migrations)) - (call-with-transaction - slc - (list-ref migrations database-version)) - (set! database-version (add1 database-version)) - (query-exec slc "update database_version set version = $1" database-version) - (do-migrate-step))) + (let do-migrate-step () + (when (database-version . < . (length migrations)) + (call-with-transaction + slc* + (list-ref migrations database-version)) + (set! database-version (add1 database-version)) + (query-exec slc* "update database_version set version = $1" database-version) + (do-migrate-step))) + (set-box! slc slc*) + slc*])) + +(define (query-exec* . args) + (apply query-exec (get-slc) args)) + +(define (query-rows* . args) + (apply query-rows (get-slc) args)) + +(define (query-list* . args) + (apply query-list (get-slc) args)) + +(define (query-value* . args) + (apply query-value (get-slc) args)) + +(define (query-maybe-value* . args) + (apply query-maybe-value (get-slc) args)) + +(define (query-maybe-row* . args) + (apply query-maybe-row (get-slc) args)) diff --git a/archiver/archiver-gui.rkt b/archiver/archiver-gui.rkt index 1badf65..3e476a9 100644 --- a/archiver/archiver-gui.rkt +++ b/archiver/archiver-gui.rkt @@ -1,89 +1,128 @@ #lang racket/base (require racket/class + racket/draw + racket/format racket/list racket/port racket/set + racket/splicing racket/string db net/http-easy + memo + (only-in racket/gui timer%) racket/gui/easy racket/gui/easy/operator + (only-in pict bitmap) + images/icons/style + images/icons/control + images/icons/stickman + images/icons/symbol "archiver-database.rkt" "archiver.rkt" "../lib/url-utils.rkt" "../lib/xexpr-utils.rkt") -(define active-threads (mutable-seteq)) +(default-icon-material rubber-icon-material) + +(require (for-syntax racket/base racket/match racket/set racket/string)) + +(define-syntax (@> stx) + (define form (cdr (syntax->datum stx))) + (match form + [(list form) ; (@> (fn @obs)) + ;; identify the observables and replace with non-@ symbols + (define collection (mutable-set)) + (define updated + (let loop ([sexp form]) + (cond [(symbol? sexp) + (let ([as-s (symbol->string sexp)]) + (if (string-prefix? as-s "@") + (let ([without-@ (string->symbol (substring as-s 1))]) + (set-add! collection (cons sexp without-@)) + without-@) + sexp))] + [(pair? sexp) (cons (loop (car sexp)) (loop (cdr sexp)))] + [#t sexp]))) + (define collection-l (set->list collection)) + ;; return obs-combine -> updated-form + (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-l)) ,updated) ,@(map car collection-l)))] + [(list (? string? str) args ...) ; (@> "Blah: ~a/~a" @arg1 arg2) + ;; identify the observables and replace with non-@ symbols + (define collection-l + (for/list ([arg args]) + (if (symbol? arg) + (let ([as-s (symbol->string arg)]) + (if (string-prefix? as-s "@") + (let ([without-@ (string->symbol (substring as-s 1))]) + (cons arg without-@)) + (cons #f arg))) + (cons #f arg)))) + (define collection-lo (filter car collection-l)) + ;; return obs-combine -> format + (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-lo)) (format ,str ,@(map cdr collection-l))) ,@(map car collection-lo)))])) (define/obs @auto-retry #f) -(define/obs @wikiname "") -(define/obs @state 'waiting) -(define/obs @num-pages 1) -(define/obs @done-pages 0) -(define/obs @just-done "") -(define/obs @queue '()) -(define @title - (obs-combine - (λ (state queue num-pages done-pages) - (define suffix (if (pair? queue) - (format " +~a" (length queue)) - "")) - (define progress (if (eq? num-pages 0) - " 0%" - (format " ~a%" (round (inexact->exact (* (/ done-pages num-pages) 100)))))) - (case state - [(waiting stage-0) (format "Fandom Archiver~a" suffix)] - [(stage-1) (format "Fandom Archiver 0%~a" suffix)] - [(stage-2) (format "Fandom Archiver~a~a" progress suffix)] - [(err) "ERROR Fandom Archiver"] - [(done) "Fandom Archiver 100%"])) - @state @queue @num-pages (obs-throttle @done-pages #:duration 5000))) -(define-syntax-rule (t body ...) - (set-add! active-threads (thread (λ () body ...)))) +(define-struct qi^ (wikiname st stage progress max-progress eta th) #:transparent) ;; queue item -(define (do-start-or-queue) - (define wikiname (obs-peek @wikiname)) - (:= @wikiname "") - (when (not (equal? (string-trim wikiname) "")) - (@queue . <~ . (λ (q) (append q (list wikiname)))) - (shift-queue-maybe))) +(define rows (query-rows* "select wikiname, progress from wiki where progress < 4")) +(define/obs @queue null) +(define (add-wikiname-to-queue wikiname st stage) + (@queue . <~ . (λ (queue) + (define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue)) + (if already-exists? + queue + (append queue (list (qi^ wikiname st stage 0 1 "..." #f))))))) +(for ([row rows]) + (add-wikiname-to-queue (vector-ref row 0) + (if (= (vector-ref row 1) 4) + 'complete + 'queued) + (vector-ref row 1))) -(define (shift-queue-maybe) - (when (memq (obs-peek @state) '(waiting done)) - (define q (obs-peek @queue)) - (cond - [(pair? q) - (define wikiname (car q)) - (:= @queue (cdr q)) - (do-start-stage1 wikiname)] - [#t (:= @state 'done)]))) +(define status-icon-size 32) +(define status-icon-min-width 36) +(define button-icon-size 12) -(define (do-start-stage1 wikiname) - (:= @just-done "") - (:= @done-pages 0) - (:= @num-pages 1) - (t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)]) - (:= @state 'stage-0) - (if-necessary-download-list-of-pages wikiname (λ (now-done num-pages just-done-name) - (:= @num-pages num-pages) - (:= @done-pages now-done) - (:= @just-done just-done-name) - (:= @state 'stage-1))) - (do-start-stage2 wikiname)))) +(define color-green (make-color 90 212 68)) -(define (do-start-stage2 wikiname) - (:= @just-done "") - (:= @num-pages 1) - (:= @done-pages 0) - (t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)]) - (save-each-page wikiname (λ (now-done num-pages just-done-path) - (:= @num-pages num-pages) - (:= @done-pages now-done) - (:= @just-done just-done-path))) - (:= @state 'waiting) - (shift-queue-maybe))) - (:= @state 'stage-2)) +(define/obs @input "") + +(splicing-let ([frame-count 30]) + (define stickman-frames + (for/vector ([s (in-range 0 1 (/ 1 frame-count))]) + (running-stickman-icon + s + #:height status-icon-size + #:material (default-icon-material)))) + + (define/obs @stick-frame-no 0) + (define stick-timer + (new timer% + [notify-callback (λ () (@stick-frame-no . <~ . add1))] + [interval (truncate (/ 1000 frame-count))])) + (define/obs @stick + (@stick-frame-no . ~> . (λ (n) (vector-ref stickman-frames + (modulo n (vector-length stickman-frames))))))) + +(define status-icons + (hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size) + 'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size) + 'running @stick + 'error (x-icon #:height status-icon-size) + 'complete (check-icon #:color color-green #:height status-icon-size))) + +(define action-icons + (hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size) + 'resume (play-icon #:color color-green #:height button-icon-size))) + +(define (bitmap-view @the-bitmap [min-width 1]) + (pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f) + #:stretch '(#f #f) + #:style '(transparent) + @the-bitmap + bitmap)) (define (exn->string e) (with-output-to-string @@ -96,13 +135,15 @@ (printf ": ~a" (car item))) (displayln ""))))) -(define ((handle-graphical-exn wikiname) e) +(define ((handle-graphical-exn @qi) e) (displayln (exn->string e) (current-error-port)) (cond [(obs-peek @auto-retry) - (do-retry-end wikiname)] + (void) ;; TODO + #;(do-retry-end wikiname)] [#t - (:= @state 'err) + (update-qi @qi [st 'error]) + (do-try-unpause-next-entry) (thread (λ () (define/obs @visible? #t) @@ -116,89 +157,177 @@ (input #:style '(multiple hscroll) #:min-size '(#f 200) (exn->string e)) - (button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname))) - (button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname))) - (button "Skip Wiki" (λ () (:= @visible? #f) (do-continue))) - (button "Use Auto-Retry" (λ () - (:= @auto-retry #t) - (:= @visible? #f) - (do-retry-end wikiname))) - (text "Be careful not to auto-retry an infinite loop!"))) + ;; TODO + #;(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname))) + #;(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname))) + #;(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue))) + #;(button "Use Auto-Retry" (λ () + (:= @auto-retry #t) + (:= @visible? #f) + (do-retry-end wikiname))) + #;(text "Be careful not to auto-retry an infinite loop!"))) main-window))) (sleep) - ; make sure the old broken threads are all gone - (for ([th active-threads]) (kill-thread th)) - (set-clear! active-threads)])) + ; make sure the broken thread is gone + (define th (qi^-th (obs-peek @qi))) + (when th (kill-thread th))])) -(define (do-retry-now wikiname) - (@queue . <~ . (λ (q) (append (list wikiname) q))) - (:= @state 'waiting) - (shift-queue-maybe)) +(define segments + (list + (list 5/100 (make-color 0 223 217)) + (list 88/100 color-green) + (list 2/100 (make-color 0 223 217)) + (list 5/100 color-green))) +(define segment-spacing 2) +(unless (= (apply + (map car segments)) 1) + (error 'segments "segments add up to ~a, not 1" (apply + (map car segments)))) -(define (do-retry-end wikiname) - (@queue . <~ . (λ (q) (append q (list wikiname)))) - (:= @state 'waiting) - (shift-queue-maybe)) +;; return the new bitmap, which can be drawn on a dc<%> +(define/memoize (ray-trace width height stage progress max-progress) + ;; (printf "rendering ~a ~a/~a at ~a~n" stage progress max-progress (current-inexact-milliseconds)) + (define bm (make-object bitmap% width height #f #t)) + (define dc (make-object bitmap-dc% bm)) + (define width-available (- width (* (length segments) segment-spacing))) + (send dc set-smoothing 'unsmoothed) + (send dc set-pen "black" 0 'transparent) + (for/fold ([offset 0]) + ([segment segments] + [i (in-naturals 0)]) ;; zero indexed stages? + ;; calculate start and end locations of grey bar + (define-values (segment-proportion segment-color) (apply values segment)) + (define segment-start (if (= offset 0) 0 (+ offset segment-spacing))) + (define segment-width (* width-available segment-proportion)) + ;; draw grey bar + (send dc set-brush (make-color 180 180 180 0.4) 'solid) + (send dc draw-rectangle segment-start 0 segment-width height) + ;; draw solid bar according to the current item's progress + (define proportion + (cond [(stage . < . i) 0] + [(stage . > . i) 1] + [(max-progress . <= . 0) 0] + [(progress . < . 0) 0] + [(progress . >= . max-progress) 1] + [else (progress . / . max-progress)])) + (send dc set-brush segment-color 'solid) + (send dc draw-rectangle segment-start 0 (* proportion segment-width) height) + (+ segment-start segment-width)) + (bitmap-render-icon bm 6/8)) -(define (do-continue) - (:= @state 'waiting) - (shift-queue-maybe)) +;; get ray traced bitmap (possibly from cache) and draw on dc<%> +(define (draw-bar orig-dc qi) + ;; (println ray-traced) + (define-values (width height) (send orig-dc get-size)) + (send orig-dc draw-bitmap (ray-trace width height (qi^-stage qi) (qi^-progress qi) (qi^-max-progress qi)) 0 0)) -(define (display-basename basename) - (define limit 40) - (cond [(string? basename) - (define query (basename->name-for-query basename)) - (define segments (string-split query "/")) - (when (and ((string-length query) . > . limit) ((length segments) . >= . 2)) - (set! query (string-append ".../" (last segments)))) - (when ((string-length query) . > . limit) - (set! query (string-append (substring query 0 (- limit 3)) "..."))) - query] - [#t "?"])) +(define ((make-progress-updater @qi) a b c) + ;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c) + (update-qi @qi [progress a] [max-progress b])) + +(define (do-add-to-queue) + (define wikiname (string-trim (obs-peek @input))) + (when ((string-length wikiname) . > . 0) + (add-wikiname-to-queue wikiname 'queued 0)) ;; TODO: automatically start? + (:= @input "")) + +(define-syntax-rule (update-qi @qi args ...) + (let ([wikiname (qi^-wikiname (obs-peek @qi))]) + (@queue . <~ . (λ (queue) + (for/list ([qi queue]) + (if (equal? (qi^-wikiname qi) wikiname) + (struct-copy qi^ qi args ...) + qi)))))) + +(define (do-start-qi @qi) + (define th + (thread (λ () + (with-handlers ([exn? (handle-graphical-exn @qi)]) + (define last-stage + (for/last ([stage all-stages] + [i (in-naturals)]) + (update-qi @qi [stage i]) + (stage (qi^-wikiname (obs-peek @qi)) (make-progress-updater @qi)) + i)) + (update-qi @qi [st 'complete] [stage (add1 last-stage)]) + (do-try-unpause-next-entry))))) + (update-qi @qi [st 'running] [th th])) + +(define (do-stop-qi @qi) + (define th (qi^-th (obs-peek @qi))) + (when th (kill-thread th)) + (update-qi @qi [th #f] [st 'paused])) + +(define (do-try-unpause-next-entry) + (define queue (obs-peek @queue)) + (define next-qi (for/first ([qi queue] + #:when (memq (qi^-st qi) '(paused queued error))) + qi)) + (when next-qi + (define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue)))) + (do-start-qi @qi))) (define main-window (render - (window #:title @title - #:size '(360 200) - #:mixin (λ (%) (class % - (super-new) - (define/augment (on-close) - (for ([th active-threads]) (kill-thread th)) - (disconnect slc)))) - ;; input box at the top - (hpanel (text "https://") - (input @wikiname - (λ (event data) (cond - [(eq? event 'input) (:= @wikiname data)] - [(eq? event 'return) (do-start-or-queue)]))) - (text ".fandom.com")) - (button (@queue . ~> . (λ (q) (if (null? q) "Start" "Queue"))) (λ () (do-start-or-queue))) - (text (@queue . ~> . (λ (q) (if (null? q) "" (string-join #:before-first "Queue: " q ", "))))) - ;; show status based on overall application state - (case-view - @state - ;; waiting for wikiname entry - ((waiting) (vpanel - (text "Fill in the wikiname and click start."))) - ((stage-0) (vpanel - (text "Checking data..."))) - ((stage-1) (vpanel - (text "Gathering list of pages...") - (text (@just-done . ~> . display-basename)) - (text (@done-pages . ~> . (λ (x) (if (eq? x 0) - "0/?" - (format "~a/~a" x (obs-peek @num-pages)))))))) - ;; downloading contents - ((stage-2) (vpanel - (text "Downloading page text...") - (progress @done-pages #:range @num-pages) - (text (@done-pages . ~> . (λ (x) (format "~a/~a" x (obs-peek @num-pages))))) - (text (@just-done . ~> . display-basename)))) - ((done) (vpanel - (text "All wikis downloaded!"))) - ((err) (vpanel - (text "Error. Check the popup window."))) - (else (text (@state . ~> . (λ (state) (format "invalid state: ~a" state)))))) - (checkbox #:label "Auto-retry on error? (Dangerous)" - #:checked? @auto-retry - (λ:= @auto-retry))))) + (window + #:title "Fandom Archiver" + #:size '(400 300) + #:mixin (λ (%) (class % + (super-new) + (define/augment (on-close) + (send stick-timer stop) + (for ([qi (obs-peek @queue)]) + (when (qi^-th qi) + (kill-thread (qi^-th qi)))) + #;(disconnect*)))) + (vpanel + #:spacing 10 + #:margin '(5 5) + (hpanel + #:stretch '(#t #f) + #:spacing 10 + (hpanel + (text "https://") + (input @input + (λ (event data) (cond + [(eq? event 'input) (:= @input data)] + [(eq? event 'return) (do-add-to-queue)]))) + (text ".fandom.com")) + (button "Download Wiki" do-add-to-queue)) + (list-view + #:style '(vertical) + @queue + #:key qi^-wikiname + (λ (k @qi) + (define @status-icons + (@> (case (qi^-st @qi) + [(running) @stick] + [else (hash-ref status-icons (qi^-st @qi))]))) + (define @is-running? + (@> (memq (qi^-st @qi) '(running)))) + ;; state icon at the left side + (hpanel #:stretch '(#t #f) + #:alignment '(left center) + #:spacing 8 + (bitmap-view @status-icons status-icon-min-width) + (vpanel + ;; name and buttons (top half) + (hpanel #:alignment '(left bottom) + (text (@> (qi^-wikiname @qi))) + (spacer) + (hpanel + #:stretch '(#f #f) + (if-view @is-running? + (button (hash-ref action-icons 'pause) + (λ () (do-stop-qi @qi))) + (button (hash-ref action-icons 'resume) + (λ () (do-start-qi @qi)))))) + ;; progress bar (bottom half) + (hpanel + (canvas + @qi + #:style '(transparent) + #:margin '(3 3) + draw-bar) + (hpanel #:min-size '(68 #f) + #:stretch '(#f #f) + #:alignment '(right center) + (text (@> (format "eta ~a" (qi^-eta @qi)))))))))))))) diff --git a/archiver/archiver.rkt b/archiver/archiver.rkt index fbeec46..5eb56c8 100644 --- a/archiver/archiver.rkt +++ b/archiver/archiver.rkt @@ -2,38 +2,35 @@ (require racket/file racket/function racket/list - racket/runtime-path + racket/path + racket/sequence racket/string net/url net/mime file/sha1 net/http-easy db - "../lib/html-parsing/main.rkt" json "archiver-database.rkt" + "../lib/html-parsing/main.rkt" "../lib/mime-types.rkt" + "../lib/syntax.rkt" "../lib/tree-updater.rkt" "../lib/url-utils.rkt" "../lib/xexpr-utils.rkt" "../lib/archive-file-mappings.rkt") -(define archive-slc slc) - (provide - if-necessary-download-list-of-pages - download-list-of-pages - save-each-page basename->name-for-query image-url->values hash->save-dir - archive-slc) + all-stages) (module+ test (require rackunit)) -(define-runtime-path archive-root "../storage/archive") -#;(define archive-root "archive") +(define archive-root (anytime-path ".." "storage/archive")) +(make-directory* archive-root) (define sources '#hasheq((style . 1) (page . 2))) @@ -46,127 +43,34 @@ wikiname (params->query '(("action" . "query") ("meta" . "siteinfo") - ("siprop" . "general|rightsinfo") + ("siprop" . "general|rightsinfo|statistics") ("format" . "json") ("formatversion" . "2"))))) (define data (response-json (get dest-url))) - (define exists? (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) - (if exists? - (query-exec slc "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" + (define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) + (if (and exists? (not (sql-null? exists?))) + (query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" (jp "/query/general/sitename" data) (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) (jp "/query/rightsinfo/text" data) (jp "/query/rightsinfo/url" data) wikiname) - (query-exec slc "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 1, ?, ?, ?, ?)" + (query-exec* "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 0, ?, ?, ?, ?)" wikiname (jp "/query/general/sitename" data) (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) (jp "/query/rightsinfo/text" data) - (jp "/query/rightsinfo/url" data)))) + (jp "/query/rightsinfo/url" data))) + (jp "/query/statistics/articles" data)) -;; call 1 if not yet done for that wiki -(define (if-necessary-download-list-of-pages wikiname callback) - (define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) - ;; done yet? - (unless (and (real? wiki-progress) (wiki-progress . >= . 1)) - ;; count total pages - (define dest-url - (format "https://~a.fandom.com/api.php?~a" - wikiname - (params->query `(("action" . "query") ("meta" . "siteinfo") ("siprop" . "statistics") ("format" . "json"))))) - (define num-pages (jp "/query/statistics/articles" (response-json (get dest-url)))) - (download-list-of-pages wikiname callback 0 num-pages #f))) -;; 1. Download list of wiki pages and store in database -(define (download-list-of-pages wikiname callback total-so-far grand-total path-with-namefrom) - (define url (if path-with-namefrom - (format "https://~a.fandom.com~a" wikiname path-with-namefrom) - (format "https://~a.fandom.com/wiki/Local_Sitemap" wikiname))) - (define r (get url)) - (define page (html->xexp (bytes->string/utf-8 (response-body r)))) - (define link-namefrom - ((query-selector (λ (t a c x) (and (eq? t 'a) - (pair? x) - (string-contains? (car x) "Next page") - (let ([href (get-attribute 'href a)] ) - (and href (string-contains? href "/wiki/Local_Sitemap"))))) - page #:include-text? #t))) - (define row-values - (for/list ([link (in-producer - (query-selector - (λ (t a c) (eq? t 'a)) - ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page))) - #f)]) - (list wikiname (local-encoded-url->basename (get-attribute 'href (bits->attributes link))) 0))) - (define query-template (string-join (make-list (length row-values) "(?, ?, ?)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values ")) - (apply query-exec slc query-template (flatten row-values)) - (define new-total (+ (length row-values) total-so-far)) - (callback new-total grand-total (second (last row-values))) - (cond - [link-namefrom ; repeat on the next page - (download-list-of-pages wikiname callback new-total grand-total (get-attribute 'href (bits->attributes link-namefrom)))] - [#t ; all done downloading sitemap - (insert-wiki-entry wikiname)])) - -;; 2. Download each page via API and: -;; * Save API response to file -(define max-page-progress 1) -(define (save-each-page wikiname callback) - ;; prepare destination folder - (define save-dir (build-path archive-root wikiname)) - (make-directory* save-dir) - ;; gather list of basenames to download (that aren't yet complete) - (define basenames (query-list slc "select basename from page where wikiname = ? and progress < ?" - wikiname max-page-progress)) - ;; counter of complete/incomplete basenames - (define already-done-count - (query-value slc "select count(*) from page where wikiname = ? and progress = ?" - wikiname max-page-progress)) - (define not-done-count - (query-value slc "select count(*) from page where wikiname = ? and progress < ?" - wikiname max-page-progress)) - ;; set initial progress - (callback already-done-count (+ already-done-count not-done-count) "") - ;; loop through basenames and download - (for ([basename basenames] - [i (in-naturals 1)]) - (define name-for-query (basename->name-for-query basename)) - (define dest-url - (format "https://~a.fandom.com/api.php?~a" - wikiname - (params->query `(("action" . "parse") - ("page" . ,name-for-query) - ("prop" . "text|headhtml|langlinks") - ("formatversion" . "2") - ("format" . "json"))))) - (define r (get dest-url)) - (define body (response-body r)) - (define filename (string-append basename ".json")) - (define save-path - (cond [((string-length basename) . > . 240) - (define key (sha1 (string->bytes/latin-1 basename))) - (query-exec slc "insert into special_page (wikiname, key, basename) values (?, ?, ?)" - wikiname key basename) - (build-path save-dir (string-append key ".json"))] - [#t - (build-path save-dir (string-append basename ".json"))])) - (display-to-file body save-path #:exists 'replace) - (query-exec slc "update page set progress = 1 where wikiname = ? and basename = ?" - wikiname basename) - (callback (+ already-done-count i) (+ already-done-count not-done-count) basename)) - ;; saved all pages, register that fact in the database - (query-exec slc "update wiki set progress = 2 where wikiname = ?" wikiname)) - -;; 3. Download CSS and: -;; * Save CSS to file -;; * Record style images to database (define (check-style-for-images wikiname path) (define content (file->string path)) (define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr)) (for/list ([url urls] #:when (not (or (equal? url "") (equal? url "'") + (string-suffix? url "\"") (string-contains? url "/resources-ucp/") (string-contains? url "/fonts/") (string-contains? url "/drm_fonts/") @@ -184,7 +88,7 @@ [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)] [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)]))) -(define (download-styles-for-wiki wikiname) +(define (download-styles-for-wiki wikiname callback) (define save-dir (build-path archive-root wikiname "styles")) (make-directory* save-dir) (define theme (λ (theme-name) @@ -198,18 +102,137 @@ (theme "dark") (cons (format "https://~a.fandom.com/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop" wikiname) (build-path save-dir "site.css")))) - (for ([style styles]) + (for ([style styles] + [i (in-naturals)]) + (callback i (length styles) "styles...") (define r (get (car style))) (define body (response-body r)) (display-to-file body (cdr style) #:exists 'replace) ;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url? ) + (callback (length styles) (length styles) "styles...") styles) -(define (do-step-3 wikiname) - (define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) +(define (hash->save-dir wikiname hash) + (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2))) + +(define (image-url->values i) + ;; TODO: handle case where there is multiple broken cb parameter on minecraft wiki + ;; TODO: ensure it still "works" with broken & on minecraft wiki + (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing + (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary + (define hash (sha1 (string->bytes/utf-8 key))) + (cons key hash)) + + +;; 1. Download list of wiki pages and store in database, if not done yet for that wiki +(define (if-necessary-download-list-of-pages wikiname callback) + (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) + ;; done yet? + (unless (and (real? wiki-progress) (wiki-progress . >= . 1)) + ;; Count total pages + (define num-pages (insert-wiki-entry wikiname)) + ;; Download the entire index of pages + (define basenames + (let loop ([path-with-namefrom "/wiki/Local_Sitemap"] + [basenames-previous-pages null]) + ;; Download the current index page + (define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom)) + (define r (get url)) + ;; Metadata from this page (the link to the next page) + (define page (html->xexp (bytes->string/utf-8 (response-body r)))) + (define link-namefrom + ((query-selector (λ (t a c x) (and (eq? t 'a) + (pair? x) + (string-contains? (car x) "Next page") + (let ([href (get-attribute 'href a)] ) + (and href (string-contains? href "/wiki/Local_Sitemap"))))) + page #:include-text? #t))) + ;; Content from this page + (define basenames-this-page + (for/list ([link (in-producer + (query-selector + (λ (t a c) (eq? t 'a)) + ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page))) + #f)]) + (local-encoded-url->basename (get-attribute 'href (bits->attributes link))))) + ;; Call the progress callback + (define all-basenames (append basenames-previous-pages basenames-this-page)) + (callback (length all-basenames) num-pages (last all-basenames)) + ;; Recurse to download from the next page + (if link-namefrom + (loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames) + all-basenames))) + ;; Save those pages into the database + ;; SQLite can have a maximum of 32766 parameters in a single query + (for ([slice (in-slice 32760 basenames)]) + (define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values ")) + (call-with-transaction + (get-slc) + (λ () + (apply query-exec* query-template wikiname slice) + ;; Record that we have the complete list of pages + (query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname)))))) + + +;; 2. Download each page via API and: +;; * Save API response to file +(define max-page-progress 1) +(define (save-each-page wikiname callback) + ;; prepare destination folder + (define save-dir (build-path archive-root wikiname)) + (make-directory* save-dir) + ;; gather list of basenames to download (that aren't yet complete) + (define basenames (query-list* "select basename from page where wikiname = ? and progress < ?" + wikiname max-page-progress)) + ;; counter of complete/incomplete basenames + (define already-done-count + (query-value* "select count(*) from page where wikiname = ? and progress = ?" + wikiname max-page-progress)) + (define not-done-count + (query-value* "select count(*) from page where wikiname = ? and progress < ?" + wikiname max-page-progress)) + (define total-count (+ already-done-count not-done-count)) + ;; set initial progress + (callback already-done-count total-count "") + ;; loop through basenames and download + (for ([basename basenames] + [i (in-naturals (add1 already-done-count))]) + (define name-for-query (basename->name-for-query basename)) + (define dest-url + (format "https://~a.fandom.com/api.php?~a" + wikiname + (params->query `(("action" . "parse") + ("page" . ,name-for-query) + ("prop" . "text|headhtml|langlinks") + ("formatversion" . "2") + ("format" . "json"))))) + (define r (get dest-url)) + (define body (response-body r)) + (define filename (string-append basename ".json")) + (define save-path + (cond [((string-length basename) . > . 240) + (define key (sha1 (string->bytes/latin-1 basename))) + (query-exec* "insert into special_page (wikiname, key, basename) values (?, ?, ?)" + wikiname key basename) + (build-path save-dir (string-append key ".json"))] + [#t + (build-path save-dir (string-append basename ".json"))])) + (display-to-file body save-path #:exists 'replace) + (query-exec* "update page set progress = 1 where wikiname = ? and basename = ?" + wikiname basename) + (callback i total-count basename)) + ;; saved all pages, register that fact in the database + (query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname)) + + +;; 3. Download CSS and: +;; * Save CSS to file +;; * Record style images to database +(define (if-necessary-download-and-check-styles wikiname callback) + (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) (unless (and (number? wiki-progress) (wiki-progress . >= . 3)) - (define styles (download-styles-for-wiki wikiname)) + (define styles (download-styles-for-wiki wikiname callback)) (define unique-image-urls (remove-duplicates (map image-url->values @@ -217,48 +240,40 @@ (for/list ([style styles]) (check-style-for-images wikiname (cdr style))))) #:key cdr)) - (println unique-image-urls) (for ([pair unique-image-urls]) - (query-exec slc "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair))) - (query-exec slc "update wiki set progress = 3 where wikiname = ?" wikiname))) + (query-exec* "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair))) + (query-exec* "update wiki set progress = 3 where wikiname = ?" wikiname))) + ;; 4: From downloaded pages, record URLs of image sources and inline style images to database -(define (hash->save-dir wikiname hash) - (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2))) - -(define (image-url->values i) - ;; TODO: handle case where there is multiple cb parameter on minecraft wiki - ;; TODO: ensure it still "works" with broken & on minecraft wiki - (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing - (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary - (define hash (sha1 (string->bytes/utf-8 key))) - (cons key hash)) - (define (check-json-for-images wikiname path) (define data (with-input-from-file path (λ () (read-json)))) (define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) (define tree (update-tree-wiki page wikiname)) - (remove-duplicates - (for/list ([element (in-producer - (query-selector - (λ (t a c) - (and (eq? t 'img) - (get-attribute 'src a))) - tree) - #f)]) - (image-url->values (get-attribute 'src (bits->attributes element)))))) + null + #;(remove-duplicates + (for/list ([element (in-producer + (query-selector + (λ (t a c) + (and (eq? t 'img) + (get-attribute 'src a))) + tree) + #f)]) + (image-url->values (get-attribute 'src (bits->attributes element)))))) + ;; 5. Download image sources and style images according to database -(define (save-each-image wikiname source callback) +(define (save-each-image wikiname callback) + (define source (hash-ref sources 'style)) ;; TODO: download entire wiki images instead? ;; gather list of basenames to download (that aren't yet complete) - (define rows (query-rows slc "select url, hash from image where wikiname = ? and source <= ? and progress < 1" + (define rows (query-rows* "select url, hash from image where wikiname = ? and source <= ? and progress < 1" wikiname source)) ;; counter of complete/incomplete basenames (define already-done-count - (query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress = 1" + (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress = 1" wikiname source)) (define not-done-count - (query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress < 1" + (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress < 1" wikiname source)) ;; set initial progress (callback already-done-count (+ already-done-count not-done-count) "") @@ -269,26 +284,35 @@ (define url (vector-ref row 0)) (define hash (vector-ref row 1)) ;; check - (printf "~a -> ~a~n" url hash) + #; (printf "~a -> ~a~n" url hash) (define r (get url)) (define declared-type (response-headers-ref r 'content-type)) (define final-type (if (equal? declared-type #"application/octet-stream") (let ([sniff-entity (message-entity (mime-analyze (response-body r)))]) (string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity)))) declared-type)) - (define ext (bytes->string/latin-1 (mime-type->ext final-type))) + (define ext + (with-handlers ([exn:fail:contract? (λ _ (error 'save-each-image "no ext found for mime type `~a` in file ~a" final-type url))]) + (bytes->string/latin-1 (mime-type->ext final-type)))) ;; save (define save-dir (hash->save-dir wikiname hash)) (make-directory* save-dir) (define save-path (build-path save-dir (string-append hash "." ext))) (define body (response-body r)) (display-to-file body save-path #:exists 'replace) - (query-exec slc "update image set progress = 1, ext = ? where wikiname = ? and hash = ?" + (query-exec* "update image set progress = 1, ext = ? where wikiname = ? and hash = ?" ext wikiname hash) - (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append hash "." ext))) - ;; TODO: saved all images, register that fact in the database - ) + (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append (substring hash 0 6) "..." ext))) + ;; saved all images, register that fact in the database + (query-exec* "update wiki set progress = 4 where wikiname = ?" wikiname)) +(define all-stages + (list + if-necessary-download-list-of-pages + save-each-page + if-necessary-download-and-check-styles + ;; check-json-for-images + save-each-image)) (module+ test (check-equal? (html->xexp "") @@ -299,11 +323,13 @@ #;(do-step-3 "gallowmere") #;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))) - #;(for ([wikiname (query-list slc "select wikiname from wiki")]) + #;(for ([wikiname (query-list* "select wikiname from wiki")]) (println wikiname) (insert-wiki-entry wikiname)) - #;(for ([wikiname (query-list slc "select wikiname from wiki")]) + #;(for ([wikiname (query-list* "select wikiname from wiki")]) (println wikiname) (do-step-3 wikiname) (save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))))) + +; (for ([stage all-stages]) (stage "create" (λ (a b c) (printf "~a/~a ~a~n" a b c))))