Rewrite archiver project

* Rewrite archiver.rkt to manage the stages order
* Rewrite archiver-gui.rkt:
  * Remembers the previous incomplete queue items
  * Pretty graphics for icons and progress bars
  * Segmented progress bars to indicate different stages
* Fix archiver-cli.rkt to use new stages
* Switch to req -d, so it doesn't auto-install gui libs
This commit is contained in:
Cadence Ember 2023-03-08 22:58:57 +13:00
parent 453570bdc9
commit cf74ffb0e2
Signed by: cadence
GPG Key ID: BC1C2C61CF521B17
4 changed files with 536 additions and 350 deletions

View File

@ -6,24 +6,24 @@
"" ""
"Downloaded pages go into `archive/` next to the executable." "Downloaded pages go into `archive/` next to the executable."
"Database goes into `archiver.db*` next to the executable." "Database goes into `archiver.db*` next to the executable."
"The database is necessary to store your download progress and resume where you left off if the process is interrupted.") "The database is necessary to store your download progress and resume where you left off if the process is interrupted."))
(ps ""
"Default output style is `progress` in a tty and `lines` otherwise."))
(flag (output-quiet?) (flag (output-quiet?)
("-q" "--output-quiet" "disable progress output") ("-q" "--output-quiet" "disable progress output")
(output-quiet? #t)) (output-quiet? #t))
(flag (output-lines?)
("-l" "--output-lines" "output the name of each file downloaded")
(output-lines? #t))
(flag (output-progress?) (flag (output-progress?)
("-p" "--output-progress" "progress output for terminals") ("-p" "--output-progress" "progress output for terminals (default in a tty)")
(output-progress? #t)) (output-progress? #t))
(flag (output-lines?)
("-l" "--output-lines" "output the name of each file downloaded (default outside of a tty)")
(output-lines? #t))
(constraint (one-of output-quiet? output-lines? output-progress?)) (constraint (one-of output-quiet? output-lines? output-progress?))
(program (program
(start [wikiname "wikiname to download"]) (start [wikiname "wikiname to download"])
;; set up arguments ;; set up arguments
@ -42,30 +42,27 @@
;; check ;; check
(when (or (not wikiname) (equal? wikiname "")) (when (or (not wikiname) (equal? wikiname ""))
(raise-user-error "Please specify the wikiname to download on the command line.")) (raise-user-error "Please specify the wikiname to download on the command line."))
;; stage 1 ;; progress reporting based on selected mode
(cond [(output-lines?) (displayln "Downloading list of pages...")] (define (report-progress a b c)
[(output-progress?) (printf "Downloading list of pages... \r")]) (define basename (basename->name-for-query c))
(if-necessary-download-list-of-pages (cond
wikiname [(output-lines?)
(λ (a b c) (displayln basename)]
(cond [(output-progress?) (printf "Downloading list of pages... [~a/~b]\r" a b)]))) [(output-progress?)
;; stage 2 (when (eq? (modulo a 20) 0)
(save-each-page (thread (λ () (update-width))))
wikiname (define prefix (format "[~a] [~a/~a] " wikiname a b))
(λ (a b c) (define rest (- width (string-length prefix)))
(define basename (basename->name-for-query c)) (define real-width (min (string-length basename) rest))
(cond (define spare-width (- rest real-width))
[(output-lines?) (define name-display (substring basename 0 real-width))
(displayln basename)] (define whitespace (make-string spare-width #\ ))
[(output-progress?) (printf "~a~a~a\r" prefix name-display whitespace)]))
(when (eq? (modulo a 20) 0) ;; download all stages
(thread (λ () (update-width)))) (for ([stage all-stages]
(define prefix (format "[~a/~a] " a b)) [i (in-naturals 1)])
(define rest (- width (string-length prefix))) (printf "> Stage ~a/~a~n" i (length all-stages))
(define real-width (min (string-length basename) rest)) (stage wikiname report-progress)
(define spare-width (- rest real-width)) (displayln "")))
(define name-display (substring basename 0 real-width))
(define whitespace (make-string spare-width #\ ))
(printf "~a~a~a\r" prefix name-display whitespace)]))))
(run start) (run start)

View File

@ -1,7 +1,7 @@
#lang racket/base #lang racket/base
(require racket/list (require racket/file
racket/list
racket/path racket/path
racket/runtime-path
racket/string racket/string
json json
json-pointer json-pointer
@ -9,9 +9,16 @@
"../lib/syntax.rkt") "../lib/syntax.rkt")
(provide (provide
slc) get-slc
query-exec*
query-rows*
query-list*
query-value*
query-maybe-value*
query-maybe-row*)
(define-runtime-path database-file "../storage/archiver.db") (define storage-path (anytime-path ".." "storage"))
(define database-file (build-path storage-path "archiver.db"))
(define migrations (define migrations
(wrap-sql (wrap-sql
@ -25,23 +32,50 @@
(query-exec slc "alter table wiki add column license_text TEXT") (query-exec slc "alter table wiki add column license_text TEXT")
(query-exec slc "alter table wiki add column license_url TEXT")))) (query-exec slc "alter table wiki add column license_url TEXT"))))
(define slc (sqlite3-connect #:database database-file #:mode 'create)) (define slc (box #f))
(query-exec slc "PRAGMA journal_mode=WAL") (define (get-slc)
(define database-version (define slc* (unbox slc))
(with-handlers ([exn:fail:sql? (cond
(λ (exn) [slc* slc*]
; need to set up the database [else
(query-exec slc "create table database_version (version integer, primary key (version))") (make-directory* storage-path)
(query-exec slc "insert into database_version values (0)") (define slc* (sqlite3-connect #:database database-file #:mode 'create))
0)]) (query-exec slc* "PRAGMA journal_mode=WAL")
(query-value slc "select version from database_version"))) (define database-version
(with-handlers ([exn:fail:sql?
(λ (exn)
; need to set up the database
(query-exec slc* "create table database_version (version integer, primary key (version))")
(query-exec slc* "insert into database_version values (0)")
0)])
(query-value slc* "select version from database_version")))
(let do-migrate-step () (let do-migrate-step ()
(when (database-version . < . (length migrations)) (when (database-version . < . (length migrations))
(call-with-transaction (call-with-transaction
slc slc*
(list-ref migrations database-version)) (list-ref migrations database-version))
(set! database-version (add1 database-version)) (set! database-version (add1 database-version))
(query-exec slc "update database_version set version = $1" database-version) (query-exec slc* "update database_version set version = $1" database-version)
(do-migrate-step))) (do-migrate-step)))
(set-box! slc slc*)
slc*]))
(define (query-exec* . args)
(apply query-exec (get-slc) args))
(define (query-rows* . args)
(apply query-rows (get-slc) args))
(define (query-list* . args)
(apply query-list (get-slc) args))
(define (query-value* . args)
(apply query-value (get-slc) args))
(define (query-maybe-value* . args)
(apply query-maybe-value (get-slc) args))
(define (query-maybe-row* . args)
(apply query-maybe-row (get-slc) args))

View File

@ -1,89 +1,128 @@
#lang racket/base #lang racket/base
(require racket/class (require racket/class
racket/draw
racket/format
racket/list racket/list
racket/port racket/port
racket/set racket/set
racket/splicing
racket/string racket/string
db db
net/http-easy net/http-easy
memo
(only-in racket/gui timer%)
racket/gui/easy racket/gui/easy
racket/gui/easy/operator racket/gui/easy/operator
(only-in pict bitmap)
images/icons/style
images/icons/control
images/icons/stickman
images/icons/symbol
"archiver-database.rkt" "archiver-database.rkt"
"archiver.rkt" "archiver.rkt"
"../lib/url-utils.rkt" "../lib/url-utils.rkt"
"../lib/xexpr-utils.rkt") "../lib/xexpr-utils.rkt")
(define active-threads (mutable-seteq)) (default-icon-material rubber-icon-material)
(require (for-syntax racket/base racket/match racket/set racket/string))
(define-syntax (@> stx)
(define form (cdr (syntax->datum stx)))
(match form
[(list form) ; (@> (fn @obs))
;; identify the observables and replace with non-@ symbols
(define collection (mutable-set))
(define updated
(let loop ([sexp form])
(cond [(symbol? sexp)
(let ([as-s (symbol->string sexp)])
(if (string-prefix? as-s "@")
(let ([without-@ (string->symbol (substring as-s 1))])
(set-add! collection (cons sexp without-@))
without-@)
sexp))]
[(pair? sexp) (cons (loop (car sexp)) (loop (cdr sexp)))]
[#t sexp])))
(define collection-l (set->list collection))
;; return obs-combine -> updated-form
(datum->syntax stx `(obs-combine (λ (,@(map cdr collection-l)) ,updated) ,@(map car collection-l)))]
[(list (? string? str) args ...) ; (@> "Blah: ~a/~a" @arg1 arg2)
;; identify the observables and replace with non-@ symbols
(define collection-l
(for/list ([arg args])
(if (symbol? arg)
(let ([as-s (symbol->string arg)])
(if (string-prefix? as-s "@")
(let ([without-@ (string->symbol (substring as-s 1))])
(cons arg without-@))
(cons #f arg)))
(cons #f arg))))
(define collection-lo (filter car collection-l))
;; return obs-combine -> format
(datum->syntax stx `(obs-combine (λ (,@(map cdr collection-lo)) (format ,str ,@(map cdr collection-l))) ,@(map car collection-lo)))]))
(define/obs @auto-retry #f) (define/obs @auto-retry #f)
(define/obs @wikiname "")
(define/obs @state 'waiting)
(define/obs @num-pages 1)
(define/obs @done-pages 0)
(define/obs @just-done "")
(define/obs @queue '())
(define @title
(obs-combine
(λ (state queue num-pages done-pages)
(define suffix (if (pair? queue)
(format " +~a" (length queue))
""))
(define progress (if (eq? num-pages 0)
" 0%"
(format " ~a%" (round (inexact->exact (* (/ done-pages num-pages) 100))))))
(case state
[(waiting stage-0) (format "Fandom Archiver~a" suffix)]
[(stage-1) (format "Fandom Archiver 0%~a" suffix)]
[(stage-2) (format "Fandom Archiver~a~a" progress suffix)]
[(err) "ERROR Fandom Archiver"]
[(done) "Fandom Archiver 100%"]))
@state @queue @num-pages (obs-throttle @done-pages #:duration 5000)))
(define-syntax-rule (t body ...) (define-struct qi^ (wikiname st stage progress max-progress eta th) #:transparent) ;; queue item
(set-add! active-threads (thread (λ () body ...))))
(define (do-start-or-queue) (define rows (query-rows* "select wikiname, progress from wiki where progress < 4"))
(define wikiname (obs-peek @wikiname)) (define/obs @queue null)
(:= @wikiname "") (define (add-wikiname-to-queue wikiname st stage)
(when (not (equal? (string-trim wikiname) "")) (@queue . <~ . (λ (queue)
(@queue . <~ . (λ (q) (append q (list wikiname)))) (define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue))
(shift-queue-maybe))) (if already-exists?
queue
(append queue (list (qi^ wikiname st stage 0 1 "..." #f)))))))
(for ([row rows])
(add-wikiname-to-queue (vector-ref row 0)
(if (= (vector-ref row 1) 4)
'complete
'queued)
(vector-ref row 1)))
(define (shift-queue-maybe) (define status-icon-size 32)
(when (memq (obs-peek @state) '(waiting done)) (define status-icon-min-width 36)
(define q (obs-peek @queue)) (define button-icon-size 12)
(cond
[(pair? q)
(define wikiname (car q))
(:= @queue (cdr q))
(do-start-stage1 wikiname)]
[#t (:= @state 'done)])))
(define (do-start-stage1 wikiname) (define color-green (make-color 90 212 68))
(:= @just-done "")
(:= @done-pages 0)
(:= @num-pages 1)
(t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)])
(:= @state 'stage-0)
(if-necessary-download-list-of-pages wikiname (λ (now-done num-pages just-done-name)
(:= @num-pages num-pages)
(:= @done-pages now-done)
(:= @just-done just-done-name)
(:= @state 'stage-1)))
(do-start-stage2 wikiname))))
(define (do-start-stage2 wikiname) (define/obs @input "")
(:= @just-done "")
(:= @num-pages 1) (splicing-let ([frame-count 30])
(:= @done-pages 0) (define stickman-frames
(t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)]) (for/vector ([s (in-range 0 1 (/ 1 frame-count))])
(save-each-page wikiname (λ (now-done num-pages just-done-path) (running-stickman-icon
(:= @num-pages num-pages) s
(:= @done-pages now-done) #:height status-icon-size
(:= @just-done just-done-path))) #:material (default-icon-material))))
(:= @state 'waiting)
(shift-queue-maybe))) (define/obs @stick-frame-no 0)
(:= @state 'stage-2)) (define stick-timer
(new timer%
[notify-callback (λ () (@stick-frame-no . <~ . add1))]
[interval (truncate (/ 1000 frame-count))]))
(define/obs @stick
(@stick-frame-no . ~> . (λ (n) (vector-ref stickman-frames
(modulo n (vector-length stickman-frames)))))))
(define status-icons
(hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size)
'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size)
'running @stick
'error (x-icon #:height status-icon-size)
'complete (check-icon #:color color-green #:height status-icon-size)))
(define action-icons
(hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size)
'resume (play-icon #:color color-green #:height button-icon-size)))
(define (bitmap-view @the-bitmap [min-width 1])
(pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f)
#:stretch '(#f #f)
#:style '(transparent)
@the-bitmap
bitmap))
(define (exn->string e) (define (exn->string e)
(with-output-to-string (with-output-to-string
@ -96,13 +135,15 @@
(printf ": ~a" (car item))) (printf ": ~a" (car item)))
(displayln ""))))) (displayln "")))))
(define ((handle-graphical-exn wikiname) e) (define ((handle-graphical-exn @qi) e)
(displayln (exn->string e) (current-error-port)) (displayln (exn->string e) (current-error-port))
(cond (cond
[(obs-peek @auto-retry) [(obs-peek @auto-retry)
(do-retry-end wikiname)] (void) ;; TODO
#;(do-retry-end wikiname)]
[#t [#t
(:= @state 'err) (update-qi @qi [st 'error])
(do-try-unpause-next-entry)
(thread (thread
(λ () (λ ()
(define/obs @visible? #t) (define/obs @visible? #t)
@ -116,89 +157,177 @@
(input #:style '(multiple hscroll) (input #:style '(multiple hscroll)
#:min-size '(#f 200) #:min-size '(#f 200)
(exn->string e)) (exn->string e))
(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname))) ;; TODO
(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname))) #;(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname)))
(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue))) #;(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname)))
(button "Use Auto-Retry" (λ () #;(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue)))
(:= @auto-retry #t) #;(button "Use Auto-Retry" (λ ()
(:= @visible? #f) (:= @auto-retry #t)
(do-retry-end wikiname))) (:= @visible? #f)
(text "Be careful not to auto-retry an infinite loop!"))) (do-retry-end wikiname)))
#;(text "Be careful not to auto-retry an infinite loop!")))
main-window))) main-window)))
(sleep) (sleep)
; make sure the old broken threads are all gone ; make sure the broken thread is gone
(for ([th active-threads]) (kill-thread th)) (define th (qi^-th (obs-peek @qi)))
(set-clear! active-threads)])) (when th (kill-thread th))]))
(define (do-retry-now wikiname) (define segments
(@queue . <~ . (λ (q) (append (list wikiname) q))) (list
(:= @state 'waiting) (list 5/100 (make-color 0 223 217))
(shift-queue-maybe)) (list 88/100 color-green)
(list 2/100 (make-color 0 223 217))
(list 5/100 color-green)))
(define segment-spacing 2)
(unless (= (apply + (map car segments)) 1)
(error 'segments "segments add up to ~a, not 1" (apply + (map car segments))))
(define (do-retry-end wikiname) ;; return the new bitmap, which can be drawn on a dc<%>
(@queue . <~ . (λ (q) (append q (list wikiname)))) (define/memoize (ray-trace width height stage progress max-progress)
(:= @state 'waiting) ;; (printf "rendering ~a ~a/~a at ~a~n" stage progress max-progress (current-inexact-milliseconds))
(shift-queue-maybe)) (define bm (make-object bitmap% width height #f #t))
(define dc (make-object bitmap-dc% bm))
(define width-available (- width (* (length segments) segment-spacing)))
(send dc set-smoothing 'unsmoothed)
(send dc set-pen "black" 0 'transparent)
(for/fold ([offset 0])
([segment segments]
[i (in-naturals 0)]) ;; zero indexed stages?
;; calculate start and end locations of grey bar
(define-values (segment-proportion segment-color) (apply values segment))
(define segment-start (if (= offset 0) 0 (+ offset segment-spacing)))
(define segment-width (* width-available segment-proportion))
;; draw grey bar
(send dc set-brush (make-color 180 180 180 0.4) 'solid)
(send dc draw-rectangle segment-start 0 segment-width height)
;; draw solid bar according to the current item's progress
(define proportion
(cond [(stage . < . i) 0]
[(stage . > . i) 1]
[(max-progress . <= . 0) 0]
[(progress . < . 0) 0]
[(progress . >= . max-progress) 1]
[else (progress . / . max-progress)]))
(send dc set-brush segment-color 'solid)
(send dc draw-rectangle segment-start 0 (* proportion segment-width) height)
(+ segment-start segment-width))
(bitmap-render-icon bm 6/8))
(define (do-continue) ;; get ray traced bitmap (possibly from cache) and draw on dc<%>
(:= @state 'waiting) (define (draw-bar orig-dc qi)
(shift-queue-maybe)) ;; (println ray-traced)
(define-values (width height) (send orig-dc get-size))
(send orig-dc draw-bitmap (ray-trace width height (qi^-stage qi) (qi^-progress qi) (qi^-max-progress qi)) 0 0))
(define (display-basename basename) (define ((make-progress-updater @qi) a b c)
(define limit 40) ;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c)
(cond [(string? basename) (update-qi @qi [progress a] [max-progress b]))
(define query (basename->name-for-query basename))
(define segments (string-split query "/")) (define (do-add-to-queue)
(when (and ((string-length query) . > . limit) ((length segments) . >= . 2)) (define wikiname (string-trim (obs-peek @input)))
(set! query (string-append ".../" (last segments)))) (when ((string-length wikiname) . > . 0)
(when ((string-length query) . > . limit) (add-wikiname-to-queue wikiname 'queued 0)) ;; TODO: automatically start?
(set! query (string-append (substring query 0 (- limit 3)) "..."))) (:= @input ""))
query]
[#t "?"])) (define-syntax-rule (update-qi @qi args ...)
(let ([wikiname (qi^-wikiname (obs-peek @qi))])
(@queue . <~ . (λ (queue)
(for/list ([qi queue])
(if (equal? (qi^-wikiname qi) wikiname)
(struct-copy qi^ qi args ...)
qi))))))
(define (do-start-qi @qi)
(define th
(thread (λ ()
(with-handlers ([exn? (handle-graphical-exn @qi)])
(define last-stage
(for/last ([stage all-stages]
[i (in-naturals)])
(update-qi @qi [stage i])
(stage (qi^-wikiname (obs-peek @qi)) (make-progress-updater @qi))
i))
(update-qi @qi [st 'complete] [stage (add1 last-stage)])
(do-try-unpause-next-entry)))))
(update-qi @qi [st 'running] [th th]))
(define (do-stop-qi @qi)
(define th (qi^-th (obs-peek @qi)))
(when th (kill-thread th))
(update-qi @qi [th #f] [st 'paused]))
(define (do-try-unpause-next-entry)
(define queue (obs-peek @queue))
(define next-qi (for/first ([qi queue]
#:when (memq (qi^-st qi) '(paused queued error)))
qi))
(when next-qi
(define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue))))
(do-start-qi @qi)))
(define main-window (define main-window
(render (render
(window #:title @title (window
#:size '(360 200) #:title "Fandom Archiver"
#:mixin (λ (%) (class % #:size '(400 300)
(super-new) #:mixin (λ (%) (class %
(define/augment (on-close) (super-new)
(for ([th active-threads]) (kill-thread th)) (define/augment (on-close)
(disconnect slc)))) (send stick-timer stop)
;; input box at the top (for ([qi (obs-peek @queue)])
(hpanel (text "https://") (when (qi^-th qi)
(input @wikiname (kill-thread (qi^-th qi))))
(λ (event data) (cond #;(disconnect*))))
[(eq? event 'input) (:= @wikiname data)] (vpanel
[(eq? event 'return) (do-start-or-queue)]))) #:spacing 10
(text ".fandom.com")) #:margin '(5 5)
(button (@queue . ~> . (λ (q) (if (null? q) "Start" "Queue"))) (λ () (do-start-or-queue))) (hpanel
(text (@queue . ~> . (λ (q) (if (null? q) "" (string-join #:before-first "Queue: " q ", "))))) #:stretch '(#t #f)
;; show status based on overall application state #:spacing 10
(case-view (hpanel
@state (text "https://")
;; waiting for wikiname entry (input @input
((waiting) (vpanel (λ (event data) (cond
(text "Fill in the wikiname and click start."))) [(eq? event 'input) (:= @input data)]
((stage-0) (vpanel [(eq? event 'return) (do-add-to-queue)])))
(text "Checking data..."))) (text ".fandom.com"))
((stage-1) (vpanel (button "Download Wiki" do-add-to-queue))
(text "Gathering list of pages...") (list-view
(text (@just-done . ~> . display-basename)) #:style '(vertical)
(text (@done-pages . ~> . (λ (x) (if (eq? x 0) @queue
"0/?" #:key qi^-wikiname
(format "~a/~a" x (obs-peek @num-pages)))))))) (λ (k @qi)
;; downloading contents (define @status-icons
((stage-2) (vpanel (@> (case (qi^-st @qi)
(text "Downloading page text...") [(running) @stick]
(progress @done-pages #:range @num-pages) [else (hash-ref status-icons (qi^-st @qi))])))
(text (@done-pages . ~> . (λ (x) (format "~a/~a" x (obs-peek @num-pages))))) (define @is-running?
(text (@just-done . ~> . display-basename)))) (@> (memq (qi^-st @qi) '(running))))
((done) (vpanel ;; state icon at the left side
(text "All wikis downloaded!"))) (hpanel #:stretch '(#t #f)
((err) (vpanel #:alignment '(left center)
(text "Error. Check the popup window."))) #:spacing 8
(else (text (@state . ~> . (λ (state) (format "invalid state: ~a" state)))))) (bitmap-view @status-icons status-icon-min-width)
(checkbox #:label "Auto-retry on error? (Dangerous)" (vpanel
#:checked? @auto-retry ;; name and buttons (top half)
(λ:= @auto-retry))))) (hpanel #:alignment '(left bottom)
(text (@> (qi^-wikiname @qi)))
(spacer)
(hpanel
#:stretch '(#f #f)
(if-view @is-running?
(button (hash-ref action-icons 'pause)
(λ () (do-stop-qi @qi)))
(button (hash-ref action-icons 'resume)
(λ () (do-start-qi @qi))))))
;; progress bar (bottom half)
(hpanel
(canvas
@qi
#:style '(transparent)
#:margin '(3 3)
draw-bar)
(hpanel #:min-size '(68 #f)
#:stretch '(#f #f)
#:alignment '(right center)
(text (@> (format "eta ~a" (qi^-eta @qi))))))))))))))

View File

@ -2,38 +2,35 @@
(require racket/file (require racket/file
racket/function racket/function
racket/list racket/list
racket/runtime-path racket/path
racket/sequence
racket/string racket/string
net/url net/url
net/mime net/mime
file/sha1 file/sha1
net/http-easy net/http-easy
db db
"../lib/html-parsing/main.rkt"
json json
"archiver-database.rkt" "archiver-database.rkt"
"../lib/html-parsing/main.rkt"
"../lib/mime-types.rkt" "../lib/mime-types.rkt"
"../lib/syntax.rkt"
"../lib/tree-updater.rkt" "../lib/tree-updater.rkt"
"../lib/url-utils.rkt" "../lib/url-utils.rkt"
"../lib/xexpr-utils.rkt" "../lib/xexpr-utils.rkt"
"../lib/archive-file-mappings.rkt") "../lib/archive-file-mappings.rkt")
(define archive-slc slc)
(provide (provide
if-necessary-download-list-of-pages
download-list-of-pages
save-each-page
basename->name-for-query basename->name-for-query
image-url->values image-url->values
hash->save-dir hash->save-dir
archive-slc) all-stages)
(module+ test (module+ test
(require rackunit)) (require rackunit))
(define-runtime-path archive-root "../storage/archive") (define archive-root (anytime-path ".." "storage/archive"))
#;(define archive-root "archive") (make-directory* archive-root)
(define sources '#hasheq((style . 1) (page . 2))) (define sources '#hasheq((style . 1) (page . 2)))
@ -46,127 +43,34 @@
wikiname wikiname
(params->query '(("action" . "query") (params->query '(("action" . "query")
("meta" . "siteinfo") ("meta" . "siteinfo")
("siprop" . "general|rightsinfo") ("siprop" . "general|rightsinfo|statistics")
("format" . "json") ("format" . "json")
("formatversion" . "2"))))) ("formatversion" . "2")))))
(define data (response-json (get dest-url))) (define data (response-json (get dest-url)))
(define exists? (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) (define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
(if exists? (if (and exists? (not (sql-null? exists?)))
(query-exec slc "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" (query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?"
(jp "/query/general/sitename" data) (jp "/query/general/sitename" data)
(second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
(jp "/query/rightsinfo/text" data) (jp "/query/rightsinfo/text" data)
(jp "/query/rightsinfo/url" data) (jp "/query/rightsinfo/url" data)
wikiname) wikiname)
(query-exec slc "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 1, ?, ?, ?, ?)" (query-exec* "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 0, ?, ?, ?, ?)"
wikiname wikiname
(jp "/query/general/sitename" data) (jp "/query/general/sitename" data)
(second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
(jp "/query/rightsinfo/text" data) (jp "/query/rightsinfo/text" data)
(jp "/query/rightsinfo/url" data)))) (jp "/query/rightsinfo/url" data)))
(jp "/query/statistics/articles" data))
;; call 1 if not yet done for that wiki
(define (if-necessary-download-list-of-pages wikiname callback)
(define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname))
;; done yet?
(unless (and (real? wiki-progress) (wiki-progress . >= . 1))
;; count total pages
(define dest-url
(format "https://~a.fandom.com/api.php?~a"
wikiname
(params->query `(("action" . "query") ("meta" . "siteinfo") ("siprop" . "statistics") ("format" . "json")))))
(define num-pages (jp "/query/statistics/articles" (response-json (get dest-url))))
(download-list-of-pages wikiname callback 0 num-pages #f)))
;; 1. Download list of wiki pages and store in database
(define (download-list-of-pages wikiname callback total-so-far grand-total path-with-namefrom)
(define url (if path-with-namefrom
(format "https://~a.fandom.com~a" wikiname path-with-namefrom)
(format "https://~a.fandom.com/wiki/Local_Sitemap" wikiname)))
(define r (get url))
(define page (html->xexp (bytes->string/utf-8 (response-body r))))
(define link-namefrom
((query-selector (λ (t a c x) (and (eq? t 'a)
(pair? x)
(string-contains? (car x) "Next page")
(let ([href (get-attribute 'href a)] )
(and href (string-contains? href "/wiki/Local_Sitemap")))))
page #:include-text? #t)))
(define row-values
(for/list ([link (in-producer
(query-selector
(λ (t a c) (eq? t 'a))
((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page)))
#f)])
(list wikiname (local-encoded-url->basename (get-attribute 'href (bits->attributes link))) 0)))
(define query-template (string-join (make-list (length row-values) "(?, ?, ?)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values "))
(apply query-exec slc query-template (flatten row-values))
(define new-total (+ (length row-values) total-so-far))
(callback new-total grand-total (second (last row-values)))
(cond
[link-namefrom ; repeat on the next page
(download-list-of-pages wikiname callback new-total grand-total (get-attribute 'href (bits->attributes link-namefrom)))]
[#t ; all done downloading sitemap
(insert-wiki-entry wikiname)]))
;; 2. Download each page via API and:
;; * Save API response to file
(define max-page-progress 1)
(define (save-each-page wikiname callback)
;; prepare destination folder
(define save-dir (build-path archive-root wikiname))
(make-directory* save-dir)
;; gather list of basenames to download (that aren't yet complete)
(define basenames (query-list slc "select basename from page where wikiname = ? and progress < ?"
wikiname max-page-progress))
;; counter of complete/incomplete basenames
(define already-done-count
(query-value slc "select count(*) from page where wikiname = ? and progress = ?"
wikiname max-page-progress))
(define not-done-count
(query-value slc "select count(*) from page where wikiname = ? and progress < ?"
wikiname max-page-progress))
;; set initial progress
(callback already-done-count (+ already-done-count not-done-count) "")
;; loop through basenames and download
(for ([basename basenames]
[i (in-naturals 1)])
(define name-for-query (basename->name-for-query basename))
(define dest-url
(format "https://~a.fandom.com/api.php?~a"
wikiname
(params->query `(("action" . "parse")
("page" . ,name-for-query)
("prop" . "text|headhtml|langlinks")
("formatversion" . "2")
("format" . "json")))))
(define r (get dest-url))
(define body (response-body r))
(define filename (string-append basename ".json"))
(define save-path
(cond [((string-length basename) . > . 240)
(define key (sha1 (string->bytes/latin-1 basename)))
(query-exec slc "insert into special_page (wikiname, key, basename) values (?, ?, ?)"
wikiname key basename)
(build-path save-dir (string-append key ".json"))]
[#t
(build-path save-dir (string-append basename ".json"))]))
(display-to-file body save-path #:exists 'replace)
(query-exec slc "update page set progress = 1 where wikiname = ? and basename = ?"
wikiname basename)
(callback (+ already-done-count i) (+ already-done-count not-done-count) basename))
;; saved all pages, register that fact in the database
(query-exec slc "update wiki set progress = 2 where wikiname = ?" wikiname))
;; 3. Download CSS and:
;; * Save CSS to file
;; * Record style images to database
(define (check-style-for-images wikiname path) (define (check-style-for-images wikiname path)
(define content (file->string path)) (define content (file->string path))
(define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr)) (define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr))
(for/list ([url urls] (for/list ([url urls]
#:when (not (or (equal? url "") #:when (not (or (equal? url "")
(equal? url "'") (equal? url "'")
(string-suffix? url "\"")
(string-contains? url "/resources-ucp/") (string-contains? url "/resources-ucp/")
(string-contains? url "/fonts/") (string-contains? url "/fonts/")
(string-contains? url "/drm_fonts/") (string-contains? url "/drm_fonts/")
@ -184,7 +88,7 @@
[(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)] [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)]
[else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)]))) [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)])))
(define (download-styles-for-wiki wikiname) (define (download-styles-for-wiki wikiname callback)
(define save-dir (build-path archive-root wikiname "styles")) (define save-dir (build-path archive-root wikiname "styles"))
(make-directory* save-dir) (make-directory* save-dir)
(define theme (λ (theme-name) (define theme (λ (theme-name)
@ -198,18 +102,137 @@
(theme "dark") (theme "dark")
(cons (format "https://~a.fandom.com/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop" wikiname) (cons (format "https://~a.fandom.com/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop" wikiname)
(build-path save-dir "site.css")))) (build-path save-dir "site.css"))))
(for ([style styles]) (for ([style styles]
[i (in-naturals)])
(callback i (length styles) "styles...")
(define r (get (car style))) (define r (get (car style)))
(define body (response-body r)) (define body (response-body r))
(display-to-file body (cdr style) #:exists 'replace) (display-to-file body (cdr style) #:exists 'replace)
;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url? ;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url?
) )
(callback (length styles) (length styles) "styles...")
styles) styles)
(define (do-step-3 wikiname) (define (hash->save-dir wikiname hash)
(define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2)))
(define (image-url->values i)
;; TODO: handle case where there is multiple broken cb parameter on minecraft wiki
;; TODO: ensure it still "works" with broken &amp; on minecraft wiki
(define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing
(define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary
(define hash (sha1 (string->bytes/utf-8 key)))
(cons key hash))
;; 1. Download list of wiki pages and store in database, if not done yet for that wiki
(define (if-necessary-download-list-of-pages wikiname callback)
(define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
;; done yet?
(unless (and (real? wiki-progress) (wiki-progress . >= . 1))
;; Count total pages
(define num-pages (insert-wiki-entry wikiname))
;; Download the entire index of pages
(define basenames
(let loop ([path-with-namefrom "/wiki/Local_Sitemap"]
[basenames-previous-pages null])
;; Download the current index page
(define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom))
(define r (get url))
;; Metadata from this page (the link to the next page)
(define page (html->xexp (bytes->string/utf-8 (response-body r))))
(define link-namefrom
((query-selector (λ (t a c x) (and (eq? t 'a)
(pair? x)
(string-contains? (car x) "Next page")
(let ([href (get-attribute 'href a)] )
(and href (string-contains? href "/wiki/Local_Sitemap")))))
page #:include-text? #t)))
;; Content from this page
(define basenames-this-page
(for/list ([link (in-producer
(query-selector
(λ (t a c) (eq? t 'a))
((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page)))
#f)])
(local-encoded-url->basename (get-attribute 'href (bits->attributes link)))))
;; Call the progress callback
(define all-basenames (append basenames-previous-pages basenames-this-page))
(callback (length all-basenames) num-pages (last all-basenames))
;; Recurse to download from the next page
(if link-namefrom
(loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames)
all-basenames)))
;; Save those pages into the database
;; SQLite can have a maximum of 32766 parameters in a single query
(for ([slice (in-slice 32760 basenames)])
(define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values "))
(call-with-transaction
(get-slc)
(λ ()
(apply query-exec* query-template wikiname slice)
;; Record that we have the complete list of pages
(query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname))))))
;; 2. Download each page via API and:
;; * Save API response to file
(define max-page-progress 1)
(define (save-each-page wikiname callback)
;; prepare destination folder
(define save-dir (build-path archive-root wikiname))
(make-directory* save-dir)
;; gather list of basenames to download (that aren't yet complete)
(define basenames (query-list* "select basename from page where wikiname = ? and progress < ?"
wikiname max-page-progress))
;; counter of complete/incomplete basenames
(define already-done-count
(query-value* "select count(*) from page where wikiname = ? and progress = ?"
wikiname max-page-progress))
(define not-done-count
(query-value* "select count(*) from page where wikiname = ? and progress < ?"
wikiname max-page-progress))
(define total-count (+ already-done-count not-done-count))
;; set initial progress
(callback already-done-count total-count "")
;; loop through basenames and download
(for ([basename basenames]
[i (in-naturals (add1 already-done-count))])
(define name-for-query (basename->name-for-query basename))
(define dest-url
(format "https://~a.fandom.com/api.php?~a"
wikiname
(params->query `(("action" . "parse")
("page" . ,name-for-query)
("prop" . "text|headhtml|langlinks")
("formatversion" . "2")
("format" . "json")))))
(define r (get dest-url))
(define body (response-body r))
(define filename (string-append basename ".json"))
(define save-path
(cond [((string-length basename) . > . 240)
(define key (sha1 (string->bytes/latin-1 basename)))
(query-exec* "insert into special_page (wikiname, key, basename) values (?, ?, ?)"
wikiname key basename)
(build-path save-dir (string-append key ".json"))]
[#t
(build-path save-dir (string-append basename ".json"))]))
(display-to-file body save-path #:exists 'replace)
(query-exec* "update page set progress = 1 where wikiname = ? and basename = ?"
wikiname basename)
(callback i total-count basename))
;; saved all pages, register that fact in the database
(query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname))
;; 3. Download CSS and:
;; * Save CSS to file
;; * Record style images to database
(define (if-necessary-download-and-check-styles wikiname callback)
(define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
(unless (and (number? wiki-progress) (wiki-progress . >= . 3)) (unless (and (number? wiki-progress) (wiki-progress . >= . 3))
(define styles (download-styles-for-wiki wikiname)) (define styles (download-styles-for-wiki wikiname callback))
(define unique-image-urls (define unique-image-urls
(remove-duplicates (remove-duplicates
(map image-url->values (map image-url->values
@ -217,48 +240,40 @@
(for/list ([style styles]) (for/list ([style styles])
(check-style-for-images wikiname (cdr style))))) (check-style-for-images wikiname (cdr style)))))
#:key cdr)) #:key cdr))
(println unique-image-urls)
(for ([pair unique-image-urls]) (for ([pair unique-image-urls])
(query-exec slc "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair))) (query-exec* "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair)))
(query-exec slc "update wiki set progress = 3 where wikiname = ?" wikiname))) (query-exec* "update wiki set progress = 3 where wikiname = ?" wikiname)))
;; 4: From downloaded pages, record URLs of image sources and inline style images to database ;; 4: From downloaded pages, record URLs of image sources and inline style images to database
(define (hash->save-dir wikiname hash)
(build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2)))
(define (image-url->values i)
;; TODO: handle case where there is multiple cb parameter on minecraft wiki
;; TODO: ensure it still "works" with broken &amp; on minecraft wiki
(define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing
(define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary
(define hash (sha1 (string->bytes/utf-8 key)))
(cons key hash))
(define (check-json-for-images wikiname path) (define (check-json-for-images wikiname path)
(define data (with-input-from-file path (λ () (read-json)))) (define data (with-input-from-file path (λ () (read-json))))
(define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) (define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data))))
(define tree (update-tree-wiki page wikiname)) (define tree (update-tree-wiki page wikiname))
(remove-duplicates null
(for/list ([element (in-producer #;(remove-duplicates
(query-selector (for/list ([element (in-producer
(λ (t a c) (query-selector
(and (eq? t 'img) (λ (t a c)
(get-attribute 'src a))) (and (eq? t 'img)
tree) (get-attribute 'src a)))
#f)]) tree)
(image-url->values (get-attribute 'src (bits->attributes element)))))) #f)])
(image-url->values (get-attribute 'src (bits->attributes element))))))
;; 5. Download image sources and style images according to database ;; 5. Download image sources and style images according to database
(define (save-each-image wikiname source callback) (define (save-each-image wikiname callback)
(define source (hash-ref sources 'style)) ;; TODO: download entire wiki images instead?
;; gather list of basenames to download (that aren't yet complete) ;; gather list of basenames to download (that aren't yet complete)
(define rows (query-rows slc "select url, hash from image where wikiname = ? and source <= ? and progress < 1" (define rows (query-rows* "select url, hash from image where wikiname = ? and source <= ? and progress < 1"
wikiname source)) wikiname source))
;; counter of complete/incomplete basenames ;; counter of complete/incomplete basenames
(define already-done-count (define already-done-count
(query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress = 1" (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress = 1"
wikiname source)) wikiname source))
(define not-done-count (define not-done-count
(query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress < 1" (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress < 1"
wikiname source)) wikiname source))
;; set initial progress ;; set initial progress
(callback already-done-count (+ already-done-count not-done-count) "") (callback already-done-count (+ already-done-count not-done-count) "")
@ -269,26 +284,35 @@
(define url (vector-ref row 0)) (define url (vector-ref row 0))
(define hash (vector-ref row 1)) (define hash (vector-ref row 1))
;; check ;; check
(printf "~a -> ~a~n" url hash) #; (printf "~a -> ~a~n" url hash)
(define r (get url)) (define r (get url))
(define declared-type (response-headers-ref r 'content-type)) (define declared-type (response-headers-ref r 'content-type))
(define final-type (if (equal? declared-type #"application/octet-stream") (define final-type (if (equal? declared-type #"application/octet-stream")
(let ([sniff-entity (message-entity (mime-analyze (response-body r)))]) (let ([sniff-entity (message-entity (mime-analyze (response-body r)))])
(string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity)))) (string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity))))
declared-type)) declared-type))
(define ext (bytes->string/latin-1 (mime-type->ext final-type))) (define ext
(with-handlers ([exn:fail:contract? (λ _ (error 'save-each-image "no ext found for mime type `~a` in file ~a" final-type url))])
(bytes->string/latin-1 (mime-type->ext final-type))))
;; save ;; save
(define save-dir (hash->save-dir wikiname hash)) (define save-dir (hash->save-dir wikiname hash))
(make-directory* save-dir) (make-directory* save-dir)
(define save-path (build-path save-dir (string-append hash "." ext))) (define save-path (build-path save-dir (string-append hash "." ext)))
(define body (response-body r)) (define body (response-body r))
(display-to-file body save-path #:exists 'replace) (display-to-file body save-path #:exists 'replace)
(query-exec slc "update image set progress = 1, ext = ? where wikiname = ? and hash = ?" (query-exec* "update image set progress = 1, ext = ? where wikiname = ? and hash = ?"
ext wikiname hash) ext wikiname hash)
(callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append hash "." ext))) (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append (substring hash 0 6) "..." ext)))
;; TODO: saved all images, register that fact in the database ;; saved all images, register that fact in the database
) (query-exec* "update wiki set progress = 4 where wikiname = ?" wikiname))
(define all-stages
(list
if-necessary-download-list-of-pages
save-each-page
if-necessary-download-and-check-styles
;; check-json-for-images
save-each-image))
(module+ test (module+ test
(check-equal? (html->xexp "<img src=\"https://example.com/images?src=Blah.jpg&amp;width=150\">") (check-equal? (html->xexp "<img src=\"https://example.com/images?src=Blah.jpg&amp;width=150\">")
@ -299,11 +323,13 @@
#;(do-step-3 "gallowmere") #;(do-step-3 "gallowmere")
#;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))) #;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))
#;(for ([wikiname (query-list slc "select wikiname from wiki")]) #;(for ([wikiname (query-list* "select wikiname from wiki")])
(println wikiname) (println wikiname)
(insert-wiki-entry wikiname)) (insert-wiki-entry wikiname))
#;(for ([wikiname (query-list slc "select wikiname from wiki")]) #;(for ([wikiname (query-list* "select wikiname from wiki")])
(println wikiname) (println wikiname)
(do-step-3 wikiname) (do-step-3 wikiname)
(save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))))) (save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))))
; (for ([stage all-stages]) (stage "create" (λ (a b c) (printf "~a/~a ~a~n" a b c))))