Compare commits

..

No commits in common. "cf74ffb0e26e257f03fc467f2f0d1846533ba0df" and "29007b0e29c5ecff6a7b6824086bb08c6758ac71" have entirely different histories.

13 changed files with 364 additions and 594 deletions

View file

@ -6,24 +6,24 @@
""
"Downloaded pages go into `archive/` next to the executable."
"Database goes into `archiver.db*` next to the executable."
"The database is necessary to store your download progress and resume where you left off if the process is interrupted."))
"The database is necessary to store your download progress and resume where you left off if the process is interrupted.")
(ps ""
"Default output style is `progress` in a tty and `lines` otherwise."))
(flag (output-quiet?)
("-q" "--output-quiet" "disable progress output")
(output-quiet? #t))
(flag (output-progress?)
("-p" "--output-progress" "progress output for terminals (default in a tty)")
(output-progress? #t))
(flag (output-lines?)
("-l" "--output-lines" "output the name of each file downloaded (default outside of a tty)")
("-l" "--output-lines" "output the name of each file downloaded")
(output-lines? #t))
(flag (output-progress?)
("-p" "--output-progress" "progress output for terminals")
(output-progress? #t))
(constraint (one-of output-quiet? output-lines? output-progress?))
(program
(start [wikiname "wikiname to download"])
;; set up arguments
@ -42,27 +42,30 @@
;; check
(when (or (not wikiname) (equal? wikiname ""))
(raise-user-error "Please specify the wikiname to download on the command line."))
;; progress reporting based on selected mode
(define (report-progress a b c)
(define basename (basename->name-for-query c))
(cond
[(output-lines?)
(displayln basename)]
[(output-progress?)
(when (eq? (modulo a 20) 0)
(thread (λ () (update-width))))
(define prefix (format "[~a] [~a/~a] " wikiname a b))
(define rest (- width (string-length prefix)))
(define real-width (min (string-length basename) rest))
(define spare-width (- rest real-width))
(define name-display (substring basename 0 real-width))
(define whitespace (make-string spare-width #\ ))
(printf "~a~a~a\r" prefix name-display whitespace)]))
;; download all stages
(for ([stage all-stages]
[i (in-naturals 1)])
(printf "> Stage ~a/~a~n" i (length all-stages))
(stage wikiname report-progress)
(displayln "")))
;; stage 1
(cond [(output-lines?) (displayln "Downloading list of pages...")]
[(output-progress?) (printf "Downloading list of pages... \r")])
(if-necessary-download-list-of-pages
wikiname
(λ (a b c)
(cond [(output-progress?) (printf "Downloading list of pages... [~a/~b]\r" a b)])))
;; stage 2
(save-each-page
wikiname
(λ (a b c)
(define basename (basename->name-for-query c))
(cond
[(output-lines?)
(displayln basename)]
[(output-progress?)
(when (eq? (modulo a 20) 0)
(thread (λ () (update-width))))
(define prefix (format "[~a/~a] " a b))
(define rest (- width (string-length prefix)))
(define real-width (min (string-length basename) rest))
(define spare-width (- rest real-width))
(define name-display (substring basename 0 real-width))
(define whitespace (make-string spare-width #\ ))
(printf "~a~a~a\r" prefix name-display whitespace)]))))
(run start)

View file

@ -1,7 +1,7 @@
#lang racket/base
(require racket/file
racket/list
(require racket/list
racket/path
racket/runtime-path
racket/string
json
json-pointer
@ -9,16 +9,9 @@
"../lib/syntax.rkt")
(provide
get-slc
query-exec*
query-rows*
query-list*
query-value*
query-maybe-value*
query-maybe-row*)
slc)
(define storage-path (anytime-path ".." "storage"))
(define database-file (build-path storage-path "archiver.db"))
(define-runtime-path database-file "../storage/archiver.db")
(define migrations
(wrap-sql
@ -32,50 +25,23 @@
(query-exec slc "alter table wiki add column license_text TEXT")
(query-exec slc "alter table wiki add column license_url TEXT"))))
(define slc (box #f))
(define (get-slc)
(define slc* (unbox slc))
(cond
[slc* slc*]
[else
(make-directory* storage-path)
(define slc* (sqlite3-connect #:database database-file #:mode 'create))
(query-exec slc* "PRAGMA journal_mode=WAL")
(define database-version
(with-handlers ([exn:fail:sql?
(λ (exn)
; need to set up the database
(query-exec slc* "create table database_version (version integer, primary key (version))")
(query-exec slc* "insert into database_version values (0)")
0)])
(query-value slc* "select version from database_version")))
(define slc (sqlite3-connect #:database database-file #:mode 'create))
(query-exec slc "PRAGMA journal_mode=WAL")
(define database-version
(with-handlers ([exn:fail:sql?
(λ (exn)
; need to set up the database
(query-exec slc "create table database_version (version integer, primary key (version))")
(query-exec slc "insert into database_version values (0)")
0)])
(query-value slc "select version from database_version")))
(let do-migrate-step ()
(when (database-version . < . (length migrations))
(call-with-transaction
slc*
(list-ref migrations database-version))
(set! database-version (add1 database-version))
(query-exec slc* "update database_version set version = $1" database-version)
(do-migrate-step)))
(let do-migrate-step ()
(when (database-version . < . (length migrations))
(call-with-transaction
slc
(list-ref migrations database-version))
(set! database-version (add1 database-version))
(query-exec slc "update database_version set version = $1" database-version)
(do-migrate-step)))
(set-box! slc slc*)
slc*]))
(define (query-exec* . args)
(apply query-exec (get-slc) args))
(define (query-rows* . args)
(apply query-rows (get-slc) args))
(define (query-list* . args)
(apply query-list (get-slc) args))
(define (query-value* . args)
(apply query-value (get-slc) args))
(define (query-maybe-value* . args)
(apply query-maybe-value (get-slc) args))
(define (query-maybe-row* . args)
(apply query-maybe-row (get-slc) args))

View file

@ -1,128 +1,89 @@
#lang racket/base
(require racket/class
racket/draw
racket/format
racket/list
racket/port
racket/set
racket/splicing
racket/string
db
net/http-easy
memo
(only-in racket/gui timer%)
racket/gui/easy
racket/gui/easy/operator
(only-in pict bitmap)
images/icons/style
images/icons/control
images/icons/stickman
images/icons/symbol
"archiver-database.rkt"
"archiver.rkt"
"../lib/url-utils.rkt"
"../lib/xexpr-utils.rkt")
(default-icon-material rubber-icon-material)
(require (for-syntax racket/base racket/match racket/set racket/string))
(define-syntax (@> stx)
(define form (cdr (syntax->datum stx)))
(match form
[(list form) ; (@> (fn @obs))
;; identify the observables and replace with non-@ symbols
(define collection (mutable-set))
(define updated
(let loop ([sexp form])
(cond [(symbol? sexp)
(let ([as-s (symbol->string sexp)])
(if (string-prefix? as-s "@")
(let ([without-@ (string->symbol (substring as-s 1))])
(set-add! collection (cons sexp without-@))
without-@)
sexp))]
[(pair? sexp) (cons (loop (car sexp)) (loop (cdr sexp)))]
[#t sexp])))
(define collection-l (set->list collection))
;; return obs-combine -> updated-form
(datum->syntax stx `(obs-combine (λ (,@(map cdr collection-l)) ,updated) ,@(map car collection-l)))]
[(list (? string? str) args ...) ; (@> "Blah: ~a/~a" @arg1 arg2)
;; identify the observables and replace with non-@ symbols
(define collection-l
(for/list ([arg args])
(if (symbol? arg)
(let ([as-s (symbol->string arg)])
(if (string-prefix? as-s "@")
(let ([without-@ (string->symbol (substring as-s 1))])
(cons arg without-@))
(cons #f arg)))
(cons #f arg))))
(define collection-lo (filter car collection-l))
;; return obs-combine -> format
(datum->syntax stx `(obs-combine (λ (,@(map cdr collection-lo)) (format ,str ,@(map cdr collection-l))) ,@(map car collection-lo)))]))
(define active-threads (mutable-seteq))
(define/obs @auto-retry #f)
(define/obs @wikiname "")
(define/obs @state 'waiting)
(define/obs @num-pages 1)
(define/obs @done-pages 0)
(define/obs @just-done "")
(define/obs @queue '())
(define @title
(obs-combine
(λ (state queue num-pages done-pages)
(define suffix (if (pair? queue)
(format " +~a" (length queue))
""))
(define progress (if (eq? num-pages 0)
" 0%"
(format " ~a%" (round (inexact->exact (* (/ done-pages num-pages) 100))))))
(case state
[(waiting stage-0) (format "Fandom Archiver~a" suffix)]
[(stage-1) (format "Fandom Archiver 0%~a" suffix)]
[(stage-2) (format "Fandom Archiver~a~a" progress suffix)]
[(err) "ERROR Fandom Archiver"]
[(done) "Fandom Archiver 100%"]))
@state @queue @num-pages (obs-throttle @done-pages #:duration 5000)))
(define-struct qi^ (wikiname st stage progress max-progress eta th) #:transparent) ;; queue item
(define-syntax-rule (t body ...)
(set-add! active-threads (thread (λ () body ...))))
(define rows (query-rows* "select wikiname, progress from wiki where progress < 4"))
(define/obs @queue null)
(define (add-wikiname-to-queue wikiname st stage)
(@queue . <~ . (λ (queue)
(define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue))
(if already-exists?
queue
(append queue (list (qi^ wikiname st stage 0 1 "..." #f)))))))
(for ([row rows])
(add-wikiname-to-queue (vector-ref row 0)
(if (= (vector-ref row 1) 4)
'complete
'queued)
(vector-ref row 1)))
(define (do-start-or-queue)
(define wikiname (obs-peek @wikiname))
(:= @wikiname "")
(when (not (equal? (string-trim wikiname) ""))
(@queue . <~ . (λ (q) (append q (list wikiname))))
(shift-queue-maybe)))
(define status-icon-size 32)
(define status-icon-min-width 36)
(define button-icon-size 12)
(define (shift-queue-maybe)
(when (memq (obs-peek @state) '(waiting done))
(define q (obs-peek @queue))
(cond
[(pair? q)
(define wikiname (car q))
(:= @queue (cdr q))
(do-start-stage1 wikiname)]
[#t (:= @state 'done)])))
(define color-green (make-color 90 212 68))
(define (do-start-stage1 wikiname)
(:= @just-done "")
(:= @done-pages 0)
(:= @num-pages 1)
(t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)])
(:= @state 'stage-0)
(if-necessary-download-list-of-pages wikiname (λ (now-done num-pages just-done-name)
(:= @num-pages num-pages)
(:= @done-pages now-done)
(:= @just-done just-done-name)
(:= @state 'stage-1)))
(do-start-stage2 wikiname))))
(define/obs @input "")
(splicing-let ([frame-count 30])
(define stickman-frames
(for/vector ([s (in-range 0 1 (/ 1 frame-count))])
(running-stickman-icon
s
#:height status-icon-size
#:material (default-icon-material))))
(define/obs @stick-frame-no 0)
(define stick-timer
(new timer%
[notify-callback (λ () (@stick-frame-no . <~ . add1))]
[interval (truncate (/ 1000 frame-count))]))
(define/obs @stick
(@stick-frame-no . ~> . (λ (n) (vector-ref stickman-frames
(modulo n (vector-length stickman-frames)))))))
(define status-icons
(hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size)
'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size)
'running @stick
'error (x-icon #:height status-icon-size)
'complete (check-icon #:color color-green #:height status-icon-size)))
(define action-icons
(hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size)
'resume (play-icon #:color color-green #:height button-icon-size)))
(define (bitmap-view @the-bitmap [min-width 1])
(pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f)
#:stretch '(#f #f)
#:style '(transparent)
@the-bitmap
bitmap))
(define (do-start-stage2 wikiname)
(:= @just-done "")
(:= @num-pages 1)
(:= @done-pages 0)
(t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)])
(save-each-page wikiname (λ (now-done num-pages just-done-path)
(:= @num-pages num-pages)
(:= @done-pages now-done)
(:= @just-done just-done-path)))
(:= @state 'waiting)
(shift-queue-maybe)))
(:= @state 'stage-2))
(define (exn->string e)
(with-output-to-string
@ -135,15 +96,13 @@
(printf ": ~a" (car item)))
(displayln "")))))
(define ((handle-graphical-exn @qi) e)
(define ((handle-graphical-exn wikiname) e)
(displayln (exn->string e) (current-error-port))
(cond
[(obs-peek @auto-retry)
(void) ;; TODO
#;(do-retry-end wikiname)]
(do-retry-end wikiname)]
[#t
(update-qi @qi [st 'error])
(do-try-unpause-next-entry)
(:= @state 'err)
(thread
(λ ()
(define/obs @visible? #t)
@ -157,177 +116,89 @@
(input #:style '(multiple hscroll)
#:min-size '(#f 200)
(exn->string e))
;; TODO
#;(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname)))
#;(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname)))
#;(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue)))
#;(button "Use Auto-Retry" (λ ()
(:= @auto-retry #t)
(:= @visible? #f)
(do-retry-end wikiname)))
#;(text "Be careful not to auto-retry an infinite loop!")))
(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname)))
(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname)))
(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue)))
(button "Use Auto-Retry" (λ ()
(:= @auto-retry #t)
(:= @visible? #f)
(do-retry-end wikiname)))
(text "Be careful not to auto-retry an infinite loop!")))
main-window)))
(sleep)
; make sure the broken thread is gone
(define th (qi^-th (obs-peek @qi)))
(when th (kill-thread th))]))
; make sure the old broken threads are all gone
(for ([th active-threads]) (kill-thread th))
(set-clear! active-threads)]))
(define segments
(list
(list 5/100 (make-color 0 223 217))
(list 88/100 color-green)
(list 2/100 (make-color 0 223 217))
(list 5/100 color-green)))
(define segment-spacing 2)
(unless (= (apply + (map car segments)) 1)
(error 'segments "segments add up to ~a, not 1" (apply + (map car segments))))
(define (do-retry-now wikiname)
(@queue . <~ . (λ (q) (append (list wikiname) q)))
(:= @state 'waiting)
(shift-queue-maybe))
;; return the new bitmap, which can be drawn on a dc<%>
(define/memoize (ray-trace width height stage progress max-progress)
;; (printf "rendering ~a ~a/~a at ~a~n" stage progress max-progress (current-inexact-milliseconds))
(define bm (make-object bitmap% width height #f #t))
(define dc (make-object bitmap-dc% bm))
(define width-available (- width (* (length segments) segment-spacing)))
(send dc set-smoothing 'unsmoothed)
(send dc set-pen "black" 0 'transparent)
(for/fold ([offset 0])
([segment segments]
[i (in-naturals 0)]) ;; zero indexed stages?
;; calculate start and end locations of grey bar
(define-values (segment-proportion segment-color) (apply values segment))
(define segment-start (if (= offset 0) 0 (+ offset segment-spacing)))
(define segment-width (* width-available segment-proportion))
;; draw grey bar
(send dc set-brush (make-color 180 180 180 0.4) 'solid)
(send dc draw-rectangle segment-start 0 segment-width height)
;; draw solid bar according to the current item's progress
(define proportion
(cond [(stage . < . i) 0]
[(stage . > . i) 1]
[(max-progress . <= . 0) 0]
[(progress . < . 0) 0]
[(progress . >= . max-progress) 1]
[else (progress . / . max-progress)]))
(send dc set-brush segment-color 'solid)
(send dc draw-rectangle segment-start 0 (* proportion segment-width) height)
(+ segment-start segment-width))
(bitmap-render-icon bm 6/8))
(define (do-retry-end wikiname)
(@queue . <~ . (λ (q) (append q (list wikiname))))
(:= @state 'waiting)
(shift-queue-maybe))
;; get ray traced bitmap (possibly from cache) and draw on dc<%>
(define (draw-bar orig-dc qi)
;; (println ray-traced)
(define-values (width height) (send orig-dc get-size))
(send orig-dc draw-bitmap (ray-trace width height (qi^-stage qi) (qi^-progress qi) (qi^-max-progress qi)) 0 0))
(define (do-continue)
(:= @state 'waiting)
(shift-queue-maybe))
(define ((make-progress-updater @qi) a b c)
;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c)
(update-qi @qi [progress a] [max-progress b]))
(define (do-add-to-queue)
(define wikiname (string-trim (obs-peek @input)))
(when ((string-length wikiname) . > . 0)
(add-wikiname-to-queue wikiname 'queued 0)) ;; TODO: automatically start?
(:= @input ""))
(define-syntax-rule (update-qi @qi args ...)
(let ([wikiname (qi^-wikiname (obs-peek @qi))])
(@queue . <~ . (λ (queue)
(for/list ([qi queue])
(if (equal? (qi^-wikiname qi) wikiname)
(struct-copy qi^ qi args ...)
qi))))))
(define (do-start-qi @qi)
(define th
(thread (λ ()
(with-handlers ([exn? (handle-graphical-exn @qi)])
(define last-stage
(for/last ([stage all-stages]
[i (in-naturals)])
(update-qi @qi [stage i])
(stage (qi^-wikiname (obs-peek @qi)) (make-progress-updater @qi))
i))
(update-qi @qi [st 'complete] [stage (add1 last-stage)])
(do-try-unpause-next-entry)))))
(update-qi @qi [st 'running] [th th]))
(define (do-stop-qi @qi)
(define th (qi^-th (obs-peek @qi)))
(when th (kill-thread th))
(update-qi @qi [th #f] [st 'paused]))
(define (do-try-unpause-next-entry)
(define queue (obs-peek @queue))
(define next-qi (for/first ([qi queue]
#:when (memq (qi^-st qi) '(paused queued error)))
qi))
(when next-qi
(define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue))))
(do-start-qi @qi)))
(define (display-basename basename)
(define limit 40)
(cond [(string? basename)
(define query (basename->name-for-query basename))
(define segments (string-split query "/"))
(when (and ((string-length query) . > . limit) ((length segments) . >= . 2))
(set! query (string-append ".../" (last segments))))
(when ((string-length query) . > . limit)
(set! query (string-append (substring query 0 (- limit 3)) "...")))
query]
[#t "?"]))
(define main-window
(render
(window
#:title "Fandom Archiver"
#:size '(400 300)
#:mixin (λ (%) (class %
(super-new)
(define/augment (on-close)
(send stick-timer stop)
(for ([qi (obs-peek @queue)])
(when (qi^-th qi)
(kill-thread (qi^-th qi))))
#;(disconnect*))))
(vpanel
#:spacing 10
#:margin '(5 5)
(hpanel
#:stretch '(#t #f)
#:spacing 10
(hpanel
(text "https://")
(input @input
(λ (event data) (cond
[(eq? event 'input) (:= @input data)]
[(eq? event 'return) (do-add-to-queue)])))
(text ".fandom.com"))
(button "Download Wiki" do-add-to-queue))
(list-view
#:style '(vertical)
@queue
#:key qi^-wikiname
(λ (k @qi)
(define @status-icons
(@> (case (qi^-st @qi)
[(running) @stick]
[else (hash-ref status-icons (qi^-st @qi))])))
(define @is-running?
(@> (memq (qi^-st @qi) '(running))))
;; state icon at the left side
(hpanel #:stretch '(#t #f)
#:alignment '(left center)
#:spacing 8
(bitmap-view @status-icons status-icon-min-width)
(vpanel
;; name and buttons (top half)
(hpanel #:alignment '(left bottom)
(text (@> (qi^-wikiname @qi)))
(spacer)
(hpanel
#:stretch '(#f #f)
(if-view @is-running?
(button (hash-ref action-icons 'pause)
(λ () (do-stop-qi @qi)))
(button (hash-ref action-icons 'resume)
(λ () (do-start-qi @qi))))))
;; progress bar (bottom half)
(hpanel
(canvas
@qi
#:style '(transparent)
#:margin '(3 3)
draw-bar)
(hpanel #:min-size '(68 #f)
#:stretch '(#f #f)
#:alignment '(right center)
(text (@> (format "eta ~a" (qi^-eta @qi))))))))))))))
(window #:title @title
#:size '(360 200)
#:mixin (λ (%) (class %
(super-new)
(define/augment (on-close)
(for ([th active-threads]) (kill-thread th))
(disconnect slc))))
;; input box at the top
(hpanel (text "https://")
(input @wikiname
(λ (event data) (cond
[(eq? event 'input) (:= @wikiname data)]
[(eq? event 'return) (do-start-or-queue)])))
(text ".fandom.com"))
(button (@queue . ~> . (λ (q) (if (null? q) "Start" "Queue"))) (λ () (do-start-or-queue)))
(text (@queue . ~> . (λ (q) (if (null? q) "" (string-join #:before-first "Queue: " q ", ")))))
;; show status based on overall application state
(case-view
@state
;; waiting for wikiname entry
((waiting) (vpanel
(text "Fill in the wikiname and click start.")))
((stage-0) (vpanel
(text "Checking data...")))
((stage-1) (vpanel
(text "Gathering list of pages...")
(text (@just-done . ~> . display-basename))
(text (@done-pages . ~> . (λ (x) (if (eq? x 0)
"0/?"
(format "~a/~a" x (obs-peek @num-pages))))))))
;; downloading contents
((stage-2) (vpanel
(text "Downloading page text...")
(progress @done-pages #:range @num-pages)
(text (@done-pages . ~> . (λ (x) (format "~a/~a" x (obs-peek @num-pages)))))
(text (@just-done . ~> . display-basename))))
((done) (vpanel
(text "All wikis downloaded!")))
((err) (vpanel
(text "Error. Check the popup window.")))
(else (text (@state . ~> . (λ (state) (format "invalid state: ~a" state))))))
(checkbox #:label "Auto-retry on error? (Dangerous)"
#:checked? @auto-retry
(λ:= @auto-retry)))))

View file

@ -2,35 +2,38 @@
(require racket/file
racket/function
racket/list
racket/path
racket/sequence
racket/runtime-path
racket/string
net/url
net/mime
file/sha1
net/http-easy
db
"../lib/html-parsing/main.rkt"
json
"archiver-database.rkt"
"../lib/html-parsing/main.rkt"
"../lib/mime-types.rkt"
"../lib/syntax.rkt"
"../lib/tree-updater.rkt"
"../lib/url-utils.rkt"
"../lib/xexpr-utils.rkt"
"../lib/archive-file-mappings.rkt")
(define archive-slc slc)
(provide
if-necessary-download-list-of-pages
download-list-of-pages
save-each-page
basename->name-for-query
image-url->values
hash->save-dir
all-stages)
archive-slc)
(module+ test
(require rackunit))
(define archive-root (anytime-path ".." "storage/archive"))
(make-directory* archive-root)
(define-runtime-path archive-root "../storage/archive")
#;(define archive-root "archive")
(define sources '#hasheq((style . 1) (page . 2)))
@ -43,34 +46,127 @@
wikiname
(params->query '(("action" . "query")
("meta" . "siteinfo")
("siprop" . "general|rightsinfo|statistics")
("siprop" . "general|rightsinfo")
("format" . "json")
("formatversion" . "2")))))
(define data (response-json (get dest-url)))
(define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
(if (and exists? (not (sql-null? exists?)))
(query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?"
(define exists? (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname))
(if exists?
(query-exec slc "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?"
(jp "/query/general/sitename" data)
(second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
(jp "/query/rightsinfo/text" data)
(jp "/query/rightsinfo/url" data)
wikiname)
(query-exec* "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 0, ?, ?, ?, ?)"
(query-exec slc "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 1, ?, ?, ?, ?)"
wikiname
(jp "/query/general/sitename" data)
(second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
(jp "/query/rightsinfo/text" data)
(jp "/query/rightsinfo/url" data)))
(jp "/query/statistics/articles" data))
(jp "/query/rightsinfo/url" data))))
;; call 1 if not yet done for that wiki
(define (if-necessary-download-list-of-pages wikiname callback)
(define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname))
;; done yet?
(unless (and (real? wiki-progress) (wiki-progress . >= . 1))
;; count total pages
(define dest-url
(format "https://~a.fandom.com/api.php?~a"
wikiname
(params->query `(("action" . "query") ("meta" . "siteinfo") ("siprop" . "statistics") ("format" . "json")))))
(define num-pages (jp "/query/statistics/articles" (response-json (get dest-url))))
(download-list-of-pages wikiname callback 0 num-pages #f)))
;; 1. Download list of wiki pages and store in database
(define (download-list-of-pages wikiname callback total-so-far grand-total path-with-namefrom)
(define url (if path-with-namefrom
(format "https://~a.fandom.com~a" wikiname path-with-namefrom)
(format "https://~a.fandom.com/wiki/Local_Sitemap" wikiname)))
(define r (get url))
(define page (html->xexp (bytes->string/utf-8 (response-body r))))
(define link-namefrom
((query-selector (λ (t a c x) (and (eq? t 'a)
(pair? x)
(string-contains? (car x) "Next page")
(let ([href (get-attribute 'href a)] )
(and href (string-contains? href "/wiki/Local_Sitemap")))))
page #:include-text? #t)))
(define row-values
(for/list ([link (in-producer
(query-selector
(λ (t a c) (eq? t 'a))
((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page)))
#f)])
(list wikiname (local-encoded-url->basename (get-attribute 'href (bits->attributes link))) 0)))
(define query-template (string-join (make-list (length row-values) "(?, ?, ?)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values "))
(apply query-exec slc query-template (flatten row-values))
(define new-total (+ (length row-values) total-so-far))
(callback new-total grand-total (second (last row-values)))
(cond
[link-namefrom ; repeat on the next page
(download-list-of-pages wikiname callback new-total grand-total (get-attribute 'href (bits->attributes link-namefrom)))]
[#t ; all done downloading sitemap
(insert-wiki-entry wikiname)]))
;; 2. Download each page via API and:
;; * Save API response to file
(define max-page-progress 1)
(define (save-each-page wikiname callback)
;; prepare destination folder
(define save-dir (build-path archive-root wikiname))
(make-directory* save-dir)
;; gather list of basenames to download (that aren't yet complete)
(define basenames (query-list slc "select basename from page where wikiname = ? and progress < ?"
wikiname max-page-progress))
;; counter of complete/incomplete basenames
(define already-done-count
(query-value slc "select count(*) from page where wikiname = ? and progress = ?"
wikiname max-page-progress))
(define not-done-count
(query-value slc "select count(*) from page where wikiname = ? and progress < ?"
wikiname max-page-progress))
;; set initial progress
(callback already-done-count (+ already-done-count not-done-count) "")
;; loop through basenames and download
(for ([basename basenames]
[i (in-naturals 1)])
(define name-for-query (basename->name-for-query basename))
(define dest-url
(format "https://~a.fandom.com/api.php?~a"
wikiname
(params->query `(("action" . "parse")
("page" . ,name-for-query)
("prop" . "text|headhtml|langlinks")
("formatversion" . "2")
("format" . "json")))))
(define r (get dest-url))
(define body (response-body r))
(define filename (string-append basename ".json"))
(define save-path
(cond [((string-length basename) . > . 240)
(define key (sha1 (string->bytes/latin-1 basename)))
(query-exec slc "insert into special_page (wikiname, key, basename) values (?, ?, ?)"
wikiname key basename)
(build-path save-dir (string-append key ".json"))]
[#t
(build-path save-dir (string-append basename ".json"))]))
(display-to-file body save-path #:exists 'replace)
(query-exec slc "update page set progress = 1 where wikiname = ? and basename = ?"
wikiname basename)
(callback (+ already-done-count i) (+ already-done-count not-done-count) basename))
;; saved all pages, register that fact in the database
(query-exec slc "update wiki set progress = 2 where wikiname = ?" wikiname))
;; 3. Download CSS and:
;; * Save CSS to file
;; * Record style images to database
(define (check-style-for-images wikiname path)
(define content (file->string path))
(define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr))
(for/list ([url urls]
#:when (not (or (equal? url "")
(equal? url "'")
(string-suffix? url "\"")
(string-contains? url "/resources-ucp/")
(string-contains? url "/fonts/")
(string-contains? url "/drm_fonts/")
@ -88,7 +184,7 @@
[(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)]
[else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)])))
(define (download-styles-for-wiki wikiname callback)
(define (download-styles-for-wiki wikiname)
(define save-dir (build-path archive-root wikiname "styles"))
(make-directory* save-dir)
(define theme (λ (theme-name)
@ -102,137 +198,18 @@
(theme "dark")
(cons (format "https://~a.fandom.com/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop" wikiname)
(build-path save-dir "site.css"))))
(for ([style styles]
[i (in-naturals)])
(callback i (length styles) "styles...")
(for ([style styles])
(define r (get (car style)))
(define body (response-body r))
(display-to-file body (cdr style) #:exists 'replace)
;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url?
)
(callback (length styles) (length styles) "styles...")
styles)
(define (hash->save-dir wikiname hash)
(build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2)))
(define (image-url->values i)
;; TODO: handle case where there is multiple broken cb parameter on minecraft wiki
;; TODO: ensure it still "works" with broken &amp; on minecraft wiki
(define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing
(define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary
(define hash (sha1 (string->bytes/utf-8 key)))
(cons key hash))
;; 1. Download list of wiki pages and store in database, if not done yet for that wiki
(define (if-necessary-download-list-of-pages wikiname callback)
(define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
;; done yet?
(unless (and (real? wiki-progress) (wiki-progress . >= . 1))
;; Count total pages
(define num-pages (insert-wiki-entry wikiname))
;; Download the entire index of pages
(define basenames
(let loop ([path-with-namefrom "/wiki/Local_Sitemap"]
[basenames-previous-pages null])
;; Download the current index page
(define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom))
(define r (get url))
;; Metadata from this page (the link to the next page)
(define page (html->xexp (bytes->string/utf-8 (response-body r))))
(define link-namefrom
((query-selector (λ (t a c x) (and (eq? t 'a)
(pair? x)
(string-contains? (car x) "Next page")
(let ([href (get-attribute 'href a)] )
(and href (string-contains? href "/wiki/Local_Sitemap")))))
page #:include-text? #t)))
;; Content from this page
(define basenames-this-page
(for/list ([link (in-producer
(query-selector
(λ (t a c) (eq? t 'a))
((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page)))
#f)])
(local-encoded-url->basename (get-attribute 'href (bits->attributes link)))))
;; Call the progress callback
(define all-basenames (append basenames-previous-pages basenames-this-page))
(callback (length all-basenames) num-pages (last all-basenames))
;; Recurse to download from the next page
(if link-namefrom
(loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames)
all-basenames)))
;; Save those pages into the database
;; SQLite can have a maximum of 32766 parameters in a single query
(for ([slice (in-slice 32760 basenames)])
(define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values "))
(call-with-transaction
(get-slc)
(λ ()
(apply query-exec* query-template wikiname slice)
;; Record that we have the complete list of pages
(query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname))))))
;; 2. Download each page via API and:
;; * Save API response to file
(define max-page-progress 1)
(define (save-each-page wikiname callback)
;; prepare destination folder
(define save-dir (build-path archive-root wikiname))
(make-directory* save-dir)
;; gather list of basenames to download (that aren't yet complete)
(define basenames (query-list* "select basename from page where wikiname = ? and progress < ?"
wikiname max-page-progress))
;; counter of complete/incomplete basenames
(define already-done-count
(query-value* "select count(*) from page where wikiname = ? and progress = ?"
wikiname max-page-progress))
(define not-done-count
(query-value* "select count(*) from page where wikiname = ? and progress < ?"
wikiname max-page-progress))
(define total-count (+ already-done-count not-done-count))
;; set initial progress
(callback already-done-count total-count "")
;; loop through basenames and download
(for ([basename basenames]
[i (in-naturals (add1 already-done-count))])
(define name-for-query (basename->name-for-query basename))
(define dest-url
(format "https://~a.fandom.com/api.php?~a"
wikiname
(params->query `(("action" . "parse")
("page" . ,name-for-query)
("prop" . "text|headhtml|langlinks")
("formatversion" . "2")
("format" . "json")))))
(define r (get dest-url))
(define body (response-body r))
(define filename (string-append basename ".json"))
(define save-path
(cond [((string-length basename) . > . 240)
(define key (sha1 (string->bytes/latin-1 basename)))
(query-exec* "insert into special_page (wikiname, key, basename) values (?, ?, ?)"
wikiname key basename)
(build-path save-dir (string-append key ".json"))]
[#t
(build-path save-dir (string-append basename ".json"))]))
(display-to-file body save-path #:exists 'replace)
(query-exec* "update page set progress = 1 where wikiname = ? and basename = ?"
wikiname basename)
(callback i total-count basename))
;; saved all pages, register that fact in the database
(query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname))
;; 3. Download CSS and:
;; * Save CSS to file
;; * Record style images to database
(define (if-necessary-download-and-check-styles wikiname callback)
(define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
(define (do-step-3 wikiname)
(define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname))
(unless (and (number? wiki-progress) (wiki-progress . >= . 3))
(define styles (download-styles-for-wiki wikiname callback))
(define styles (download-styles-for-wiki wikiname))
(define unique-image-urls
(remove-duplicates
(map image-url->values
@ -240,40 +217,48 @@
(for/list ([style styles])
(check-style-for-images wikiname (cdr style)))))
#:key cdr))
(println unique-image-urls)
(for ([pair unique-image-urls])
(query-exec* "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair)))
(query-exec* "update wiki set progress = 3 where wikiname = ?" wikiname)))
(query-exec slc "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair)))
(query-exec slc "update wiki set progress = 3 where wikiname = ?" wikiname)))
;; 4: From downloaded pages, record URLs of image sources and inline style images to database
(define (hash->save-dir wikiname hash)
(build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2)))
(define (image-url->values i)
;; TODO: handle case where there is multiple cb parameter on minecraft wiki
;; TODO: ensure it still "works" with broken &amp; on minecraft wiki
(define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing
(define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary
(define hash (sha1 (string->bytes/utf-8 key)))
(cons key hash))
(define (check-json-for-images wikiname path)
(define data (with-input-from-file path (λ () (read-json))))
(define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data))))
(define tree (update-tree-wiki page wikiname))
null
#;(remove-duplicates
(for/list ([element (in-producer
(query-selector
(λ (t a c)
(and (eq? t 'img)
(get-attribute 'src a)))
tree)
#f)])
(image-url->values (get-attribute 'src (bits->attributes element))))))
(remove-duplicates
(for/list ([element (in-producer
(query-selector
(λ (t a c)
(and (eq? t 'img)
(get-attribute 'src a)))
tree)
#f)])
(image-url->values (get-attribute 'src (bits->attributes element))))))
;; 5. Download image sources and style images according to database
(define (save-each-image wikiname callback)
(define source (hash-ref sources 'style)) ;; TODO: download entire wiki images instead?
(define (save-each-image wikiname source callback)
;; gather list of basenames to download (that aren't yet complete)
(define rows (query-rows* "select url, hash from image where wikiname = ? and source <= ? and progress < 1"
(define rows (query-rows slc "select url, hash from image where wikiname = ? and source <= ? and progress < 1"
wikiname source))
;; counter of complete/incomplete basenames
(define already-done-count
(query-value* "select count(*) from image where wikiname = ? and source <= ? and progress = 1"
(query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress = 1"
wikiname source))
(define not-done-count
(query-value* "select count(*) from image where wikiname = ? and source <= ? and progress < 1"
(query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress < 1"
wikiname source))
;; set initial progress
(callback already-done-count (+ already-done-count not-done-count) "")
@ -284,35 +269,26 @@
(define url (vector-ref row 0))
(define hash (vector-ref row 1))
;; check
#; (printf "~a -> ~a~n" url hash)
(printf "~a -> ~a~n" url hash)
(define r (get url))
(define declared-type (response-headers-ref r 'content-type))
(define final-type (if (equal? declared-type #"application/octet-stream")
(let ([sniff-entity (message-entity (mime-analyze (response-body r)))])
(string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity))))
declared-type))
(define ext
(with-handlers ([exn:fail:contract? (λ _ (error 'save-each-image "no ext found for mime type `~a` in file ~a" final-type url))])
(bytes->string/latin-1 (mime-type->ext final-type))))
(define ext (bytes->string/latin-1 (mime-type->ext final-type)))
;; save
(define save-dir (hash->save-dir wikiname hash))
(make-directory* save-dir)
(define save-path (build-path save-dir (string-append hash "." ext)))
(define body (response-body r))
(display-to-file body save-path #:exists 'replace)
(query-exec* "update image set progress = 1, ext = ? where wikiname = ? and hash = ?"
(query-exec slc "update image set progress = 1, ext = ? where wikiname = ? and hash = ?"
ext wikiname hash)
(callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append (substring hash 0 6) "..." ext)))
;; saved all images, register that fact in the database
(query-exec* "update wiki set progress = 4 where wikiname = ?" wikiname))
(callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append hash "." ext)))
;; TODO: saved all images, register that fact in the database
)
(define all-stages
(list
if-necessary-download-list-of-pages
save-each-page
if-necessary-download-and-check-styles
;; check-json-for-images
save-each-image))
(module+ test
(check-equal? (html->xexp "<img src=\"https://example.com/images?src=Blah.jpg&amp;width=150\">")
@ -323,13 +299,11 @@
#;(do-step-3 "gallowmere")
#;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))
#;(for ([wikiname (query-list* "select wikiname from wiki")])
#;(for ([wikiname (query-list slc "select wikiname from wiki")])
(println wikiname)
(insert-wiki-entry wikiname))
#;(for ([wikiname (query-list* "select wikiname from wiki")])
#;(for ([wikiname (query-list slc "select wikiname from wiki")])
(println wikiname)
(do-step-3 wikiname)
(save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))))
; (for ([stage all-stages]) (stage "create" (λ (a b c) (printf "~a/~a ~a~n" a b c))))

View file

@ -30,9 +30,7 @@
(define ch (make-channel))
(define (start)
(serve/launch/wait
#:listen-ip (if (equal? (config-get 'bind_host) "auto")
(if (config-true? 'debug) "127.0.0.1" #f)
(config-get 'bind_host))
#:listen-ip (if (config-true? 'debug) "127.0.0.1" #f)
#:port (string->number (config-get 'port))
(λ (quit)
(channel-put ch (lambda () (semaphore-post quit)))

View file

@ -20,9 +20,7 @@
(require (only-in "src/page-file.rkt" page-file))
(serve/launch/wait
#:listen-ip (if (equal? (config-get 'bind_host) "auto")
(if (config-true? 'debug) "127.0.0.1" #f)
(config-get 'bind_host))
#:listen-ip (if (config-true? 'debug) "127.0.0.1" #f)
#:port (string->number (config-get 'port))
(λ (quit)
(dispatcher-tree

View file

@ -1,6 +1,5 @@
text/html html
text/css css
application/xml xml
text/xml xml
image/gif gif
image/jpeg jpeg
@ -26,7 +25,6 @@ application/font-woff2 woff2
application/acad woff2
font/woff2 woff2
application/font-woff woff
font/woff woff
application/x-font-ttf ttf
application/x-font-truetype ttf
application/x-truetype-font ttf

View file

@ -1,5 +1,5 @@
#lang racket/base
(require (for-syntax racket/base syntax/location))
(require (for-syntax racket/base))
(provide
; help make a nested if. if/in will gain the same false form of its containing if/out.
@ -7,12 +7,7 @@
; cond, but values can be defined between conditions
cond/var
; wrap sql statements into lambdas so they can be executed during migration
wrap-sql
; get the name of the file that contains the currently evaluating form
this-directory
this-file
; replacement for define-runtime-path
anytime-path)
wrap-sql)
(module+ test
(require rackunit)
@ -101,16 +96,6 @@
(check-equal? (if/out #t (if/in #f 'yes) 'no) 'no)
(check-equal? (if/out #f (if/in #f 'yes) 'no) 'no))
(define-syntax (this-directory stx)
(datum->syntax stx (syntax-source-directory stx)))
(define-syntax (this-file stx)
(datum->syntax stx (build-path (or (syntax-source-directory stx) 'same) (syntax-source-file-name stx))))
(module+ test
(require racket/path)
(check-equal? (file-name-from-path (this-file)) (build-path "syntax.rkt")))
(define-syntax (cond/var stx)
(transform/out-cond/var stx))
(module+ test
@ -118,28 +103,7 @@
#'(cond
[#f 0]
[#t
(let* ([d (* a 2)])
(let ([d (* a 2)])
(cond
[(eq? d 8) d]
[#t "not 4"]))])))
;;; Replacement for define-runtime-path that usually works well and doesn't include the files/folder contents into the distribution.
;;; When running from source, should always work appropriately.
;;; When running from a distribution, (current-directory) is treated as the root.
;;; Usage:
;;; * to-root : Path-String * relative path from the source file to the project root
;;; * to-dest : Path-String * relative path from the root to the desired file/folder
(define-syntax (anytime-path stx)
(define-values (_ to-root to-dest) (apply values (syntax->list stx)))
(define source (syntax-source stx))
(unless (complete-path? source)
(error 'anytime-path "syntax source has no directory: ~v" stx))
(datum->syntax
stx
`(let* ([syntax-to-root (build-path (path-only ,source) ,to-root)]
[root (if (directory-exists? syntax-to-root)
;; running on the same filesystem it was compiled on, i.e. it's running the source code out of a directory, and the complication is the intermediate compilation
syntax-to-root
;; not running on the same filesystem, i.e. it's a distribution. we assume that the current working directory is where the executable is, and treat this as the root.
(current-directory))])
(simple-form-path (build-path root ,to-dest)))))

View file

@ -29,13 +29,12 @@
(define default-config
'((application_name . "BreezeWiki")
(bind_host . "auto")
(port . "10416")
(canonical_origin . "")
(debug . "false")
(feature_search_suggestions . "true")
(instance_is_official . "false") ; please don't turn this on, or you will make me very upset
(log_outgoing . "true")
(port . "10416")
(strict_proxy . "false")
(feature_offline::enabled . "false")

View file

@ -8,8 +8,8 @@
db
memo
"static-data.rkt"
"whole-utils.rkt"
"../lib/url-utils.rkt"
"whole-utils.rkt"
"../lib/xexpr-utils.rkt"
"../archiver/archiver-database.rkt"
"config.rkt")
@ -42,8 +42,8 @@
[(config-true? 'feature_offline::only)
(when (config-true? 'debug)
(printf "using offline mode for siteinfo ~a~n" wikiname))
(define row (query-maybe-row* "select sitename, basepage, license_text, license_url from wiki where wikiname = ?"
wikiname))
(define row (query-maybe-row slc "select sitename, basepage, license_text, license_url from wiki where wikiname = ?"
wikiname))
(if row
(siteinfo^ (vector-ref row 0)
(vector-ref row 1)

View file

@ -2,6 +2,7 @@
(require racket/file
racket/path
racket/port
racket/runtime-path
racket/string
net/url
web-server/http
@ -10,7 +11,6 @@
(only-in web-server/dispatchers/dispatch next-dispatcher)
"../archiver/archiver.rkt"
"../lib/mime-types.rkt"
"../lib/syntax.rkt"
"../lib/xexpr-utils.rkt"
"config.rkt"
"log.rkt")
@ -18,7 +18,7 @@
(provide
page-static-archive)
(define path-archive (anytime-path ".." "storage/archive"))
(define-runtime-path path-archive "../storage/archive")
(define ((replacer wikiname) whole url)
(format

View file

@ -8,7 +8,6 @@
(only-in web-server/dispatchers/dispatch next-dispatcher)
(prefix-in files: web-server/dispatchers/dispatch-files)
"../lib/mime-types.rkt"
"../lib/syntax.rkt"
"config.rkt")
(provide
@ -18,7 +17,7 @@
(require rackunit))
(define-runtime-path path-static "../static")
(define path-archive (anytime-path ".." "storage/archive"))
(define-runtime-path path-archive "../storage/archive")
(define hash-ext-mime-type
(hash #".css" #"text/css"

View file

@ -4,7 +4,7 @@
racket/function
racket/list
racket/match
racket/path
racket/runtime-path
racket/string
; libs
(prefix-in easy: net/http-easy)
@ -38,7 +38,7 @@
(module+ test
(require rackunit))
(define path-archive (anytime-path ".." "storage/archive"))
(define-runtime-path path-archive "../storage/archive")
(define (page-wiki-offline req)
(response-handler