forked from cadence/breezewiki
		
	Rewrite archiver project
* Rewrite archiver.rkt to manage the stages order * Rewrite archiver-gui.rkt: * Remembers the previous incomplete queue items * Pretty graphics for icons and progress bars * Segmented progress bars to indicate different stages * Fix archiver-cli.rkt to use new stages * Switch to req -d, so it doesn't auto-install gui libs
This commit is contained in:
		
							parent
							
								
									453570bdc9
								
							
						
					
					
						commit
						cf74ffb0e2
					
				
					 4 changed files with 536 additions and 350 deletions
				
			
		|  | @ -6,24 +6,24 @@ | ||||||
|              "" |              "" | ||||||
|              "Downloaded pages go into `archive/` next to the executable." |              "Downloaded pages go into `archive/` next to the executable." | ||||||
|              "Database goes into `archiver.db*` next to the executable." |              "Database goes into `archiver.db*` next to the executable." | ||||||
|              "The database is necessary to store your download progress and resume where you left off if the process is interrupted.") |              "The database is necessary to store your download progress and resume where you left off if the process is interrupted.")) | ||||||
|       (ps "" |  | ||||||
|           "Default output style is `progress` in a tty and `lines` otherwise.")) |  | ||||||
| 
 | 
 | ||||||
| (flag (output-quiet?) | (flag (output-quiet?) | ||||||
|       ("-q" "--output-quiet" "disable progress output") |       ("-q" "--output-quiet" "disable progress output") | ||||||
|       (output-quiet? #t)) |       (output-quiet? #t)) | ||||||
| 
 | 
 | ||||||
| (flag (output-lines?) |  | ||||||
|       ("-l" "--output-lines" "output the name of each file downloaded") |  | ||||||
|       (output-lines? #t)) |  | ||||||
| 
 |  | ||||||
| (flag (output-progress?) | (flag (output-progress?) | ||||||
|       ("-p" "--output-progress" "progress output for terminals") |       ("-p" "--output-progress" "progress output for terminals (default in a tty)") | ||||||
|       (output-progress? #t)) |       (output-progress? #t)) | ||||||
| 
 | 
 | ||||||
|  | (flag (output-lines?) | ||||||
|  |       ("-l" "--output-lines" "output the name of each file downloaded (default outside of a tty)") | ||||||
|  |       (output-lines? #t)) | ||||||
|  | 
 | ||||||
| (constraint (one-of output-quiet? output-lines? output-progress?)) | (constraint (one-of output-quiet? output-lines? output-progress?)) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| (program | (program | ||||||
|  (start [wikiname "wikiname to download"]) |  (start [wikiname "wikiname to download"]) | ||||||
|  ;; set up arguments |  ;; set up arguments | ||||||
|  | @ -42,17 +42,8 @@ | ||||||
|  ;; check |  ;; check | ||||||
|  (when (or (not wikiname) (equal? wikiname "")) |  (when (or (not wikiname) (equal? wikiname "")) | ||||||
|    (raise-user-error "Please specify the wikiname to download on the command line.")) |    (raise-user-error "Please specify the wikiname to download on the command line.")) | ||||||
|  ;; stage 1 |  ;; progress reporting based on selected mode | ||||||
|  (cond [(output-lines?) (displayln "Downloading list of pages...")] |  (define (report-progress a b c) | ||||||
|        [(output-progress?) (printf "Downloading list of pages... \r")]) |  | ||||||
|  (if-necessary-download-list-of-pages |  | ||||||
|   wikiname |  | ||||||
|   (λ (a b c) |  | ||||||
|     (cond [(output-progress?) (printf "Downloading list of pages... [~a/~b]\r" a b)]))) |  | ||||||
|  ;; stage 2 |  | ||||||
|  (save-each-page |  | ||||||
|   wikiname |  | ||||||
|   (λ (a b c) |  | ||||||
|    (define basename (basename->name-for-query c)) |    (define basename (basename->name-for-query c)) | ||||||
|    (cond |    (cond | ||||||
|      [(output-lines?) |      [(output-lines?) | ||||||
|  | @ -60,12 +51,18 @@ | ||||||
|      [(output-progress?) |      [(output-progress?) | ||||||
|       (when (eq? (modulo a 20) 0) |       (when (eq? (modulo a 20) 0) | ||||||
|         (thread (λ () (update-width)))) |         (thread (λ () (update-width)))) | ||||||
|        (define prefix (format "[~a/~a] " a b)) |       (define prefix (format "[~a] [~a/~a] " wikiname a b)) | ||||||
|       (define rest (- width (string-length prefix))) |       (define rest (- width (string-length prefix))) | ||||||
|       (define real-width (min (string-length basename) rest)) |       (define real-width (min (string-length basename) rest)) | ||||||
|       (define spare-width (- rest real-width)) |       (define spare-width (- rest real-width)) | ||||||
|       (define name-display (substring basename 0 real-width)) |       (define name-display (substring basename 0 real-width)) | ||||||
|       (define whitespace (make-string spare-width #\ )) |       (define whitespace (make-string spare-width #\ )) | ||||||
|        (printf "~a~a~a\r" prefix name-display whitespace)])))) |       (printf "~a~a~a\r" prefix name-display whitespace)])) | ||||||
|  |  ;; download all stages | ||||||
|  |  (for ([stage all-stages] | ||||||
|  |        [i (in-naturals 1)]) | ||||||
|  |    (printf "> Stage ~a/~a~n" i (length all-stages)) | ||||||
|  |    (stage wikiname report-progress) | ||||||
|  |    (displayln ""))) | ||||||
| 
 | 
 | ||||||
| (run start) | (run start) | ||||||
|  |  | ||||||
|  | @ -1,7 +1,7 @@ | ||||||
| #lang racket/base | #lang racket/base | ||||||
| (require racket/list | (require racket/file | ||||||
|  |          racket/list | ||||||
|          racket/path |          racket/path | ||||||
|          racket/runtime-path |  | ||||||
|          racket/string |          racket/string | ||||||
|          json |          json | ||||||
|          json-pointer |          json-pointer | ||||||
|  | @ -9,9 +9,16 @@ | ||||||
|          "../lib/syntax.rkt") |          "../lib/syntax.rkt") | ||||||
| 
 | 
 | ||||||
| (provide | (provide | ||||||
|  slc) |  get-slc | ||||||
|  |  query-exec* | ||||||
|  |  query-rows* | ||||||
|  |  query-list* | ||||||
|  |  query-value* | ||||||
|  |  query-maybe-value* | ||||||
|  |  query-maybe-row*) | ||||||
| 
 | 
 | ||||||
| (define-runtime-path database-file "../storage/archiver.db") | (define storage-path (anytime-path ".." "storage")) | ||||||
|  | (define database-file (build-path storage-path "archiver.db")) | ||||||
| 
 | 
 | ||||||
| (define migrations | (define migrations | ||||||
|   (wrap-sql |   (wrap-sql | ||||||
|  | @ -25,23 +32,50 @@ | ||||||
|     (query-exec slc "alter table wiki add column license_text TEXT") |     (query-exec slc "alter table wiki add column license_text TEXT") | ||||||
|     (query-exec slc "alter table wiki add column license_url TEXT")))) |     (query-exec slc "alter table wiki add column license_url TEXT")))) | ||||||
| 
 | 
 | ||||||
| (define slc (sqlite3-connect #:database database-file #:mode 'create)) | (define slc (box #f)) | ||||||
| (query-exec slc "PRAGMA journal_mode=WAL") | (define (get-slc) | ||||||
|  |   (define slc* (unbox slc)) | ||||||
|  |   (cond | ||||||
|  |     [slc* slc*] | ||||||
|  |     [else | ||||||
|  |      (make-directory* storage-path) | ||||||
|  |      (define slc* (sqlite3-connect #:database database-file #:mode 'create)) | ||||||
|  |      (query-exec slc* "PRAGMA journal_mode=WAL") | ||||||
|      (define database-version |      (define database-version | ||||||
|        (with-handlers ([exn:fail:sql? |        (with-handlers ([exn:fail:sql? | ||||||
|                         (λ (exn) |                         (λ (exn) | ||||||
|                           ; need to set up the database |                           ; need to set up the database | ||||||
|                      (query-exec slc "create table database_version (version integer, primary key (version))") |                           (query-exec slc* "create table database_version (version integer, primary key (version))") | ||||||
|                      (query-exec slc "insert into database_version values (0)") |                           (query-exec slc* "insert into database_version values (0)") | ||||||
|                           0)]) |                           0)]) | ||||||
|     (query-value slc "select version from database_version"))) |          (query-value slc* "select version from database_version"))) | ||||||
| 
 | 
 | ||||||
|      (let do-migrate-step () |      (let do-migrate-step () | ||||||
|        (when (database-version . < . (length migrations)) |        (when (database-version . < . (length migrations)) | ||||||
|          (call-with-transaction |          (call-with-transaction | ||||||
|      slc |           slc* | ||||||
|           (list-ref migrations database-version)) |           (list-ref migrations database-version)) | ||||||
|          (set! database-version (add1 database-version)) |          (set! database-version (add1 database-version)) | ||||||
|     (query-exec slc "update database_version set version = $1" database-version) |          (query-exec slc* "update database_version set version = $1" database-version) | ||||||
|          (do-migrate-step))) |          (do-migrate-step))) | ||||||
| 
 | 
 | ||||||
|  |      (set-box! slc slc*) | ||||||
|  |      slc*])) | ||||||
|  | 
 | ||||||
|  | (define (query-exec* . args) | ||||||
|  |   (apply query-exec (get-slc) args)) | ||||||
|  | 
 | ||||||
|  | (define (query-rows* . args) | ||||||
|  |   (apply query-rows (get-slc) args)) | ||||||
|  | 
 | ||||||
|  | (define (query-list* . args) | ||||||
|  |   (apply query-list (get-slc) args)) | ||||||
|  | 
 | ||||||
|  | (define (query-value* . args) | ||||||
|  |   (apply query-value (get-slc) args)) | ||||||
|  | 
 | ||||||
|  | (define (query-maybe-value* . args) | ||||||
|  |   (apply query-maybe-value (get-slc) args)) | ||||||
|  | 
 | ||||||
|  | (define (query-maybe-row* . args) | ||||||
|  |   (apply query-maybe-row (get-slc) args)) | ||||||
|  |  | ||||||
|  | @ -1,89 +1,128 @@ | ||||||
| #lang racket/base | #lang racket/base | ||||||
| (require racket/class | (require racket/class | ||||||
|  |          racket/draw | ||||||
|  |          racket/format | ||||||
|          racket/list |          racket/list | ||||||
|          racket/port |          racket/port | ||||||
|          racket/set |          racket/set | ||||||
|  |          racket/splicing | ||||||
|          racket/string |          racket/string | ||||||
|          db |          db | ||||||
|          net/http-easy |          net/http-easy | ||||||
|  |          memo | ||||||
|  |          (only-in racket/gui timer%) | ||||||
|          racket/gui/easy |          racket/gui/easy | ||||||
|          racket/gui/easy/operator |          racket/gui/easy/operator | ||||||
|  |          (only-in pict bitmap) | ||||||
|  |          images/icons/style | ||||||
|  |          images/icons/control | ||||||
|  |          images/icons/stickman | ||||||
|  |          images/icons/symbol | ||||||
|          "archiver-database.rkt" |          "archiver-database.rkt" | ||||||
|          "archiver.rkt" |          "archiver.rkt" | ||||||
|          "../lib/url-utils.rkt" |          "../lib/url-utils.rkt" | ||||||
|          "../lib/xexpr-utils.rkt") |          "../lib/xexpr-utils.rkt") | ||||||
| 
 | 
 | ||||||
| (define active-threads (mutable-seteq)) | (default-icon-material rubber-icon-material) | ||||||
|  | 
 | ||||||
|  | (require (for-syntax racket/base racket/match racket/set racket/string)) | ||||||
|  | 
 | ||||||
|  | (define-syntax (@> stx) | ||||||
|  |   (define form (cdr (syntax->datum stx))) | ||||||
|  |   (match form | ||||||
|  |     [(list form) ; (@> (fn @obs)) | ||||||
|  |      ;; identify the observables and replace with non-@ symbols | ||||||
|  |      (define collection (mutable-set)) | ||||||
|  |      (define updated | ||||||
|  |        (let loop ([sexp form]) | ||||||
|  |          (cond [(symbol? sexp) | ||||||
|  |                 (let ([as-s (symbol->string sexp)]) | ||||||
|  |                   (if (string-prefix? as-s "@") | ||||||
|  |                       (let ([without-@ (string->symbol (substring as-s 1))]) | ||||||
|  |                         (set-add! collection (cons sexp without-@)) | ||||||
|  |                         without-@) | ||||||
|  |                       sexp))] | ||||||
|  |                [(pair? sexp) (cons (loop (car sexp)) (loop (cdr sexp)))] | ||||||
|  |                [#t sexp]))) | ||||||
|  |      (define collection-l (set->list collection)) | ||||||
|  |      ;; return obs-combine -> updated-form | ||||||
|  |      (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-l)) ,updated) ,@(map car collection-l)))] | ||||||
|  |     [(list (? string? str) args ...) ; (@> "Blah: ~a/~a" @arg1 arg2) | ||||||
|  |      ;; identify the observables and replace with non-@ symbols | ||||||
|  |      (define collection-l | ||||||
|  |        (for/list ([arg args]) | ||||||
|  |          (if (symbol? arg) | ||||||
|  |              (let ([as-s (symbol->string arg)]) | ||||||
|  |                (if (string-prefix? as-s "@") | ||||||
|  |                    (let ([without-@ (string->symbol (substring as-s 1))]) | ||||||
|  |                      (cons arg without-@)) | ||||||
|  |                    (cons #f arg))) | ||||||
|  |              (cons #f arg)))) | ||||||
|  |      (define collection-lo (filter car collection-l)) | ||||||
|  |      ;; return obs-combine -> format | ||||||
|  |      (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-lo)) (format ,str ,@(map cdr collection-l))) ,@(map car collection-lo)))])) | ||||||
| 
 | 
 | ||||||
| (define/obs @auto-retry #f) | (define/obs @auto-retry #f) | ||||||
| (define/obs @wikiname "") |  | ||||||
| (define/obs @state 'waiting) |  | ||||||
| (define/obs @num-pages 1) |  | ||||||
| (define/obs @done-pages 0) |  | ||||||
| (define/obs @just-done "") |  | ||||||
| (define/obs @queue '()) |  | ||||||
| (define @title |  | ||||||
|   (obs-combine |  | ||||||
|    (λ (state queue num-pages done-pages) |  | ||||||
|      (define suffix (if (pair? queue) |  | ||||||
|                         (format " +~a" (length queue)) |  | ||||||
|                         "")) |  | ||||||
|      (define progress (if (eq? num-pages 0) |  | ||||||
|                           " 0%" |  | ||||||
|                           (format " ~a%" (round (inexact->exact (* (/ done-pages num-pages) 100)))))) |  | ||||||
|      (case state |  | ||||||
|        [(waiting stage-0) (format "Fandom Archiver~a" suffix)] |  | ||||||
|        [(stage-1) (format "Fandom Archiver 0%~a" suffix)] |  | ||||||
|        [(stage-2) (format "Fandom Archiver~a~a" progress suffix)] |  | ||||||
|        [(err) "ERROR Fandom Archiver"] |  | ||||||
|        [(done) "Fandom Archiver 100%"])) |  | ||||||
|    @state @queue @num-pages (obs-throttle @done-pages #:duration 5000))) |  | ||||||
| 
 | 
 | ||||||
| (define-syntax-rule (t body ...) | (define-struct qi^ (wikiname st stage progress max-progress eta th) #:transparent) ;; queue item | ||||||
|   (set-add! active-threads (thread (λ () body ...)))) |  | ||||||
| 
 | 
 | ||||||
| (define (do-start-or-queue) | (define rows (query-rows* "select wikiname, progress from wiki where progress < 4")) | ||||||
|   (define wikiname (obs-peek @wikiname)) | (define/obs @queue null) | ||||||
|   (:= @wikiname "") | (define (add-wikiname-to-queue wikiname st stage) | ||||||
|   (when (not (equal? (string-trim wikiname) "")) |   (@queue . <~ . (λ (queue) | ||||||
|     (@queue . <~ . (λ (q) (append q (list wikiname)))) |                    (define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue)) | ||||||
|     (shift-queue-maybe))) |                    (if already-exists? | ||||||
|  |                        queue | ||||||
|  |                        (append queue (list (qi^ wikiname st stage 0 1 "..." #f))))))) | ||||||
|  | (for ([row rows]) | ||||||
|  |   (add-wikiname-to-queue (vector-ref row 0) | ||||||
|  |                          (if (= (vector-ref row 1) 4) | ||||||
|  |                              'complete | ||||||
|  |                              'queued) | ||||||
|  |                          (vector-ref row 1))) | ||||||
| 
 | 
 | ||||||
| (define (shift-queue-maybe) | (define status-icon-size 32) | ||||||
|   (when (memq (obs-peek @state) '(waiting done)) | (define status-icon-min-width 36) | ||||||
|     (define q (obs-peek @queue)) | (define button-icon-size 12) | ||||||
|     (cond |  | ||||||
|       [(pair? q) |  | ||||||
|        (define wikiname (car q)) |  | ||||||
|        (:= @queue (cdr q)) |  | ||||||
|        (do-start-stage1 wikiname)] |  | ||||||
|       [#t (:= @state 'done)]))) |  | ||||||
| 
 | 
 | ||||||
| (define (do-start-stage1 wikiname) | (define color-green (make-color 90 212 68)) | ||||||
|   (:= @just-done "") |  | ||||||
|   (:= @done-pages 0) |  | ||||||
|   (:= @num-pages 1) |  | ||||||
|   (t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)]) |  | ||||||
|        (:= @state 'stage-0) |  | ||||||
|        (if-necessary-download-list-of-pages wikiname (λ (now-done num-pages just-done-name) |  | ||||||
|                                                        (:= @num-pages num-pages) |  | ||||||
|                                                        (:= @done-pages now-done) |  | ||||||
|                                                        (:= @just-done just-done-name) |  | ||||||
|                                                        (:= @state 'stage-1))) |  | ||||||
|        (do-start-stage2 wikiname)))) |  | ||||||
| 
 | 
 | ||||||
| (define (do-start-stage2 wikiname) | (define/obs @input "") | ||||||
|   (:= @just-done "") | 
 | ||||||
|   (:= @num-pages 1) | (splicing-let ([frame-count 30]) | ||||||
|   (:= @done-pages 0) |   (define stickman-frames | ||||||
|   (t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)]) |     (for/vector ([s (in-range 0 1 (/ 1 frame-count))]) | ||||||
|        (save-each-page wikiname (λ (now-done num-pages just-done-path) |       (running-stickman-icon | ||||||
|                                   (:= @num-pages num-pages) |        s | ||||||
|                                   (:= @done-pages now-done) |        #:height status-icon-size | ||||||
|                                   (:= @just-done just-done-path))) |        #:material (default-icon-material)))) | ||||||
|        (:= @state 'waiting) | 
 | ||||||
|        (shift-queue-maybe))) |   (define/obs @stick-frame-no 0) | ||||||
|   (:= @state 'stage-2)) |   (define stick-timer | ||||||
|  |     (new timer% | ||||||
|  |          [notify-callback (λ () (@stick-frame-no . <~ . add1))] | ||||||
|  |          [interval (truncate (/ 1000 frame-count))])) | ||||||
|  |   (define/obs @stick | ||||||
|  |     (@stick-frame-no . ~> . (λ (n) (vector-ref stickman-frames | ||||||
|  |                                                (modulo n (vector-length stickman-frames))))))) | ||||||
|  | 
 | ||||||
|  | (define status-icons | ||||||
|  |   (hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size) | ||||||
|  |           'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size) | ||||||
|  |           'running @stick | ||||||
|  |           'error (x-icon #:height status-icon-size) | ||||||
|  |           'complete (check-icon #:color color-green #:height status-icon-size))) | ||||||
|  | 
 | ||||||
|  | (define action-icons | ||||||
|  |   (hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size) | ||||||
|  |           'resume (play-icon #:color color-green #:height button-icon-size))) | ||||||
|  | 
 | ||||||
|  | (define (bitmap-view @the-bitmap [min-width 1]) | ||||||
|  |   (pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f) | ||||||
|  |                #:stretch '(#f #f) | ||||||
|  |                #:style '(transparent) | ||||||
|  |                @the-bitmap | ||||||
|  |                bitmap)) | ||||||
| 
 | 
 | ||||||
| (define (exn->string e) | (define (exn->string e) | ||||||
|   (with-output-to-string |   (with-output-to-string | ||||||
|  | @ -96,13 +135,15 @@ | ||||||
|           (printf ": ~a" (car item))) |           (printf ": ~a" (car item))) | ||||||
|         (displayln ""))))) |         (displayln ""))))) | ||||||
| 
 | 
 | ||||||
| (define ((handle-graphical-exn wikiname) e) | (define ((handle-graphical-exn @qi) e) | ||||||
|   (displayln (exn->string e) (current-error-port)) |   (displayln (exn->string e) (current-error-port)) | ||||||
|   (cond |   (cond | ||||||
|     [(obs-peek @auto-retry) |     [(obs-peek @auto-retry) | ||||||
|      (do-retry-end wikiname)] |      (void) ;; TODO | ||||||
|  |      #;(do-retry-end wikiname)] | ||||||
|     [#t |     [#t | ||||||
|      (:= @state 'err) |      (update-qi @qi [st 'error]) | ||||||
|  |      (do-try-unpause-next-entry) | ||||||
|      (thread |      (thread | ||||||
|       (λ () |       (λ () | ||||||
|         (define/obs @visible? #t) |         (define/obs @visible? #t) | ||||||
|  | @ -116,89 +157,177 @@ | ||||||
|                          (input #:style '(multiple hscroll) |                          (input #:style '(multiple hscroll) | ||||||
|                                 #:min-size '(#f 200) |                                 #:min-size '(#f 200) | ||||||
|                                 (exn->string e)) |                                 (exn->string e)) | ||||||
|                          (button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname))) |                          ;; TODO | ||||||
|                          (button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname))) |                          #;(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname))) | ||||||
|                          (button "Skip Wiki" (λ () (:= @visible? #f) (do-continue))) |                          #;(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname))) | ||||||
|                          (button "Use Auto-Retry" (λ () |                          #;(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue))) | ||||||
|  |                          #;(button "Use Auto-Retry" (λ () | ||||||
|                                                       (:= @auto-retry #t) |                                                       (:= @auto-retry #t) | ||||||
|                                                       (:= @visible? #f) |                                                       (:= @visible? #f) | ||||||
|                                                       (do-retry-end wikiname))) |                                                       (do-retry-end wikiname))) | ||||||
|                          (text "Be careful not to auto-retry an infinite loop!"))) |                          #;(text "Be careful not to auto-retry an infinite loop!"))) | ||||||
|          main-window))) |          main-window))) | ||||||
|      (sleep) |      (sleep) | ||||||
|      ; make sure the old broken threads are all gone |      ; make sure the broken thread is gone | ||||||
|      (for ([th active-threads]) (kill-thread th)) |      (define th (qi^-th (obs-peek @qi))) | ||||||
|      (set-clear! active-threads)])) |      (when th (kill-thread th))])) | ||||||
| 
 | 
 | ||||||
| (define (do-retry-now wikiname) | (define segments | ||||||
|   (@queue . <~ . (λ (q) (append (list wikiname) q))) |   (list | ||||||
|   (:= @state 'waiting) |    (list 5/100 (make-color 0 223 217)) | ||||||
|   (shift-queue-maybe)) |    (list 88/100 color-green) | ||||||
|  |    (list 2/100 (make-color 0 223 217)) | ||||||
|  |    (list 5/100 color-green))) | ||||||
|  | (define segment-spacing 2) | ||||||
|  | (unless (= (apply + (map car segments)) 1) | ||||||
|  |   (error 'segments "segments add up to ~a, not 1" (apply + (map car segments)))) | ||||||
| 
 | 
 | ||||||
| (define (do-retry-end wikiname) | ;; return the new bitmap, which can be drawn on a dc<%> | ||||||
|   (@queue . <~ . (λ (q) (append q (list wikiname)))) | (define/memoize (ray-trace width height stage progress max-progress) | ||||||
|   (:= @state 'waiting) |   ;; (printf "rendering ~a ~a/~a at ~a~n" stage progress max-progress (current-inexact-milliseconds)) | ||||||
|   (shift-queue-maybe)) |   (define bm (make-object bitmap% width height #f #t)) | ||||||
|  |   (define dc (make-object bitmap-dc% bm)) | ||||||
|  |   (define width-available (- width (* (length segments) segment-spacing))) | ||||||
|  |   (send dc set-smoothing 'unsmoothed) | ||||||
|  |   (send dc set-pen "black" 0 'transparent) | ||||||
|  |   (for/fold ([offset 0]) | ||||||
|  |             ([segment segments] | ||||||
|  |              [i (in-naturals 0)]) ;; zero indexed stages? | ||||||
|  |     ;; calculate start and end locations of grey bar | ||||||
|  |     (define-values (segment-proportion segment-color) (apply values segment)) | ||||||
|  |     (define segment-start (if (= offset 0) 0 (+ offset segment-spacing))) | ||||||
|  |     (define segment-width (* width-available segment-proportion)) | ||||||
|  |     ;; draw grey bar | ||||||
|  |     (send dc set-brush (make-color 180 180 180 0.4) 'solid) | ||||||
|  |     (send dc draw-rectangle segment-start 0 segment-width height) | ||||||
|  |     ;; draw solid bar according to the current item's progress | ||||||
|  |     (define proportion | ||||||
|  |       (cond [(stage . < . i) 0] | ||||||
|  |             [(stage . > . i) 1] | ||||||
|  |             [(max-progress . <= . 0) 0] | ||||||
|  |             [(progress . < . 0) 0] | ||||||
|  |             [(progress . >= . max-progress) 1] | ||||||
|  |             [else (progress . / . max-progress)])) | ||||||
|  |     (send dc set-brush segment-color 'solid) | ||||||
|  |     (send dc draw-rectangle segment-start 0 (* proportion segment-width) height) | ||||||
|  |     (+ segment-start segment-width)) | ||||||
|  |   (bitmap-render-icon bm 6/8)) | ||||||
| 
 | 
 | ||||||
| (define (do-continue) | ;; get ray traced bitmap (possibly from cache) and draw on dc<%> | ||||||
|   (:= @state 'waiting) | (define (draw-bar orig-dc qi) | ||||||
|   (shift-queue-maybe)) |   ;; (println ray-traced) | ||||||
|  |   (define-values (width height) (send orig-dc get-size)) | ||||||
|  |   (send orig-dc draw-bitmap (ray-trace width height  (qi^-stage qi) (qi^-progress qi) (qi^-max-progress qi)) 0 0)) | ||||||
| 
 | 
 | ||||||
| (define (display-basename basename) | (define ((make-progress-updater @qi) a b c) | ||||||
|   (define limit 40) |   ;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c) | ||||||
|   (cond [(string? basename) |   (update-qi @qi [progress a] [max-progress b])) | ||||||
|          (define query (basename->name-for-query basename)) | 
 | ||||||
|          (define segments (string-split query "/")) | (define (do-add-to-queue) | ||||||
|          (when (and ((string-length query) . > . limit) ((length segments) . >= . 2)) |   (define wikiname (string-trim (obs-peek @input))) | ||||||
|            (set! query (string-append ".../" (last segments)))) |   (when ((string-length wikiname) . > . 0) | ||||||
|          (when ((string-length query) . > . limit) |     (add-wikiname-to-queue wikiname 'queued 0)) ;; TODO: automatically start? | ||||||
|            (set! query (string-append (substring query 0 (- limit 3)) "..."))) |   (:= @input "")) | ||||||
|          query] | 
 | ||||||
|         [#t "?"])) | (define-syntax-rule (update-qi @qi args ...) | ||||||
|  |   (let ([wikiname (qi^-wikiname (obs-peek @qi))]) | ||||||
|  |     (@queue . <~ . (λ (queue) | ||||||
|  |                      (for/list ([qi queue]) | ||||||
|  |                        (if (equal? (qi^-wikiname qi) wikiname) | ||||||
|  |                            (struct-copy qi^ qi args ...) | ||||||
|  |                            qi)))))) | ||||||
|  | 
 | ||||||
|  | (define (do-start-qi @qi) | ||||||
|  |   (define th | ||||||
|  |     (thread (λ () | ||||||
|  |               (with-handlers ([exn? (handle-graphical-exn @qi)]) | ||||||
|  |                 (define last-stage | ||||||
|  |                   (for/last ([stage all-stages] | ||||||
|  |                              [i (in-naturals)]) | ||||||
|  |                     (update-qi @qi [stage i]) | ||||||
|  |                     (stage (qi^-wikiname (obs-peek @qi)) (make-progress-updater @qi)) | ||||||
|  |                     i)) | ||||||
|  |                 (update-qi @qi [st 'complete] [stage (add1 last-stage)]) | ||||||
|  |                 (do-try-unpause-next-entry))))) | ||||||
|  |   (update-qi @qi [st 'running] [th th])) | ||||||
|  | 
 | ||||||
|  | (define (do-stop-qi @qi) | ||||||
|  |   (define th (qi^-th (obs-peek @qi))) | ||||||
|  |   (when th (kill-thread th)) | ||||||
|  |   (update-qi @qi [th #f] [st 'paused])) | ||||||
|  | 
 | ||||||
|  | (define (do-try-unpause-next-entry) | ||||||
|  |   (define queue (obs-peek @queue)) | ||||||
|  |   (define next-qi (for/first ([qi queue] | ||||||
|  |                               #:when (memq (qi^-st qi) '(paused queued error))) | ||||||
|  |                     qi)) | ||||||
|  |   (when next-qi | ||||||
|  |     (define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue)))) | ||||||
|  |     (do-start-qi @qi))) | ||||||
| 
 | 
 | ||||||
| (define main-window | (define main-window | ||||||
|   (render |   (render | ||||||
|    (window #:title @title |    (window | ||||||
|            #:size '(360 200) |     #:title "Fandom Archiver" | ||||||
|  |     #:size '(400 300) | ||||||
|     #:mixin (λ (%) (class % |     #:mixin (λ (%) (class % | ||||||
|                      (super-new) |                      (super-new) | ||||||
|                      (define/augment (on-close) |                      (define/augment (on-close) | ||||||
|                               (for ([th active-threads]) (kill-thread th)) |                        (send stick-timer stop) | ||||||
|                               (disconnect slc)))) |                        (for ([qi (obs-peek @queue)]) | ||||||
|            ;; input box at the top |                          (when (qi^-th qi) | ||||||
|            (hpanel (text "https://") |                            (kill-thread (qi^-th qi)))) | ||||||
|                    (input @wikiname |                        #;(disconnect*)))) | ||||||
|  |     (vpanel | ||||||
|  |      #:spacing 10 | ||||||
|  |      #:margin '(5 5) | ||||||
|  |      (hpanel | ||||||
|  |       #:stretch '(#t #f) | ||||||
|  |       #:spacing 10 | ||||||
|  |       (hpanel | ||||||
|  |        (text "https://") | ||||||
|  |        (input @input | ||||||
|               (λ (event data) (cond |               (λ (event data) (cond | ||||||
|                                             [(eq? event 'input) (:= @wikiname data)] |                                 [(eq? event 'input) (:= @input data)] | ||||||
|                                             [(eq? event 'return) (do-start-or-queue)]))) |                                 [(eq? event 'return) (do-add-to-queue)]))) | ||||||
|        (text ".fandom.com")) |        (text ".fandom.com")) | ||||||
|            (button (@queue . ~> . (λ (q) (if (null? q) "Start" "Queue"))) (λ () (do-start-or-queue))) |       (button "Download Wiki" do-add-to-queue)) | ||||||
|            (text (@queue . ~> . (λ (q) (if (null? q) "" (string-join #:before-first "Queue: " q ", "))))) |      (list-view | ||||||
|            ;; show status based on overall application state |       #:style '(vertical) | ||||||
|            (case-view |       @queue | ||||||
|             @state |       #:key qi^-wikiname | ||||||
|             ;; waiting for wikiname entry |       (λ (k @qi) | ||||||
|             ((waiting) (vpanel |         (define @status-icons | ||||||
|                         (text "Fill in the wikiname and click start."))) |           (@> (case (qi^-st @qi) | ||||||
|             ((stage-0) (vpanel |                 [(running) @stick] | ||||||
|                         (text "Checking data..."))) |                 [else (hash-ref status-icons (qi^-st @qi))]))) | ||||||
|             ((stage-1) (vpanel |         (define @is-running? | ||||||
|                         (text "Gathering list of pages...") |           (@> (memq (qi^-st @qi) '(running)))) | ||||||
|                         (text (@just-done . ~> . display-basename)) |         ;; state icon at the left side | ||||||
|                         (text (@done-pages . ~> . (λ (x) (if (eq? x 0) |         (hpanel #:stretch '(#t #f) | ||||||
|                                                              "0/?" |                 #:alignment '(left center) | ||||||
|                                                              (format "~a/~a" x (obs-peek @num-pages)))))))) |                 #:spacing 8 | ||||||
|             ;; downloading contents |                 (bitmap-view @status-icons status-icon-min-width) | ||||||
|             ((stage-2) (vpanel |                 (vpanel | ||||||
|                         (text "Downloading page text...") |                  ;; name and buttons (top half) | ||||||
|                         (progress @done-pages #:range @num-pages) |                  (hpanel #:alignment '(left bottom) | ||||||
|                         (text (@done-pages . ~> . (λ (x) (format "~a/~a" x (obs-peek @num-pages))))) |                          (text (@> (qi^-wikiname @qi))) | ||||||
|                         (text (@just-done . ~> . display-basename)))) |                          (spacer) | ||||||
|             ((done) (vpanel |                          (hpanel | ||||||
|                      (text "All wikis downloaded!"))) |                           #:stretch '(#f #f) | ||||||
|             ((err) (vpanel |                           (if-view @is-running? | ||||||
|                       (text "Error. Check the popup window."))) |                                    (button (hash-ref action-icons 'pause) | ||||||
|             (else (text (@state . ~> . (λ (state) (format "invalid state: ~a" state)))))) |                                            (λ () (do-stop-qi @qi))) | ||||||
|            (checkbox #:label "Auto-retry on error? (Dangerous)" |                                    (button (hash-ref action-icons 'resume) | ||||||
|                      #:checked? @auto-retry |                                            (λ () (do-start-qi @qi)))))) | ||||||
|                      (λ:= @auto-retry))))) |                  ;; progress bar (bottom half) | ||||||
|  |                  (hpanel | ||||||
|  |                   (canvas | ||||||
|  |                    @qi | ||||||
|  |                    #:style '(transparent) | ||||||
|  |                    #:margin '(3 3) | ||||||
|  |                    draw-bar) | ||||||
|  |                   (hpanel #:min-size '(68 #f) | ||||||
|  |                           #:stretch '(#f #f) | ||||||
|  |                           #:alignment '(right center) | ||||||
|  |                           (text (@> (format "eta ~a" (qi^-eta @qi)))))))))))))) | ||||||
|  |  | ||||||
|  | @ -2,38 +2,35 @@ | ||||||
| (require racket/file | (require racket/file | ||||||
|          racket/function |          racket/function | ||||||
|          racket/list |          racket/list | ||||||
|          racket/runtime-path |          racket/path | ||||||
|  |          racket/sequence | ||||||
|          racket/string |          racket/string | ||||||
|          net/url |          net/url | ||||||
|          net/mime |          net/mime | ||||||
|          file/sha1 |          file/sha1 | ||||||
|          net/http-easy |          net/http-easy | ||||||
|          db |          db | ||||||
|          "../lib/html-parsing/main.rkt" |  | ||||||
|          json |          json | ||||||
|          "archiver-database.rkt" |          "archiver-database.rkt" | ||||||
|  |          "../lib/html-parsing/main.rkt" | ||||||
|          "../lib/mime-types.rkt" |          "../lib/mime-types.rkt" | ||||||
|  |          "../lib/syntax.rkt" | ||||||
|          "../lib/tree-updater.rkt" |          "../lib/tree-updater.rkt" | ||||||
|          "../lib/url-utils.rkt" |          "../lib/url-utils.rkt" | ||||||
|          "../lib/xexpr-utils.rkt" |          "../lib/xexpr-utils.rkt" | ||||||
|          "../lib/archive-file-mappings.rkt") |          "../lib/archive-file-mappings.rkt") | ||||||
| 
 | 
 | ||||||
| (define archive-slc slc) |  | ||||||
| 
 |  | ||||||
| (provide | (provide | ||||||
|  if-necessary-download-list-of-pages |  | ||||||
|  download-list-of-pages |  | ||||||
|  save-each-page |  | ||||||
|  basename->name-for-query |  basename->name-for-query | ||||||
|  image-url->values |  image-url->values | ||||||
|  hash->save-dir |  hash->save-dir | ||||||
|  archive-slc) |  all-stages) | ||||||
| 
 | 
 | ||||||
| (module+ test | (module+ test | ||||||
|   (require rackunit)) |   (require rackunit)) | ||||||
| 
 | 
 | ||||||
| (define-runtime-path archive-root "../storage/archive") | (define archive-root (anytime-path ".." "storage/archive")) | ||||||
| #;(define archive-root "archive") | (make-directory* archive-root) | ||||||
| 
 | 
 | ||||||
| (define sources '#hasheq((style . 1) (page . 2))) | (define sources '#hasheq((style . 1) (page . 2))) | ||||||
| 
 | 
 | ||||||
|  | @ -46,127 +43,34 @@ | ||||||
|             wikiname |             wikiname | ||||||
|             (params->query '(("action" . "query") |             (params->query '(("action" . "query") | ||||||
|                              ("meta" . "siteinfo") |                              ("meta" . "siteinfo") | ||||||
|                              ("siprop" . "general|rightsinfo") |                              ("siprop" . "general|rightsinfo|statistics") | ||||||
|                              ("format" . "json") |                              ("format" . "json") | ||||||
|                              ("formatversion" . "2"))))) |                              ("formatversion" . "2"))))) | ||||||
|   (define data (response-json (get dest-url))) |   (define data (response-json (get dest-url))) | ||||||
|   (define exists? (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) |   (define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) | ||||||
|   (if exists? |   (if (and exists? (not (sql-null? exists?))) | ||||||
|       (query-exec slc "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" |       (query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" | ||||||
|                   (jp "/query/general/sitename" data) |                   (jp "/query/general/sitename" data) | ||||||
|                   (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) |                   (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) | ||||||
|                   (jp "/query/rightsinfo/text" data) |                   (jp "/query/rightsinfo/text" data) | ||||||
|                   (jp "/query/rightsinfo/url" data) |                   (jp "/query/rightsinfo/url" data) | ||||||
|                   wikiname) |                   wikiname) | ||||||
|       (query-exec slc "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 1, ?, ?, ?, ?)" |       (query-exec* "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 0, ?, ?, ?, ?)" | ||||||
|                   wikiname |                   wikiname | ||||||
|                   (jp "/query/general/sitename" data) |                   (jp "/query/general/sitename" data) | ||||||
|                   (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) |                   (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) | ||||||
|                   (jp "/query/rightsinfo/text" data) |                   (jp "/query/rightsinfo/text" data) | ||||||
|                   (jp "/query/rightsinfo/url" data)))) |                   (jp "/query/rightsinfo/url" data))) | ||||||
|  |   (jp "/query/statistics/articles" data)) | ||||||
| 
 | 
 | ||||||
| ;; call 1 if not yet done for that wiki |  | ||||||
| (define (if-necessary-download-list-of-pages wikiname callback) |  | ||||||
|   (define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) |  | ||||||
|   ;; done yet? |  | ||||||
|   (unless (and (real? wiki-progress) (wiki-progress . >= . 1)) |  | ||||||
|     ;; count total pages |  | ||||||
|     (define dest-url |  | ||||||
|       (format "https://~a.fandom.com/api.php?~a" |  | ||||||
|               wikiname |  | ||||||
|               (params->query `(("action" . "query") ("meta" . "siteinfo") ("siprop" . "statistics") ("format" . "json"))))) |  | ||||||
|     (define num-pages (jp "/query/statistics/articles" (response-json (get dest-url)))) |  | ||||||
|     (download-list-of-pages wikiname callback 0 num-pages #f))) |  | ||||||
| 
 | 
 | ||||||
| ;; 1. Download list of wiki pages and store in database |  | ||||||
| (define (download-list-of-pages wikiname callback total-so-far grand-total path-with-namefrom) |  | ||||||
|   (define url (if path-with-namefrom |  | ||||||
|                   (format "https://~a.fandom.com~a" wikiname path-with-namefrom) |  | ||||||
|                   (format "https://~a.fandom.com/wiki/Local_Sitemap" wikiname))) |  | ||||||
|   (define r (get url)) |  | ||||||
|   (define page (html->xexp (bytes->string/utf-8 (response-body r)))) |  | ||||||
|   (define link-namefrom |  | ||||||
|     ((query-selector (λ (t a c x) (and (eq? t 'a) |  | ||||||
|                                        (pair? x) |  | ||||||
|                                        (string-contains? (car x) "Next page") |  | ||||||
|                                        (let ([href (get-attribute 'href a)] ) |  | ||||||
|                                          (and href (string-contains? href "/wiki/Local_Sitemap"))))) |  | ||||||
|                      page #:include-text? #t))) |  | ||||||
|   (define row-values |  | ||||||
|     (for/list ([link (in-producer |  | ||||||
|                       (query-selector |  | ||||||
|                        (λ (t a c) (eq? t 'a)) |  | ||||||
|                        ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page))) |  | ||||||
|                       #f)]) |  | ||||||
|       (list wikiname (local-encoded-url->basename (get-attribute 'href (bits->attributes link))) 0))) |  | ||||||
|   (define query-template (string-join (make-list (length row-values) "(?, ?, ?)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values ")) |  | ||||||
|   (apply query-exec slc query-template (flatten row-values)) |  | ||||||
|   (define new-total (+ (length row-values) total-so-far)) |  | ||||||
|   (callback new-total grand-total (second (last row-values))) |  | ||||||
|   (cond |  | ||||||
|     [link-namefrom ; repeat on the next page |  | ||||||
|      (download-list-of-pages wikiname callback new-total grand-total (get-attribute 'href (bits->attributes link-namefrom)))] |  | ||||||
|     [#t ; all done downloading sitemap |  | ||||||
|      (insert-wiki-entry wikiname)])) |  | ||||||
| 
 |  | ||||||
| ;; 2. Download each page via API and: |  | ||||||
| ;; * Save API response to file |  | ||||||
| (define max-page-progress 1) |  | ||||||
| (define (save-each-page wikiname callback) |  | ||||||
|   ;; prepare destination folder |  | ||||||
|   (define save-dir (build-path archive-root wikiname)) |  | ||||||
|   (make-directory* save-dir) |  | ||||||
|   ;; gather list of basenames to download (that aren't yet complete) |  | ||||||
|   (define basenames (query-list slc "select basename from page where wikiname = ? and progress < ?" |  | ||||||
|                                 wikiname max-page-progress)) |  | ||||||
|   ;; counter of complete/incomplete basenames |  | ||||||
|   (define already-done-count |  | ||||||
|     (query-value slc "select count(*) from page where wikiname = ? and progress = ?" |  | ||||||
|                  wikiname max-page-progress)) |  | ||||||
|   (define not-done-count |  | ||||||
|     (query-value slc "select count(*) from page where wikiname = ? and progress < ?" |  | ||||||
|                  wikiname max-page-progress)) |  | ||||||
|   ;; set initial progress |  | ||||||
|   (callback already-done-count (+ already-done-count not-done-count) "") |  | ||||||
|   ;; loop through basenames and download |  | ||||||
|   (for ([basename basenames] |  | ||||||
|         [i (in-naturals 1)]) |  | ||||||
|     (define name-for-query (basename->name-for-query basename)) |  | ||||||
|     (define dest-url |  | ||||||
|       (format "https://~a.fandom.com/api.php?~a" |  | ||||||
|               wikiname |  | ||||||
|               (params->query `(("action" . "parse") |  | ||||||
|                                ("page" . ,name-for-query) |  | ||||||
|                                ("prop" . "text|headhtml|langlinks") |  | ||||||
|                                ("formatversion" . "2") |  | ||||||
|                                ("format" . "json"))))) |  | ||||||
|     (define r (get dest-url)) |  | ||||||
|     (define body (response-body r)) |  | ||||||
|     (define filename (string-append basename ".json")) |  | ||||||
|     (define save-path |  | ||||||
|       (cond [((string-length basename) . > . 240) |  | ||||||
|              (define key (sha1 (string->bytes/latin-1 basename))) |  | ||||||
|              (query-exec slc "insert into special_page (wikiname, key, basename) values (?, ?, ?)" |  | ||||||
|                          wikiname key basename) |  | ||||||
|              (build-path save-dir (string-append key ".json"))] |  | ||||||
|             [#t |  | ||||||
|              (build-path save-dir (string-append basename ".json"))])) |  | ||||||
|     (display-to-file body save-path #:exists 'replace) |  | ||||||
|     (query-exec slc "update page set progress = 1 where wikiname = ? and basename = ?" |  | ||||||
|                 wikiname basename) |  | ||||||
|     (callback (+ already-done-count i) (+ already-done-count not-done-count) basename)) |  | ||||||
|   ;; saved all pages, register that fact in the database |  | ||||||
|   (query-exec slc "update wiki set progress = 2 where wikiname = ?" wikiname)) |  | ||||||
| 
 |  | ||||||
| ;; 3. Download CSS and: |  | ||||||
| ;; * Save CSS to file |  | ||||||
| ;; * Record style images to database |  | ||||||
| (define (check-style-for-images wikiname path) | (define (check-style-for-images wikiname path) | ||||||
|   (define content (file->string path)) |   (define content (file->string path)) | ||||||
|   (define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr)) |   (define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr)) | ||||||
|   (for/list ([url urls] |   (for/list ([url urls] | ||||||
|              #:when (not (or (equal? url "") |              #:when (not (or (equal? url "") | ||||||
|                              (equal? url "'") |                              (equal? url "'") | ||||||
|  |                              (string-suffix? url "\"") | ||||||
|                              (string-contains? url "/resources-ucp/") |                              (string-contains? url "/resources-ucp/") | ||||||
|                              (string-contains? url "/fonts/") |                              (string-contains? url "/fonts/") | ||||||
|                              (string-contains? url "/drm_fonts/") |                              (string-contains? url "/drm_fonts/") | ||||||
|  | @ -184,7 +88,7 @@ | ||||||
|       [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)] |       [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)] | ||||||
|       [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)]))) |       [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)]))) | ||||||
| 
 | 
 | ||||||
| (define (download-styles-for-wiki wikiname) | (define (download-styles-for-wiki wikiname callback) | ||||||
|   (define save-dir (build-path archive-root wikiname "styles")) |   (define save-dir (build-path archive-root wikiname "styles")) | ||||||
|   (make-directory* save-dir) |   (make-directory* save-dir) | ||||||
|   (define theme (λ (theme-name) |   (define theme (λ (theme-name) | ||||||
|  | @ -198,18 +102,137 @@ | ||||||
|      (theme "dark") |      (theme "dark") | ||||||
|      (cons (format "https://~a.fandom.com/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop" wikiname) |      (cons (format "https://~a.fandom.com/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop" wikiname) | ||||||
|            (build-path save-dir "site.css")))) |            (build-path save-dir "site.css")))) | ||||||
|   (for ([style styles]) |   (for ([style styles] | ||||||
|  |         [i (in-naturals)]) | ||||||
|  |     (callback i (length styles) "styles...") | ||||||
|     (define r (get (car style))) |     (define r (get (car style))) | ||||||
|     (define body (response-body r)) |     (define body (response-body r)) | ||||||
|     (display-to-file body (cdr style) #:exists 'replace) |     (display-to-file body (cdr style) #:exists 'replace) | ||||||
|     ;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url? |     ;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url? | ||||||
|     ) |     ) | ||||||
|  |   (callback (length styles) (length styles) "styles...") | ||||||
|   styles) |   styles) | ||||||
| 
 | 
 | ||||||
| (define (do-step-3 wikiname) | (define (hash->save-dir wikiname hash) | ||||||
|   (define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname)) |   (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2))) | ||||||
|  | 
 | ||||||
|  | (define (image-url->values i) | ||||||
|  |   ;; TODO: handle case where there is multiple broken cb parameter on minecraft wiki | ||||||
|  |   ;; TODO: ensure it still "works" with broken & on minecraft wiki | ||||||
|  |   (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing | ||||||
|  |   (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary | ||||||
|  |   (define hash (sha1 (string->bytes/utf-8 key))) | ||||||
|  |   (cons key hash)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ;; 1. Download list of wiki pages and store in database, if not done yet for that wiki | ||||||
|  | (define (if-necessary-download-list-of-pages wikiname callback) | ||||||
|  |   (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) | ||||||
|  |   ;; done yet? | ||||||
|  |   (unless (and (real? wiki-progress) (wiki-progress . >= . 1)) | ||||||
|  |     ;; Count total pages | ||||||
|  |     (define num-pages (insert-wiki-entry wikiname)) | ||||||
|  |     ;; Download the entire index of pages | ||||||
|  |     (define basenames | ||||||
|  |       (let loop ([path-with-namefrom "/wiki/Local_Sitemap"] | ||||||
|  |                  [basenames-previous-pages null]) | ||||||
|  |         ;; Download the current index page | ||||||
|  |         (define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom)) | ||||||
|  |         (define r (get url)) | ||||||
|  |         ;; Metadata from this page (the link to the next page) | ||||||
|  |         (define page (html->xexp (bytes->string/utf-8 (response-body r)))) | ||||||
|  |         (define link-namefrom | ||||||
|  |           ((query-selector (λ (t a c x) (and (eq? t 'a) | ||||||
|  |                                              (pair? x) | ||||||
|  |                                              (string-contains? (car x) "Next page") | ||||||
|  |                                              (let ([href (get-attribute 'href a)] ) | ||||||
|  |                                                (and href (string-contains? href "/wiki/Local_Sitemap"))))) | ||||||
|  |                            page #:include-text? #t))) | ||||||
|  |         ;; Content from this page | ||||||
|  |         (define basenames-this-page | ||||||
|  |           (for/list ([link (in-producer | ||||||
|  |                             (query-selector | ||||||
|  |                              (λ (t a c) (eq? t 'a)) | ||||||
|  |                              ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page))) | ||||||
|  |                             #f)]) | ||||||
|  |             (local-encoded-url->basename (get-attribute 'href (bits->attributes link))))) | ||||||
|  |         ;; Call the progress callback | ||||||
|  |         (define all-basenames (append basenames-previous-pages basenames-this-page)) | ||||||
|  |         (callback (length all-basenames) num-pages (last all-basenames)) | ||||||
|  |         ;; Recurse to download from the next page | ||||||
|  |         (if link-namefrom | ||||||
|  |             (loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames) | ||||||
|  |             all-basenames))) | ||||||
|  |     ;; Save those pages into the database | ||||||
|  |     ;; SQLite can have a maximum of 32766 parameters in a single query | ||||||
|  |     (for ([slice (in-slice 32760 basenames)]) | ||||||
|  |       (define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values ")) | ||||||
|  |       (call-with-transaction | ||||||
|  |        (get-slc) | ||||||
|  |        (λ () | ||||||
|  |          (apply query-exec* query-template wikiname slice) | ||||||
|  |          ;; Record that we have the complete list of pages | ||||||
|  |          (query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname)))))) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ;; 2. Download each page via API and: | ||||||
|  | ;; * Save API response to file | ||||||
|  | (define max-page-progress 1) | ||||||
|  | (define (save-each-page wikiname callback) | ||||||
|  |   ;; prepare destination folder | ||||||
|  |   (define save-dir (build-path archive-root wikiname)) | ||||||
|  |   (make-directory* save-dir) | ||||||
|  |   ;; gather list of basenames to download (that aren't yet complete) | ||||||
|  |   (define basenames (query-list* "select basename from page where wikiname = ? and progress < ?" | ||||||
|  |                                 wikiname max-page-progress)) | ||||||
|  |   ;; counter of complete/incomplete basenames | ||||||
|  |   (define already-done-count | ||||||
|  |     (query-value* "select count(*) from page where wikiname = ? and progress = ?" | ||||||
|  |                  wikiname max-page-progress)) | ||||||
|  |   (define not-done-count | ||||||
|  |     (query-value* "select count(*) from page where wikiname = ? and progress < ?" | ||||||
|  |                   wikiname max-page-progress)) | ||||||
|  |   (define total-count (+ already-done-count not-done-count)) | ||||||
|  |   ;; set initial progress | ||||||
|  |   (callback already-done-count total-count "") | ||||||
|  |   ;; loop through basenames and download | ||||||
|  |   (for ([basename basenames] | ||||||
|  |         [i (in-naturals (add1 already-done-count))]) | ||||||
|  |     (define name-for-query (basename->name-for-query basename)) | ||||||
|  |     (define dest-url | ||||||
|  |       (format "https://~a.fandom.com/api.php?~a" | ||||||
|  |               wikiname | ||||||
|  |               (params->query `(("action" . "parse") | ||||||
|  |                                ("page" . ,name-for-query) | ||||||
|  |                                ("prop" . "text|headhtml|langlinks") | ||||||
|  |                                ("formatversion" . "2") | ||||||
|  |                                ("format" . "json"))))) | ||||||
|  |     (define r (get dest-url)) | ||||||
|  |     (define body (response-body r)) | ||||||
|  |     (define filename (string-append basename ".json")) | ||||||
|  |     (define save-path | ||||||
|  |       (cond [((string-length basename) . > . 240) | ||||||
|  |              (define key (sha1 (string->bytes/latin-1 basename))) | ||||||
|  |              (query-exec* "insert into special_page (wikiname, key, basename) values (?, ?, ?)" | ||||||
|  |                          wikiname key basename) | ||||||
|  |              (build-path save-dir (string-append key ".json"))] | ||||||
|  |             [#t | ||||||
|  |              (build-path save-dir (string-append basename ".json"))])) | ||||||
|  |     (display-to-file body save-path #:exists 'replace) | ||||||
|  |     (query-exec* "update page set progress = 1 where wikiname = ? and basename = ?" | ||||||
|  |                 wikiname basename) | ||||||
|  |     (callback i total-count basename)) | ||||||
|  |   ;; saved all pages, register that fact in the database | ||||||
|  |   (query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ;; 3. Download CSS and: | ||||||
|  | ;; * Save CSS to file | ||||||
|  | ;; * Record style images to database | ||||||
|  | (define (if-necessary-download-and-check-styles wikiname callback) | ||||||
|  |   (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) | ||||||
|   (unless (and (number? wiki-progress) (wiki-progress . >= . 3)) |   (unless (and (number? wiki-progress) (wiki-progress . >= . 3)) | ||||||
|     (define styles (download-styles-for-wiki wikiname)) |     (define styles (download-styles-for-wiki wikiname callback)) | ||||||
|     (define unique-image-urls |     (define unique-image-urls | ||||||
|       (remove-duplicates |       (remove-duplicates | ||||||
|        (map image-url->values |        (map image-url->values | ||||||
|  | @ -217,28 +240,18 @@ | ||||||
|              (for/list ([style styles]) |              (for/list ([style styles]) | ||||||
|                (check-style-for-images wikiname (cdr style))))) |                (check-style-for-images wikiname (cdr style))))) | ||||||
|        #:key cdr)) |        #:key cdr)) | ||||||
|     (println unique-image-urls) |  | ||||||
|     (for ([pair unique-image-urls]) |     (for ([pair unique-image-urls]) | ||||||
|       (query-exec slc "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair))) |       (query-exec* "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair))) | ||||||
|     (query-exec slc "update wiki set progress = 3 where wikiname = ?" wikiname))) |     (query-exec* "update wiki set progress = 3 where wikiname = ?" wikiname))) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| ;; 4: From downloaded pages, record URLs of image sources and inline style images to database | ;; 4: From downloaded pages, record URLs of image sources and inline style images to database | ||||||
| (define (hash->save-dir wikiname hash) |  | ||||||
|   (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2))) |  | ||||||
| 
 |  | ||||||
| (define (image-url->values i) |  | ||||||
|   ;; TODO: handle case where there is multiple cb parameter on minecraft wiki |  | ||||||
|   ;; TODO: ensure it still "works" with broken & on minecraft wiki |  | ||||||
|   (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing |  | ||||||
|   (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary |  | ||||||
|   (define hash (sha1 (string->bytes/utf-8 key))) |  | ||||||
|   (cons key hash)) |  | ||||||
| 
 |  | ||||||
| (define (check-json-for-images wikiname path) | (define (check-json-for-images wikiname path) | ||||||
|   (define data (with-input-from-file path (λ () (read-json)))) |   (define data (with-input-from-file path (λ () (read-json)))) | ||||||
|   (define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) |   (define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) | ||||||
|   (define tree (update-tree-wiki page wikiname)) |   (define tree (update-tree-wiki page wikiname)) | ||||||
|   (remove-duplicates |   null | ||||||
|  |   #;(remove-duplicates | ||||||
|      (for/list ([element (in-producer |      (for/list ([element (in-producer | ||||||
|                           (query-selector |                           (query-selector | ||||||
|                            (λ (t a c) |                            (λ (t a c) | ||||||
|  | @ -248,17 +261,19 @@ | ||||||
|                           #f)]) |                           #f)]) | ||||||
|        (image-url->values (get-attribute 'src (bits->attributes element)))))) |        (image-url->values (get-attribute 'src (bits->attributes element)))))) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| ;; 5. Download image sources and style images according to database | ;; 5. Download image sources and style images according to database | ||||||
| (define (save-each-image wikiname source callback) | (define (save-each-image wikiname callback) | ||||||
|  |   (define source (hash-ref sources 'style)) ;; TODO: download entire wiki images instead? | ||||||
|   ;; gather list of basenames to download (that aren't yet complete) |   ;; gather list of basenames to download (that aren't yet complete) | ||||||
|   (define rows (query-rows slc "select url, hash from image where wikiname = ? and source <= ? and progress < 1" |   (define rows (query-rows* "select url, hash from image where wikiname = ? and source <= ? and progress < 1" | ||||||
|                                 wikiname source)) |                                 wikiname source)) | ||||||
|   ;; counter of complete/incomplete basenames |   ;; counter of complete/incomplete basenames | ||||||
|   (define already-done-count |   (define already-done-count | ||||||
|     (query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress = 1" |     (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress = 1" | ||||||
|                  wikiname source)) |                  wikiname source)) | ||||||
|   (define not-done-count |   (define not-done-count | ||||||
|     (query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress < 1" |     (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress < 1" | ||||||
|                  wikiname source)) |                  wikiname source)) | ||||||
|   ;; set initial progress |   ;; set initial progress | ||||||
|   (callback already-done-count (+ already-done-count not-done-count) "") |   (callback already-done-count (+ already-done-count not-done-count) "") | ||||||
|  | @ -269,26 +284,35 @@ | ||||||
|     (define url (vector-ref row 0)) |     (define url (vector-ref row 0)) | ||||||
|     (define hash (vector-ref row 1)) |     (define hash (vector-ref row 1)) | ||||||
|     ;; check |     ;; check | ||||||
|     (printf "~a -> ~a~n" url hash) |     #; (printf "~a -> ~a~n" url hash) | ||||||
|     (define r (get url)) |     (define r (get url)) | ||||||
|     (define declared-type (response-headers-ref r 'content-type)) |     (define declared-type (response-headers-ref r 'content-type)) | ||||||
|     (define final-type (if (equal? declared-type #"application/octet-stream") |     (define final-type (if (equal? declared-type #"application/octet-stream") | ||||||
|                            (let ([sniff-entity (message-entity (mime-analyze (response-body r)))]) |                            (let ([sniff-entity (message-entity (mime-analyze (response-body r)))]) | ||||||
|                              (string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity)))) |                              (string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity)))) | ||||||
|                            declared-type)) |                            declared-type)) | ||||||
|     (define ext (bytes->string/latin-1 (mime-type->ext final-type))) |     (define ext | ||||||
|  |       (with-handlers ([exn:fail:contract? (λ _ (error 'save-each-image "no ext found for mime type `~a` in file ~a" final-type url))]) | ||||||
|  |         (bytes->string/latin-1 (mime-type->ext final-type)))) | ||||||
|     ;; save |     ;; save | ||||||
|     (define save-dir (hash->save-dir wikiname hash)) |     (define save-dir (hash->save-dir wikiname hash)) | ||||||
|     (make-directory* save-dir) |     (make-directory* save-dir) | ||||||
|     (define save-path (build-path save-dir (string-append hash "." ext))) |     (define save-path (build-path save-dir (string-append hash "." ext))) | ||||||
|     (define body (response-body r)) |     (define body (response-body r)) | ||||||
|     (display-to-file body save-path #:exists 'replace) |     (display-to-file body save-path #:exists 'replace) | ||||||
|     (query-exec slc "update image set progress = 1, ext = ? where wikiname = ? and hash = ?" |     (query-exec* "update image set progress = 1, ext = ? where wikiname = ? and hash = ?" | ||||||
|                 ext wikiname hash) |                 ext wikiname hash) | ||||||
|     (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append hash "." ext))) |     (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append (substring hash 0 6) "..." ext))) | ||||||
|   ;; TODO: saved all images, register that fact in the database |   ;; saved all images, register that fact in the database | ||||||
|   ) |   (query-exec* "update wiki set progress = 4 where wikiname = ?" wikiname)) | ||||||
| 
 | 
 | ||||||
|  | (define all-stages | ||||||
|  |   (list | ||||||
|  |    if-necessary-download-list-of-pages | ||||||
|  |    save-each-page | ||||||
|  |    if-necessary-download-and-check-styles | ||||||
|  |    ;; check-json-for-images | ||||||
|  |    save-each-image)) | ||||||
| 
 | 
 | ||||||
| (module+ test | (module+ test | ||||||
|   (check-equal? (html->xexp "<img src=\"https://example.com/images?src=Blah.jpg&width=150\">") |   (check-equal? (html->xexp "<img src=\"https://example.com/images?src=Blah.jpg&width=150\">") | ||||||
|  | @ -299,11 +323,13 @@ | ||||||
|   #;(do-step-3 "gallowmere") |   #;(do-step-3 "gallowmere") | ||||||
|   #;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))) |   #;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))) | ||||||
| 
 | 
 | ||||||
|   #;(for ([wikiname (query-list slc "select wikiname from wiki")]) |   #;(for ([wikiname (query-list* "select wikiname from wiki")]) | ||||||
|       (println wikiname) |       (println wikiname) | ||||||
|       (insert-wiki-entry wikiname)) |       (insert-wiki-entry wikiname)) | ||||||
| 
 | 
 | ||||||
|   #;(for ([wikiname (query-list slc "select wikiname from wiki")]) |   #;(for ([wikiname (query-list* "select wikiname from wiki")]) | ||||||
|       (println wikiname) |       (println wikiname) | ||||||
|       (do-step-3 wikiname) |       (do-step-3 wikiname) | ||||||
|       (save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))))) |       (save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))))) | ||||||
|  | 
 | ||||||
|  | ; (for ([stage all-stages]) (stage "create" (λ (a b c) (printf "~a/~a ~a~n" a b c)))) | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue