forked from cadence/breezewiki
		
	Rewrite archiver project
* Rewrite archiver.rkt to manage the stages order * Rewrite archiver-gui.rkt: * Remembers the previous incomplete queue items * Pretty graphics for icons and progress bars * Segmented progress bars to indicate different stages * Fix archiver-cli.rkt to use new stages * Switch to req -d, so it doesn't auto-install gui libs
This commit is contained in:
		
							parent
							
								
									453570bdc9
								
							
						
					
					
						commit
						cf74ffb0e2
					
				
					 4 changed files with 536 additions and 350 deletions
				
			
		| 
						 | 
				
			
			@ -6,24 +6,24 @@
 | 
			
		|||
             ""
 | 
			
		||||
             "Downloaded pages go into `archive/` next to the executable."
 | 
			
		||||
             "Database goes into `archiver.db*` next to the executable."
 | 
			
		||||
             "The database is necessary to store your download progress and resume where you left off if the process is interrupted.")
 | 
			
		||||
      (ps ""
 | 
			
		||||
          "Default output style is `progress` in a tty and `lines` otherwise."))
 | 
			
		||||
             "The database is necessary to store your download progress and resume where you left off if the process is interrupted."))
 | 
			
		||||
 | 
			
		||||
(flag (output-quiet?)
 | 
			
		||||
      ("-q" "--output-quiet" "disable progress output")
 | 
			
		||||
      (output-quiet? #t))
 | 
			
		||||
 | 
			
		||||
(flag (output-lines?)
 | 
			
		||||
      ("-l" "--output-lines" "output the name of each file downloaded")
 | 
			
		||||
      (output-lines? #t))
 | 
			
		||||
 | 
			
		||||
(flag (output-progress?)
 | 
			
		||||
      ("-p" "--output-progress" "progress output for terminals")
 | 
			
		||||
      ("-p" "--output-progress" "progress output for terminals (default in a tty)")
 | 
			
		||||
      (output-progress? #t))
 | 
			
		||||
 | 
			
		||||
(flag (output-lines?)
 | 
			
		||||
      ("-l" "--output-lines" "output the name of each file downloaded (default outside of a tty)")
 | 
			
		||||
      (output-lines? #t))
 | 
			
		||||
 | 
			
		||||
(constraint (one-of output-quiet? output-lines? output-progress?))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
(program
 | 
			
		||||
 (start [wikiname "wikiname to download"])
 | 
			
		||||
 ;; set up arguments
 | 
			
		||||
| 
						 | 
				
			
			@ -42,30 +42,27 @@
 | 
			
		|||
 ;; check
 | 
			
		||||
 (when (or (not wikiname) (equal? wikiname ""))
 | 
			
		||||
   (raise-user-error "Please specify the wikiname to download on the command line."))
 | 
			
		||||
 ;; stage 1
 | 
			
		||||
 (cond [(output-lines?) (displayln "Downloading list of pages...")]
 | 
			
		||||
       [(output-progress?) (printf "Downloading list of pages... \r")])
 | 
			
		||||
 (if-necessary-download-list-of-pages
 | 
			
		||||
  wikiname
 | 
			
		||||
  (λ (a b c)
 | 
			
		||||
    (cond [(output-progress?) (printf "Downloading list of pages... [~a/~b]\r" a b)])))
 | 
			
		||||
 ;; stage 2
 | 
			
		||||
 (save-each-page
 | 
			
		||||
  wikiname
 | 
			
		||||
  (λ (a b c)
 | 
			
		||||
    (define basename (basename->name-for-query c))
 | 
			
		||||
    (cond
 | 
			
		||||
      [(output-lines?)
 | 
			
		||||
       (displayln basename)]
 | 
			
		||||
      [(output-progress?)
 | 
			
		||||
       (when (eq? (modulo a 20) 0)
 | 
			
		||||
         (thread (λ () (update-width))))
 | 
			
		||||
       (define prefix (format "[~a/~a] " a b))
 | 
			
		||||
       (define rest (- width (string-length prefix)))
 | 
			
		||||
       (define real-width (min (string-length basename) rest))
 | 
			
		||||
       (define spare-width (- rest real-width))
 | 
			
		||||
       (define name-display (substring basename 0 real-width))
 | 
			
		||||
       (define whitespace (make-string spare-width #\ ))
 | 
			
		||||
       (printf "~a~a~a\r" prefix name-display whitespace)]))))
 | 
			
		||||
 ;; progress reporting based on selected mode
 | 
			
		||||
 (define (report-progress a b c)
 | 
			
		||||
   (define basename (basename->name-for-query c))
 | 
			
		||||
   (cond
 | 
			
		||||
     [(output-lines?)
 | 
			
		||||
      (displayln basename)]
 | 
			
		||||
     [(output-progress?)
 | 
			
		||||
      (when (eq? (modulo a 20) 0)
 | 
			
		||||
        (thread (λ () (update-width))))
 | 
			
		||||
      (define prefix (format "[~a] [~a/~a] " wikiname a b))
 | 
			
		||||
      (define rest (- width (string-length prefix)))
 | 
			
		||||
      (define real-width (min (string-length basename) rest))
 | 
			
		||||
      (define spare-width (- rest real-width))
 | 
			
		||||
      (define name-display (substring basename 0 real-width))
 | 
			
		||||
      (define whitespace (make-string spare-width #\ ))
 | 
			
		||||
      (printf "~a~a~a\r" prefix name-display whitespace)]))
 | 
			
		||||
 ;; download all stages
 | 
			
		||||
 (for ([stage all-stages]
 | 
			
		||||
       [i (in-naturals 1)])
 | 
			
		||||
   (printf "> Stage ~a/~a~n" i (length all-stages))
 | 
			
		||||
   (stage wikiname report-progress)
 | 
			
		||||
   (displayln "")))
 | 
			
		||||
 | 
			
		||||
(run start)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
#lang racket/base
 | 
			
		||||
(require racket/list
 | 
			
		||||
(require racket/file
 | 
			
		||||
         racket/list
 | 
			
		||||
         racket/path
 | 
			
		||||
         racket/runtime-path
 | 
			
		||||
         racket/string
 | 
			
		||||
         json
 | 
			
		||||
         json-pointer
 | 
			
		||||
| 
						 | 
				
			
			@ -9,9 +9,16 @@
 | 
			
		|||
         "../lib/syntax.rkt")
 | 
			
		||||
 | 
			
		||||
(provide
 | 
			
		||||
 slc)
 | 
			
		||||
 get-slc
 | 
			
		||||
 query-exec*
 | 
			
		||||
 query-rows*
 | 
			
		||||
 query-list*
 | 
			
		||||
 query-value*
 | 
			
		||||
 query-maybe-value*
 | 
			
		||||
 query-maybe-row*)
 | 
			
		||||
 | 
			
		||||
(define-runtime-path database-file "../storage/archiver.db")
 | 
			
		||||
(define storage-path (anytime-path ".." "storage"))
 | 
			
		||||
(define database-file (build-path storage-path "archiver.db"))
 | 
			
		||||
 | 
			
		||||
(define migrations
 | 
			
		||||
  (wrap-sql
 | 
			
		||||
| 
						 | 
				
			
			@ -25,23 +32,50 @@
 | 
			
		|||
    (query-exec slc "alter table wiki add column license_text TEXT")
 | 
			
		||||
    (query-exec slc "alter table wiki add column license_url TEXT"))))
 | 
			
		||||
 | 
			
		||||
(define slc (sqlite3-connect #:database database-file #:mode 'create))
 | 
			
		||||
(query-exec slc "PRAGMA journal_mode=WAL")
 | 
			
		||||
(define database-version
 | 
			
		||||
  (with-handlers ([exn:fail:sql?
 | 
			
		||||
                   (λ (exn)
 | 
			
		||||
                     ; need to set up the database
 | 
			
		||||
                     (query-exec slc "create table database_version (version integer, primary key (version))")
 | 
			
		||||
                     (query-exec slc "insert into database_version values (0)")
 | 
			
		||||
                     0)])
 | 
			
		||||
    (query-value slc "select version from database_version")))
 | 
			
		||||
(define slc (box #f))
 | 
			
		||||
(define (get-slc)
 | 
			
		||||
  (define slc* (unbox slc))
 | 
			
		||||
  (cond
 | 
			
		||||
    [slc* slc*]
 | 
			
		||||
    [else
 | 
			
		||||
     (make-directory* storage-path)
 | 
			
		||||
     (define slc* (sqlite3-connect #:database database-file #:mode 'create))
 | 
			
		||||
     (query-exec slc* "PRAGMA journal_mode=WAL")
 | 
			
		||||
     (define database-version
 | 
			
		||||
       (with-handlers ([exn:fail:sql?
 | 
			
		||||
                        (λ (exn)
 | 
			
		||||
                          ; need to set up the database
 | 
			
		||||
                          (query-exec slc* "create table database_version (version integer, primary key (version))")
 | 
			
		||||
                          (query-exec slc* "insert into database_version values (0)")
 | 
			
		||||
                          0)])
 | 
			
		||||
         (query-value slc* "select version from database_version")))
 | 
			
		||||
 | 
			
		||||
(let do-migrate-step ()
 | 
			
		||||
  (when (database-version . < . (length migrations))
 | 
			
		||||
    (call-with-transaction
 | 
			
		||||
     slc
 | 
			
		||||
     (list-ref migrations database-version))
 | 
			
		||||
    (set! database-version (add1 database-version))
 | 
			
		||||
    (query-exec slc "update database_version set version = $1" database-version)
 | 
			
		||||
    (do-migrate-step)))
 | 
			
		||||
     (let do-migrate-step ()
 | 
			
		||||
       (when (database-version . < . (length migrations))
 | 
			
		||||
         (call-with-transaction
 | 
			
		||||
          slc*
 | 
			
		||||
          (list-ref migrations database-version))
 | 
			
		||||
         (set! database-version (add1 database-version))
 | 
			
		||||
         (query-exec slc* "update database_version set version = $1" database-version)
 | 
			
		||||
         (do-migrate-step)))
 | 
			
		||||
 | 
			
		||||
     (set-box! slc slc*)
 | 
			
		||||
     slc*]))
 | 
			
		||||
 | 
			
		||||
(define (query-exec* . args)
 | 
			
		||||
  (apply query-exec (get-slc) args))
 | 
			
		||||
 | 
			
		||||
(define (query-rows* . args)
 | 
			
		||||
  (apply query-rows (get-slc) args))
 | 
			
		||||
 | 
			
		||||
(define (query-list* . args)
 | 
			
		||||
  (apply query-list (get-slc) args))
 | 
			
		||||
 | 
			
		||||
(define (query-value* . args)
 | 
			
		||||
  (apply query-value (get-slc) args))
 | 
			
		||||
 | 
			
		||||
(define (query-maybe-value* . args)
 | 
			
		||||
  (apply query-maybe-value (get-slc) args))
 | 
			
		||||
 | 
			
		||||
(define (query-maybe-row* . args)
 | 
			
		||||
  (apply query-maybe-row (get-slc) args))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,89 +1,128 @@
 | 
			
		|||
#lang racket/base
 | 
			
		||||
(require racket/class
 | 
			
		||||
         racket/draw
 | 
			
		||||
         racket/format
 | 
			
		||||
         racket/list
 | 
			
		||||
         racket/port
 | 
			
		||||
         racket/set
 | 
			
		||||
         racket/splicing
 | 
			
		||||
         racket/string
 | 
			
		||||
         db
 | 
			
		||||
         net/http-easy
 | 
			
		||||
         memo
 | 
			
		||||
         (only-in racket/gui timer%)
 | 
			
		||||
         racket/gui/easy
 | 
			
		||||
         racket/gui/easy/operator
 | 
			
		||||
         (only-in pict bitmap)
 | 
			
		||||
         images/icons/style
 | 
			
		||||
         images/icons/control
 | 
			
		||||
         images/icons/stickman
 | 
			
		||||
         images/icons/symbol
 | 
			
		||||
         "archiver-database.rkt"
 | 
			
		||||
         "archiver.rkt"
 | 
			
		||||
         "../lib/url-utils.rkt"
 | 
			
		||||
         "../lib/xexpr-utils.rkt")
 | 
			
		||||
 | 
			
		||||
(define active-threads (mutable-seteq))
 | 
			
		||||
(default-icon-material rubber-icon-material)
 | 
			
		||||
 | 
			
		||||
(require (for-syntax racket/base racket/match racket/set racket/string))
 | 
			
		||||
 | 
			
		||||
(define-syntax (@> stx)
 | 
			
		||||
  (define form (cdr (syntax->datum stx)))
 | 
			
		||||
  (match form
 | 
			
		||||
    [(list form) ; (@> (fn @obs))
 | 
			
		||||
     ;; identify the observables and replace with non-@ symbols
 | 
			
		||||
     (define collection (mutable-set))
 | 
			
		||||
     (define updated
 | 
			
		||||
       (let loop ([sexp form])
 | 
			
		||||
         (cond [(symbol? sexp)
 | 
			
		||||
                (let ([as-s (symbol->string sexp)])
 | 
			
		||||
                  (if (string-prefix? as-s "@")
 | 
			
		||||
                      (let ([without-@ (string->symbol (substring as-s 1))])
 | 
			
		||||
                        (set-add! collection (cons sexp without-@))
 | 
			
		||||
                        without-@)
 | 
			
		||||
                      sexp))]
 | 
			
		||||
               [(pair? sexp) (cons (loop (car sexp)) (loop (cdr sexp)))]
 | 
			
		||||
               [#t sexp])))
 | 
			
		||||
     (define collection-l (set->list collection))
 | 
			
		||||
     ;; return obs-combine -> updated-form
 | 
			
		||||
     (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-l)) ,updated) ,@(map car collection-l)))]
 | 
			
		||||
    [(list (? string? str) args ...) ; (@> "Blah: ~a/~a" @arg1 arg2)
 | 
			
		||||
     ;; identify the observables and replace with non-@ symbols
 | 
			
		||||
     (define collection-l
 | 
			
		||||
       (for/list ([arg args])
 | 
			
		||||
         (if (symbol? arg)
 | 
			
		||||
             (let ([as-s (symbol->string arg)])
 | 
			
		||||
               (if (string-prefix? as-s "@")
 | 
			
		||||
                   (let ([without-@ (string->symbol (substring as-s 1))])
 | 
			
		||||
                     (cons arg without-@))
 | 
			
		||||
                   (cons #f arg)))
 | 
			
		||||
             (cons #f arg))))
 | 
			
		||||
     (define collection-lo (filter car collection-l))
 | 
			
		||||
     ;; return obs-combine -> format
 | 
			
		||||
     (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-lo)) (format ,str ,@(map cdr collection-l))) ,@(map car collection-lo)))]))
 | 
			
		||||
 | 
			
		||||
(define/obs @auto-retry #f)
 | 
			
		||||
(define/obs @wikiname "")
 | 
			
		||||
(define/obs @state 'waiting)
 | 
			
		||||
(define/obs @num-pages 1)
 | 
			
		||||
(define/obs @done-pages 0)
 | 
			
		||||
(define/obs @just-done "")
 | 
			
		||||
(define/obs @queue '())
 | 
			
		||||
(define @title
 | 
			
		||||
  (obs-combine
 | 
			
		||||
   (λ (state queue num-pages done-pages)
 | 
			
		||||
     (define suffix (if (pair? queue)
 | 
			
		||||
                        (format " +~a" (length queue))
 | 
			
		||||
                        ""))
 | 
			
		||||
     (define progress (if (eq? num-pages 0)
 | 
			
		||||
                          " 0%"
 | 
			
		||||
                          (format " ~a%" (round (inexact->exact (* (/ done-pages num-pages) 100))))))
 | 
			
		||||
     (case state
 | 
			
		||||
       [(waiting stage-0) (format "Fandom Archiver~a" suffix)]
 | 
			
		||||
       [(stage-1) (format "Fandom Archiver 0%~a" suffix)]
 | 
			
		||||
       [(stage-2) (format "Fandom Archiver~a~a" progress suffix)]
 | 
			
		||||
       [(err) "ERROR Fandom Archiver"]
 | 
			
		||||
       [(done) "Fandom Archiver 100%"]))
 | 
			
		||||
   @state @queue @num-pages (obs-throttle @done-pages #:duration 5000)))
 | 
			
		||||
 | 
			
		||||
(define-syntax-rule (t body ...)
 | 
			
		||||
  (set-add! active-threads (thread (λ () body ...))))
 | 
			
		||||
(define-struct qi^ (wikiname st stage progress max-progress eta th) #:transparent) ;; queue item
 | 
			
		||||
 | 
			
		||||
(define (do-start-or-queue)
 | 
			
		||||
  (define wikiname (obs-peek @wikiname))
 | 
			
		||||
  (:= @wikiname "")
 | 
			
		||||
  (when (not (equal? (string-trim wikiname) ""))
 | 
			
		||||
    (@queue . <~ . (λ (q) (append q (list wikiname))))
 | 
			
		||||
    (shift-queue-maybe)))
 | 
			
		||||
(define rows (query-rows* "select wikiname, progress from wiki where progress < 4"))
 | 
			
		||||
(define/obs @queue null)
 | 
			
		||||
(define (add-wikiname-to-queue wikiname st stage)
 | 
			
		||||
  (@queue . <~ . (λ (queue)
 | 
			
		||||
                   (define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue))
 | 
			
		||||
                   (if already-exists?
 | 
			
		||||
                       queue
 | 
			
		||||
                       (append queue (list (qi^ wikiname st stage 0 1 "..." #f)))))))
 | 
			
		||||
(for ([row rows])
 | 
			
		||||
  (add-wikiname-to-queue (vector-ref row 0)
 | 
			
		||||
                         (if (= (vector-ref row 1) 4)
 | 
			
		||||
                             'complete
 | 
			
		||||
                             'queued)
 | 
			
		||||
                         (vector-ref row 1)))
 | 
			
		||||
 | 
			
		||||
(define (shift-queue-maybe)
 | 
			
		||||
  (when (memq (obs-peek @state) '(waiting done))
 | 
			
		||||
    (define q (obs-peek @queue))
 | 
			
		||||
    (cond
 | 
			
		||||
      [(pair? q)
 | 
			
		||||
       (define wikiname (car q))
 | 
			
		||||
       (:= @queue (cdr q))
 | 
			
		||||
       (do-start-stage1 wikiname)]
 | 
			
		||||
      [#t (:= @state 'done)])))
 | 
			
		||||
(define status-icon-size 32)
 | 
			
		||||
(define status-icon-min-width 36)
 | 
			
		||||
(define button-icon-size 12)
 | 
			
		||||
 | 
			
		||||
(define (do-start-stage1 wikiname)
 | 
			
		||||
  (:= @just-done "")
 | 
			
		||||
  (:= @done-pages 0)
 | 
			
		||||
  (:= @num-pages 1)
 | 
			
		||||
  (t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)])
 | 
			
		||||
       (:= @state 'stage-0)
 | 
			
		||||
       (if-necessary-download-list-of-pages wikiname (λ (now-done num-pages just-done-name)
 | 
			
		||||
                                                       (:= @num-pages num-pages)
 | 
			
		||||
                                                       (:= @done-pages now-done)
 | 
			
		||||
                                                       (:= @just-done just-done-name)
 | 
			
		||||
                                                       (:= @state 'stage-1)))
 | 
			
		||||
       (do-start-stage2 wikiname))))
 | 
			
		||||
(define color-green (make-color 90 212 68))
 | 
			
		||||
 | 
			
		||||
(define (do-start-stage2 wikiname)
 | 
			
		||||
  (:= @just-done "")
 | 
			
		||||
  (:= @num-pages 1)
 | 
			
		||||
  (:= @done-pages 0)
 | 
			
		||||
  (t (with-handlers ([exn:fail? (handle-graphical-exn wikiname)])
 | 
			
		||||
       (save-each-page wikiname (λ (now-done num-pages just-done-path)
 | 
			
		||||
                                  (:= @num-pages num-pages)
 | 
			
		||||
                                  (:= @done-pages now-done)
 | 
			
		||||
                                  (:= @just-done just-done-path)))
 | 
			
		||||
       (:= @state 'waiting)
 | 
			
		||||
       (shift-queue-maybe)))
 | 
			
		||||
  (:= @state 'stage-2))
 | 
			
		||||
(define/obs @input "")
 | 
			
		||||
 | 
			
		||||
(splicing-let ([frame-count 30])
 | 
			
		||||
  (define stickman-frames
 | 
			
		||||
    (for/vector ([s (in-range 0 1 (/ 1 frame-count))])
 | 
			
		||||
      (running-stickman-icon
 | 
			
		||||
       s
 | 
			
		||||
       #:height status-icon-size
 | 
			
		||||
       #:material (default-icon-material))))
 | 
			
		||||
 | 
			
		||||
  (define/obs @stick-frame-no 0)
 | 
			
		||||
  (define stick-timer
 | 
			
		||||
    (new timer%
 | 
			
		||||
         [notify-callback (λ () (@stick-frame-no . <~ . add1))]
 | 
			
		||||
         [interval (truncate (/ 1000 frame-count))]))
 | 
			
		||||
  (define/obs @stick
 | 
			
		||||
    (@stick-frame-no . ~> . (λ (n) (vector-ref stickman-frames
 | 
			
		||||
                                               (modulo n (vector-length stickman-frames)))))))
 | 
			
		||||
 | 
			
		||||
(define status-icons
 | 
			
		||||
  (hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size)
 | 
			
		||||
          'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size)
 | 
			
		||||
          'running @stick
 | 
			
		||||
          'error (x-icon #:height status-icon-size)
 | 
			
		||||
          'complete (check-icon #:color color-green #:height status-icon-size)))
 | 
			
		||||
 | 
			
		||||
(define action-icons
 | 
			
		||||
  (hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size)
 | 
			
		||||
          'resume (play-icon #:color color-green #:height button-icon-size)))
 | 
			
		||||
 | 
			
		||||
(define (bitmap-view @the-bitmap [min-width 1])
 | 
			
		||||
  (pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f)
 | 
			
		||||
               #:stretch '(#f #f)
 | 
			
		||||
               #:style '(transparent)
 | 
			
		||||
               @the-bitmap
 | 
			
		||||
               bitmap))
 | 
			
		||||
 | 
			
		||||
(define (exn->string e)
 | 
			
		||||
  (with-output-to-string
 | 
			
		||||
| 
						 | 
				
			
			@ -96,13 +135,15 @@
 | 
			
		|||
          (printf ": ~a" (car item)))
 | 
			
		||||
        (displayln "")))))
 | 
			
		||||
 | 
			
		||||
(define ((handle-graphical-exn wikiname) e)
 | 
			
		||||
(define ((handle-graphical-exn @qi) e)
 | 
			
		||||
  (displayln (exn->string e) (current-error-port))
 | 
			
		||||
  (cond
 | 
			
		||||
    [(obs-peek @auto-retry)
 | 
			
		||||
     (do-retry-end wikiname)]
 | 
			
		||||
     (void) ;; TODO
 | 
			
		||||
     #;(do-retry-end wikiname)]
 | 
			
		||||
    [#t
 | 
			
		||||
     (:= @state 'err)
 | 
			
		||||
     (update-qi @qi [st 'error])
 | 
			
		||||
     (do-try-unpause-next-entry)
 | 
			
		||||
     (thread
 | 
			
		||||
      (λ ()
 | 
			
		||||
        (define/obs @visible? #t)
 | 
			
		||||
| 
						 | 
				
			
			@ -116,89 +157,177 @@
 | 
			
		|||
                         (input #:style '(multiple hscroll)
 | 
			
		||||
                                #:min-size '(#f 200)
 | 
			
		||||
                                (exn->string e))
 | 
			
		||||
                         (button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname)))
 | 
			
		||||
                         (button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname)))
 | 
			
		||||
                         (button "Skip Wiki" (λ () (:= @visible? #f) (do-continue)))
 | 
			
		||||
                         (button "Use Auto-Retry" (λ ()
 | 
			
		||||
                                                    (:= @auto-retry #t)
 | 
			
		||||
                                                    (:= @visible? #f)
 | 
			
		||||
                                                    (do-retry-end wikiname)))
 | 
			
		||||
                         (text "Be careful not to auto-retry an infinite loop!")))
 | 
			
		||||
                         ;; TODO
 | 
			
		||||
                         #;(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname)))
 | 
			
		||||
                         #;(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname)))
 | 
			
		||||
                         #;(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue)))
 | 
			
		||||
                         #;(button "Use Auto-Retry" (λ ()
 | 
			
		||||
                                                      (:= @auto-retry #t)
 | 
			
		||||
                                                      (:= @visible? #f)
 | 
			
		||||
                                                      (do-retry-end wikiname)))
 | 
			
		||||
                         #;(text "Be careful not to auto-retry an infinite loop!")))
 | 
			
		||||
         main-window)))
 | 
			
		||||
     (sleep)
 | 
			
		||||
     ; make sure the old broken threads are all gone
 | 
			
		||||
     (for ([th active-threads]) (kill-thread th))
 | 
			
		||||
     (set-clear! active-threads)]))
 | 
			
		||||
     ; make sure the broken thread is gone
 | 
			
		||||
     (define th (qi^-th (obs-peek @qi)))
 | 
			
		||||
     (when th (kill-thread th))]))
 | 
			
		||||
 | 
			
		||||
(define (do-retry-now wikiname)
 | 
			
		||||
  (@queue . <~ . (λ (q) (append (list wikiname) q)))
 | 
			
		||||
  (:= @state 'waiting)
 | 
			
		||||
  (shift-queue-maybe))
 | 
			
		||||
(define segments
 | 
			
		||||
  (list
 | 
			
		||||
   (list 5/100 (make-color 0 223 217))
 | 
			
		||||
   (list 88/100 color-green)
 | 
			
		||||
   (list 2/100 (make-color 0 223 217))
 | 
			
		||||
   (list 5/100 color-green)))
 | 
			
		||||
(define segment-spacing 2)
 | 
			
		||||
(unless (= (apply + (map car segments)) 1)
 | 
			
		||||
  (error 'segments "segments add up to ~a, not 1" (apply + (map car segments))))
 | 
			
		||||
 | 
			
		||||
(define (do-retry-end wikiname)
 | 
			
		||||
  (@queue . <~ . (λ (q) (append q (list wikiname))))
 | 
			
		||||
  (:= @state 'waiting)
 | 
			
		||||
  (shift-queue-maybe))
 | 
			
		||||
;; return the new bitmap, which can be drawn on a dc<%>
 | 
			
		||||
(define/memoize (ray-trace width height stage progress max-progress)
 | 
			
		||||
  ;; (printf "rendering ~a ~a/~a at ~a~n" stage progress max-progress (current-inexact-milliseconds))
 | 
			
		||||
  (define bm (make-object bitmap% width height #f #t))
 | 
			
		||||
  (define dc (make-object bitmap-dc% bm))
 | 
			
		||||
  (define width-available (- width (* (length segments) segment-spacing)))
 | 
			
		||||
  (send dc set-smoothing 'unsmoothed)
 | 
			
		||||
  (send dc set-pen "black" 0 'transparent)
 | 
			
		||||
  (for/fold ([offset 0])
 | 
			
		||||
            ([segment segments]
 | 
			
		||||
             [i (in-naturals 0)]) ;; zero indexed stages?
 | 
			
		||||
    ;; calculate start and end locations of grey bar
 | 
			
		||||
    (define-values (segment-proportion segment-color) (apply values segment))
 | 
			
		||||
    (define segment-start (if (= offset 0) 0 (+ offset segment-spacing)))
 | 
			
		||||
    (define segment-width (* width-available segment-proportion))
 | 
			
		||||
    ;; draw grey bar
 | 
			
		||||
    (send dc set-brush (make-color 180 180 180 0.4) 'solid)
 | 
			
		||||
    (send dc draw-rectangle segment-start 0 segment-width height)
 | 
			
		||||
    ;; draw solid bar according to the current item's progress
 | 
			
		||||
    (define proportion
 | 
			
		||||
      (cond [(stage . < . i) 0]
 | 
			
		||||
            [(stage . > . i) 1]
 | 
			
		||||
            [(max-progress . <= . 0) 0]
 | 
			
		||||
            [(progress . < . 0) 0]
 | 
			
		||||
            [(progress . >= . max-progress) 1]
 | 
			
		||||
            [else (progress . / . max-progress)]))
 | 
			
		||||
    (send dc set-brush segment-color 'solid)
 | 
			
		||||
    (send dc draw-rectangle segment-start 0 (* proportion segment-width) height)
 | 
			
		||||
    (+ segment-start segment-width))
 | 
			
		||||
  (bitmap-render-icon bm 6/8))
 | 
			
		||||
 | 
			
		||||
(define (do-continue)
 | 
			
		||||
  (:= @state 'waiting)
 | 
			
		||||
  (shift-queue-maybe))
 | 
			
		||||
;; get ray traced bitmap (possibly from cache) and draw on dc<%>
 | 
			
		||||
(define (draw-bar orig-dc qi)
 | 
			
		||||
  ;; (println ray-traced)
 | 
			
		||||
  (define-values (width height) (send orig-dc get-size))
 | 
			
		||||
  (send orig-dc draw-bitmap (ray-trace width height  (qi^-stage qi) (qi^-progress qi) (qi^-max-progress qi)) 0 0))
 | 
			
		||||
 | 
			
		||||
(define (display-basename basename)
 | 
			
		||||
  (define limit 40)
 | 
			
		||||
  (cond [(string? basename)
 | 
			
		||||
         (define query (basename->name-for-query basename))
 | 
			
		||||
         (define segments (string-split query "/"))
 | 
			
		||||
         (when (and ((string-length query) . > . limit) ((length segments) . >= . 2))
 | 
			
		||||
           (set! query (string-append ".../" (last segments))))
 | 
			
		||||
         (when ((string-length query) . > . limit)
 | 
			
		||||
           (set! query (string-append (substring query 0 (- limit 3)) "...")))
 | 
			
		||||
         query]
 | 
			
		||||
        [#t "?"]))
 | 
			
		||||
(define ((make-progress-updater @qi) a b c)
 | 
			
		||||
  ;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c)
 | 
			
		||||
  (update-qi @qi [progress a] [max-progress b]))
 | 
			
		||||
 | 
			
		||||
(define (do-add-to-queue)
 | 
			
		||||
  (define wikiname (string-trim (obs-peek @input)))
 | 
			
		||||
  (when ((string-length wikiname) . > . 0)
 | 
			
		||||
    (add-wikiname-to-queue wikiname 'queued 0)) ;; TODO: automatically start?
 | 
			
		||||
  (:= @input ""))
 | 
			
		||||
 | 
			
		||||
(define-syntax-rule (update-qi @qi args ...)
 | 
			
		||||
  (let ([wikiname (qi^-wikiname (obs-peek @qi))])
 | 
			
		||||
    (@queue . <~ . (λ (queue)
 | 
			
		||||
                     (for/list ([qi queue])
 | 
			
		||||
                       (if (equal? (qi^-wikiname qi) wikiname)
 | 
			
		||||
                           (struct-copy qi^ qi args ...)
 | 
			
		||||
                           qi))))))
 | 
			
		||||
 | 
			
		||||
(define (do-start-qi @qi)
 | 
			
		||||
  (define th
 | 
			
		||||
    (thread (λ ()
 | 
			
		||||
              (with-handlers ([exn? (handle-graphical-exn @qi)])
 | 
			
		||||
                (define last-stage
 | 
			
		||||
                  (for/last ([stage all-stages]
 | 
			
		||||
                             [i (in-naturals)])
 | 
			
		||||
                    (update-qi @qi [stage i])
 | 
			
		||||
                    (stage (qi^-wikiname (obs-peek @qi)) (make-progress-updater @qi))
 | 
			
		||||
                    i))
 | 
			
		||||
                (update-qi @qi [st 'complete] [stage (add1 last-stage)])
 | 
			
		||||
                (do-try-unpause-next-entry)))))
 | 
			
		||||
  (update-qi @qi [st 'running] [th th]))
 | 
			
		||||
 | 
			
		||||
(define (do-stop-qi @qi)
 | 
			
		||||
  (define th (qi^-th (obs-peek @qi)))
 | 
			
		||||
  (when th (kill-thread th))
 | 
			
		||||
  (update-qi @qi [th #f] [st 'paused]))
 | 
			
		||||
 | 
			
		||||
(define (do-try-unpause-next-entry)
 | 
			
		||||
  (define queue (obs-peek @queue))
 | 
			
		||||
  (define next-qi (for/first ([qi queue]
 | 
			
		||||
                              #:when (memq (qi^-st qi) '(paused queued error)))
 | 
			
		||||
                    qi))
 | 
			
		||||
  (when next-qi
 | 
			
		||||
    (define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue))))
 | 
			
		||||
    (do-start-qi @qi)))
 | 
			
		||||
 | 
			
		||||
(define main-window
 | 
			
		||||
  (render
 | 
			
		||||
   (window #:title @title
 | 
			
		||||
           #:size '(360 200)
 | 
			
		||||
           #:mixin (λ (%) (class %
 | 
			
		||||
                            (super-new)
 | 
			
		||||
                            (define/augment (on-close)
 | 
			
		||||
                              (for ([th active-threads]) (kill-thread th))
 | 
			
		||||
                              (disconnect slc))))
 | 
			
		||||
           ;; input box at the top
 | 
			
		||||
           (hpanel (text "https://")
 | 
			
		||||
                   (input @wikiname
 | 
			
		||||
                          (λ (event data) (cond
 | 
			
		||||
                                            [(eq? event 'input) (:= @wikiname data)]
 | 
			
		||||
                                            [(eq? event 'return) (do-start-or-queue)])))
 | 
			
		||||
                   (text ".fandom.com"))
 | 
			
		||||
           (button (@queue . ~> . (λ (q) (if (null? q) "Start" "Queue"))) (λ () (do-start-or-queue)))
 | 
			
		||||
           (text (@queue . ~> . (λ (q) (if (null? q) "" (string-join #:before-first "Queue: " q ", ")))))
 | 
			
		||||
           ;; show status based on overall application state
 | 
			
		||||
           (case-view
 | 
			
		||||
            @state
 | 
			
		||||
            ;; waiting for wikiname entry
 | 
			
		||||
            ((waiting) (vpanel
 | 
			
		||||
                        (text "Fill in the wikiname and click start.")))
 | 
			
		||||
            ((stage-0) (vpanel
 | 
			
		||||
                        (text "Checking data...")))
 | 
			
		||||
            ((stage-1) (vpanel
 | 
			
		||||
                        (text "Gathering list of pages...")
 | 
			
		||||
                        (text (@just-done . ~> . display-basename))
 | 
			
		||||
                        (text (@done-pages . ~> . (λ (x) (if (eq? x 0)
 | 
			
		||||
                                                             "0/?"
 | 
			
		||||
                                                             (format "~a/~a" x (obs-peek @num-pages))))))))
 | 
			
		||||
            ;; downloading contents
 | 
			
		||||
            ((stage-2) (vpanel
 | 
			
		||||
                        (text "Downloading page text...")
 | 
			
		||||
                        (progress @done-pages #:range @num-pages)
 | 
			
		||||
                        (text (@done-pages . ~> . (λ (x) (format "~a/~a" x (obs-peek @num-pages)))))
 | 
			
		||||
                        (text (@just-done . ~> . display-basename))))
 | 
			
		||||
            ((done) (vpanel
 | 
			
		||||
                     (text "All wikis downloaded!")))
 | 
			
		||||
            ((err) (vpanel
 | 
			
		||||
                      (text "Error. Check the popup window.")))
 | 
			
		||||
            (else (text (@state . ~> . (λ (state) (format "invalid state: ~a" state))))))
 | 
			
		||||
           (checkbox #:label "Auto-retry on error? (Dangerous)"
 | 
			
		||||
                     #:checked? @auto-retry
 | 
			
		||||
                     (λ:= @auto-retry)))))
 | 
			
		||||
   (window
 | 
			
		||||
    #:title "Fandom Archiver"
 | 
			
		||||
    #:size '(400 300)
 | 
			
		||||
    #:mixin (λ (%) (class %
 | 
			
		||||
                     (super-new)
 | 
			
		||||
                     (define/augment (on-close)
 | 
			
		||||
                       (send stick-timer stop)
 | 
			
		||||
                       (for ([qi (obs-peek @queue)])
 | 
			
		||||
                         (when (qi^-th qi)
 | 
			
		||||
                           (kill-thread (qi^-th qi))))
 | 
			
		||||
                       #;(disconnect*))))
 | 
			
		||||
    (vpanel
 | 
			
		||||
     #:spacing 10
 | 
			
		||||
     #:margin '(5 5)
 | 
			
		||||
     (hpanel
 | 
			
		||||
      #:stretch '(#t #f)
 | 
			
		||||
      #:spacing 10
 | 
			
		||||
      (hpanel
 | 
			
		||||
       (text "https://")
 | 
			
		||||
       (input @input
 | 
			
		||||
              (λ (event data) (cond
 | 
			
		||||
                                [(eq? event 'input) (:= @input data)]
 | 
			
		||||
                                [(eq? event 'return) (do-add-to-queue)])))
 | 
			
		||||
       (text ".fandom.com"))
 | 
			
		||||
      (button "Download Wiki" do-add-to-queue))
 | 
			
		||||
     (list-view
 | 
			
		||||
      #:style '(vertical)
 | 
			
		||||
      @queue
 | 
			
		||||
      #:key qi^-wikiname
 | 
			
		||||
      (λ (k @qi)
 | 
			
		||||
        (define @status-icons
 | 
			
		||||
          (@> (case (qi^-st @qi)
 | 
			
		||||
                [(running) @stick]
 | 
			
		||||
                [else (hash-ref status-icons (qi^-st @qi))])))
 | 
			
		||||
        (define @is-running?
 | 
			
		||||
          (@> (memq (qi^-st @qi) '(running))))
 | 
			
		||||
        ;; state icon at the left side
 | 
			
		||||
        (hpanel #:stretch '(#t #f)
 | 
			
		||||
                #:alignment '(left center)
 | 
			
		||||
                #:spacing 8
 | 
			
		||||
                (bitmap-view @status-icons status-icon-min-width)
 | 
			
		||||
                (vpanel
 | 
			
		||||
                 ;; name and buttons (top half)
 | 
			
		||||
                 (hpanel #:alignment '(left bottom)
 | 
			
		||||
                         (text (@> (qi^-wikiname @qi)))
 | 
			
		||||
                         (spacer)
 | 
			
		||||
                         (hpanel
 | 
			
		||||
                          #:stretch '(#f #f)
 | 
			
		||||
                          (if-view @is-running?
 | 
			
		||||
                                   (button (hash-ref action-icons 'pause)
 | 
			
		||||
                                           (λ () (do-stop-qi @qi)))
 | 
			
		||||
                                   (button (hash-ref action-icons 'resume)
 | 
			
		||||
                                           (λ () (do-start-qi @qi))))))
 | 
			
		||||
                 ;; progress bar (bottom half)
 | 
			
		||||
                 (hpanel
 | 
			
		||||
                  (canvas
 | 
			
		||||
                   @qi
 | 
			
		||||
                   #:style '(transparent)
 | 
			
		||||
                   #:margin '(3 3)
 | 
			
		||||
                   draw-bar)
 | 
			
		||||
                  (hpanel #:min-size '(68 #f)
 | 
			
		||||
                          #:stretch '(#f #f)
 | 
			
		||||
                          #:alignment '(right center)
 | 
			
		||||
                          (text (@> (format "eta ~a" (qi^-eta @qi))))))))))))))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,38 +2,35 @@
 | 
			
		|||
(require racket/file
 | 
			
		||||
         racket/function
 | 
			
		||||
         racket/list
 | 
			
		||||
         racket/runtime-path
 | 
			
		||||
         racket/path
 | 
			
		||||
         racket/sequence
 | 
			
		||||
         racket/string
 | 
			
		||||
         net/url
 | 
			
		||||
         net/mime
 | 
			
		||||
         file/sha1
 | 
			
		||||
         net/http-easy
 | 
			
		||||
         db
 | 
			
		||||
         "../lib/html-parsing/main.rkt"
 | 
			
		||||
         json
 | 
			
		||||
         "archiver-database.rkt"
 | 
			
		||||
         "../lib/html-parsing/main.rkt"
 | 
			
		||||
         "../lib/mime-types.rkt"
 | 
			
		||||
         "../lib/syntax.rkt"
 | 
			
		||||
         "../lib/tree-updater.rkt"
 | 
			
		||||
         "../lib/url-utils.rkt"
 | 
			
		||||
         "../lib/xexpr-utils.rkt"
 | 
			
		||||
         "../lib/archive-file-mappings.rkt")
 | 
			
		||||
 | 
			
		||||
(define archive-slc slc)
 | 
			
		||||
 | 
			
		||||
(provide
 | 
			
		||||
 if-necessary-download-list-of-pages
 | 
			
		||||
 download-list-of-pages
 | 
			
		||||
 save-each-page
 | 
			
		||||
 basename->name-for-query
 | 
			
		||||
 image-url->values
 | 
			
		||||
 hash->save-dir
 | 
			
		||||
 archive-slc)
 | 
			
		||||
 all-stages)
 | 
			
		||||
 | 
			
		||||
(module+ test
 | 
			
		||||
  (require rackunit))
 | 
			
		||||
 | 
			
		||||
(define-runtime-path archive-root "../storage/archive")
 | 
			
		||||
#;(define archive-root "archive")
 | 
			
		||||
(define archive-root (anytime-path ".." "storage/archive"))
 | 
			
		||||
(make-directory* archive-root)
 | 
			
		||||
 | 
			
		||||
(define sources '#hasheq((style . 1) (page . 2)))
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -46,127 +43,34 @@
 | 
			
		|||
            wikiname
 | 
			
		||||
            (params->query '(("action" . "query")
 | 
			
		||||
                             ("meta" . "siteinfo")
 | 
			
		||||
                             ("siprop" . "general|rightsinfo")
 | 
			
		||||
                             ("siprop" . "general|rightsinfo|statistics")
 | 
			
		||||
                             ("format" . "json")
 | 
			
		||||
                             ("formatversion" . "2")))))
 | 
			
		||||
  (define data (response-json (get dest-url)))
 | 
			
		||||
  (define exists? (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname))
 | 
			
		||||
  (if exists?
 | 
			
		||||
      (query-exec slc "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?"
 | 
			
		||||
  (define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
 | 
			
		||||
  (if (and exists? (not (sql-null? exists?)))
 | 
			
		||||
      (query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?"
 | 
			
		||||
                  (jp "/query/general/sitename" data)
 | 
			
		||||
                  (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
 | 
			
		||||
                  (jp "/query/rightsinfo/text" data)
 | 
			
		||||
                  (jp "/query/rightsinfo/url" data)
 | 
			
		||||
                  wikiname)
 | 
			
		||||
      (query-exec slc "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 1, ?, ?, ?, ?)"
 | 
			
		||||
      (query-exec* "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 0, ?, ?, ?, ?)"
 | 
			
		||||
                  wikiname
 | 
			
		||||
                  (jp "/query/general/sitename" data)
 | 
			
		||||
                  (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
 | 
			
		||||
                  (jp "/query/rightsinfo/text" data)
 | 
			
		||||
                  (jp "/query/rightsinfo/url" data))))
 | 
			
		||||
                  (jp "/query/rightsinfo/url" data)))
 | 
			
		||||
  (jp "/query/statistics/articles" data))
 | 
			
		||||
 | 
			
		||||
;; call 1 if not yet done for that wiki
 | 
			
		||||
(define (if-necessary-download-list-of-pages wikiname callback)
 | 
			
		||||
  (define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname))
 | 
			
		||||
  ;; done yet?
 | 
			
		||||
  (unless (and (real? wiki-progress) (wiki-progress . >= . 1))
 | 
			
		||||
    ;; count total pages
 | 
			
		||||
    (define dest-url
 | 
			
		||||
      (format "https://~a.fandom.com/api.php?~a"
 | 
			
		||||
              wikiname
 | 
			
		||||
              (params->query `(("action" . "query") ("meta" . "siteinfo") ("siprop" . "statistics") ("format" . "json")))))
 | 
			
		||||
    (define num-pages (jp "/query/statistics/articles" (response-json (get dest-url))))
 | 
			
		||||
    (download-list-of-pages wikiname callback 0 num-pages #f)))
 | 
			
		||||
 | 
			
		||||
;; 1. Download list of wiki pages and store in database
 | 
			
		||||
(define (download-list-of-pages wikiname callback total-so-far grand-total path-with-namefrom)
 | 
			
		||||
  (define url (if path-with-namefrom
 | 
			
		||||
                  (format "https://~a.fandom.com~a" wikiname path-with-namefrom)
 | 
			
		||||
                  (format "https://~a.fandom.com/wiki/Local_Sitemap" wikiname)))
 | 
			
		||||
  (define r (get url))
 | 
			
		||||
  (define page (html->xexp (bytes->string/utf-8 (response-body r))))
 | 
			
		||||
  (define link-namefrom
 | 
			
		||||
    ((query-selector (λ (t a c x) (and (eq? t 'a)
 | 
			
		||||
                                       (pair? x)
 | 
			
		||||
                                       (string-contains? (car x) "Next page")
 | 
			
		||||
                                       (let ([href (get-attribute 'href a)] )
 | 
			
		||||
                                         (and href (string-contains? href "/wiki/Local_Sitemap")))))
 | 
			
		||||
                     page #:include-text? #t)))
 | 
			
		||||
  (define row-values
 | 
			
		||||
    (for/list ([link (in-producer
 | 
			
		||||
                      (query-selector
 | 
			
		||||
                       (λ (t a c) (eq? t 'a))
 | 
			
		||||
                       ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page)))
 | 
			
		||||
                      #f)])
 | 
			
		||||
      (list wikiname (local-encoded-url->basename (get-attribute 'href (bits->attributes link))) 0)))
 | 
			
		||||
  (define query-template (string-join (make-list (length row-values) "(?, ?, ?)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values "))
 | 
			
		||||
  (apply query-exec slc query-template (flatten row-values))
 | 
			
		||||
  (define new-total (+ (length row-values) total-so-far))
 | 
			
		||||
  (callback new-total grand-total (second (last row-values)))
 | 
			
		||||
  (cond
 | 
			
		||||
    [link-namefrom ; repeat on the next page
 | 
			
		||||
     (download-list-of-pages wikiname callback new-total grand-total (get-attribute 'href (bits->attributes link-namefrom)))]
 | 
			
		||||
    [#t ; all done downloading sitemap
 | 
			
		||||
     (insert-wiki-entry wikiname)]))
 | 
			
		||||
 | 
			
		||||
;; 2. Download each page via API and:
 | 
			
		||||
;; * Save API response to file
 | 
			
		||||
(define max-page-progress 1)
 | 
			
		||||
(define (save-each-page wikiname callback)
 | 
			
		||||
  ;; prepare destination folder
 | 
			
		||||
  (define save-dir (build-path archive-root wikiname))
 | 
			
		||||
  (make-directory* save-dir)
 | 
			
		||||
  ;; gather list of basenames to download (that aren't yet complete)
 | 
			
		||||
  (define basenames (query-list slc "select basename from page where wikiname = ? and progress < ?"
 | 
			
		||||
                                wikiname max-page-progress))
 | 
			
		||||
  ;; counter of complete/incomplete basenames
 | 
			
		||||
  (define already-done-count
 | 
			
		||||
    (query-value slc "select count(*) from page where wikiname = ? and progress = ?"
 | 
			
		||||
                 wikiname max-page-progress))
 | 
			
		||||
  (define not-done-count
 | 
			
		||||
    (query-value slc "select count(*) from page where wikiname = ? and progress < ?"
 | 
			
		||||
                 wikiname max-page-progress))
 | 
			
		||||
  ;; set initial progress
 | 
			
		||||
  (callback already-done-count (+ already-done-count not-done-count) "")
 | 
			
		||||
  ;; loop through basenames and download
 | 
			
		||||
  (for ([basename basenames]
 | 
			
		||||
        [i (in-naturals 1)])
 | 
			
		||||
    (define name-for-query (basename->name-for-query basename))
 | 
			
		||||
    (define dest-url
 | 
			
		||||
      (format "https://~a.fandom.com/api.php?~a"
 | 
			
		||||
              wikiname
 | 
			
		||||
              (params->query `(("action" . "parse")
 | 
			
		||||
                               ("page" . ,name-for-query)
 | 
			
		||||
                               ("prop" . "text|headhtml|langlinks")
 | 
			
		||||
                               ("formatversion" . "2")
 | 
			
		||||
                               ("format" . "json")))))
 | 
			
		||||
    (define r (get dest-url))
 | 
			
		||||
    (define body (response-body r))
 | 
			
		||||
    (define filename (string-append basename ".json"))
 | 
			
		||||
    (define save-path
 | 
			
		||||
      (cond [((string-length basename) . > . 240)
 | 
			
		||||
             (define key (sha1 (string->bytes/latin-1 basename)))
 | 
			
		||||
             (query-exec slc "insert into special_page (wikiname, key, basename) values (?, ?, ?)"
 | 
			
		||||
                         wikiname key basename)
 | 
			
		||||
             (build-path save-dir (string-append key ".json"))]
 | 
			
		||||
            [#t
 | 
			
		||||
             (build-path save-dir (string-append basename ".json"))]))
 | 
			
		||||
    (display-to-file body save-path #:exists 'replace)
 | 
			
		||||
    (query-exec slc "update page set progress = 1 where wikiname = ? and basename = ?"
 | 
			
		||||
                wikiname basename)
 | 
			
		||||
    (callback (+ already-done-count i) (+ already-done-count not-done-count) basename))
 | 
			
		||||
  ;; saved all pages, register that fact in the database
 | 
			
		||||
  (query-exec slc "update wiki set progress = 2 where wikiname = ?" wikiname))
 | 
			
		||||
 | 
			
		||||
;; 3. Download CSS and:
 | 
			
		||||
;; * Save CSS to file
 | 
			
		||||
;; * Record style images to database
 | 
			
		||||
(define (check-style-for-images wikiname path)
 | 
			
		||||
  (define content (file->string path))
 | 
			
		||||
  (define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr))
 | 
			
		||||
  (for/list ([url urls]
 | 
			
		||||
             #:when (not (or (equal? url "")
 | 
			
		||||
                             (equal? url "'")
 | 
			
		||||
                             (string-suffix? url "\"")
 | 
			
		||||
                             (string-contains? url "/resources-ucp/")
 | 
			
		||||
                             (string-contains? url "/fonts/")
 | 
			
		||||
                             (string-contains? url "/drm_fonts/")
 | 
			
		||||
| 
						 | 
				
			
			@ -184,7 +88,7 @@
 | 
			
		|||
      [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)]
 | 
			
		||||
      [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)])))
 | 
			
		||||
 | 
			
		||||
(define (download-styles-for-wiki wikiname)
 | 
			
		||||
(define (download-styles-for-wiki wikiname callback)
 | 
			
		||||
  (define save-dir (build-path archive-root wikiname "styles"))
 | 
			
		||||
  (make-directory* save-dir)
 | 
			
		||||
  (define theme (λ (theme-name)
 | 
			
		||||
| 
						 | 
				
			
			@ -198,18 +102,137 @@
 | 
			
		|||
     (theme "dark")
 | 
			
		||||
     (cons (format "https://~a.fandom.com/load.php?lang=en&modules=skin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles%7Csite.styles&only=styles&skin=fandomdesktop" wikiname)
 | 
			
		||||
           (build-path save-dir "site.css"))))
 | 
			
		||||
  (for ([style styles])
 | 
			
		||||
  (for ([style styles]
 | 
			
		||||
        [i (in-naturals)])
 | 
			
		||||
    (callback i (length styles) "styles...")
 | 
			
		||||
    (define r (get (car style)))
 | 
			
		||||
    (define body (response-body r))
 | 
			
		||||
    (display-to-file body (cdr style) #:exists 'replace)
 | 
			
		||||
    ;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url?
 | 
			
		||||
    )
 | 
			
		||||
  (callback (length styles) (length styles) "styles...")
 | 
			
		||||
  styles)
 | 
			
		||||
 | 
			
		||||
(define (do-step-3 wikiname)
 | 
			
		||||
  (define wiki-progress (query-maybe-value slc "select progress from wiki where wikiname = ?" wikiname))
 | 
			
		||||
(define (hash->save-dir wikiname hash)
 | 
			
		||||
  (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2)))
 | 
			
		||||
 | 
			
		||||
(define (image-url->values i)
 | 
			
		||||
  ;; TODO: handle case where there is multiple broken cb parameter on minecraft wiki
 | 
			
		||||
  ;; TODO: ensure it still "works" with broken & on minecraft wiki
 | 
			
		||||
  (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing
 | 
			
		||||
  (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary
 | 
			
		||||
  (define hash (sha1 (string->bytes/utf-8 key)))
 | 
			
		||||
  (cons key hash))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
;; 1. Download list of wiki pages and store in database, if not done yet for that wiki
 | 
			
		||||
(define (if-necessary-download-list-of-pages wikiname callback)
 | 
			
		||||
  (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
 | 
			
		||||
  ;; done yet?
 | 
			
		||||
  (unless (and (real? wiki-progress) (wiki-progress . >= . 1))
 | 
			
		||||
    ;; Count total pages
 | 
			
		||||
    (define num-pages (insert-wiki-entry wikiname))
 | 
			
		||||
    ;; Download the entire index of pages
 | 
			
		||||
    (define basenames
 | 
			
		||||
      (let loop ([path-with-namefrom "/wiki/Local_Sitemap"]
 | 
			
		||||
                 [basenames-previous-pages null])
 | 
			
		||||
        ;; Download the current index page
 | 
			
		||||
        (define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom))
 | 
			
		||||
        (define r (get url))
 | 
			
		||||
        ;; Metadata from this page (the link to the next page)
 | 
			
		||||
        (define page (html->xexp (bytes->string/utf-8 (response-body r))))
 | 
			
		||||
        (define link-namefrom
 | 
			
		||||
          ((query-selector (λ (t a c x) (and (eq? t 'a)
 | 
			
		||||
                                             (pair? x)
 | 
			
		||||
                                             (string-contains? (car x) "Next page")
 | 
			
		||||
                                             (let ([href (get-attribute 'href a)] )
 | 
			
		||||
                                               (and href (string-contains? href "/wiki/Local_Sitemap")))))
 | 
			
		||||
                           page #:include-text? #t)))
 | 
			
		||||
        ;; Content from this page
 | 
			
		||||
        (define basenames-this-page
 | 
			
		||||
          (for/list ([link (in-producer
 | 
			
		||||
                            (query-selector
 | 
			
		||||
                             (λ (t a c) (eq? t 'a))
 | 
			
		||||
                             ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page)))
 | 
			
		||||
                            #f)])
 | 
			
		||||
            (local-encoded-url->basename (get-attribute 'href (bits->attributes link)))))
 | 
			
		||||
        ;; Call the progress callback
 | 
			
		||||
        (define all-basenames (append basenames-previous-pages basenames-this-page))
 | 
			
		||||
        (callback (length all-basenames) num-pages (last all-basenames))
 | 
			
		||||
        ;; Recurse to download from the next page
 | 
			
		||||
        (if link-namefrom
 | 
			
		||||
            (loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames)
 | 
			
		||||
            all-basenames)))
 | 
			
		||||
    ;; Save those pages into the database
 | 
			
		||||
    ;; SQLite can have a maximum of 32766 parameters in a single query
 | 
			
		||||
    (for ([slice (in-slice 32760 basenames)])
 | 
			
		||||
      (define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values "))
 | 
			
		||||
      (call-with-transaction
 | 
			
		||||
       (get-slc)
 | 
			
		||||
       (λ ()
 | 
			
		||||
         (apply query-exec* query-template wikiname slice)
 | 
			
		||||
         ;; Record that we have the complete list of pages
 | 
			
		||||
         (query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname))))))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
;; 2. Download each page via API and:
 | 
			
		||||
;; * Save API response to file
 | 
			
		||||
(define max-page-progress 1)
 | 
			
		||||
(define (save-each-page wikiname callback)
 | 
			
		||||
  ;; prepare destination folder
 | 
			
		||||
  (define save-dir (build-path archive-root wikiname))
 | 
			
		||||
  (make-directory* save-dir)
 | 
			
		||||
  ;; gather list of basenames to download (that aren't yet complete)
 | 
			
		||||
  (define basenames (query-list* "select basename from page where wikiname = ? and progress < ?"
 | 
			
		||||
                                wikiname max-page-progress))
 | 
			
		||||
  ;; counter of complete/incomplete basenames
 | 
			
		||||
  (define already-done-count
 | 
			
		||||
    (query-value* "select count(*) from page where wikiname = ? and progress = ?"
 | 
			
		||||
                 wikiname max-page-progress))
 | 
			
		||||
  (define not-done-count
 | 
			
		||||
    (query-value* "select count(*) from page where wikiname = ? and progress < ?"
 | 
			
		||||
                  wikiname max-page-progress))
 | 
			
		||||
  (define total-count (+ already-done-count not-done-count))
 | 
			
		||||
  ;; set initial progress
 | 
			
		||||
  (callback already-done-count total-count "")
 | 
			
		||||
  ;; loop through basenames and download
 | 
			
		||||
  (for ([basename basenames]
 | 
			
		||||
        [i (in-naturals (add1 already-done-count))])
 | 
			
		||||
    (define name-for-query (basename->name-for-query basename))
 | 
			
		||||
    (define dest-url
 | 
			
		||||
      (format "https://~a.fandom.com/api.php?~a"
 | 
			
		||||
              wikiname
 | 
			
		||||
              (params->query `(("action" . "parse")
 | 
			
		||||
                               ("page" . ,name-for-query)
 | 
			
		||||
                               ("prop" . "text|headhtml|langlinks")
 | 
			
		||||
                               ("formatversion" . "2")
 | 
			
		||||
                               ("format" . "json")))))
 | 
			
		||||
    (define r (get dest-url))
 | 
			
		||||
    (define body (response-body r))
 | 
			
		||||
    (define filename (string-append basename ".json"))
 | 
			
		||||
    (define save-path
 | 
			
		||||
      (cond [((string-length basename) . > . 240)
 | 
			
		||||
             (define key (sha1 (string->bytes/latin-1 basename)))
 | 
			
		||||
             (query-exec* "insert into special_page (wikiname, key, basename) values (?, ?, ?)"
 | 
			
		||||
                         wikiname key basename)
 | 
			
		||||
             (build-path save-dir (string-append key ".json"))]
 | 
			
		||||
            [#t
 | 
			
		||||
             (build-path save-dir (string-append basename ".json"))]))
 | 
			
		||||
    (display-to-file body save-path #:exists 'replace)
 | 
			
		||||
    (query-exec* "update page set progress = 1 where wikiname = ? and basename = ?"
 | 
			
		||||
                wikiname basename)
 | 
			
		||||
    (callback i total-count basename))
 | 
			
		||||
  ;; saved all pages, register that fact in the database
 | 
			
		||||
  (query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
;; 3. Download CSS and:
 | 
			
		||||
;; * Save CSS to file
 | 
			
		||||
;; * Record style images to database
 | 
			
		||||
(define (if-necessary-download-and-check-styles wikiname callback)
 | 
			
		||||
  (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
 | 
			
		||||
  (unless (and (number? wiki-progress) (wiki-progress . >= . 3))
 | 
			
		||||
    (define styles (download-styles-for-wiki wikiname))
 | 
			
		||||
    (define styles (download-styles-for-wiki wikiname callback))
 | 
			
		||||
    (define unique-image-urls
 | 
			
		||||
      (remove-duplicates
 | 
			
		||||
       (map image-url->values
 | 
			
		||||
| 
						 | 
				
			
			@ -217,48 +240,40 @@
 | 
			
		|||
             (for/list ([style styles])
 | 
			
		||||
               (check-style-for-images wikiname (cdr style)))))
 | 
			
		||||
       #:key cdr))
 | 
			
		||||
    (println unique-image-urls)
 | 
			
		||||
    (for ([pair unique-image-urls])
 | 
			
		||||
      (query-exec slc "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair)))
 | 
			
		||||
    (query-exec slc "update wiki set progress = 3 where wikiname = ?" wikiname)))
 | 
			
		||||
      (query-exec* "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair)))
 | 
			
		||||
    (query-exec* "update wiki set progress = 3 where wikiname = ?" wikiname)))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
;; 4: From downloaded pages, record URLs of image sources and inline style images to database
 | 
			
		||||
(define (hash->save-dir wikiname hash)
 | 
			
		||||
  (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2)))
 | 
			
		||||
 | 
			
		||||
(define (image-url->values i)
 | 
			
		||||
  ;; TODO: handle case where there is multiple cb parameter on minecraft wiki
 | 
			
		||||
  ;; TODO: ensure it still "works" with broken & on minecraft wiki
 | 
			
		||||
  (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing
 | 
			
		||||
  (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary
 | 
			
		||||
  (define hash (sha1 (string->bytes/utf-8 key)))
 | 
			
		||||
  (cons key hash))
 | 
			
		||||
 | 
			
		||||
(define (check-json-for-images wikiname path)
 | 
			
		||||
  (define data (with-input-from-file path (λ () (read-json))))
 | 
			
		||||
  (define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data))))
 | 
			
		||||
  (define tree (update-tree-wiki page wikiname))
 | 
			
		||||
  (remove-duplicates
 | 
			
		||||
   (for/list ([element (in-producer
 | 
			
		||||
                        (query-selector
 | 
			
		||||
                         (λ (t a c)
 | 
			
		||||
                           (and (eq? t 'img)
 | 
			
		||||
                                (get-attribute 'src a)))
 | 
			
		||||
                         tree)
 | 
			
		||||
                        #f)])
 | 
			
		||||
     (image-url->values (get-attribute 'src (bits->attributes element))))))
 | 
			
		||||
  null
 | 
			
		||||
  #;(remove-duplicates
 | 
			
		||||
     (for/list ([element (in-producer
 | 
			
		||||
                          (query-selector
 | 
			
		||||
                           (λ (t a c)
 | 
			
		||||
                             (and (eq? t 'img)
 | 
			
		||||
                                  (get-attribute 'src a)))
 | 
			
		||||
                           tree)
 | 
			
		||||
                          #f)])
 | 
			
		||||
       (image-url->values (get-attribute 'src (bits->attributes element))))))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
;; 5. Download image sources and style images according to database
 | 
			
		||||
(define (save-each-image wikiname source callback)
 | 
			
		||||
(define (save-each-image wikiname callback)
 | 
			
		||||
  (define source (hash-ref sources 'style)) ;; TODO: download entire wiki images instead?
 | 
			
		||||
  ;; gather list of basenames to download (that aren't yet complete)
 | 
			
		||||
  (define rows (query-rows slc "select url, hash from image where wikiname = ? and source <= ? and progress < 1"
 | 
			
		||||
  (define rows (query-rows* "select url, hash from image where wikiname = ? and source <= ? and progress < 1"
 | 
			
		||||
                                wikiname source))
 | 
			
		||||
  ;; counter of complete/incomplete basenames
 | 
			
		||||
  (define already-done-count
 | 
			
		||||
    (query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress = 1"
 | 
			
		||||
    (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress = 1"
 | 
			
		||||
                 wikiname source))
 | 
			
		||||
  (define not-done-count
 | 
			
		||||
    (query-value slc "select count(*) from image where wikiname = ? and source <= ? and progress < 1"
 | 
			
		||||
    (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress < 1"
 | 
			
		||||
                 wikiname source))
 | 
			
		||||
  ;; set initial progress
 | 
			
		||||
  (callback already-done-count (+ already-done-count not-done-count) "")
 | 
			
		||||
| 
						 | 
				
			
			@ -269,26 +284,35 @@
 | 
			
		|||
    (define url (vector-ref row 0))
 | 
			
		||||
    (define hash (vector-ref row 1))
 | 
			
		||||
    ;; check
 | 
			
		||||
    (printf "~a -> ~a~n" url hash)
 | 
			
		||||
    #; (printf "~a -> ~a~n" url hash)
 | 
			
		||||
    (define r (get url))
 | 
			
		||||
    (define declared-type (response-headers-ref r 'content-type))
 | 
			
		||||
    (define final-type (if (equal? declared-type #"application/octet-stream")
 | 
			
		||||
                           (let ([sniff-entity (message-entity (mime-analyze (response-body r)))])
 | 
			
		||||
                             (string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity))))
 | 
			
		||||
                           declared-type))
 | 
			
		||||
    (define ext (bytes->string/latin-1 (mime-type->ext final-type)))
 | 
			
		||||
    (define ext
 | 
			
		||||
      (with-handlers ([exn:fail:contract? (λ _ (error 'save-each-image "no ext found for mime type `~a` in file ~a" final-type url))])
 | 
			
		||||
        (bytes->string/latin-1 (mime-type->ext final-type))))
 | 
			
		||||
    ;; save
 | 
			
		||||
    (define save-dir (hash->save-dir wikiname hash))
 | 
			
		||||
    (make-directory* save-dir)
 | 
			
		||||
    (define save-path (build-path save-dir (string-append hash "." ext)))
 | 
			
		||||
    (define body (response-body r))
 | 
			
		||||
    (display-to-file body save-path #:exists 'replace)
 | 
			
		||||
    (query-exec slc "update image set progress = 1, ext = ? where wikiname = ? and hash = ?"
 | 
			
		||||
    (query-exec* "update image set progress = 1, ext = ? where wikiname = ? and hash = ?"
 | 
			
		||||
                ext wikiname hash)
 | 
			
		||||
    (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append hash "." ext)))
 | 
			
		||||
  ;; TODO: saved all images, register that fact in the database
 | 
			
		||||
  )
 | 
			
		||||
    (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append (substring hash 0 6) "..." ext)))
 | 
			
		||||
  ;; saved all images, register that fact in the database
 | 
			
		||||
  (query-exec* "update wiki set progress = 4 where wikiname = ?" wikiname))
 | 
			
		||||
 | 
			
		||||
(define all-stages
 | 
			
		||||
  (list
 | 
			
		||||
   if-necessary-download-list-of-pages
 | 
			
		||||
   save-each-page
 | 
			
		||||
   if-necessary-download-and-check-styles
 | 
			
		||||
   ;; check-json-for-images
 | 
			
		||||
   save-each-image))
 | 
			
		||||
 | 
			
		||||
(module+ test
 | 
			
		||||
  (check-equal? (html->xexp "<img src=\"https://example.com/images?src=Blah.jpg&width=150\">")
 | 
			
		||||
| 
						 | 
				
			
			@ -299,11 +323,13 @@
 | 
			
		|||
  #;(do-step-3 "gallowmere")
 | 
			
		||||
  #;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))
 | 
			
		||||
 | 
			
		||||
  #;(for ([wikiname (query-list slc "select wikiname from wiki")])
 | 
			
		||||
  #;(for ([wikiname (query-list* "select wikiname from wiki")])
 | 
			
		||||
      (println wikiname)
 | 
			
		||||
      (insert-wiki-entry wikiname))
 | 
			
		||||
 | 
			
		||||
  #;(for ([wikiname (query-list slc "select wikiname from wiki")])
 | 
			
		||||
  #;(for ([wikiname (query-list* "select wikiname from wiki")])
 | 
			
		||||
      (println wikiname)
 | 
			
		||||
      (do-step-3 wikiname)
 | 
			
		||||
      (save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))))
 | 
			
		||||
 | 
			
		||||
; (for ([stage all-stages]) (stage "create" (λ (a b c) (printf "~a/~a ~a~n" a b c))))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue