Compare commits
No commits in common. "main" and "main" have entirely different histories.
112 changed files with 616 additions and 16467 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -16,5 +16,3 @@ compiled
|
|||
|
||||
# Personal
|
||||
/config.ini
|
||||
misc
|
||||
storage
|
||||
|
|
|
@ -1,71 +0,0 @@
|
|||
#lang cli
|
||||
(require charterm
|
||||
"archiver.rkt")
|
||||
|
||||
(help (usage "Downloads a single Fandom wiki in BreezeWiki offline format."
|
||||
""
|
||||
"Downloaded pages go into `archive/` next to the executable."
|
||||
"Database goes into `archiver.db*` next to the executable."
|
||||
"The database is necessary to store your download progress and resume where you left off if the process is interrupted."))
|
||||
|
||||
(flag (output-quiet?)
|
||||
("-q" "--output-quiet" "disable progress output")
|
||||
(output-quiet? #t))
|
||||
|
||||
(flag (output-progress?)
|
||||
("-p" "--output-progress" "progress output for terminals (default in a tty)")
|
||||
(output-progress? #t))
|
||||
|
||||
(flag (output-lines?)
|
||||
("-l" "--output-lines" "output the name of each file downloaded (default outside of a tty)")
|
||||
(output-lines? #t))
|
||||
|
||||
(constraint (one-of output-quiet? output-lines? output-progress?))
|
||||
|
||||
|
||||
|
||||
(program
|
||||
(start [wikiname "wikiname to download"])
|
||||
;; set up arguments
|
||||
(define width 80)
|
||||
(when (not (or (output-quiet?) (output-lines?) (output-progress?)))
|
||||
(cond [(terminal-port? current-input-port)
|
||||
(output-progress? #t)]
|
||||
[else
|
||||
(output-lines? #t)]))
|
||||
(define (update-width)
|
||||
(when (output-progress?)
|
||||
(case (system-type 'os)
|
||||
[(linux)
|
||||
(with-charterm
|
||||
(call-with-values (λ () (charterm-screen-size))
|
||||
(λ (cols rows) (set! width cols))))]
|
||||
[else 100])))
|
||||
(update-width)
|
||||
;; check
|
||||
(when (or (not wikiname) (equal? wikiname ""))
|
||||
(raise-user-error "Please specify the wikiname to download on the command line."))
|
||||
;; progress reporting based on selected mode
|
||||
(define (report-progress a b c)
|
||||
(define basename (basename->name-for-query c))
|
||||
(cond
|
||||
[(output-lines?)
|
||||
(displayln basename)]
|
||||
[(output-progress?)
|
||||
(when (eq? (modulo a 20) 0)
|
||||
(thread (λ () (update-width))))
|
||||
(define prefix (format "[~a] [~a/~a] " wikiname a b))
|
||||
(define rest (- width (string-length prefix)))
|
||||
(define real-width (min (string-length basename) rest))
|
||||
(define spare-width (- rest real-width))
|
||||
(define name-display (substring basename 0 real-width))
|
||||
(printf "\e[2K\r~a~a" prefix name-display)
|
||||
(flush-output)]))
|
||||
;; download all stages
|
||||
(for ([stage all-stages]
|
||||
[i (in-naturals 1)])
|
||||
(printf "> Stage ~a/~a~n" i (length all-stages))
|
||||
(stage wikiname report-progress)
|
||||
(displayln "")))
|
||||
|
||||
(run start)
|
|
@ -1,82 +0,0 @@
|
|||
#lang racket/base
|
||||
(require racket/file
|
||||
racket/list
|
||||
racket/path
|
||||
racket/string
|
||||
json
|
||||
json-pointer
|
||||
db
|
||||
"../lib/syntax.rkt")
|
||||
|
||||
(provide
|
||||
get-slc
|
||||
query-exec*
|
||||
query-rows*
|
||||
query-list*
|
||||
query-value*
|
||||
query-maybe-value*
|
||||
query-maybe-row*)
|
||||
|
||||
(define storage-path (anytime-path ".." "storage"))
|
||||
(define database-file (build-path storage-path "archiver.db"))
|
||||
|
||||
(define slc (box #f))
|
||||
(define (get-slc)
|
||||
(define slc* (unbox slc))
|
||||
(cond
|
||||
[slc* slc*]
|
||||
[else
|
||||
(make-directory* storage-path)
|
||||
(define slc* (sqlite3-connect #:database database-file #:mode 'create))
|
||||
(query-exec slc* "PRAGMA journal_mode=WAL")
|
||||
(define database-version
|
||||
(with-handlers ([exn:fail:sql?
|
||||
(λ (exn)
|
||||
; need to set up the database
|
||||
(query-exec slc* "create table database_version (version integer, primary key (version))")
|
||||
(query-exec slc* "insert into database_version values (0)")
|
||||
0)])
|
||||
(query-value slc* "select version from database_version")))
|
||||
|
||||
(define migrations
|
||||
(wrap-sql
|
||||
((query-exec slc* "create table page (wikiname TEXT NOT NULL, basename TEXT NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, basename))")
|
||||
(query-exec slc* "create table wiki (wikiname TEXT NOT NULL, progress INTEGER, PRIMARY KEY (wikiname))"))
|
||||
((query-exec slc* "create table special_page (wikiname TEXT NOT NULL, key TEXT NOT NULL, basename TEXT NOT NULL, PRIMARY KEY (wikiname, key))"))
|
||||
((query-exec slc* "update wiki set progress = 2 where wikiname in (select wikiname from wiki inner join page using (wikiname) group by wikiname having min(page.progress) = 1)"))
|
||||
((query-exec slc* "create table image (wikiname TEXT NOT NULL, hash TEXT NTO NULL, url TEXT NOT NULL, ext TEXT, source INTEGER NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, hash))"))
|
||||
((query-exec slc* "alter table wiki add column sitename TEXT")
|
||||
(query-exec slc* "alter table wiki add column basepage TEXT")
|
||||
(query-exec slc* "alter table wiki add column license_text TEXT")
|
||||
(query-exec slc* "alter table wiki add column license_url TEXT"))
|
||||
((query-exec slc* "alter table page add column redirect"))))
|
||||
|
||||
(let do-migrate-step ()
|
||||
(when (database-version . < . (length migrations))
|
||||
(call-with-transaction
|
||||
slc*
|
||||
(list-ref migrations database-version))
|
||||
(set! database-version (add1 database-version))
|
||||
(query-exec slc* "update database_version set version = $1" database-version)
|
||||
(do-migrate-step)))
|
||||
|
||||
(set-box! slc slc*)
|
||||
slc*]))
|
||||
|
||||
(define (query-exec* . args)
|
||||
(apply query-exec (get-slc) args))
|
||||
|
||||
(define (query-rows* . args)
|
||||
(apply query-rows (get-slc) args))
|
||||
|
||||
(define (query-list* . args)
|
||||
(apply query-list (get-slc) args))
|
||||
|
||||
(define (query-value* . args)
|
||||
(apply query-value (get-slc) args))
|
||||
|
||||
(define (query-maybe-value* . args)
|
||||
(apply query-maybe-value (get-slc) args))
|
||||
|
||||
(define (query-maybe-row* . args)
|
||||
(apply query-maybe-row (get-slc) args))
|
|
@ -1,390 +0,0 @@
|
|||
#lang racket/base
|
||||
(require racket/class
|
||||
racket/draw
|
||||
racket/format
|
||||
racket/function
|
||||
racket/list
|
||||
racket/math
|
||||
racket/port
|
||||
racket/set
|
||||
racket/splicing
|
||||
racket/string
|
||||
(except-in pict text table)
|
||||
db
|
||||
net/http-easy
|
||||
memo
|
||||
(only-in racket/gui timer%)
|
||||
racket/gui/easy
|
||||
racket/gui/easy/operator
|
||||
(only-in pict bitmap)
|
||||
images/icons/arrow
|
||||
images/icons/control
|
||||
images/icons/stickman
|
||||
images/icons/style
|
||||
images/icons/symbol
|
||||
"archiver-database.rkt"
|
||||
"archiver.rkt"
|
||||
"../lib/url-utils.rkt"
|
||||
"../lib/xexpr-utils.rkt")
|
||||
|
||||
(default-icon-material rubber-icon-material)
|
||||
|
||||
(require (for-syntax racket/base racket/match racket/set racket/string))
|
||||
|
||||
(define-syntax (@> stx)
|
||||
(define form (cdr (syntax->datum stx)))
|
||||
(match form
|
||||
[(list form) ; (@> (fn @obs))
|
||||
;; identify the observables and replace with non-@ symbols
|
||||
(define collection (mutable-set))
|
||||
(define updated
|
||||
(let loop ([sexp form])
|
||||
(cond [(symbol? sexp)
|
||||
(let ([as-s (symbol->string sexp)])
|
||||
(if (string-prefix? as-s "@")
|
||||
(let ([without-@ (string->symbol (substring as-s 1))])
|
||||
(set-add! collection (cons sexp without-@))
|
||||
without-@)
|
||||
sexp))]
|
||||
[(pair? sexp) (cons (loop (car sexp)) (loop (cdr sexp)))]
|
||||
[#t sexp])))
|
||||
(define collection-l (set->list collection))
|
||||
;; return obs-combine -> updated-form
|
||||
(datum->syntax stx `(obs-combine (λ (,@(map cdr collection-l)) ,updated) ,@(map car collection-l)))]
|
||||
[(list (? string? str) args ...) ; (@> "Blah: ~a/~a" @arg1 arg2)
|
||||
;; identify the observables and replace with non-@ symbols
|
||||
(define collection-l
|
||||
(for/list ([arg args])
|
||||
(if (symbol? arg)
|
||||
(let ([as-s (symbol->string arg)])
|
||||
(if (string-prefix? as-s "@")
|
||||
(let ([without-@ (string->symbol (substring as-s 1))])
|
||||
(cons arg without-@))
|
||||
(cons #f arg)))
|
||||
(cons #f arg))))
|
||||
(define collection-lo (filter car collection-l))
|
||||
;; return obs-combine -> format
|
||||
(datum->syntax stx `(obs-combine (λ (,@(map cdr collection-lo)) (format ,str ,@(map cdr collection-l))) ,@(map car collection-lo)))]))
|
||||
|
||||
(define/obs @auto-retry #f)
|
||||
|
||||
(define-struct qi^ (wikiname st stage progress max-progress ticks eta th) #:transparent) ;; queue item
|
||||
|
||||
(define rows (query-rows* "select wikiname, progress from wiki where progress < 4"))
|
||||
(define/obs @queue null)
|
||||
(define (add-wikiname-to-queue wikiname st stage)
|
||||
(@queue . <~ . (λ (queue)
|
||||
(define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue))
|
||||
(if already-exists?
|
||||
queue
|
||||
(append queue (list (qi^ wikiname st stage 0 1 0 "..." #f)))))))
|
||||
(for ([row rows])
|
||||
(add-wikiname-to-queue (vector-ref row 0)
|
||||
(if (= (vector-ref row 1) 4)
|
||||
'complete
|
||||
'queued)
|
||||
(vector-ref row 1)))
|
||||
|
||||
(define status-icon-size 32)
|
||||
(define status-icon-min-width 36)
|
||||
(define button-icon-size 12)
|
||||
|
||||
(define color-green (make-color 90 212 68))
|
||||
|
||||
(define (resize coords fraction)
|
||||
(for/list ([coord (in-list coords)])
|
||||
(cons (* fraction (car coord))
|
||||
(* fraction (cdr coord)))))
|
||||
|
||||
(define (flat-right-arrow #:height [height 32] #:color [color #f])
|
||||
((if color
|
||||
(curryr colorize color)
|
||||
values)
|
||||
(dc (λ (dc dx dy)
|
||||
(send dc draw-polygon (resize
|
||||
(list '(0 . 9) '(15 . 9) '(14 . 0)
|
||||
'(31 . 15.5)
|
||||
'(14 . 31) '(15 . 22) '(0 . 22))
|
||||
(/ height 32))))
|
||||
height height)))
|
||||
|
||||
(define (double-left-arrow-icon #:height [height 32])
|
||||
(define shift (/ height 48))
|
||||
(pict->bitmap
|
||||
(scale-to-fit
|
||||
(panorama
|
||||
(pin-under
|
||||
(bitmap
|
||||
(left-over-arrow-icon #:color halt-icon-color #:height height
|
||||
#:material rubber-icon-material))
|
||||
(- (* -20 shift) 2) (+ (* 6 shift) 1)
|
||||
(bitmap
|
||||
(bitmap-render-icon
|
||||
(pict->bitmap
|
||||
(rotate
|
||||
(flat-right-arrow #:color (make-object color% 255 64 64) #:height (/ height 1.26))
|
||||
(* pi 1.23)))))
|
||||
#;(rotate
|
||||
(flat-right-arrow #:color (make-object color% 255 64 64) #:height (/ height 1.26))
|
||||
(* pi 1.23))))
|
||||
height height #:mode 'preserve/max)))
|
||||
|
||||
(splicing-let ([frame-count 20])
|
||||
(define stickman-frames
|
||||
(for/vector ([s (in-range 0 1 (/ 1 frame-count))])
|
||||
(running-stickman-icon
|
||||
s
|
||||
#:height status-icon-size
|
||||
#:material (default-icon-material)))))
|
||||
|
||||
(define (stick n)
|
||||
(vector-ref stickman-frames (modulo n (vector-length stickman-frames))))
|
||||
|
||||
(define status-icons
|
||||
(hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size)
|
||||
'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size)
|
||||
'running (stick 0)
|
||||
'error (x-icon #:height status-icon-size)
|
||||
'complete (check-icon #:color color-green #:height status-icon-size)))
|
||||
|
||||
(define action-icons
|
||||
(hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size)
|
||||
'resume (play-icon #:color color-green #:height button-icon-size)
|
||||
'reset (left-over-arrow-icon #:color halt-icon-color #:height button-icon-size)
|
||||
'reseter (double-left-arrow-icon #:height button-icon-size)))
|
||||
|
||||
(define (bitmap-view @the-bitmap [min-width 1])
|
||||
(pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f)
|
||||
#:stretch '(#f #f)
|
||||
#:style '(transparent)
|
||||
@the-bitmap
|
||||
bitmap))
|
||||
|
||||
(define (exn->string e)
|
||||
(with-output-to-string
|
||||
(λ ()
|
||||
(displayln (exn-message e))
|
||||
(displayln "context:")
|
||||
(for ([item (continuation-mark-set->context (exn-continuation-marks e))])
|
||||
(printf " ~a" (srcloc->string (cdr item)))
|
||||
(when (car item)
|
||||
(printf ": ~a" (car item)))
|
||||
(displayln "")))))
|
||||
|
||||
(define ((handle-graphical-exn @qi) e)
|
||||
(displayln (exn->string e) (current-error-port))
|
||||
(cond
|
||||
[(obs-peek @auto-retry)
|
||||
(void) ;; TODO
|
||||
#;(do-retry-end wikiname)]
|
||||
[#t
|
||||
(update-qi @qi [st 'error])
|
||||
(do-try-unpause-next-entry)
|
||||
(thread
|
||||
(λ ()
|
||||
(define/obs @visible? #t)
|
||||
(render
|
||||
(dialog #:title "Download Error"
|
||||
#:style '(resize-border)
|
||||
#:mixin (λ (%) (class % (super-new)
|
||||
(obs-observe! @visible? (λ (visible?) (send this show visible?)))))
|
||||
(vpanel #:margin '(15 15)
|
||||
(text (format "Encountered this error while downloading ~a:" (qi^-wikiname (obs-peek @qi))))
|
||||
(input #:style '(multiple hscroll)
|
||||
#:min-size '(#f 200)
|
||||
(exn->string e))
|
||||
;; TODO
|
||||
#;(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname)))
|
||||
#;(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname)))
|
||||
#;(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue)))
|
||||
#;(button "Use Auto-Retry" (λ ()
|
||||
(:= @auto-retry #t)
|
||||
(:= @visible? #f)
|
||||
(do-retry-end wikiname)))
|
||||
#;(text "Be careful not to auto-retry an infinite loop!")))
|
||||
main-window)))
|
||||
(sleep)
|
||||
; make sure the broken thread is gone
|
||||
(define th (qi^-th (obs-peek @qi)))
|
||||
(when th (kill-thread th))]))
|
||||
|
||||
(define segments
|
||||
(list
|
||||
(list 5/100 (make-color 0 223 217))
|
||||
(list 88/100 color-green)
|
||||
(list 2/100 (make-color 0 223 217))
|
||||
(list 5/100 color-green)))
|
||||
(define segment-spacing 2)
|
||||
(unless (= (apply + (map car segments)) 1)
|
||||
(error 'segments "segments add up to ~a, not 1" (apply + (map car segments))))
|
||||
|
||||
;; return the new bitmap, which can be drawn on a dc<%>
|
||||
(define/memoize (ray-trace width height stage progress max-progress)
|
||||
;; (printf "rendering ~a ~a/~a at ~a~n" stage progress max-progress (current-inexact-milliseconds))
|
||||
(define bm (make-object bitmap% width height #f #t))
|
||||
(define dc (make-object bitmap-dc% bm))
|
||||
(define width-available (- width (* (length segments) segment-spacing)))
|
||||
(send dc set-smoothing 'unsmoothed)
|
||||
(send dc set-pen "black" 0 'transparent)
|
||||
(for/fold ([offset 0])
|
||||
([segment segments]
|
||||
[i (in-naturals 0)]) ;; zero indexed stages?
|
||||
;; calculate start and end locations of grey bar
|
||||
(define-values (segment-proportion segment-color) (apply values segment))
|
||||
(define segment-start (if (= offset 0) 0 (+ offset segment-spacing)))
|
||||
(define segment-width (* width-available segment-proportion))
|
||||
;; draw grey bar
|
||||
(send dc set-brush (make-color 180 180 180 0.4) 'solid)
|
||||
(send dc draw-rectangle segment-start 0 segment-width height)
|
||||
;; draw solid bar according to the current item's progress
|
||||
(define proportion
|
||||
(cond [(stage . < . i) 0]
|
||||
[(stage . > . i) 1]
|
||||
[(max-progress . <= . 0) 0]
|
||||
[(progress . < . 0) 0]
|
||||
[(progress . >= . max-progress) 1]
|
||||
[else (progress . / . max-progress)]))
|
||||
(send dc set-brush segment-color 'solid)
|
||||
(send dc draw-rectangle segment-start 0 (* proportion segment-width) height)
|
||||
(+ segment-start segment-width))
|
||||
(bitmap-render-icon bm 6/8))
|
||||
|
||||
;; get ray traced bitmap (possibly from cache) and draw on dc<%>
|
||||
(define (draw-bar orig-dc qi)
|
||||
;; (println ray-traced)
|
||||
(define-values (width height) (send orig-dc get-size))
|
||||
(send orig-dc draw-bitmap (ray-trace width height (qi^-stage qi) (qi^-progress qi) (qi^-max-progress qi)) 0 0))
|
||||
|
||||
(define ((make-progress-updater @qi) a b c)
|
||||
;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c)
|
||||
(update-qi @qi [progress a] [max-progress b] [ticks (add1 (qi^-ticks (obs-peek @qi)))]))
|
||||
|
||||
(define/obs @input "")
|
||||
|
||||
(define (do-add-to-queue)
|
||||
(define wikiname (string-trim (obs-peek @input)))
|
||||
(when ((string-length wikiname) . > . 0)
|
||||
(add-wikiname-to-queue wikiname 'queued 0)) ;; TODO: automatically start?
|
||||
(:= @input ""))
|
||||
|
||||
(define-syntax-rule (update-qi @qi args ...)
|
||||
(let ([wikiname (qi^-wikiname (obs-peek @qi))])
|
||||
(@queue . <~ . (λ (queue)
|
||||
(for/list ([qi queue])
|
||||
(if (equal? (qi^-wikiname qi) wikiname)
|
||||
(struct-copy qi^ qi args ...)
|
||||
qi))))))
|
||||
|
||||
(define (do-start-qi @qi)
|
||||
(define th
|
||||
(thread (λ ()
|
||||
(with-handlers ([exn? (handle-graphical-exn @qi)])
|
||||
(define last-stage
|
||||
(for/last ([stage all-stages]
|
||||
[i (in-naturals)])
|
||||
(update-qi @qi [stage i])
|
||||
(stage (qi^-wikiname (obs-peek @qi)) (make-progress-updater @qi))
|
||||
i))
|
||||
(update-qi @qi [st 'complete] [stage (add1 last-stage)])
|
||||
(do-try-unpause-next-entry)))))
|
||||
(update-qi @qi [st 'running] [th th]))
|
||||
|
||||
(define (do-stop-qi @qi)
|
||||
(define th (qi^-th (obs-peek @qi)))
|
||||
(when th (kill-thread th))
|
||||
(update-qi @qi [th #f] [st 'paused]))
|
||||
|
||||
(define (do-reset-qi @qi)
|
||||
(define reset-progress-to 0)
|
||||
(define th (qi^-th (obs-peek @qi)))
|
||||
(when th (kill-thread th))
|
||||
(update-qi @qi [th #f] [st 'queued] [stage reset-progress-to] [progress 0] [max-progress 0])
|
||||
(query-exec* "update wiki set progress = ? where wikiname = ?" reset-progress-to (qi^-wikiname (obs-peek @qi))))
|
||||
|
||||
(define (do-reseter-qi @qi)
|
||||
(do-reset-qi @qi)
|
||||
(query-exec* "delete from page where wikiname = ?" (qi^-wikiname (obs-peek @qi))))
|
||||
|
||||
(define (do-try-unpause-next-entry)
|
||||
(define queue (obs-peek @queue))
|
||||
(define next-qi (for/last ([qi queue]
|
||||
#:when (memq (qi^-st qi) '(paused queued)))
|
||||
qi))
|
||||
(when next-qi
|
||||
(define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue))))
|
||||
(do-start-qi @qi)))
|
||||
|
||||
(define main-window
|
||||
(render
|
||||
(window
|
||||
#:title "Fandom Archiver"
|
||||
#:size '(400 300)
|
||||
#:mixin (λ (%) (class %
|
||||
(super-new)
|
||||
(define/augment (on-close)
|
||||
(for ([qi (obs-peek @queue)])
|
||||
(when (qi^-th qi)
|
||||
(kill-thread (qi^-th qi))))
|
||||
#;(disconnect*))))
|
||||
(vpanel
|
||||
#:spacing 10
|
||||
#:margin '(5 5)
|
||||
(hpanel
|
||||
#:stretch '(#t #f)
|
||||
#:spacing 10
|
||||
(hpanel
|
||||
(text "https://")
|
||||
(input @input
|
||||
(λ (event data) (cond
|
||||
[(eq? event 'input) (:= @input data)]
|
||||
[(eq? event 'return) (do-add-to-queue)])))
|
||||
(text ".fandom.com"))
|
||||
(button "Download Wiki" do-add-to-queue))
|
||||
(list-view
|
||||
#:style '(vertical)
|
||||
@queue
|
||||
#:key qi^-wikiname
|
||||
(λ (k @qi)
|
||||
(define @status-icons
|
||||
(@> (case (qi^-st @qi)
|
||||
[(running) (stick (qi^-ticks @qi))]
|
||||
[else (hash-ref status-icons (qi^-st @qi))])))
|
||||
(define @is-running?
|
||||
(@> (memq (qi^-st @qi) '(running))))
|
||||
(define @is-complete?
|
||||
(@> (eq? (qi^-st @qi) 'complete)))
|
||||
;; state icon at the left side
|
||||
(hpanel #:stretch '(#t #f)
|
||||
#:alignment '(left center)
|
||||
#:spacing 8
|
||||
(bitmap-view @status-icons status-icon-min-width)
|
||||
(vpanel
|
||||
;; name and buttons (top half)
|
||||
(hpanel #:alignment '(left bottom)
|
||||
(text (@> (qi^-wikiname @qi)))
|
||||
(spacer)
|
||||
(hpanel
|
||||
#:stretch '(#f #f)
|
||||
|
||||
(if-view @is-running?
|
||||
(button (hash-ref action-icons 'pause)
|
||||
(λ () (do-stop-qi @qi)))
|
||||
(hpanel
|
||||
#:stretch '(#f #f)
|
||||
(button (hash-ref action-icons 'reseter)
|
||||
(λ () (do-reseter-qi @qi)))
|
||||
(button (hash-ref action-icons 'reset)
|
||||
(λ () (do-reset-qi @qi)))
|
||||
(button (hash-ref action-icons 'resume)
|
||||
(λ () (do-start-qi @qi)))))))
|
||||
;; progress bar (bottom half)
|
||||
(hpanel
|
||||
(canvas
|
||||
@qi
|
||||
#:style '(transparent)
|
||||
#:margin '(3 3)
|
||||
draw-bar)
|
||||
(hpanel #:min-size '(68 #f)
|
||||
#:stretch '(#f #f)
|
||||
#:alignment '(right center)
|
||||
(text (@> (format "eta ~a" (qi^-eta @qi))))))))))))))
|
|
@ -1,388 +0,0 @@
|
|||
#lang racket/base
|
||||
(require racket/file
|
||||
racket/format
|
||||
racket/function
|
||||
racket/list
|
||||
racket/path
|
||||
racket/sequence
|
||||
racket/string
|
||||
net/url
|
||||
net/mime
|
||||
file/sha1
|
||||
net/http-easy
|
||||
db
|
||||
json
|
||||
"archiver-database.rkt"
|
||||
"../lib/html-parsing/main.rkt"
|
||||
"../lib/mime-types.rkt"
|
||||
"../lib/syntax.rkt"
|
||||
"../lib/tree-updater.rkt"
|
||||
"../lib/url-utils.rkt"
|
||||
"../lib/xexpr-utils.rkt"
|
||||
"../lib/archive-file-mappings.rkt")
|
||||
|
||||
(provide
|
||||
basename->name-for-query
|
||||
image-url->values
|
||||
hash->save-dir
|
||||
all-stages)
|
||||
|
||||
(module+ test
|
||||
(require rackunit))
|
||||
|
||||
(define archive-root (anytime-path ".." "storage/archive"))
|
||||
(make-directory* archive-root)
|
||||
|
||||
(define sources '#hasheq((style . 1) (page . 2)))
|
||||
|
||||
(define (get-origin wikiname)
|
||||
(format "https://~a.fandom.com" wikiname))
|
||||
|
||||
(define (insert-wiki-entry wikiname)
|
||||
(define dest-url
|
||||
(format "https://~a.fandom.com/api.php?~a"
|
||||
wikiname
|
||||
(params->query '(("action" . "query")
|
||||
("meta" . "siteinfo")
|
||||
("siprop" . "general|rightsinfo|statistics|namespaces")
|
||||
("format" . "json")
|
||||
("formatversion" . "2")))))
|
||||
(define data (response-json (get dest-url)))
|
||||
(define content-nss
|
||||
(sort
|
||||
(for/list ([(k v) (in-hash (jp "/query/namespaces" data))]
|
||||
#:do [(define id (hash-ref v 'id))]
|
||||
#:when (and (id . < . 2900) ; exclude maps namespace
|
||||
(hash-ref v 'content))) ; exclude non-content and talk namespaces
|
||||
id)
|
||||
<))
|
||||
(define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
|
||||
(if (and exists? (not (sql-null? exists?)))
|
||||
(query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?"
|
||||
(jp "/query/general/sitename" data)
|
||||
(second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
|
||||
(jp "/query/rightsinfo/text" data)
|
||||
(jp "/query/rightsinfo/url" data)
|
||||
wikiname)
|
||||
(query-exec* "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 0, ?, ?, ?, ?)"
|
||||
wikiname
|
||||
(jp "/query/general/sitename" data)
|
||||
(second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data)))
|
||||
(jp "/query/rightsinfo/text" data)
|
||||
(jp "/query/rightsinfo/url" data)))
|
||||
(values (jp "/query/statistics/articles" data)
|
||||
content-nss))
|
||||
|
||||
|
||||
(define (check-style-for-images wikiname path)
|
||||
(define content (file->string path))
|
||||
(define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr))
|
||||
(for/list ([url urls]
|
||||
#:when (not (or (equal? url "")
|
||||
(equal? url "'")
|
||||
(string-suffix? url "\"")
|
||||
(string-contains? url "/resources-ucp/")
|
||||
(string-contains? url "/fonts/")
|
||||
(string-contains? url "/drm_fonts/")
|
||||
(string-contains? url "//db.onlinewebfonts.com/")
|
||||
(string-contains? url "//bits.wikimedia.org/")
|
||||
(string-contains? url "mygamercard.net/")
|
||||
(string-contains? url "dropbox")
|
||||
(string-contains? url "only=styles")
|
||||
(string-contains? url "https://https://")
|
||||
(regexp-match? #rx"^%20" url)
|
||||
(regexp-match? #rx"^data:" url)
|
||||
(regexp-match? #rx"^file:" url))))
|
||||
(cond
|
||||
[(string-prefix? url "https://") url]
|
||||
[(string-prefix? url "http://") (regexp-replace #rx"http:" url "https:")]
|
||||
[(string-prefix? url "httpshttps://") (regexp-replace #rx"httpshttps://" url "https://")]
|
||||
[(string-prefix? url "//") (string-append "https:" url)]
|
||||
[(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)]
|
||||
[else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)])))
|
||||
|
||||
(define (download-styles-for-wiki wikiname callback)
|
||||
(define save-dir (build-path archive-root wikiname "styles"))
|
||||
(make-directory* save-dir)
|
||||
(define theme (λ (theme-name)
|
||||
(cons (format "https://~a.fandom.com/wikia.php?controller=ThemeApi&method=themeVariables&variant=~a" wikiname theme-name)
|
||||
(build-path save-dir (format "themeVariables-~a.css" theme-name)))))
|
||||
;; (Listof (Pair url save-path))
|
||||
(define styles
|
||||
(list
|
||||
(theme "default")
|
||||
(theme "light")
|
||||
(theme "dark")
|
||||
(cons (format "https://~a.fandom.com/load.php?lang=en&modules=site.styles%7Cskin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles&only=styles&skin=fandomdesktop" wikiname)
|
||||
(build-path save-dir "site.css"))))
|
||||
(for ([style styles]
|
||||
[i (in-naturals)])
|
||||
(callback i (length styles) "styles...")
|
||||
(define r (get (car style)))
|
||||
(define body (response-body r))
|
||||
(display-to-file body (cdr style) #:exists 'replace)
|
||||
;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url?
|
||||
)
|
||||
(callback (length styles) (length styles) "styles...")
|
||||
styles)
|
||||
|
||||
(define (hash->save-dir wikiname hash)
|
||||
(build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2)))
|
||||
|
||||
(define (image-url->values i)
|
||||
;; TODO: handle case where there is multiple broken cb parameter on minecraft wiki
|
||||
;; TODO: ensure it still "works" with broken & on minecraft wiki
|
||||
(define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing
|
||||
(define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary
|
||||
(define hash (sha1 (string->bytes/utf-8 key)))
|
||||
(cons key hash))
|
||||
|
||||
|
||||
;; 1. Download list of wiki pages and store in database, if not done yet for that wiki
|
||||
(define (if-necessary-download-list-of-pages wikiname callback)
|
||||
(define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
|
||||
;; done yet?
|
||||
(unless (and (real? wiki-progress) (wiki-progress . >= . 1))
|
||||
;; Count total pages
|
||||
(define-values (num-pages namespaces) (insert-wiki-entry wikiname))
|
||||
;; Download the entire index of pages
|
||||
(for*/fold ([total 0])
|
||||
([namespace namespaces]
|
||||
[redir-filter '("nonredirects" "redirects")])
|
||||
(let loop ([apcontinue ""]
|
||||
[basenames null])
|
||||
(cond
|
||||
[apcontinue
|
||||
(define url (format "https://~a.fandom.com/api.php?~a"
|
||||
wikiname
|
||||
(params->query `(("action" . "query")
|
||||
("list" . "allpages")
|
||||
("apnamespace" . ,(~a namespace))
|
||||
("apfilterredir" . ,redir-filter)
|
||||
("aplimit" . "500")
|
||||
("apcontinue" . ,apcontinue)
|
||||
("format" . "json")
|
||||
("formatversion" . "2")))))
|
||||
;; Download the current listing page
|
||||
(define res (get url))
|
||||
(define json (response-json res))
|
||||
;; Content from this page
|
||||
(define current-basenames
|
||||
(for/list ([page (jp "/query/allpages" json)])
|
||||
(title->basename (jp "/title" page))))
|
||||
(when ((length current-basenames) . > . 0)
|
||||
;; Report
|
||||
(if (equal? redir-filter "nonredirects")
|
||||
(callback (+ (length basenames) (length current-basenames) total) num-pages (last current-basenames))
|
||||
(callback total num-pages (last current-basenames))))
|
||||
;; Loop
|
||||
(loop (jp "/continue/apcontinue" json #f) (append basenames current-basenames))]
|
||||
[else
|
||||
;; All done with this (loop)! Save those pages into the database
|
||||
;; SQLite can have a maximum of 32766 parameters in a single query
|
||||
(begin0
|
||||
;; next for*/fold
|
||||
(if (equal? redir-filter "nonredirects")
|
||||
(+ (length basenames) total)
|
||||
total) ; redirects don't count for the site statistics total
|
||||
(call-with-transaction
|
||||
(get-slc)
|
||||
(λ ()
|
||||
(for ([slice (in-slice 32760 basenames)])
|
||||
(define query-template
|
||||
(string-join #:before-first "insert or ignore into page (wikiname, redirect, basename, progress) values "
|
||||
(make-list (length slice) "(?1, ?2, ?, 0)") ", "))
|
||||
(apply query-exec* query-template wikiname (if (equal? redir-filter "redirects") 1 sql-null) slice)))))])))
|
||||
;; Record that we have the complete list of pages
|
||||
(query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname)))
|
||||
|
||||
|
||||
;; 2. Download each page via API and:
|
||||
;; * Save API response to file
|
||||
(define max-page-progress 1)
|
||||
(define (save-each-page wikiname callback)
|
||||
;; prepare destination folder
|
||||
(define save-dir (build-path archive-root wikiname))
|
||||
(make-directory* save-dir)
|
||||
;; gather list of basenames to download (that aren't yet complete)
|
||||
(define basenames (query-list* "select basename from page where wikiname = ? and progress < ? and redirect is null"
|
||||
wikiname max-page-progress))
|
||||
;; counter of complete/incomplete basenames
|
||||
(define already-done-count
|
||||
(query-value* "select count(*) from page where wikiname = ? and progress = ?"
|
||||
wikiname max-page-progress))
|
||||
(define not-done-count
|
||||
(query-value* "select count(*) from page where wikiname = ? and progress < ?"
|
||||
wikiname max-page-progress))
|
||||
(define total-count (+ already-done-count not-done-count))
|
||||
;; set initial progress
|
||||
(callback already-done-count total-count "")
|
||||
;; loop through basenames and download
|
||||
(for ([basename basenames]
|
||||
[i (in-naturals (add1 already-done-count))])
|
||||
(define name-for-query (basename->name-for-query basename))
|
||||
(define dest-url
|
||||
(format "https://~a.fandom.com/api.php?~a"
|
||||
wikiname
|
||||
(params->query `(("action" . "parse")
|
||||
("page" . ,name-for-query)
|
||||
("prop" . "text|headhtml|langlinks")
|
||||
("formatversion" . "2")
|
||||
("format" . "json")))))
|
||||
(define r (get dest-url))
|
||||
(define body (response-body r))
|
||||
(define filename (string-append basename ".json"))
|
||||
(define save-path
|
||||
(cond [((string-length basename) . > . 240)
|
||||
(define key (sha1 (string->bytes/latin-1 basename)))
|
||||
(query-exec* "insert into special_page (wikiname, key, basename) values (?, ?, ?)"
|
||||
wikiname key basename)
|
||||
(build-path save-dir (string-append key ".json"))]
|
||||
[#t
|
||||
(build-path save-dir (string-append basename ".json"))]))
|
||||
(display-to-file body save-path #:exists 'replace)
|
||||
(query-exec* "update page set progress = 1 where wikiname = ? and basename = ?"
|
||||
wikiname basename)
|
||||
(callback i total-count basename))
|
||||
;; save redirects as well
|
||||
(save-redirects wikiname callback (+ already-done-count (length basenames)) total-count)
|
||||
;; saved all pages, register that fact in the database
|
||||
(query-exec* "update wiki set progress = 2 where wikiname = ? and progress <= 2" wikiname))
|
||||
|
||||
|
||||
;; 2.5. Download each redirect-target via API and save mapping in database
|
||||
(define (save-redirects wikiname callback already-done-count total-count)
|
||||
(define basenames (query-list* "select basename from page where wikiname = ? and progress < ? and redirect = 1"
|
||||
wikiname max-page-progress))
|
||||
;; loop through basenames, in slices of 50 (MediaWiki API max per request), and download
|
||||
(for ([basename basenames]
|
||||
[i (in-naturals (add1 already-done-count))])
|
||||
(define dest-url
|
||||
(format "https://~a.fandom.com/api.php?~a"
|
||||
wikiname
|
||||
(params->query `(("action" . "query")
|
||||
("prop" . "links")
|
||||
("titles" . ,(basename->name-for-query basename))
|
||||
("format" . "json")
|
||||
("formatversion" . "2")))))
|
||||
(define res (get dest-url))
|
||||
(define json (response-json res))
|
||||
(define dest-title (jp "/query/pages/0/links/0/title" json #f))
|
||||
(callback i total-count basename)
|
||||
(cond
|
||||
[dest-title
|
||||
;; store it
|
||||
(define dest-basename (title->basename dest-title))
|
||||
(query-exec* "update page set progress = 1, redirect = ? where wikiname = ? and basename = ?" dest-basename wikiname basename)]
|
||||
[else
|
||||
;; the page just doesn't exist
|
||||
(query-exec* "delete from page where wikiname = ? and basename = ?" wikiname basename)])))
|
||||
|
||||
|
||||
;; 3. Download CSS and:
|
||||
;; * Save CSS to file
|
||||
;; * Record style images to database
|
||||
(define (if-necessary-download-and-check-styles wikiname callback)
|
||||
(define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname))
|
||||
(unless (and (number? wiki-progress) (wiki-progress . >= . 3))
|
||||
(define styles (download-styles-for-wiki wikiname callback))
|
||||
(define unique-image-urls
|
||||
(remove-duplicates
|
||||
(map image-url->values
|
||||
(flatten
|
||||
(for/list ([style styles])
|
||||
(check-style-for-images wikiname (cdr style)))))
|
||||
#:key cdr))
|
||||
(for ([pair unique-image-urls])
|
||||
(query-exec* "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair)))
|
||||
(query-exec* "update wiki set progress = 3 where wikiname = ?" wikiname)))
|
||||
|
||||
|
||||
;; 4: From downloaded pages, record URLs of image sources and inline style images to database
|
||||
(define (check-json-for-images wikiname path)
|
||||
(define data (with-input-from-file path (λ () (read-json))))
|
||||
(define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data))))
|
||||
(define tree (update-tree-wiki page wikiname))
|
||||
null
|
||||
#;(remove-duplicates
|
||||
(for/list ([element (in-producer
|
||||
(query-selector
|
||||
(λ (t a c)
|
||||
(and (eq? t 'img)
|
||||
(get-attribute 'src a)))
|
||||
tree)
|
||||
#f)])
|
||||
(image-url->values (get-attribute 'src (bits->attributes element))))))
|
||||
|
||||
|
||||
;; 5. Download image sources and style images according to database
|
||||
(define (save-each-image wikiname callback)
|
||||
(define source (hash-ref sources 'style)) ;; TODO: download entire wiki images instead?
|
||||
;; gather list of basenames to download (that aren't yet complete)
|
||||
(define rows (query-rows* "select url, hash from image where wikiname = ? and source <= ? and progress < 1"
|
||||
wikiname source))
|
||||
;; counter of complete/incomplete basenames
|
||||
(define already-done-count
|
||||
(query-value* "select count(*) from image where wikiname = ? and source <= ? and progress = 1"
|
||||
wikiname source))
|
||||
(define not-done-count
|
||||
(query-value* "select count(*) from image where wikiname = ? and source <= ? and progress < 1"
|
||||
wikiname source))
|
||||
;; set initial progress
|
||||
(callback already-done-count (+ already-done-count not-done-count) "")
|
||||
;; loop through urls and download
|
||||
(for ([row rows]
|
||||
[i (in-naturals 1)])
|
||||
;; row fragments
|
||||
(define url (vector-ref row 0))
|
||||
(define hash (vector-ref row 1))
|
||||
;; check
|
||||
#;(printf "~a -> ~a~n" url hash)
|
||||
(define r (get url #:timeouts (make-timeout-config #:connect 15)))
|
||||
(define declared-type (response-headers-ref r 'content-type))
|
||||
(define final-type (if (equal? declared-type #"application/octet-stream")
|
||||
(let ([sniff-entity (message-entity (mime-analyze (response-body r)))])
|
||||
(string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity))))
|
||||
declared-type))
|
||||
(define ext
|
||||
(with-handlers ([exn:fail:contract? (λ _ (error 'save-each-image "no ext found for mime type `~a` in file ~a" final-type url))])
|
||||
(bytes->string/latin-1 (mime-type->ext final-type))))
|
||||
;; save
|
||||
(define save-dir (hash->save-dir wikiname hash))
|
||||
(make-directory* save-dir)
|
||||
(define save-path (build-path save-dir (string-append hash "." ext)))
|
||||
(define body (response-body r))
|
||||
(display-to-file body save-path #:exists 'replace)
|
||||
(query-exec* "update image set progress = 1, ext = ? where wikiname = ? and hash = ?"
|
||||
ext wikiname hash)
|
||||
(callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append (substring hash 0 6) "..." ext)))
|
||||
;; saved all images, register that fact in the database
|
||||
(query-exec* "update wiki set progress = 4 where wikiname = ?" wikiname))
|
||||
|
||||
(define all-stages
|
||||
(list
|
||||
if-necessary-download-list-of-pages
|
||||
save-each-page
|
||||
if-necessary-download-and-check-styles
|
||||
;; check-json-for-images
|
||||
save-each-image))
|
||||
|
||||
(module+ test
|
||||
(check-equal? (html->xexp "<img src=\"https://example.com/images?src=Blah.jpg&width=150\">")
|
||||
'(*TOP* (img (@ (src "https://example.com/images?src=Blah.jpg&width=150")))))
|
||||
#;(download-list-of-pages "minecraft" values)
|
||||
#;(save-each-page "minecraft" values)
|
||||
#;(check-json-for-images "chiki" (build-path archive-root "chiki" "Fiona.json"))
|
||||
#;(do-step-3 "gallowmere")
|
||||
#;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))
|
||||
|
||||
#;(for ([wikiname (query-list* "select wikiname from wiki")])
|
||||
(println wikiname)
|
||||
(insert-wiki-entry wikiname))
|
||||
|
||||
#;(for ([wikiname (query-list* "select wikiname from wiki")])
|
||||
(println wikiname)
|
||||
(do-step-3 wikiname)
|
||||
(save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c)))))
|
||||
|
||||
; (for ([stage all-stages]) (stage "create" (λ (a b c) (printf "~a/~a ~a~n" a b c))))
|
213
archiver/fts.rkt
213
archiver/fts.rkt
|
@ -1,213 +0,0 @@
|
|||
#lang cli
|
||||
(require (for-syntax racket/base))
|
||||
(require racket/format
|
||||
racket/function
|
||||
racket/future
|
||||
racket/match
|
||||
racket/path
|
||||
racket/promise
|
||||
racket/port
|
||||
racket/runtime-path
|
||||
racket/sequence
|
||||
racket/string
|
||||
file/gunzip
|
||||
db
|
||||
db/unsafe/sqlite3
|
||||
net/http-easy
|
||||
json
|
||||
json-pointer
|
||||
"../lib/html-parsing/main.rkt"
|
||||
"../lib/xexpr-utils.rkt"
|
||||
"../lib/tree-updater.rkt")
|
||||
|
||||
(flag (read-from-cache?)
|
||||
("-c" "--read-from-cache" "read from last run cache instead of rebuilding documents")
|
||||
(read-from-cache? #t))
|
||||
|
||||
(define-runtime-path storage-path "../storage/archive")
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; Progress bar display
|
||||
;; ***************************************************************************************************
|
||||
|
||||
(struct progress^ (n max title) #:transparent)
|
||||
|
||||
(define (make-m-s seconds)
|
||||
(define-values (eta-m eta-s) (quotient/remainder seconds 60))
|
||||
(format "~a:~a" eta-m (~a eta-s #:width 2 #:align 'right #:pad-string "0")))
|
||||
|
||||
(define (make-progress get-p [history-size 20])
|
||||
(define update-sleep 1)
|
||||
(define name-width 30)
|
||||
(define max-width 105)
|
||||
(define history (make-vector history-size 0))
|
||||
(define history-pointer 0)
|
||||
(define elapsed 0)
|
||||
(define (report-progress)
|
||||
(define p (get-p))
|
||||
(define history-cycle (vector-ref history history-pointer))
|
||||
(vector-set! history history-pointer (progress^-n p))
|
||||
(set! history-pointer (modulo (add1 history-pointer) history-size))
|
||||
(set! elapsed (add1 elapsed))
|
||||
(define-values (eta-display diff-per-second)
|
||||
(cond
|
||||
[((progress^-n p) . >= . (progress^-max p)) (values (format "~a **" (make-m-s elapsed)) (format "** ~a" (quotient (progress^-max p) (max elapsed 1))))]
|
||||
[(= history-cycle 0) (values "-:--" "--")]
|
||||
[else (define diff-per-second (/ (- (progress^-n p) history-cycle) (* history-size update-sleep)))
|
||||
(define eta-total
|
||||
(if (diff-per-second . > . 0)
|
||||
(floor (round (/ (- (progress^-max p) (progress^-n p)) diff-per-second)))
|
||||
0))
|
||||
(values (make-m-s eta-total)
|
||||
(round diff-per-second))]))
|
||||
(define left (format "~a/~a ~a/s ~a ~a%"
|
||||
(~a (progress^-n p) #:width (string-length (~a (progress^-max p))) #:align 'right #:pad-string " ")
|
||||
(progress^-max p)
|
||||
diff-per-second
|
||||
eta-display
|
||||
(floor (* 100 (/ (progress^-n p) (progress^-max p))))))
|
||||
(define name-display (~a (progress^-title p) #:max-width name-width #:limit-marker "..."))
|
||||
(define remaining-space (- max-width name-width (string-length left) 2))
|
||||
(define bar-width
|
||||
(floor (* (sub1 remaining-space)
|
||||
(/ (progress^-n p) (progress^-max p)))))
|
||||
(define bar (string-append (make-string bar-width #\=)
|
||||
">"
|
||||
(make-string (- remaining-space bar-width) #\ )))
|
||||
(printf "\e[2K\r~a~a~a" left bar name-display)
|
||||
(flush-output))
|
||||
(define (report-progress-loop)
|
||||
(sleep update-sleep)
|
||||
(report-progress)
|
||||
(report-progress-loop))
|
||||
(define t (thread report-progress-loop))
|
||||
(define (quit)
|
||||
(kill-thread t)
|
||||
(report-progress)
|
||||
(displayln ""))
|
||||
quit)
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; Page text extractor
|
||||
;; ***************************************************************************************************
|
||||
|
||||
(define (class-has? attributes substrs)
|
||||
(define cl (or (get-attribute 'class attributes) ""))
|
||||
(ormap (λ (substr) (string-contains? cl substr)) substrs))
|
||||
|
||||
(define (updater element element-type attributes children)
|
||||
(cond
|
||||
[(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews"))
|
||||
(list 'div '() '())]
|
||||
[#t
|
||||
(list element-type attributes children)]))
|
||||
|
||||
(define (writer tables-mode? page)
|
||||
(define (writer-inner page)
|
||||
(for ([bit page])
|
||||
(cond
|
||||
[(and tables-mode? (pair? bit) (memq (car bit) '(h1 h2 h3 p blockquote q))) (void)]
|
||||
[(and (not tables-mode?) (pair? bit) (memq (car bit) '(ul ol dl table))) (void)]
|
||||
[(memq bit '(div p li td dd dt br)) (displayln "")]
|
||||
[(symbol? bit) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '@)) (void)]
|
||||
[(pair? bit) (writer-inner bit)]
|
||||
[(string? bit) (display bit)])))
|
||||
(writer-inner page))
|
||||
|
||||
(define (write-and-post-process tables-mode? page)
|
||||
(define text (with-output-to-string (λ () (writer tables-mode? page))))
|
||||
;; (define text-no-numbers (regexp-replace* #px"(?:-|[+$£€¥] *)?[0-9,.]{2,}%?\\s*" text ""))
|
||||
(define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n"))
|
||||
shrink-text)
|
||||
|
||||
(define ((extract f)) ; f - filename
|
||||
(with-handlers
|
||||
([exn:fail? (λ (err) (printf "extract: ~a: ~v~n" f err))])
|
||||
(define j
|
||||
(case (path-get-extension f)
|
||||
[(#".json")
|
||||
(with-input-from-file f (λ () (read-json)))]
|
||||
[(#".gz")
|
||||
(define-values (in out) (make-pipe))
|
||||
(with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out)))
|
||||
(read-json in)]
|
||||
[else #f]))
|
||||
(define title (json-pointer-value "/parse/title" j))
|
||||
(define pageid (json-pointer-value "/parse/pageid" j))
|
||||
(define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j)))
|
||||
(define page (update-tree updater (html->xexp page-html)))
|
||||
(define body (write-and-post-process #f page))
|
||||
(define table (write-and-post-process #t page))
|
||||
(list title body table pageid)))
|
||||
|
||||
;; ***************************************************************************************************
|
||||
;; Program, loop, Solr APIs
|
||||
;; ***************************************************************************************************
|
||||
|
||||
(program
|
||||
(start [wikiname "wikiname to download"])
|
||||
|
||||
(define results
|
||||
(for/list ([f (directory-list (build-path storage-path wikiname) #:build? #t)]
|
||||
#:when (member (path-get-extension f) '(#".gz")))
|
||||
(extract f)))
|
||||
|
||||
(define data
|
||||
(cond
|
||||
[(and (read-from-cache?) (file-exists? "cache.rkt"))
|
||||
(define size (file-size "cache.rkt"))
|
||||
(call-with-input-file "cache.rkt"
|
||||
(λ (in)
|
||||
(define quit (make-progress (λ () (progress^ (ceiling (/ (file-position in) 64 1024))
|
||||
(ceiling (/ size 64 1024))
|
||||
"Reading in..."))
|
||||
2))
|
||||
(begin0
|
||||
(read in)
|
||||
(quit))))]
|
||||
[else
|
||||
(define x (box (progress^ 0 1 "...")))
|
||||
(define quit (make-progress (λ () (unbox x))))
|
||||
(define data
|
||||
(for/list ([fut results]
|
||||
[i (in-naturals 1)]
|
||||
#:do [(define page (fut))]
|
||||
#:when (not (void? page)))
|
||||
(match-define (list title body table pageid) page)
|
||||
(define len (string-length body))
|
||||
(set-box! x (progress^ i (length results) title))
|
||||
`#hasheq((id . ,(number->string pageid))
|
||||
(title . ,title)
|
||||
(body . ,body)
|
||||
(table . ,table)
|
||||
(len . ,len))))
|
||||
(quit)
|
||||
|
||||
(display "Writing out... ")
|
||||
(flush-output)
|
||||
(with-output-to-file "cache.rkt" (λ () (write data)) #:exists 'truncate/replace)
|
||||
data]))
|
||||
|
||||
(display "Converting... ")
|
||||
(flush-output)
|
||||
(define slice-size 30000)
|
||||
(define slices (ceiling (/ (length data) slice-size)))
|
||||
(for ([slice (in-slice slice-size data)]
|
||||
[i (in-naturals 1)])
|
||||
(define ser (jsexpr->bytes slice))
|
||||
(define ser-port (open-input-bytes ser))
|
||||
(define quit (make-progress (λ () (progress^ (ceiling (/ (file-position ser-port) 64 1024))
|
||||
(ceiling (/ (bytes-length ser) 64 1024))
|
||||
(format "Posting... (~a/~a)" i slices)))
|
||||
2))
|
||||
(define res
|
||||
(post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname)
|
||||
#:data ser-port
|
||||
#:headers '#hasheq((Content-Type . "application/json"))
|
||||
#:timeouts (make-timeout-config #:lease 5 #:connect 5 #:request 300)))
|
||||
(quit)
|
||||
(displayln (response-status-line res))))
|
||||
|
||||
(run start)
|
|
@ -1,3 +0,0 @@
|
|||
#lang info
|
||||
|
||||
(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo" "net-cookies-lib" "gui-easy-lib" "sql" "charterm" "cli"))
|
|
@ -1 +0,0 @@
|
|||
((local (".")))
|
|
@ -1,8 +0,0 @@
|
|||
# Set of Catalan contractions for ElisionFilter
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
d
|
||||
l
|
||||
m
|
||||
n
|
||||
s
|
||||
t
|
|
@ -1,15 +0,0 @@
|
|||
# Set of French contractions for ElisionFilter
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
l
|
||||
m
|
||||
t
|
||||
qu
|
||||
n
|
||||
s
|
||||
j
|
||||
d
|
||||
c
|
||||
jusqu
|
||||
quoiqu
|
||||
lorsqu
|
||||
puisqu
|
|
@ -1,5 +0,0 @@
|
|||
# Set of Irish contractions for ElisionFilter
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
d
|
||||
m
|
||||
b
|
|
@ -1,23 +0,0 @@
|
|||
# Set of Italian contractions for ElisionFilter
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
c
|
||||
l
|
||||
all
|
||||
dall
|
||||
dell
|
||||
nell
|
||||
sull
|
||||
coll
|
||||
pell
|
||||
gl
|
||||
agl
|
||||
dagl
|
||||
degl
|
||||
negl
|
||||
sugl
|
||||
un
|
||||
m
|
||||
t
|
||||
s
|
||||
v
|
||||
d
|
|
@ -1,5 +0,0 @@
|
|||
# Set of Irish hyphenations for StopFilter
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
h
|
||||
n
|
||||
t
|
|
@ -1,6 +0,0 @@
|
|||
# Set of overrides for the dutch stemmer
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
fiets fiets
|
||||
bromfiets bromfiets
|
||||
ei eier
|
||||
kind kinder
|
|
@ -1,420 +0,0 @@
|
|||
#
|
||||
# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
|
||||
#
|
||||
# Any token with a part-of-speech tag that exactly matches those defined in this
|
||||
# file are removed from the token stream.
|
||||
#
|
||||
# Set your own stoptags by uncommenting the lines below. Note that comments are
|
||||
# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists,
|
||||
# etc. that can be useful for building you own stoptag set.
|
||||
#
|
||||
# The entire possible tagset is provided below for convenience.
|
||||
#
|
||||
#####
|
||||
# noun: unclassified nouns
|
||||
#名詞
|
||||
#
|
||||
# noun-common: Common nouns or nouns where the sub-classification is undefined
|
||||
#名詞-一般
|
||||
#
|
||||
# noun-proper: Proper nouns where the sub-classification is undefined
|
||||
#名詞-固有名詞
|
||||
#
|
||||
# noun-proper-misc: miscellaneous proper nouns
|
||||
#名詞-固有名詞-一般
|
||||
#
|
||||
# noun-proper-person: Personal names where the sub-classification is undefined
|
||||
#名詞-固有名詞-人名
|
||||
#
|
||||
# noun-proper-person-misc: names that cannot be divided into surname and
|
||||
# given name; foreign names; names where the surname or given name is unknown.
|
||||
# e.g. お市の方
|
||||
#名詞-固有名詞-人名-一般
|
||||
#
|
||||
# noun-proper-person-surname: Mainly Japanese surnames.
|
||||
# e.g. 山田
|
||||
#名詞-固有名詞-人名-姓
|
||||
#
|
||||
# noun-proper-person-given_name: Mainly Japanese given names.
|
||||
# e.g. 太郎
|
||||
#名詞-固有名詞-人名-名
|
||||
#
|
||||
# noun-proper-organization: Names representing organizations.
|
||||
# e.g. 通産省, NHK
|
||||
#名詞-固有名詞-組織
|
||||
#
|
||||
# noun-proper-place: Place names where the sub-classification is undefined
|
||||
#名詞-固有名詞-地域
|
||||
#
|
||||
# noun-proper-place-misc: Place names excluding countries.
|
||||
# e.g. アジア, バルセロナ, 京都
|
||||
#名詞-固有名詞-地域-一般
|
||||
#
|
||||
# noun-proper-place-country: Country names.
|
||||
# e.g. 日本, オーストラリア
|
||||
#名詞-固有名詞-地域-国
|
||||
#
|
||||
# noun-pronoun: Pronouns where the sub-classification is undefined
|
||||
#名詞-代名詞
|
||||
#
|
||||
# noun-pronoun-misc: miscellaneous pronouns:
|
||||
# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ
|
||||
#名詞-代名詞-一般
|
||||
#
|
||||
# noun-pronoun-contraction: Spoken language contraction made by combining a
|
||||
# pronoun and the particle 'wa'.
|
||||
# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ
|
||||
#名詞-代名詞-縮約
|
||||
#
|
||||
# noun-adverbial: Temporal nouns such as names of days or months that behave
|
||||
# like adverbs. Nouns that represent amount or ratios and can be used adverbially,
|
||||
# e.g. 金曜, 一月, 午後, 少量
|
||||
#名詞-副詞可能
|
||||
#
|
||||
# noun-verbal: Nouns that take arguments with case and can appear followed by
|
||||
# 'suru' and related verbs (する, できる, なさる, くださる)
|
||||
# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り
|
||||
#名詞-サ変接続
|
||||
#
|
||||
# noun-adjective-base: The base form of adjectives, words that appear before な ("na")
|
||||
# e.g. 健康, 安易, 駄目, だめ
|
||||
#名詞-形容動詞語幹
|
||||
#
|
||||
# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数.
|
||||
# e.g. 0, 1, 2, 何, 数, 幾
|
||||
#名詞-数
|
||||
#
|
||||
# noun-affix: noun affixes where the sub-classification is undefined
|
||||
#名詞-非自立
|
||||
#
|
||||
# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that
|
||||
# attach to the base form of inflectional words, words that cannot be classified
|
||||
# into any of the other categories below. This category includes indefinite nouns.
|
||||
# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第,
|
||||
# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み,
|
||||
# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳,
|
||||
# わり, 割り, 割, ん-口語/, もん-口語/
|
||||
#名詞-非自立-一般
|
||||
#
|
||||
# noun-affix-adverbial: noun affixes that that can behave as adverbs.
|
||||
# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ,
|
||||
# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか,
|
||||
# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所,
|
||||
# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま,
|
||||
# 儘, 侭, みぎり, 矢先
|
||||
#名詞-非自立-副詞可能
|
||||
#
|
||||
# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars
|
||||
# with the stem よう(だ) ("you(da)").
|
||||
# e.g. よう, やう, 様 (よう)
|
||||
#名詞-非自立-助動詞語幹
|
||||
#
|
||||
# noun-affix-adjective-base: noun affixes that can connect to the indeclinable
|
||||
# connection form な (aux "da").
|
||||
# e.g. みたい, ふう
|
||||
#名詞-非自立-形容動詞語幹
|
||||
#
|
||||
# noun-special: special nouns where the sub-classification is undefined.
|
||||
#名詞-特殊
|
||||
#
|
||||
# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is
|
||||
# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base
|
||||
# form of inflectional words.
|
||||
# e.g. そう
|
||||
#名詞-特殊-助動詞語幹
|
||||
#
|
||||
# noun-suffix: noun suffixes where the sub-classification is undefined.
|
||||
#名詞-接尾
|
||||
#
|
||||
# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect
|
||||
# to ガル or タイ and can combine into compound nouns, words that cannot be classified into
|
||||
# any of the other categories below. In general, this category is more inclusive than
|
||||
# 接尾語 ("suffix") and is usually the last element in a compound noun.
|
||||
# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み,
|
||||
# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用
|
||||
#名詞-接尾-一般
|
||||
#
|
||||
# noun-suffix-person: Suffixes that form nouns and attach to person names more often
|
||||
# than other nouns.
|
||||
# e.g. 君, 様, 著
|
||||
#名詞-接尾-人名
|
||||
#
|
||||
# noun-suffix-place: Suffixes that form nouns and attach to place names more often
|
||||
# than other nouns.
|
||||
# e.g. 町, 市, 県
|
||||
#名詞-接尾-地域
|
||||
#
|
||||
# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that
|
||||
# can appear before スル ("suru").
|
||||
# e.g. 化, 視, 分け, 入り, 落ち, 買い
|
||||
#名詞-接尾-サ変接続
|
||||
#
|
||||
# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions,
|
||||
# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the
|
||||
# conjunctive form of inflectional words.
|
||||