forked from cadence/breezewiki
Create archiver and offline code handlers
Somewhat messy. Will clean up gradually in future commits.
This commit is contained in:
parent
b8e5fb8dc5
commit
c7cce5479d
46 changed files with 4274 additions and 407 deletions
28
lib/archive-file-mappings.rkt
Normal file
28
lib/archive-file-mappings.rkt
Normal file
|
@ -0,0 +1,28 @@
|
|||
#lang racket/base
|
||||
(require racket/string
|
||||
net/url
|
||||
(only-in net/uri-codec uri-decode)
|
||||
"url-utils.rkt")
|
||||
(provide
|
||||
local-encoded-url->segments
|
||||
url-segments->basename
|
||||
local-encoded-url->basename
|
||||
basename->name-for-query
|
||||
url-segments->guess-title)
|
||||
|
||||
(define (local-encoded-url->segments str) ; '("wiki" "Page_title")
|
||||
(map path/param-path (url-path (string->url str))))
|
||||
|
||||
(define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix
|
||||
(define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments)))
|
||||
(define basic-filename (string-join extra-encoded "#"))
|
||||
basic-filename)
|
||||
|
||||
(define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix
|
||||
(url-segments->basename (local-encoded-url->segments str)))
|
||||
|
||||
(define (basename->name-for-query str)
|
||||
(uri-decode (regexp-replace* #rx"#" str "/")))
|
||||
|
||||
(define (url-segments->guess-title segments)
|
||||
(regexp-replace* #rx"_" (cadr segments) " "))
|
1887
lib/html-parsing/main.rkt
Normal file
1887
lib/html-parsing/main.rkt
Normal file
File diff suppressed because it is too large
Load diff
34
lib/mime-types.rkt
Normal file
34
lib/mime-types.rkt
Normal file
|
@ -0,0 +1,34 @@
|
|||
#lang racket/base
|
||||
(require racket/contract
|
||||
racket/match
|
||||
racket/path
|
||||
racket/runtime-path
|
||||
racket/string)
|
||||
|
||||
(provide
|
||||
(contract-out
|
||||
[ext->mime-type (-> bytes? bytes?)]
|
||||
[mime-type->ext (-> bytes? bytes?)]))
|
||||
|
||||
(define-runtime-path mime.types-path "mime.types")
|
||||
|
||||
(define ls
|
||||
(call-with-input-file mime.types-path
|
||||
(λ (in) (for/list ([line (in-lines in)]
|
||||
#:when (not (regexp-match? #rx"^ *($|#)" line)))
|
||||
(match line
|
||||
[(regexp #rx"^([^ ]+) +(.+)$" (list _ mime ext))
|
||||
(cons (string->bytes/utf-8 ext) (string->bytes/utf-8 mime))]
|
||||
[(regexp #rx"^ *#") (void)]
|
||||
[_ (log-warning "mime-types: failed to parse line ~s" line)])))))
|
||||
|
||||
(define forward-hash (make-immutable-hash ls))
|
||||
(define reverse-hash (make-immutable-hash (map (λ (x) (cons (cdr x) (car x))) ls)))
|
||||
|
||||
(define (ext->mime-type ext-in)
|
||||
(define ext (regexp-replace #rx"^\\." ext-in #""))
|
||||
(hash-ref forward-hash ext))
|
||||
|
||||
(define (mime-type->ext m-in)
|
||||
(define m (regexp-replace #rx";.*" m-in #""))
|
||||
(hash-ref reverse-hash m))
|
85
lib/mime.types
Normal file
85
lib/mime.types
Normal file
|
@ -0,0 +1,85 @@
|
|||
text/html html
|
||||
text/css css
|
||||
text/xml xml
|
||||
image/gif gif
|
||||
image/jpeg jpeg
|
||||
application/javascript js
|
||||
text/javascript js
|
||||
application/atom+xml atom
|
||||
application/rss+xml rss
|
||||
|
||||
text/mathml mml
|
||||
text/plain txt
|
||||
text/x-component htc
|
||||
|
||||
image/png png
|
||||
image/tiff tiff
|
||||
image/vnd.wap.wbmp wbmp
|
||||
image/x-icon ico
|
||||
image/x-jng jng
|
||||
image/x-ms-bmp bmp
|
||||
image/svg+xml svg
|
||||
image/webp webp
|
||||
|
||||
application/font-woff2 woff2
|
||||
application/acad woff2
|
||||
font/woff2 woff2
|
||||
application/font-woff woff
|
||||
application/x-font-ttf ttf
|
||||
application/x-font-truetype ttf
|
||||
application/x-truetype-font ttf
|
||||
application/font-sfnt ttf
|
||||
font/sfnt ttf
|
||||
application/vnd.oasis.opendocument.formula-template otf
|
||||
application/x-font-opentype otf
|
||||
application/vnd.ms-opentype otf
|
||||
font/otf otf
|
||||
application/java-archive jar
|
||||
application/json json
|
||||
application/mac-binhex40 hqx
|
||||
application/msword doc
|
||||
application/pdf pdf
|
||||
application/postscript ps
|
||||
application/rtf rtf
|
||||
application/vnd.apple.mpegurl m3u8
|
||||
application/vnd.ms-excel xls
|
||||
application/vnd.ms-fontobject eot
|
||||
application/vnd.ms-powerpoint ppt
|
||||
application/vnd.wap.wmlc wmlc
|
||||
application/vnd.google-earth.kml+xml kml
|
||||
application/vnd.google-earth.kmz kmz
|
||||
application/x-7z-compressed 7z
|
||||
application/x-cocoa cco
|
||||
application/x-java-archive-diff jardiff
|
||||
application/x-java-jnlp-file jnlp
|
||||
application/x-makeself run
|
||||
application/x-perl pl
|
||||
application/x-rar-compressed rar
|
||||
application/x-redhat-package-manager rpm
|
||||
application/x-sea sea
|
||||
application/x-shockwave-flash swf
|
||||
application/x-stuffit sit
|
||||
application/x-tcl tcl
|
||||
application/x-x509-ca-cert pem
|
||||
application/x-xpinstall xpi
|
||||
application/xhtml+xml xhtml
|
||||
application/xspf+xml xspf
|
||||
application/zip zip
|
||||
application/gzip gz
|
||||
|
||||
audio/midi mid midi kar
|
||||
audio/mpeg mp3
|
||||
audio/ogg ogg
|
||||
audio/x-m4a m4a
|
||||
audio/x-realaudio ra
|
||||
|
||||
video/mp2t ts
|
||||
video/mp4 mp4
|
||||
video/mpeg mpeg
|
||||
video/quicktime mov
|
||||
video/webm webm
|
||||
video/x-flv flv
|
||||
video/x-m4v m4v
|
||||
video/x-mng mng
|
||||
video/x-ms-wmv wmv
|
||||
video/x-msvideo avi
|
45
lib/pure-utils.rkt
Normal file
45
lib/pure-utils.rkt
Normal file
|
@ -0,0 +1,45 @@
|
|||
#lang typed/racket/base
|
||||
|
||||
(provide
|
||||
; call the updater on the dictionary key only if it has that key
|
||||
alist-maybe-update
|
||||
; update a value only if a condition succeeds on it
|
||||
u
|
||||
; like string-join, but for lists
|
||||
list-join
|
||||
u-counter)
|
||||
|
||||
(module+ test
|
||||
(require "typed-rackunit.rkt"))
|
||||
|
||||
(define u-counter (box 0))
|
||||
|
||||
(: alist-maybe-update (∀ (A B) ((Listof (Pairof A B)) A (B -> B) -> (Listof (Pairof A B)))))
|
||||
(define (alist-maybe-update alist key updater)
|
||||
(set-box! u-counter (add1 (unbox u-counter)))
|
||||
(map (λ ([p : (Pairof A B)])
|
||||
(if (eq? (car p) key)
|
||||
(cons (car p) (updater (cdr p)))
|
||||
p))
|
||||
alist))
|
||||
(module+ test
|
||||
(check-equal? (alist-maybe-update '((a . 5) (b . 6)) 'a (λ ([x : Number]) (+ x 10)))
|
||||
'((a . 15) (b . 6)))
|
||||
(check-equal? (alist-maybe-update '((b . 6)) 'a (λ ([x : Number]) (+ x 10)))
|
||||
'((b . 6))))
|
||||
|
||||
(: u (∀ (A) ((A -> Any) (A -> A) A -> A)))
|
||||
(define (u condition updater value)
|
||||
(set-box! u-counter (add1 (unbox u-counter)))
|
||||
(if (condition value) (updater value) value))
|
||||
(module+ test
|
||||
(check-equal? (u (λ ([x : Integer]) (< x 5)) (λ ([x : Integer]) (* x -1)) 4) -4)
|
||||
(check-equal? (u (λ ([x : Integer]) (< x 5)) (λ ([x : Integer]) (* x -1)) 8) 8))
|
||||
|
||||
(: list-join (∀ (A B) (A (Listof B) -> (Listof (U A B)))))
|
||||
(define (list-join element ls)
|
||||
(if (pair? (cdr ls))
|
||||
(list* (car ls) element (list-join element (cdr ls)))
|
||||
(list (car ls))))
|
||||
(module+ test
|
||||
(check-equal? (list-join "h" '(2 3 4 5)) '(2 "h" 3 "h" 4 "h" 5)))
|
161
lib/syntax.rkt
Normal file
161
lib/syntax.rkt
Normal file
|
@ -0,0 +1,161 @@
|
|||
#lang racket/base
|
||||
(require (for-syntax racket/base))
|
||||
|
||||
(provide
|
||||
; help make a nested if. if/in will gain the same false form of its containing if/out.
|
||||
if/out
|
||||
; let, but the value for each variable is evaluated within a thread
|
||||
thread-let
|
||||
; cond, but values can be defined between conditions
|
||||
cond/var
|
||||
; wrap sql statements into lambdas so they can be executed during migration
|
||||
wrap-sql)
|
||||
|
||||
(module+ test
|
||||
(require rackunit)
|
||||
(define (check-syntax-equal? s1 s2)
|
||||
(check-equal? (syntax->datum s1)
|
||||
(syntax->datum s2))))
|
||||
|
||||
;; actual transforming goes on in here.
|
||||
;; it's in a submodule so that it can be required in both levels, for testing
|
||||
|
||||
(module transform racket/base
|
||||
(require racket/list)
|
||||
|
||||
(provide
|
||||
transform-if/out
|
||||
transform-thread-let
|
||||
transform/out-cond/var)
|
||||
|
||||
(define (transform-if/out stx)
|
||||
(define tree (cdr (syntax->datum stx))) ; condition true false
|
||||
(define else (cddr tree)) ; the else branch cons cell
|
||||
(define result
|
||||
(let walk ([node tree])
|
||||
(cond
|
||||
; normally, node should be a full cons cell (a pair) but it might be something else.
|
||||
; situation: reached the end of a list, empty cons cell
|
||||
[(null? node) node]
|
||||
; situation: reached the end of a list, cons cdr was non-list
|
||||
[(symbol? node) node]
|
||||
; normal situation, full cons cell
|
||||
; -- don't go replacing through nested if/out
|
||||
[(and (pair? node) (eq? 'if/out (car node))) node]
|
||||
; -- replace if/in
|
||||
[(and (pair? node) (eq? 'if/in (car node)))
|
||||
(append '(if) (walk (cdr node)) else)]
|
||||
; recurse down pair head and tail
|
||||
[(pair? node) (cons (walk (car node)) (walk (cdr node)))]
|
||||
; something else that can't be recursed into, so pass it through
|
||||
[#t node])))
|
||||
(datum->syntax stx (cons 'if result)))
|
||||
|
||||
(define (transform-thread-let stx)
|
||||
(define tree (cdr (syntax->datum stx)))
|
||||
(define defs (car tree))
|
||||
(define forms (cdr tree))
|
||||
(when (eq? (length forms) 0)
|
||||
(error (format "thread-let: bad syntax (need some forms to execute after the threads)~n forms: ~a" forms)))
|
||||
(define counter (build-list (length defs) values))
|
||||
(datum->syntax
|
||||
stx
|
||||
`(let ([chv (build-vector ,(length defs) (λ (_) (make-channel)))])
|
||||
,@(map (λ (n)
|
||||
(define def (list-ref defs n))
|
||||
`(thread (λ () (channel-put (vector-ref chv ,n) (let _ () ,@(cdr def))))))
|
||||
counter)
|
||||
(let ,(map (λ (n)
|
||||
(define def (list-ref defs n))
|
||||
`(,(car def) (channel-get (vector-ref chv ,n))))
|
||||
counter)
|
||||
,@forms))))
|
||||
|
||||
(define (transform/out-cond/var stx)
|
||||
(define tree (transform-cond/var (cdr (syntax->datum stx))))
|
||||
(datum->syntax
|
||||
stx
|
||||
tree))
|
||||
|
||||
(define (transform-cond/var tree)
|
||||
(define-values (els temp) (splitf-at tree (λ (el) (and (pair? el) (not (eq? (car el) 'var))))))
|
||||
(define-values (vars rest) (splitf-at temp (λ (el) (and (pair? el) (eq? (car el) 'var)))))
|
||||
(if (null? rest)
|
||||
`(cond ,@els)
|
||||
`(cond
|
||||
,@els
|
||||
[#t
|
||||
(let ,(for/list ([var vars])
|
||||
(cdr var))
|
||||
,(transform-cond/var rest))]))))
|
||||
|
||||
;; the syntax definitions and their tests go below here
|
||||
|
||||
(require 'transform (for-syntax 'transform))
|
||||
|
||||
(define-syntax (wrap-sql stx)
|
||||
; the arguments
|
||||
(define xs (cdr (syntax->list stx)))
|
||||
; wrap each argument
|
||||
(define wrapped (map (λ (xe) ; xe is the syntax of an argument
|
||||
(if (list? (car (syntax->datum xe)))
|
||||
; it's a list of lists (a list of sql migration steps)
|
||||
; return instead syntax of a lambda that will call everything in xe
|
||||
(datum->syntax stx `(λ () ,@xe))
|
||||
; it's just a single sql migration step
|
||||
; return instead syntax of a lambda that will call xe
|
||||
(datum->syntax stx `(λ () ,xe))))
|
||||
xs))
|
||||
; since I'm returning *code*, I need to return the form (list ...) so that runtime makes a list
|
||||
(datum->syntax stx `(list ,@wrapped)))
|
||||
|
||||
(define-syntax (if/out stx)
|
||||
(transform-if/out stx))
|
||||
(module+ test
|
||||
(check-syntax-equal? (transform-if/out #'(if/out (condition 1) (if/in (condition 2) (do-yes)) (do-no)))
|
||||
#'(if (condition 1) (if (condition 2) (do-yes) (do-no)) (do-no)))
|
||||
(check-equal? (if/out #t (if/in #t 'yes) 'no) 'yes)
|
||||
(check-equal? (if/out #f (if/in #t 'yes) 'no) 'no)
|
||||
(check-equal? (if/out #t (if/in #f 'yes) 'no) 'no)
|
||||
(check-equal? (if/out #f (if/in #f 'yes) 'no) 'no))
|
||||
|
||||
(define-syntax (thread-let stx)
|
||||
(transform-thread-let stx))
|
||||
(module+ test
|
||||
; check that it is transformed as expected
|
||||
(check-syntax-equal?
|
||||
(transform-thread-let
|
||||
#'(thread-let ([a (hey "this is a")]
|
||||
[b (hey "this is b")])
|
||||
(list a b)))
|
||||
#'(let ([chv (build-vector 2 (λ (_) (make-channel)))])
|
||||
(thread (λ () (channel-put (vector-ref chv 0) (let _ () (hey "this is a")))))
|
||||
(thread (λ () (channel-put (vector-ref chv 1) (let _ () (hey "this is b")))))
|
||||
(let ([a (channel-get (vector-ref chv 0))]
|
||||
[b (channel-get (vector-ref chv 1))])
|
||||
(list a b))))
|
||||
; check that they actually execute concurrently
|
||||
(define ch (make-channel))
|
||||
(check-equal? (thread-let ([a (begin
|
||||
(channel-put ch 'a)
|
||||
(channel-get ch))]
|
||||
[b (begin0
|
||||
(channel-get ch)
|
||||
(channel-put ch 'b))])
|
||||
(list a b))
|
||||
'(b a))
|
||||
; check that it assigns the correct value to the correct variable
|
||||
(check-equal? (thread-let ([a (sleep 0) 'a] [b 'b]) (list a b))
|
||||
'(a b)))
|
||||
|
||||
(define-syntax (cond/var stx)
|
||||
(transform/out-cond/var stx))
|
||||
(module+ test
|
||||
(check-syntax-equal? (transform/out-cond/var #'(cond/def [#f 0] (var d (* a 2)) [(eq? d 8) d] [#t "not 4"]))
|
||||
#'(cond
|
||||
[#f 0]
|
||||
[#t
|
||||
(let ([d (* a 2)])
|
||||
(cond
|
||||
[(eq? d 8) d]
|
||||
[#t "not 4"]))])))
|
284
lib/tree-updater.rkt
Normal file
284
lib/tree-updater.rkt
Normal file
|
@ -0,0 +1,284 @@
|
|||
#lang racket/base
|
||||
(require racket/dict
|
||||
racket/function
|
||||
racket/match
|
||||
racket/string
|
||||
"pure-utils.rkt"
|
||||
"url-utils.rkt"
|
||||
"xexpr-utils.rkt")
|
||||
|
||||
(provide
|
||||
preprocess-html-wiki
|
||||
update-tree-wiki)
|
||||
|
||||
(define (preprocess-html-wiki html)
|
||||
(define ((rr* find replace) contents)
|
||||
(regexp-replace* find contents replace))
|
||||
((compose1
|
||||
; fix navbox list nesting
|
||||
; navbox on right of page has incorrect html "<td ...><li>" and the xexpr parser puts the <li> much further up the tree
|
||||
; add a <ul> to make the parser happy
|
||||
; usage: /fallout/wiki/Fallout:_New_Vegas_achievements_and_trophies
|
||||
(rr* #rx"(<td[^>]*>\n?)(<li>)" "\\1<ul>\\2")
|
||||
; change <figcaption><p> to <figcaption><span> to make the parser happy
|
||||
(rr* #rx"(<figcaption[^>]*>)[ \t]*<p class=\"caption\">([^<]*)</p>" "\\1<span class=\"caption\">\\2</span>"))
|
||||
html))
|
||||
(module+ test
|
||||
(check-equal? (preprocess-html-wiki "<td class=\"va-navbox-column\" style=\"width: 33%\">\n<li>Hey</li>")
|
||||
"<td class=\"va-navbox-column\" style=\"width: 33%\">\n<ul><li>Hey</li>")
|
||||
(check-equal? (preprocess-html-wiki "<figure class=\"thumb tright\" style=\"width: 150px\"><a class=\"image\"><img></a><noscript><a><img></a></noscript><figcaption class=\"thumbcaption\"> <p class=\"caption\">Caption text.</p></figcaption></figure>")
|
||||
"<figure class=\"thumb tright\" style=\"width: 150px\"><a class=\"image\"><img></a><noscript><a><img></a></noscript><figcaption class=\"thumbcaption\"><span class=\"caption\">Caption text.</span></figcaption></figure>"))
|
||||
|
||||
(module+ test
|
||||
(require rackunit
|
||||
"html-parsing/main.rkt")
|
||||
(define wiki-document
|
||||
'(*TOP*
|
||||
(div (@ (class "mw-parser-output"))
|
||||
(aside (@ (role "region") (class "portable-infobox pi-theme-wikia pi-layout-default"))
|
||||
(h2 (@ (class "pi-item pi-title") (data-source "title"))
|
||||
"Infobox Title")
|
||||
(figure (@ (class "pi-item pi-image") (data-source "image"))
|
||||
(a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image image-thumbnail") (title ""))
|
||||
(img (@ (src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") (class "pi-image-thumbnail")))))
|
||||
(div (@ (class "pi-item pi-data") (data-source "description"))
|
||||
(h3 (@ (class "pi-data-label"))
|
||||
"Description")
|
||||
(div (@ (class "pi-data-value"))
|
||||
"Mystery infobox!")))
|
||||
(div (@ (data-test-collapsesection) (class "collapsible collapsetoggle-inline collapsed"))
|
||||
(i (b "This section is hidden for dramatic effect."))
|
||||
(div (@ (class "collapsible-content"))
|
||||
(p "Another page link: "
|
||||
(a (@ (data-test-wikilink) (href "https://test.fandom.com/wiki/Another_Page") (title "Another Page"))
|
||||
"Another Page"))))
|
||||
(figure (@ (class "thumb tnone"))
|
||||
(a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image") (data-test-figure-a))
|
||||
(img (@ (src "%3D%3D")
|
||||
(data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png")
|
||||
(class "thumbimage lazyload"))))
|
||||
(noscript
|
||||
(a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image"))
|
||||
(img (@ (src "https://static.wikia.nocookie.net/nice-image-thumbnail.png")
|
||||
(data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png")
|
||||
(class "thumbimage")))))
|
||||
(figcaption "Test figure!"))
|
||||
(iframe (@ (src "https://example.com/iframe-src")))))))
|
||||
|
||||
(define (updater wikiname #:strict-proxy? [strict-proxy? #f])
|
||||
(define classlist-updater
|
||||
(compose1
|
||||
; uncollapse all navbox items (bottom of page mass navigation)
|
||||
(curry u
|
||||
(λ (classlist) (and ; removed due to scoping, would improve peformance (eq? element-type 'table)
|
||||
(member "navbox" classlist)
|
||||
(member "collapsed" classlist)))
|
||||
(λ (classlist) (filter (curry (negate equal?) "collapsed") classlist)))
|
||||
; uncollapse portable-infobox sections
|
||||
(curry u
|
||||
(λ (classlist) (and ; removed due to scoping, would improve performance (eq? element-type 'section)
|
||||
(member "pi-collapse" classlist)))
|
||||
(λ (classlist) (filter (λ (v)
|
||||
(and (not (equal? v "pi-collapse-closed"))
|
||||
(not (equal? v "pi-collapse"))))
|
||||
classlist)))
|
||||
; generic: includes article sections and tables, probably more
|
||||
(curry u
|
||||
(λ (classlist) (and (member "collapsible" classlist)
|
||||
(member "collapsed" classlist)))
|
||||
(λ (classlist) (filter (curry (negate equal?) "collapsed") classlist)))))
|
||||
|
||||
(define ((string-replace-curried from to) str)
|
||||
(string-replace str from to))
|
||||
|
||||
(define class-updater
|
||||
(compose1
|
||||
(string-replace-curried " collapsed" "")
|
||||
(string-replace-curried "pi-collapse-closed" "")
|
||||
(string-replace-curried "pi-collapse" "")))
|
||||
|
||||
(define attributes-updater
|
||||
(compose1
|
||||
; uncollapsing
|
||||
#;(curry attribute-maybe-update 'class
|
||||
(λ (class) (string-join (classlist-updater (string-split class " ")) " ")))
|
||||
(curry attribute-maybe-update 'class class-updater)
|
||||
; change links to stay on the same wiki
|
||||
(curry attribute-maybe-update 'href
|
||||
(λ (href)
|
||||
((compose1
|
||||
(λ (href) (regexp-replace #rx"^(/wiki/.*)" href (format "/~a\\1" wikiname)))
|
||||
(λ (href) (regexp-replace (pregexp (format "^https://(~a)\\.fandom\\.com(/wiki/.*)" px-wikiname)) href "/\\1\\2")))
|
||||
href)))
|
||||
; add noreferrer to a.image
|
||||
(curry u
|
||||
(λ (v) (and #;(eq? element-type 'a)
|
||||
(has-class? "image" v)))
|
||||
(λ (v) (dict-update v 'rel (λ (s)
|
||||
(list (string-append (car s) " noreferrer")))
|
||||
'(""))))
|
||||
; proxy images from inline styles, if strict_proxy is set
|
||||
(curry u
|
||||
(λ (v) strict-proxy?)
|
||||
(λ (v) (attribute-maybe-update
|
||||
'style
|
||||
(λ (style)
|
||||
(regexp-replace #rx"url\\(['\"]?(.*?)['\"]?\\)" style
|
||||
(λ (whole url)
|
||||
(string-append
|
||||
"url("
|
||||
(u-proxy-url url)
|
||||
")")))) v)))
|
||||
; and also their links, if strict_proxy is set
|
||||
(curry u
|
||||
(λ (v)
|
||||
(and strict-proxy?
|
||||
#;(eq? element-type 'a)
|
||||
(or (has-class? "image-thumbnail" v)
|
||||
(has-class? "image" v))))
|
||||
(λ (v) (attribute-maybe-update 'href u-proxy-url v)))
|
||||
; proxy images from src attributes, if strict_proxy is set
|
||||
(curry u
|
||||
(λ (v) strict-proxy?)
|
||||
(λ (v) (attribute-maybe-update 'src u-proxy-url v)))
|
||||
; don't lazyload images
|
||||
(curry u
|
||||
(λ (v) (dict-has-key? v 'data-src))
|
||||
(λ (v) (attribute-maybe-update 'src (λ (_) (car (dict-ref v 'data-src))) v)))
|
||||
; don't use srcset - TODO: use srcset?
|
||||
(λ (v) (dict-remove v 'srcset))))
|
||||
|
||||
(define (children-updater attributes children)
|
||||
; more uncollapsing - sample: bandori/wiki/BanG_Dream!_Wikia
|
||||
((λ (children)
|
||||
(u
|
||||
(λ (v) (has-class? "mw-collapsible-content" attributes))
|
||||
(λ (v) (for/list ([element v])
|
||||
(u (λ (element) (pair? element))
|
||||
(λ (element)
|
||||
`(,(car element)
|
||||
(@ ,@(attribute-maybe-update 'style (λ (a) (regexp-replace #rx"display: *none" a "display:inline")) (bits->attributes element)))
|
||||
,@(filter element-is-content? (cdr element))))
|
||||
element)))
|
||||
children))
|
||||
; wrap blinking animated images in a slot so they can be animated with CSS
|
||||
((λ (children)
|
||||
(u
|
||||
(λ (v) (and (has-class? "animated" attributes)
|
||||
((length v) . > . 1)))
|
||||
(λ (v)
|
||||
`((span (@ (class "animated-slot__outer") (style ,(format "--steps: ~a" (length v))))
|
||||
(span (@ (class "animated-slot__inner"))
|
||||
,@v))))
|
||||
children))
|
||||
children)))
|
||||
|
||||
(define (updater element element-type attributes children)
|
||||
;; replace whole element?
|
||||
(cond
|
||||
; wrap tables in a div.table-scroller
|
||||
[(and (eq? element-type 'table)
|
||||
(has-class? "wikitable" attributes)
|
||||
(not (dict-has-key? attributes 'data-scrolling)))
|
||||
`(div
|
||||
((class "table-scroller"))
|
||||
((,element-type (@ (data-scrolling) ,@attributes)
|
||||
,@children)))]
|
||||
; exclude empty figcaptions
|
||||
[(and (eq? element-type 'figcaption)
|
||||
(or (eq? (length (filter element-is-element? children)) 0)
|
||||
((query-selector (λ (element-type attributes children)
|
||||
(eq? element-type 'use))
|
||||
element))))
|
||||
return-no-element]
|
||||
; exclude infobox items that are videos, and gallery items that are videos
|
||||
[(and (or (has-class? "pi-item" attributes)
|
||||
(has-class? "wikia-gallery-item" attributes))
|
||||
((query-selector (λ (element-type attributes children)
|
||||
(has-class? "video-thumbnail" attributes))
|
||||
element)))
|
||||
return-no-element]
|
||||
; exclude the invisible brackets after headings
|
||||
[(and (eq? element-type 'span)
|
||||
(has-class? "mw-editsection" attributes))
|
||||
return-no-element]
|
||||
; display a link instead of an iframe
|
||||
[(eq? element-type 'iframe)
|
||||
(define src (car (dict-ref attributes 'src null)))
|
||||
`(a
|
||||
((class "iframe-alternative") (href ,src))
|
||||
(,(format "Embedded media: ~a" src)))]
|
||||
; remove noscript versions of images because they are likely lower quality than the script versions
|
||||
[(and (eq? element-type 'noscript)
|
||||
(match children
|
||||
; either the noscript has a.image as a first child...
|
||||
[(list (list 'a (list '@ a-att ...) _)) (has-class? "image" a-att)]
|
||||
; or the noscript has img as a first child
|
||||
[(list (list 'img _)) #t]
|
||||
[_ #f]))
|
||||
return-no-element]
|
||||
[#t
|
||||
(list element-type
|
||||
;; attributes
|
||||
(attributes-updater #; element-type attributes)
|
||||
;; children
|
||||
(children-updater attributes children))]))
|
||||
|
||||
updater)
|
||||
|
||||
(define (update-tree-wiki tree wikiname #:strict-proxy? [strict-proxy? #f])
|
||||
(update-tree (updater wikiname #:strict-proxy? strict-proxy?) tree))
|
||||
|
||||
(module+ test
|
||||
(define transformed
|
||||
(update-tree-wiki wiki-document "test" #:strict-proxy? #t))
|
||||
; check that wikilinks are changed to be local
|
||||
(check-equal? (get-attribute 'href (bits->attributes
|
||||
((query-selector
|
||||
(λ (t a c) (dict-has-key? a 'data-test-wikilink))
|
||||
transformed))))
|
||||
"/test/wiki/Another_Page")
|
||||
; check that a.image has noreferrer
|
||||
(check-equal? (get-attribute 'rel (bits->attributes
|
||||
((query-selector
|
||||
(λ (t a c) (and (eq? t 'a)
|
||||
(has-class? "image" a)))
|
||||
transformed))))
|
||||
" noreferrer")
|
||||
; check that article collapse sections become uncollapsed
|
||||
(check-equal? (get-attribute 'class (bits->attributes
|
||||
((query-selector
|
||||
(λ (t a c) (dict-has-key? a 'data-test-collapsesection))
|
||||
transformed))))
|
||||
"collapsible collapsetoggle-inline")
|
||||
; check that iframes are gone
|
||||
(check-false ((query-selector (λ (t a c) (eq? t 'iframe)) transformed)))
|
||||
(check-equal? (let* ([alternative ((query-selector (λ (t a c) (has-class? "iframe-alternative" a)) transformed))]
|
||||
[link ((query-selector (λ (t a c) (eq? t 'a)) alternative))])
|
||||
(get-attribute 'href (bits->attributes link)))
|
||||
"https://example.com/iframe-src")
|
||||
; check that images are proxied
|
||||
(check-equal? (get-attribute 'src (bits->attributes
|
||||
((query-selector
|
||||
(λ (t a c) (eq? t 'img))
|
||||
transformed))))
|
||||
"/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image-thumbnail.png")
|
||||
; check that links to images are proxied
|
||||
(check-equal? (get-attribute 'href (bits->attributes
|
||||
((query-selector
|
||||
(λ (t a c) (and (eq? t 'a) (has-class? "image-thumbnail" a)))
|
||||
transformed))))
|
||||
"/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png")
|
||||
(check-equal? (get-attribute 'href (bits->attributes
|
||||
((query-selector
|
||||
(λ (t a c) (member '(data-test-figure-a) a))
|
||||
transformed))))
|
||||
"/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png")
|
||||
; check that noscript images are removed
|
||||
(check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f)
|
||||
; benchmark
|
||||
(when (file-exists? "../misc/Frog.html")
|
||||
(with-input-from-file "../misc/Frog.html"
|
||||
(λ ()
|
||||
(define tree (html->xexp (current-input-port)))
|
||||
(time (length (update-tree-wiki tree "minecraft")))))))
|
11
lib/typed-rackunit.rkt
Normal file
11
lib/typed-rackunit.rkt
Normal file
|
@ -0,0 +1,11 @@
|
|||
#lang typed/racket/base
|
||||
|
||||
(provide
|
||||
check-equal?
|
||||
check-true
|
||||
check-false)
|
||||
|
||||
(require/typed rackunit
|
||||
[check-equal? (Any Any -> Void)]
|
||||
[check-true (Any -> Void)]
|
||||
[check-false (Any -> Void)])
|
108
lib/url-utils.rkt
Normal file
108
lib/url-utils.rkt
Normal file
|
@ -0,0 +1,108 @@
|
|||
#lang typed/racket/base
|
||||
(require racket/string
|
||||
"pure-utils.rkt")
|
||||
(require/typed web-server/http/request-structs
|
||||
[#:opaque Header header?])
|
||||
|
||||
(provide
|
||||
; regex to match wiki names
|
||||
px-wikiname
|
||||
; make a query string from an association list of strings
|
||||
params->query
|
||||
; custom percent encoding (you probably want params->query instead)
|
||||
percent-encode
|
||||
; sets for custom percent encoding
|
||||
path-set urlencoded-set filename-set
|
||||
; make a proxied version of a fandom url
|
||||
u-proxy-url
|
||||
; check whether a url is on a domain controlled by fandom
|
||||
is-fandom-url?
|
||||
; pass in a header, headers, or something useless. they'll all combine into a list
|
||||
build-headers
|
||||
; try to follow wikimedia's format for which characters should be encoded/replaced in page titles for the url
|
||||
page-title->path)
|
||||
|
||||
(module+ test
|
||||
(require "typed-rackunit.rkt"))
|
||||
|
||||
(define px-wikiname "[a-zA-Z0-9-]{1,50}")
|
||||
|
||||
;; https://url.spec.whatwg.org/#urlencoded-serializing
|
||||
|
||||
(define path-set '(#\; ; semicolon is part of the userinfo set in the URL standard, but I'm putting it here
|
||||
#\? #\` #\{ #\} ; path set
|
||||
#\ #\" #\# #\< #\> ; query set
|
||||
; c0 controls included elsewhere
|
||||
; higher ranges included elsewhere
|
||||
))
|
||||
(define urlencoded-set (append
|
||||
'(#\! #\' #\( #\) #\~ ; urlencoded set
|
||||
#\$ #\% #\& #\+ #\, ; component set
|
||||
#\/ #\: #\= #\@ #\[ #\\ #\] #\^ #\| ; userinfo set
|
||||
)
|
||||
path-set))
|
||||
|
||||
(define filename-set '(#\< #\> #\: #\" #\/ #\\ #\| #\? #\* #\# #\~ #\&))
|
||||
|
||||
(: percent-encode (String (Listof Char) Boolean -> Bytes))
|
||||
(define (percent-encode value set space-as-plus)
|
||||
(define b (string->bytes/utf-8 value))
|
||||
(apply bytes-append
|
||||
(for/list ([char b]) : (Listof Bytes)
|
||||
(cond
|
||||
[(and space-as-plus (eq? char 32))
|
||||
#"+"]
|
||||
[(or (member (integer->char char) set)
|
||||
(char . > . #x7E)
|
||||
(char . <= . #x1F))
|
||||
(bytes-append #"%" (string->bytes/latin-1
|
||||
(string-upcase (number->string char 16))))]
|
||||
[#t
|
||||
(bytes char)]))))
|
||||
|
||||
(: params->query ((Listof (Pair String String)) -> String))
|
||||
(define (params->query params)
|
||||
(string-join
|
||||
(map (λ ([p : (Pair String String)])
|
||||
(format "~a=~a"
|
||||
(percent-encode (car p) urlencoded-set #t)
|
||||
(percent-encode (cdr p) urlencoded-set #t)))
|
||||
params)
|
||||
"&"))
|
||||
(module+ test
|
||||
(check-equal? (params->query '(("hello" . "world")))
|
||||
"hello=world")
|
||||
(check-equal? (params->query '(("a" . "hello world''") ("utf8" . "✓")))
|
||||
"a=hello+world%27%27&utf8=%E2%9C%93"))
|
||||
|
||||
(: is-fandom-url? (String -> Boolean))
|
||||
(define (is-fandom-url? url)
|
||||
(regexp-match? (pregexp (format "^https://static\\.wikia\\.nocookie\\.net/|^https://~a\\.fandom\\.com/" px-wikiname)) url))
|
||||
(module+ test
|
||||
(check-true (is-fandom-url? "https://static.wikia.nocookie.net/wikiname/images/2/2f/SomeImage.jpg/revision/latest?cb=20110210094136"))
|
||||
(check-true (is-fandom-url? "https://test.fandom.com/wiki/Some_Page"))
|
||||
(check-false (is-fandom-url? "https://cadence.moe")))
|
||||
|
||||
(: u-proxy-url (String -> String))
|
||||
(define (u-proxy-url url)
|
||||
(u
|
||||
is-fandom-url?
|
||||
(λ ([v : String]) (string-append "/proxy?" (params->query `(("dest" . ,url)))))
|
||||
url))
|
||||
|
||||
(: build-headers ((U Header (Listof Header) False Void) * -> (Listof Header)))
|
||||
(define (build-headers . fs)
|
||||
(apply
|
||||
append
|
||||
(map (λ ([f : (U Header (Listof Header) False Void)])
|
||||
(cond
|
||||
[(not f) null]
|
||||
[(void? f) null]
|
||||
[(null? f) null]
|
||||
[(header? f) (list f)]
|
||||
[(pair? f) f]))
|
||||
fs)))
|
||||
|
||||
(: page-title->path (String -> Bytes))
|
||||
(define (page-title->path title)
|
||||
(percent-encode (regexp-replace* " " title "_") path-set #f))
|
217
lib/xexpr-utils.rkt
Normal file
217
lib/xexpr-utils.rkt
Normal file
|
@ -0,0 +1,217 @@
|
|||
#lang racket/base
|
||||
(require racket/dict
|
||||
racket/function
|
||||
racket/generator
|
||||
racket/match
|
||||
racket/string
|
||||
(only-in json-pointer json-pointer-value)
|
||||
(only-in web-server/http response/output)
|
||||
"pure-utils.rkt")
|
||||
|
||||
(provide
|
||||
;; with whole xexprs
|
||||
; xexpr for an "empty" element, which in reality uses <template>
|
||||
return-no-element
|
||||
; query a tree for elements matching a condition
|
||||
query-selector
|
||||
; update a tree with a function called on each element
|
||||
update-tree
|
||||
|
||||
;; with bits of xexprs
|
||||
; predicates
|
||||
element-is-bits?
|
||||
element-is-xattributes?
|
||||
element-is-element?
|
||||
element-is-content?
|
||||
|
||||
;; with attributes
|
||||
; find the attributes in some bits of an element
|
||||
bits->attributes
|
||||
; get attribute value from some attributes
|
||||
get-attribute
|
||||
; update an attribute if it is present (otherwise no change)
|
||||
attribute-maybe-update
|
||||
; make an attribute selector for use in query-selector
|
||||
attribute-selector
|
||||
; do these attributes have a certain value in their class?
|
||||
has-class?
|
||||
|
||||
;; with json
|
||||
; get value in json structure using a *j*son *p*ointer, optionally with default value for not present
|
||||
jp
|
||||
|
||||
; error catching for http responses
|
||||
response-handler)
|
||||
|
||||
(module+ test
|
||||
(require rackunit)
|
||||
(define demo-attributes
|
||||
'(span (@ (title "Inside joke."))
|
||||
"To get to the other side."
|
||||
(@ (style "color: blue"))))
|
||||
(define demo-document
|
||||
'(html
|
||||
(@ (lang "en"))
|
||||
(head
|
||||
(title "Hello world!"))
|
||||
(body
|
||||
(h1 "Hello world!")
|
||||
(p "Welcome to my "
|
||||
(span (@ (style "color: yellow")
|
||||
(title "Really."))
|
||||
(em "cool"))
|
||||
"website.")))))
|
||||
|
||||
; replacing with a template element removes it from the rendered document
|
||||
(define return-no-element '(template
|
||||
()
|
||||
()))
|
||||
|
||||
; "bits" is attributes or real elements (non-string)
|
||||
(define (element-is-bits? element)
|
||||
(pair? element))
|
||||
(module+ test
|
||||
(check-true (element-is-bits? '(span "hi")))
|
||||
(check-true (element-is-bits? '(@ (alt "Cute cat."))))
|
||||
(check-false (element-is-bits? "hi")))
|
||||
|
||||
; "xattributes" is attributes hugged by @
|
||||
(define (element-is-xattributes? element)
|
||||
(and (element-is-bits? element) (eq? '@ (car element))))
|
||||
(module+ test
|
||||
(check-false (element-is-xattributes? '(span "hi")))
|
||||
(check-true (element-is-xattributes? '(@ (alt "Cute cat."))))
|
||||
(check-false (element-is-xattributes? '((alt "Cute cat."))))
|
||||
(check-false (element-is-xattributes? "hi")))
|
||||
|
||||
; "element" is a real element with a type and everything (non-string, non-attributes)
|
||||
(define (element-is-element? element)
|
||||
(and (element-is-bits? element) (not (element-is-xattributes? element))))
|
||||
(module+ test
|
||||
(check-true (element-is-element? '(span "hi")))
|
||||
(check-false (element-is-element? '(@ (alt "Cute cat."))))
|
||||
(check-false (element-is-element? "hi")))
|
||||
|
||||
; "element content" is a real element or a string
|
||||
(define (element-is-content? element)
|
||||
(or (string? element) (element-is-element? element)))
|
||||
(module+ test
|
||||
(check-true (element-is-content? '(span "hi")))
|
||||
(check-false (element-is-content? '(@ (alt "Cute cat."))))
|
||||
(check-true (element-is-content? "hi")))
|
||||
|
||||
; get the actual attributes, leaving out the @ signs
|
||||
(define (xattributes->attributes xattrs)
|
||||
(filter pair? xattrs))
|
||||
|
||||
(define (bits->attributes bits)
|
||||
; (append) is a clean and general approach to finding and combining any attributes
|
||||
(xattributes->attributes (apply append (filter element-is-xattributes? bits))))
|
||||
(module+ test
|
||||
(check-equal? (bits->attributes demo-attributes)
|
||||
'((title "Inside joke.") (style "color: blue"))))
|
||||
|
||||
(define (get-attribute name attributes)
|
||||
(define a (assq name attributes))
|
||||
(if (pair? a)
|
||||
(cadr a)
|
||||
#f))
|
||||
(module+ test
|
||||
(check-equal? (get-attribute 'title (bits->attributes demo-attributes)) "Inside joke."))
|
||||
|
||||
(define (attribute-maybe-update key updater attributes)
|
||||
(alist-maybe-update attributes key (λ (v) (map updater v))))
|
||||
(module+ test
|
||||
(check-equal? (attribute-maybe-update 'a (λ (x) (+ x 10)) '((a 5) (b 6)))
|
||||
'((a 15) (b 6))))
|
||||
|
||||
(define (attribute-selector name value)
|
||||
(λ (element-type attributes children)
|
||||
(equal? (get-attribute name attributes) value)))
|
||||
|
||||
(define (query-selector selector element #:include-text? [include-text? #f])
|
||||
(generator
|
||||
()
|
||||
(let loop ([element element])
|
||||
(define element-type (car element))
|
||||
(define attributes (bits->attributes (cdr element)))
|
||||
(define children (filter element-is-element? (cdr element))) ; only recurse through real children
|
||||
(cond
|
||||
[(equal? element-type '*DECL*) #f]
|
||||
[(equal? element-type '@) #f]
|
||||
[#t
|
||||
(when (if include-text?
|
||||
(selector element-type attributes children (filter string? (cdr element)))
|
||||
(selector element-type attributes children))
|
||||
(yield element))
|
||||
(for ([child children]) (loop child))]))
|
||||
#f))
|
||||
(module+ test
|
||||
(let ([result (query-selector (attribute-selector 'title "Really.")
|
||||
demo-document)])
|
||||
(check-equal? (result) '(span (@ (style "color: yellow")
|
||||
(title "Really."))
|
||||
(em "cool")))
|
||||
(check-equal? (result) #f)))
|
||||
|
||||
(define (update-tree transformer element)
|
||||
(let loop ([element element])
|
||||
(define element-type (car element))
|
||||
(define attributes (bits->attributes (cdr element)))
|
||||
(define contents (filter element-is-content? (cdr element))) ; provide elements and strings
|
||||
(cond
|
||||
[(equal? element-type '*DECL*)
|
||||
; declarations like <!DOCTYPE html> get mapped as attributes as if the element were (*DECL* (@ (DOCTYPE) (html)))
|
||||
(match (transformer element element-type (map list (cdr element)) null)
|
||||
[(list element-type attributes contents)
|
||||
`(*DECL* ,@(map car attributes))]
|
||||
[#f ""])]
|
||||
[(member element-type '(@ &))
|
||||
; special element, do nothing
|
||||
element]
|
||||
[#t
|
||||
; regular element, transform it
|
||||
(match (transformer element element-type attributes contents)
|
||||
[(list element-type attributes contents)
|
||||
(append (list element-type)
|
||||
(if (pair? attributes) (list (append '(@) attributes)) (list))
|
||||
(map (λ (content)
|
||||
(if (element-is-element? content) (loop content) content))
|
||||
contents))])])))
|
||||
(module+ test
|
||||
; check doctype is preserved when present
|
||||
(check-equal? (update-tree (λ (e t a c) (list t a c)) '(*TOP* (*DECL* DOCTYPE html) (html (body "Hey"))))
|
||||
'(*TOP* (*DECL* DOCTYPE html) (html (body "Hey"))))
|
||||
; check doctype can be removed if desirable
|
||||
(check-equal? (update-tree (λ (e t a c) (if (eq? t '*DECL*) #f (list t a c))) '(*TOP* (*DECL* DOCTYPE html) (html (body "Hey"))))
|
||||
'(*TOP* "" (html (body "Hey"))))
|
||||
; check (& x) sequences are preserved
|
||||
(check-equal? (update-tree (λ (e t a c) (list t a c)) '(body "Hey" (& nbsp) (a (@ (href "/")))))
|
||||
'(body "Hey" (& nbsp) (a (@ (href "/"))))))
|
||||
|
||||
(define (has-class? name attributes)
|
||||
(and (member name (string-split (or (get-attribute 'class attributes) "") " ")) #t))
|
||||
(module+ test
|
||||
(check-true (has-class? "red" '((class "yellow red blue"))))
|
||||
(check-false (has-class? "red" '((class "yellow blue"))))
|
||||
(check-false (has-class? "red" '((title "Inside joke.")))))
|
||||
|
||||
(define (jp pointer document [else null])
|
||||
(with-handlers ([exn:fail:contract? (λ (exn) (cond
|
||||
[(null? else) (raise exn)]
|
||||
[(procedure? else) (else)]
|
||||
[#t else]))])
|
||||
(json-pointer-value pointer document)))
|
||||
|
||||
(define-syntax-rule (response-handler body ...)
|
||||
(with-handlers ([exn:fail? (λ (e)
|
||||
(response/output
|
||||
#:code 500
|
||||
#:mime-type #"text/plain"
|
||||
(λ (out)
|
||||
(for ([port (list (current-error-port) out)])
|
||||
(parameterize ([current-error-port port])
|
||||
(with-handlers ([exn:fail? (λ (e) (void))])
|
||||
(displayln "Exception raised in Racket code at response generation time:" (current-error-port))
|
||||
((error-display-handler) (exn-message e) e)))))))])
|
||||
body ...))
|
Loading…
Add table
Add a link
Reference in a new issue