diff --git a/src/page-wiki.rkt b/src/page-wiki.rkt
index 2cd14dc..6f22ad8 100644
--- a/src/page-wiki.rkt
+++ b/src/page-wiki.rkt
@@ -19,6 +19,7 @@
"data.rkt"
"pure-utils.rkt"
"syntax.rkt"
+ "tree-updater.rkt"
"xexpr-utils.rkt"
"url-utils.rkt")
@@ -30,51 +31,19 @@
preprocess-html-wiki)
(module+ test
- (require rackunit)
- (define wiki-document
- '(*TOP*
- (div (@ (class "mw-parser-output"))
- (aside (@ (role "region") (class "portable-infobox pi-theme-wikia pi-layout-default"))
- (h2 (@ (class "pi-item pi-title") (data-source "title"))
- "Infobox Title")
- (figure (@ (class "pi-item pi-image") (data-source "image"))
- (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image image-thumbnail") (title ""))
- (img (@ (src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") (class "pi-image-thumbnail")))))
- (div (@ (class "pi-item pi-data") (data-source "description"))
- (h3 (@ (class "pi-data-label"))
- "Description")
- (div (@ (class "pi-data-value"))
- "Mystery infobox!")))
- (div (@ (data-test-collapsesection) (class "collapsible collapsetoggle-inline collapsed"))
- (i (b "This section is hidden for dramatic effect."))
- (div (@ (class "collapsible-content"))
- (p "Another page link: "
- (a (@ (data-test-wikilink) (href "https://test.fandom.com/wiki/Another_Page") (title "Another Page"))
- "Another Page"))))
- (figure (@ (class "thumb tnone"))
- (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image") (data-test-figure-a))
- (img (@ (src "data:image/gif;base64,R0lGODlhAQABAIABAAAAAP///yH5BAEAAAEALAAAAAABAAEAQAICTAEAOw%3D%3D")
- (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png")
- (class "thumbimage lazyload"))))
- (noscript
- (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image"))
- (img (@ (src "https://static.wikia.nocookie.net/nice-image-thumbnail.png")
- (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png")
- (class "thumbimage")))))
- (figcaption "Test figure!"))
- (iframe (@ (src "https://example.com/iframe-src")))))))
+ (require rackunit))
(define (preprocess-html-wiki html)
- (define (rr* find replace contents)
+ (define ((rr* find replace) contents)
(regexp-replace* find contents replace))
((compose1
; fix navbox list nesting
; navbox on right of page has incorrect html "
" and the xexpr parser puts the much further up the tree
; add a to make the parser happy
; usage: /fallout/wiki/Fallout:_New_Vegas_achievements_and_trophies
- (curry rr* #rx"(]*>\n?)(- )" "\\1
\\2")
+ (rr* #rx"(]*>\n?)(- )" "\\1
\\2")
; change to to make the parser happy
- (curry rr* #rx"(]*>)[ \t]*([^<]*) " "\\1\\2"))
+ (rr* #rx"(]*>)[ \t]*([^<]*) " "\\1\\2"))
html))
(module+ test
(check-equal? (preprocess-html-wiki "\n- Hey
")
@@ -82,198 +51,6 @@
(check-equal? (preprocess-html-wiki "")
""))
-(define (update-tree-wiki tree wikiname)
- (update-tree
- (λ (element element-type attributes children)
- ;; replace whole element?
- (cond
- ; wrap tables in a div.table-scroller
- [(and (eq? element-type 'table)
- (has-class? "wikitable" attributes)
- (not (dict-has-key? attributes 'data-scrolling)))
- `(div
- ((class "table-scroller"))
- ((,element-type (@ (data-scrolling) ,@attributes)
- ,@children)))]
- ; exclude empty figcaptions
- [(and (eq? element-type 'figcaption)
- (or (eq? (length (filter element-is-element? children)) 0)
- ((query-selector (λ (element-type attributes children)
- (eq? element-type 'use))
- element))))
- return-no-element]
- ; exclude infobox items that are videos, and gallery items that are videos
- [(and (or (has-class? "pi-item" attributes)
- (has-class? "wikia-gallery-item" attributes))
- ((query-selector (λ (element-type attributes children)
- (has-class? "video-thumbnail" attributes))
- element)))
- return-no-element]
- ; exclude the invisible brackets after headings
- [(and (eq? element-type 'span)
- (has-class? "mw-editsection" attributes))
- return-no-element]
- ; display a link instead of an iframe
- [(eq? element-type 'iframe)
- (define src (car (dict-ref attributes 'src null)))
- `(a
- ((class "iframe-alternative") (href ,src))
- (,(format "Embedded media: ~a" src)))]
- ; remove noscript versions of images because they are likely lower quality than the script versions
- [(and (eq? element-type 'noscript)
- (match children
- ; either the noscript has a.image as a first child...
- [(list (list 'a (list '@ a-att ...) _)) (has-class? "image" a-att)]
- ; or the noscript has img as a first child
- [(list (list 'img _)) #t]
- [_ #f]))
- return-no-element]
- [#t
- (list element-type
- ;; attributes
- ((compose1
- ; uncollapsing
- (curry attribute-maybe-update 'class
- (λ (class)
- (string-join
- ((compose1
- ; uncollapse all navbox items (bottom of page mass navigation)
- (curry u
- (λ (classlist) (and (eq? element-type 'table)
- (member "navbox" classlist)
- (member "collapsed" classlist)))
- (λ (classlist) (filter (curry (negate equal?) "collapsed") classlist)))
- ; uncollapse portable-infobox sections
- (curry u
- (λ (classlist) (and (eq? element-type 'section)
- (member "pi-collapse" classlist)))
- (λ (classlist) (filter (λ (v)
- (and (not (equal? v "pi-collapse-closed"))
- (not (equal? v "pi-collapse"))))
- classlist)))
- ; generic: includes article sections and tables, probably more
- (curry u
- (λ (classlist) (and (member "collapsible" classlist)
- (member "collapsed" classlist)))
- (λ (classlist) (filter (curry (negate equal?) "collapsed") classlist))))
- (string-split class " "))
- " ")))
- ; change links to stay on the same wiki
- (curry attribute-maybe-update 'href
- (λ (href)
- ((compose1
- (λ (href) (regexp-replace #rx"^(/wiki/.*)" href (format "/~a\\1" wikiname)))
- (λ (href) (regexp-replace (pregexp (format "^https://(~a)\\.fandom\\.com(/wiki/.*)" px-wikiname)) href "/\\1\\2")))
- href)))
- ; add noreferrer to a.image
- (curry u
- (λ (v) (and (eq? element-type 'a)
- (has-class? "image" v)))
- (λ (v) (dict-update v 'rel (λ (s)
- (list (string-append (car s) " noreferrer")))
- '(""))))
- ; proxy images from inline styles, if strict_proxy is set
- (curry u
- (λ (v) (config-true? 'strict_proxy))
- (λ (v) (attribute-maybe-update
- 'style
- (λ (style)
- (regexp-replace #rx"url\\(['\"]?(.*?)['\"]?\\)" style
- (λ (whole url)
- (string-append
- "url("
- (u-proxy-url url)
- ")")))) v)))
- ; and also their links, if strict_proxy is set
- (curry u
- (λ (v)
- (and (config-true? 'strict_proxy)
- (eq? element-type 'a)
- (or (has-class? "image-thumbnail" v)
- (has-class? "image" v))))
- (λ (v) (attribute-maybe-update 'href u-proxy-url v)))
- ; proxy images from src attributes, if strict_proxy is set
- (curry u
- (λ (v) (config-true? 'strict_proxy))
- (λ (v) (attribute-maybe-update 'src u-proxy-url v)))
- ; don't lazyload images
- (curry u
- (λ (v) (dict-has-key? v 'data-src))
- (λ (v) (attribute-maybe-update 'src (λ (_) (car (dict-ref v 'data-src))) v)))
- ; don't use srcset - TODO: use srcset?
- (λ (v) (dict-remove v 'srcset)))
- attributes)
- ;; children
- ((compose1
- ; more uncollapsing - sample: bandori/wiki/BanG_Dream!_Wikia
- (curry u
- (λ (v) (has-class? "mw-collapsible-content" attributes))
- (λ (v) (for/list ([element v])
- (u (λ (element) (pair? element))
- (λ (element)
- `(,(car element)
- (@ ,@(attribute-maybe-update 'style (λ (a) (regexp-replace #rx"display: *none" a "display:inline")) (bits->attributes element)))
- ,@(filter element-is-content? (cdr element))))
- element))))
- ; wrap blinking animated images in a slot so they can be animated with CSS
- (curry u
- (λ (v) (and (has-class? "animated" attributes)
- ((length v) . > . 1)))
- (λ (v)
- `((span (@ (class "animated-slot__outer") (style ,(format "--steps: ~a" (length v))))
- (span (@ (class "animated-slot__inner"))
- ,@v))))))
- children))]))
- tree))
-(module+ test
- (define transformed
- (parameterize ([(config-parameter 'strict_proxy) "true"])
- (update-tree-wiki wiki-document "test")))
- ; check that wikilinks are changed to be local
- (check-equal? (get-attribute 'href (bits->attributes
- ((query-selector
- (λ (t a c) (dict-has-key? a 'data-test-wikilink))
- transformed))))
- "/test/wiki/Another_Page")
- ; check that a.image has noreferrer
- (check-equal? (get-attribute 'rel (bits->attributes
- ((query-selector
- (λ (t a c) (and (eq? t 'a)
- (has-class? "image" a)))
- transformed))))
- " noreferrer")
- ; check that article collapse sections become uncollapsed
- (check-equal? (get-attribute 'class (bits->attributes
- ((query-selector
- (λ (t a c) (dict-has-key? a 'data-test-collapsesection))
- transformed))))
- "collapsible collapsetoggle-inline")
- ; check that iframes are gone
- (check-false ((query-selector (λ (t a c) (eq? t 'iframe)) transformed)))
- (check-equal? (let* ([alternative ((query-selector (λ (t a c) (has-class? "iframe-alternative" a)) transformed))]
- [link ((query-selector (λ (t a c) (eq? t 'a)) alternative))])
- (get-attribute 'href (bits->attributes link)))
- "https://example.com/iframe-src")
- ; check that images are proxied
- (check-equal? (get-attribute 'src (bits->attributes
- ((query-selector
- (λ (t a c) (eq? t 'img))
- transformed))))
- "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image-thumbnail.png")
- ; check that links to images are proxied
- (check-equal? (get-attribute 'href (bits->attributes
- ((query-selector
- (λ (t a c) (and (eq? t 'a) (has-class? "image-thumbnail" a)))
- transformed))))
- "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png")
- (check-equal? (get-attribute 'href (bits->attributes
- ((query-selector
- (λ (t a c) (member '(data-test-figure-a) a))
- transformed))))
- "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png")
- ; check that noscript images are removed
- (check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f))
-
(define (page-wiki req)
(define wikiname (path/param-path (first (url-path (request-uri req)))))
(define user-cookies (user-cookies-getter req))
diff --git a/src/tree-updater.rkt b/src/tree-updater.rkt
new file mode 100644
index 0000000..815ef8a
--- /dev/null
+++ b/src/tree-updater.rkt
@@ -0,0 +1,267 @@
+#lang racket/base
+(require racket/dict
+ racket/function
+ racket/match
+ racket/string
+ "config.rkt"
+ "pure-utils.rkt"
+ "url-utils.rkt"
+ "xexpr-utils.rkt")
+
+(provide
+ update-tree-wiki)
+
+(module+ test
+ (require rackunit
+ html-parsing)
+ (define wiki-document
+ '(*TOP*
+ (div (@ (class "mw-parser-output"))
+ (aside (@ (role "region") (class "portable-infobox pi-theme-wikia pi-layout-default"))
+ (h2 (@ (class "pi-item pi-title") (data-source "title"))
+ "Infobox Title")
+ (figure (@ (class "pi-item pi-image") (data-source "image"))
+ (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image image-thumbnail") (title ""))
+ (img (@ (src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") (class "pi-image-thumbnail")))))
+ (div (@ (class "pi-item pi-data") (data-source "description"))
+ (h3 (@ (class "pi-data-label"))
+ "Description")
+ (div (@ (class "pi-data-value"))
+ "Mystery infobox!")))
+ (div (@ (data-test-collapsesection) (class "collapsible collapsetoggle-inline collapsed"))
+ (i (b "This section is hidden for dramatic effect."))
+ (div (@ (class "collapsible-content"))
+ (p "Another page link: "
+ (a (@ (data-test-wikilink) (href "https://test.fandom.com/wiki/Another_Page") (title "Another Page"))
+ "Another Page"))))
+ (figure (@ (class "thumb tnone"))
+ (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image") (data-test-figure-a))
+ (img (@ (src "data:image/gif;base64,R0lGODlhAQABAIABAAAAAP///yH5BAEAAAEALAAAAAABAAEAQAICTAEAOw%3D%3D")
+ (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png")
+ (class "thumbimage lazyload"))))
+ (noscript
+ (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image"))
+ (img (@ (src "https://static.wikia.nocookie.net/nice-image-thumbnail.png")
+ (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png")
+ (class "thumbimage")))))
+ (figcaption "Test figure!"))
+ (iframe (@ (src "https://example.com/iframe-src")))))))
+
+(define (updater wikiname)
+ (define classlist-updater
+ (compose1
+ ; uncollapse all navbox items (bottom of page mass navigation)
+ (curry u
+ (λ (classlist) (and ; removed due to scoping, would improve peformance (eq? element-type 'table)
+ (member "navbox" classlist)
+ (member "collapsed" classlist)))
+ (λ (classlist) (filter (curry (negate equal?) "collapsed") classlist)))
+ ; uncollapse portable-infobox sections
+ (curry u
+ (λ (classlist) (and ; removed due to scoping, would improve performance (eq? element-type 'section)
+ (member "pi-collapse" classlist)))
+ (λ (classlist) (filter (λ (v)
+ (and (not (equal? v "pi-collapse-closed"))
+ (not (equal? v "pi-collapse"))))
+ classlist)))
+ ; generic: includes article sections and tables, probably more
+ (curry u
+ (λ (classlist) (and (member "collapsible" classlist)
+ (member "collapsed" classlist)))
+ (λ (classlist) (filter (curry (negate equal?) "collapsed") classlist)))))
+
+ (define ((string-replace-curried from to) str)
+ (string-replace str from to))
+
+ (define class-updater
+ (compose1
+ (string-replace-curried " collapsed" "")
+ (string-replace-curried "pi-collapse-closed" "")
+ (string-replace-curried "pi-collapse" "")))
+
+ (define attributes-updater
+ (compose1
+ ; uncollapsing
+ #;(curry attribute-maybe-update 'class
+ (λ (class) (string-join (classlist-updater (string-split class " ")) " ")))
+ (curry attribute-maybe-update 'class class-updater)
+ ; change links to stay on the same wiki
+ (curry attribute-maybe-update 'href
+ (λ (href)
+ ((compose1
+ (λ (href) (regexp-replace #rx"^(/wiki/.*)" href (format "/~a\\1" wikiname)))
+ (λ (href) (regexp-replace (pregexp (format "^https://(~a)\\.fandom\\.com(/wiki/.*)" px-wikiname)) href "/\\1\\2")))
+ href)))
+ ; add noreferrer to a.image
+ (curry u
+ (λ (v) (and #;(eq? element-type 'a)
+ (has-class? "image" v)))
+ (λ (v) (dict-update v 'rel (λ (s)
+ (list (string-append (car s) " noreferrer")))
+ '(""))))
+ ; proxy images from inline styles, if strict_proxy is set
+ (curry u
+ (λ (v) (config-true? 'strict_proxy))
+ (λ (v) (attribute-maybe-update
+ 'style
+ (λ (style)
+ (regexp-replace #rx"url\\(['\"]?(.*?)['\"]?\\)" style
+ (λ (whole url)
+ (string-append
+ "url("
+ (u-proxy-url url)
+ ")")))) v)))
+ ; and also their links, if strict_proxy is set
+ (curry u
+ (λ (v)
+ (and (config-true? 'strict_proxy)
+ #;(eq? element-type 'a)
+ (or (has-class? "image-thumbnail" v)
+ (has-class? "image" v))))
+ (λ (v) (attribute-maybe-update 'href u-proxy-url v)))
+ ; proxy images from src attributes, if strict_proxy is set
+ (curry u
+ (λ (v) (config-true? 'strict_proxy))
+ (λ (v) (attribute-maybe-update 'src u-proxy-url v)))
+ ; don't lazyload images
+ (curry u
+ (λ (v) (dict-has-key? v 'data-src))
+ (λ (v) (attribute-maybe-update 'src (λ (_) (car (dict-ref v 'data-src))) v)))
+ ; don't use srcset - TODO: use srcset?
+ (λ (v) (dict-remove v 'srcset))))
+
+ (define (children-updater attributes children)
+ ; more uncollapsing - sample: bandori/wiki/BanG_Dream!_Wikia
+ ((λ (children)
+ (u
+ (λ (v) (has-class? "mw-collapsible-content" attributes))
+ (λ (v) (for/list ([element v])
+ (u (λ (element) (pair? element))
+ (λ (element)
+ `(,(car element)
+ (@ ,@(attribute-maybe-update 'style (λ (a) (regexp-replace #rx"display: *none" a "display:inline")) (bits->attributes element)))
+ ,@(filter element-is-content? (cdr element))))
+ element)))
+ children))
+ ; wrap blinking animated images in a slot so they can be animated with CSS
+ ((λ (children)
+ (u
+ (λ (v) (and (has-class? "animated" attributes)
+ ((length v) . > . 1)))
+ (λ (v)
+ `((span (@ (class "animated-slot__outer") (style ,(format "--steps: ~a" (length v))))
+ (span (@ (class "animated-slot__inner"))
+ ,@v))))
+ children))
+ children)))
+
+ (define (updater element element-type attributes children)
+ ;; replace whole element?
+ (cond
+ ; wrap tables in a div.table-scroller
+ [(and (eq? element-type 'table)
+ (has-class? "wikitable" attributes)
+ (not (dict-has-key? attributes 'data-scrolling)))
+ `(div
+ ((class "table-scroller"))
+ ((,element-type (@ (data-scrolling) ,@attributes)
+ ,@children)))]
+ ; exclude empty figcaptions
+ [(and (eq? element-type 'figcaption)
+ (or (eq? (length (filter element-is-element? children)) 0)
+ ((query-selector (λ (element-type attributes children)
+ (eq? element-type 'use))
+ element))))
+ return-no-element]
+ ; exclude infobox items that are videos, and gallery items that are videos
+ [(and (or (has-class? "pi-item" attributes)
+ (has-class? "wikia-gallery-item" attributes))
+ ((query-selector (λ (element-type attributes children)
+ (has-class? "video-thumbnail" attributes))
+ element)))
+ return-no-element]
+ ; exclude the invisible brackets after headings
+ [(and (eq? element-type 'span)
+ (has-class? "mw-editsection" attributes))
+ return-no-element]
+ ; display a link instead of an iframe
+ [(eq? element-type 'iframe)
+ (define src (car (dict-ref attributes 'src null)))
+ `(a
+ ((class "iframe-alternative") (href ,src))
+ (,(format "Embedded media: ~a" src)))]
+ ; remove noscript versions of images because they are likely lower quality than the script versions
+ [(and (eq? element-type 'noscript)
+ (match children
+ ; either the noscript has a.image as a first child...
+ [(list (list 'a (list '@ a-att ...) _)) (has-class? "image" a-att)]
+ ; or the noscript has img as a first child
+ [(list (list 'img _)) #t]
+ [_ #f]))
+ return-no-element]
+ [#t
+ (list element-type
+ ;; attributes
+ (attributes-updater #; element-type attributes)
+ ;; children
+ (children-updater attributes children))]))
+
+ updater)
+
+(define (update-tree-wiki tree wikiname)
+ (update-tree (updater wikiname) tree))
+
+(module+ test
+ (define transformed
+ (parameterize ([(config-parameter 'strict_proxy) "true"])
+ (update-tree-wiki wiki-document "test")))
+ ; check that wikilinks are changed to be local
+ (check-equal? (get-attribute 'href (bits->attributes
+ ((query-selector
+ (λ (t a c) (dict-has-key? a 'data-test-wikilink))
+ transformed))))
+ "/test/wiki/Another_Page")
+ ; check that a.image has noreferrer
+ (check-equal? (get-attribute 'rel (bits->attributes
+ ((query-selector
+ (λ (t a c) (and (eq? t 'a)
+ (has-class? "image" a)))
+ transformed))))
+ " noreferrer")
+ ; check that article collapse sections become uncollapsed
+ (check-equal? (get-attribute 'class (bits->attributes
+ ((query-selector
+ (λ (t a c) (dict-has-key? a 'data-test-collapsesection))
+ transformed))))
+ "collapsible collapsetoggle-inline")
+ ; check that iframes are gone
+ (check-false ((query-selector (λ (t a c) (eq? t 'iframe)) transformed)))
+ (check-equal? (let* ([alternative ((query-selector (λ (t a c) (has-class? "iframe-alternative" a)) transformed))]
+ [link ((query-selector (λ (t a c) (eq? t 'a)) alternative))])
+ (get-attribute 'href (bits->attributes link)))
+ "https://example.com/iframe-src")
+ ; check that images are proxied
+ (check-equal? (get-attribute 'src (bits->attributes
+ ((query-selector
+ (λ (t a c) (eq? t 'img))
+ transformed))))
+ "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image-thumbnail.png")
+ ; check that links to images are proxied
+ (check-equal? (get-attribute 'href (bits->attributes
+ ((query-selector
+ (λ (t a c) (and (eq? t 'a) (has-class? "image-thumbnail" a)))
+ transformed))))
+ "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png")
+ (check-equal? (get-attribute 'href (bits->attributes
+ ((query-selector
+ (λ (t a c) (member '(data-test-figure-a) a))
+ transformed))))
+ "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png")
+ ; check that noscript images are removed
+ (check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f)
+ ; benchmark
+ (when (file-exists? "Frog.html2")
+ (with-input-from-file "Frog.html2"
+ (λ ()
+ (define tree (html->xexp (current-input-port)))
+ (time (length (update-tree-wiki tree "minecraft")))))))
| | | |