From ba3b39242cef005f60ddf9e50a63ba0c190ce9a2 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sun, 4 Dec 2022 23:44:35 +1300 Subject: [PATCH 1/2] Rename ref parameter to next_location --- src/data.rkt | 2 +- src/page-set-user-settings.rkt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/data.rkt b/src/data.rkt index e8a9e69..cdd8c39 100644 --- a/src/data.rkt +++ b/src/data.rkt @@ -91,5 +91,5 @@ #:max-age (* 60 60 24 365 10))))) (define (user-cookies-setter-url req new-settings) - (format "/set-user-settings?~a" (params->query `(("ref" . ,(url->string (request-uri req))) + (format "/set-user-settings?~a" (params->query `(("next_location" . ,(url->string (request-uri req))) ("new_settings" . ,(format "~a" new-settings)))))) diff --git a/src/page-set-user-settings.rkt b/src/page-set-user-settings.rkt index 3f37692..2fbc573 100644 --- a/src/page-set-user-settings.rkt +++ b/src/page-set-user-settings.rkt @@ -12,7 +12,7 @@ (define (page-set-user-settings req) (response-handler - (define ref (dict-ref (url-query (request-uri req)) 'ref)) + (define next-location (dict-ref (url-query (request-uri req)) 'next_location)) (define new-settings (read (open-input-string (dict-ref (url-query (request-uri req)) 'new_settings)))) (define headers (user-cookies-setter new-settings)) - (generate-redirect ref #:headers headers))) + (generate-redirect next-location #:headers headers))) From 9fd2b4699d9697b87fabcf982184ce6c381634f0 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sun, 4 Dec 2022 23:46:24 +1300 Subject: [PATCH 2/2] Optimise update-tree-wiki somewhat --- src/page-wiki.rkt | 233 +------------------------------------ src/tree-updater.rkt | 267 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 272 insertions(+), 228 deletions(-) create mode 100644 src/tree-updater.rkt diff --git a/src/page-wiki.rkt b/src/page-wiki.rkt index 2cd14dc..6f22ad8 100644 --- a/src/page-wiki.rkt +++ b/src/page-wiki.rkt @@ -19,6 +19,7 @@ "data.rkt" "pure-utils.rkt" "syntax.rkt" + "tree-updater.rkt" "xexpr-utils.rkt" "url-utils.rkt") @@ -30,51 +31,19 @@ preprocess-html-wiki) (module+ test - (require rackunit) - (define wiki-document - '(*TOP* - (div (@ (class "mw-parser-output")) - (aside (@ (role "region") (class "portable-infobox pi-theme-wikia pi-layout-default")) - (h2 (@ (class "pi-item pi-title") (data-source "title")) - "Infobox Title") - (figure (@ (class "pi-item pi-image") (data-source "image")) - (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image image-thumbnail") (title "")) - (img (@ (src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") (class "pi-image-thumbnail"))))) - (div (@ (class "pi-item pi-data") (data-source "description")) - (h3 (@ (class "pi-data-label")) - "Description") - (div (@ (class "pi-data-value")) - "Mystery infobox!"))) - (div (@ (data-test-collapsesection) (class "collapsible collapsetoggle-inline collapsed")) - (i (b "This section is hidden for dramatic effect.")) - (div (@ (class "collapsible-content")) - (p "Another page link: " - (a (@ (data-test-wikilink) (href "https://test.fandom.com/wiki/Another_Page") (title "Another Page")) - "Another Page")))) - (figure (@ (class "thumb tnone")) - (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image") (data-test-figure-a)) - (img (@ (src "data:image/gif;base64,R0lGODlhAQABAIABAAAAAP///yH5BAEAAAEALAAAAAABAAEAQAICTAEAOw%3D%3D") - (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") - (class "thumbimage lazyload")))) - (noscript - (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image")) - (img (@ (src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") - (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") - (class "thumbimage"))))) - (figcaption "Test figure!")) - (iframe (@ (src "https://example.com/iframe-src"))))))) + (require rackunit)) (define (preprocess-html-wiki html) - (define (rr* find replace contents) + (define ((rr* find replace) contents) (regexp-replace* find contents replace)) ((compose1 ; fix navbox list nesting ; navbox on right of page has incorrect html "
  • " and the xexpr parser puts the
  • much further up the tree ; add a
      to make the parser happy ; usage: /fallout/wiki/Fallout:_New_Vegas_achievements_and_trophies - (curry rr* #rx"(]*>\n?)(
    • )" "\\1
        \\2") + (rr* #rx"(]*>\n?)(
      • )" "\\1
          \\2") ; change

          to

          to make the parser happy - (curry rr* #rx"(]*>)[ \t]*

          ([^<]*)

          " "\\1\\2")) + (rr* #rx"(]*>)[ \t]*

          ([^<]*)

          " "\\1\\2")) html)) (module+ test (check-equal? (preprocess-html-wiki "\n
        • Hey
        • ") @@ -82,198 +51,6 @@ (check-equal? (preprocess-html-wiki "

          Caption text.

          ") "
          Caption text.
          ")) -(define (update-tree-wiki tree wikiname) - (update-tree - (λ (element element-type attributes children) - ;; replace whole element? - (cond - ; wrap tables in a div.table-scroller - [(and (eq? element-type 'table) - (has-class? "wikitable" attributes) - (not (dict-has-key? attributes 'data-scrolling))) - `(div - ((class "table-scroller")) - ((,element-type (@ (data-scrolling) ,@attributes) - ,@children)))] - ; exclude empty figcaptions - [(and (eq? element-type 'figcaption) - (or (eq? (length (filter element-is-element? children)) 0) - ((query-selector (λ (element-type attributes children) - (eq? element-type 'use)) - element)))) - return-no-element] - ; exclude infobox items that are videos, and gallery items that are videos - [(and (or (has-class? "pi-item" attributes) - (has-class? "wikia-gallery-item" attributes)) - ((query-selector (λ (element-type attributes children) - (has-class? "video-thumbnail" attributes)) - element))) - return-no-element] - ; exclude the invisible brackets after headings - [(and (eq? element-type 'span) - (has-class? "mw-editsection" attributes)) - return-no-element] - ; display a link instead of an iframe - [(eq? element-type 'iframe) - (define src (car (dict-ref attributes 'src null))) - `(a - ((class "iframe-alternative") (href ,src)) - (,(format "Embedded media: ~a" src)))] - ; remove noscript versions of images because they are likely lower quality than the script versions - [(and (eq? element-type 'noscript) - (match children - ; either the noscript has a.image as a first child... - [(list (list 'a (list '@ a-att ...) _)) (has-class? "image" a-att)] - ; or the noscript has img as a first child - [(list (list 'img _)) #t] - [_ #f])) - return-no-element] - [#t - (list element-type - ;; attributes - ((compose1 - ; uncollapsing - (curry attribute-maybe-update 'class - (λ (class) - (string-join - ((compose1 - ; uncollapse all navbox items (bottom of page mass navigation) - (curry u - (λ (classlist) (and (eq? element-type 'table) - (member "navbox" classlist) - (member "collapsed" classlist))) - (λ (classlist) (filter (curry (negate equal?) "collapsed") classlist))) - ; uncollapse portable-infobox sections - (curry u - (λ (classlist) (and (eq? element-type 'section) - (member "pi-collapse" classlist))) - (λ (classlist) (filter (λ (v) - (and (not (equal? v "pi-collapse-closed")) - (not (equal? v "pi-collapse")))) - classlist))) - ; generic: includes article sections and tables, probably more - (curry u - (λ (classlist) (and (member "collapsible" classlist) - (member "collapsed" classlist))) - (λ (classlist) (filter (curry (negate equal?) "collapsed") classlist)))) - (string-split class " ")) - " "))) - ; change links to stay on the same wiki - (curry attribute-maybe-update 'href - (λ (href) - ((compose1 - (λ (href) (regexp-replace #rx"^(/wiki/.*)" href (format "/~a\\1" wikiname))) - (λ (href) (regexp-replace (pregexp (format "^https://(~a)\\.fandom\\.com(/wiki/.*)" px-wikiname)) href "/\\1\\2"))) - href))) - ; add noreferrer to a.image - (curry u - (λ (v) (and (eq? element-type 'a) - (has-class? "image" v))) - (λ (v) (dict-update v 'rel (λ (s) - (list (string-append (car s) " noreferrer"))) - '("")))) - ; proxy images from inline styles, if strict_proxy is set - (curry u - (λ (v) (config-true? 'strict_proxy)) - (λ (v) (attribute-maybe-update - 'style - (λ (style) - (regexp-replace #rx"url\\(['\"]?(.*?)['\"]?\\)" style - (λ (whole url) - (string-append - "url(" - (u-proxy-url url) - ")")))) v))) - ; and also their links, if strict_proxy is set - (curry u - (λ (v) - (and (config-true? 'strict_proxy) - (eq? element-type 'a) - (or (has-class? "image-thumbnail" v) - (has-class? "image" v)))) - (λ (v) (attribute-maybe-update 'href u-proxy-url v))) - ; proxy images from src attributes, if strict_proxy is set - (curry u - (λ (v) (config-true? 'strict_proxy)) - (λ (v) (attribute-maybe-update 'src u-proxy-url v))) - ; don't lazyload images - (curry u - (λ (v) (dict-has-key? v 'data-src)) - (λ (v) (attribute-maybe-update 'src (λ (_) (car (dict-ref v 'data-src))) v))) - ; don't use srcset - TODO: use srcset? - (λ (v) (dict-remove v 'srcset))) - attributes) - ;; children - ((compose1 - ; more uncollapsing - sample: bandori/wiki/BanG_Dream!_Wikia - (curry u - (λ (v) (has-class? "mw-collapsible-content" attributes)) - (λ (v) (for/list ([element v]) - (u (λ (element) (pair? element)) - (λ (element) - `(,(car element) - (@ ,@(attribute-maybe-update 'style (λ (a) (regexp-replace #rx"display: *none" a "display:inline")) (bits->attributes element))) - ,@(filter element-is-content? (cdr element)))) - element)))) - ; wrap blinking animated images in a slot so they can be animated with CSS - (curry u - (λ (v) (and (has-class? "animated" attributes) - ((length v) . > . 1))) - (λ (v) - `((span (@ (class "animated-slot__outer") (style ,(format "--steps: ~a" (length v)))) - (span (@ (class "animated-slot__inner")) - ,@v)))))) - children))])) - tree)) -(module+ test - (define transformed - (parameterize ([(config-parameter 'strict_proxy) "true"]) - (update-tree-wiki wiki-document "test"))) - ; check that wikilinks are changed to be local - (check-equal? (get-attribute 'href (bits->attributes - ((query-selector - (λ (t a c) (dict-has-key? a 'data-test-wikilink)) - transformed)))) - "/test/wiki/Another_Page") - ; check that a.image has noreferrer - (check-equal? (get-attribute 'rel (bits->attributes - ((query-selector - (λ (t a c) (and (eq? t 'a) - (has-class? "image" a))) - transformed)))) - " noreferrer") - ; check that article collapse sections become uncollapsed - (check-equal? (get-attribute 'class (bits->attributes - ((query-selector - (λ (t a c) (dict-has-key? a 'data-test-collapsesection)) - transformed)))) - "collapsible collapsetoggle-inline") - ; check that iframes are gone - (check-false ((query-selector (λ (t a c) (eq? t 'iframe)) transformed))) - (check-equal? (let* ([alternative ((query-selector (λ (t a c) (has-class? "iframe-alternative" a)) transformed))] - [link ((query-selector (λ (t a c) (eq? t 'a)) alternative))]) - (get-attribute 'href (bits->attributes link))) - "https://example.com/iframe-src") - ; check that images are proxied - (check-equal? (get-attribute 'src (bits->attributes - ((query-selector - (λ (t a c) (eq? t 'img)) - transformed)))) - "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image-thumbnail.png") - ; check that links to images are proxied - (check-equal? (get-attribute 'href (bits->attributes - ((query-selector - (λ (t a c) (and (eq? t 'a) (has-class? "image-thumbnail" a))) - transformed)))) - "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png") - (check-equal? (get-attribute 'href (bits->attributes - ((query-selector - (λ (t a c) (member '(data-test-figure-a) a)) - transformed)))) - "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png") - ; check that noscript images are removed - (check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f)) - (define (page-wiki req) (define wikiname (path/param-path (first (url-path (request-uri req))))) (define user-cookies (user-cookies-getter req)) diff --git a/src/tree-updater.rkt b/src/tree-updater.rkt new file mode 100644 index 0000000..815ef8a --- /dev/null +++ b/src/tree-updater.rkt @@ -0,0 +1,267 @@ +#lang racket/base +(require racket/dict + racket/function + racket/match + racket/string + "config.rkt" + "pure-utils.rkt" + "url-utils.rkt" + "xexpr-utils.rkt") + +(provide + update-tree-wiki) + +(module+ test + (require rackunit + html-parsing) + (define wiki-document + '(*TOP* + (div (@ (class "mw-parser-output")) + (aside (@ (role "region") (class "portable-infobox pi-theme-wikia pi-layout-default")) + (h2 (@ (class "pi-item pi-title") (data-source "title")) + "Infobox Title") + (figure (@ (class "pi-item pi-image") (data-source "image")) + (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image image-thumbnail") (title "")) + (img (@ (src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") (class "pi-image-thumbnail"))))) + (div (@ (class "pi-item pi-data") (data-source "description")) + (h3 (@ (class "pi-data-label")) + "Description") + (div (@ (class "pi-data-value")) + "Mystery infobox!"))) + (div (@ (data-test-collapsesection) (class "collapsible collapsetoggle-inline collapsed")) + (i (b "This section is hidden for dramatic effect.")) + (div (@ (class "collapsible-content")) + (p "Another page link: " + (a (@ (data-test-wikilink) (href "https://test.fandom.com/wiki/Another_Page") (title "Another Page")) + "Another Page")))) + (figure (@ (class "thumb tnone")) + (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image") (data-test-figure-a)) + (img (@ (src "data:image/gif;base64,R0lGODlhAQABAIABAAAAAP///yH5BAEAAAEALAAAAAABAAEAQAICTAEAOw%3D%3D") + (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") + (class "thumbimage lazyload")))) + (noscript + (a (@ (href "https://static.wikia.nocookie.net/nice-image.png") (class "image")) + (img (@ (src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") + (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") + (class "thumbimage"))))) + (figcaption "Test figure!")) + (iframe (@ (src "https://example.com/iframe-src"))))))) + +(define (updater wikiname) + (define classlist-updater + (compose1 + ; uncollapse all navbox items (bottom of page mass navigation) + (curry u + (λ (classlist) (and ; removed due to scoping, would improve peformance (eq? element-type 'table) + (member "navbox" classlist) + (member "collapsed" classlist))) + (λ (classlist) (filter (curry (negate equal?) "collapsed") classlist))) + ; uncollapse portable-infobox sections + (curry u + (λ (classlist) (and ; removed due to scoping, would improve performance (eq? element-type 'section) + (member "pi-collapse" classlist))) + (λ (classlist) (filter (λ (v) + (and (not (equal? v "pi-collapse-closed")) + (not (equal? v "pi-collapse")))) + classlist))) + ; generic: includes article sections and tables, probably more + (curry u + (λ (classlist) (and (member "collapsible" classlist) + (member "collapsed" classlist))) + (λ (classlist) (filter (curry (negate equal?) "collapsed") classlist))))) + + (define ((string-replace-curried from to) str) + (string-replace str from to)) + + (define class-updater + (compose1 + (string-replace-curried " collapsed" "") + (string-replace-curried "pi-collapse-closed" "") + (string-replace-curried "pi-collapse" ""))) + + (define attributes-updater + (compose1 + ; uncollapsing + #;(curry attribute-maybe-update 'class + (λ (class) (string-join (classlist-updater (string-split class " ")) " "))) + (curry attribute-maybe-update 'class class-updater) + ; change links to stay on the same wiki + (curry attribute-maybe-update 'href + (λ (href) + ((compose1 + (λ (href) (regexp-replace #rx"^(/wiki/.*)" href (format "/~a\\1" wikiname))) + (λ (href) (regexp-replace (pregexp (format "^https://(~a)\\.fandom\\.com(/wiki/.*)" px-wikiname)) href "/\\1\\2"))) + href))) + ; add noreferrer to a.image + (curry u + (λ (v) (and #;(eq? element-type 'a) + (has-class? "image" v))) + (λ (v) (dict-update v 'rel (λ (s) + (list (string-append (car s) " noreferrer"))) + '("")))) + ; proxy images from inline styles, if strict_proxy is set + (curry u + (λ (v) (config-true? 'strict_proxy)) + (λ (v) (attribute-maybe-update + 'style + (λ (style) + (regexp-replace #rx"url\\(['\"]?(.*?)['\"]?\\)" style + (λ (whole url) + (string-append + "url(" + (u-proxy-url url) + ")")))) v))) + ; and also their links, if strict_proxy is set + (curry u + (λ (v) + (and (config-true? 'strict_proxy) + #;(eq? element-type 'a) + (or (has-class? "image-thumbnail" v) + (has-class? "image" v)))) + (λ (v) (attribute-maybe-update 'href u-proxy-url v))) + ; proxy images from src attributes, if strict_proxy is set + (curry u + (λ (v) (config-true? 'strict_proxy)) + (λ (v) (attribute-maybe-update 'src u-proxy-url v))) + ; don't lazyload images + (curry u + (λ (v) (dict-has-key? v 'data-src)) + (λ (v) (attribute-maybe-update 'src (λ (_) (car (dict-ref v 'data-src))) v))) + ; don't use srcset - TODO: use srcset? + (λ (v) (dict-remove v 'srcset)))) + + (define (children-updater attributes children) + ; more uncollapsing - sample: bandori/wiki/BanG_Dream!_Wikia + ((λ (children) + (u + (λ (v) (has-class? "mw-collapsible-content" attributes)) + (λ (v) (for/list ([element v]) + (u (λ (element) (pair? element)) + (λ (element) + `(,(car element) + (@ ,@(attribute-maybe-update 'style (λ (a) (regexp-replace #rx"display: *none" a "display:inline")) (bits->attributes element))) + ,@(filter element-is-content? (cdr element)))) + element))) + children)) + ; wrap blinking animated images in a slot so they can be animated with CSS + ((λ (children) + (u + (λ (v) (and (has-class? "animated" attributes) + ((length v) . > . 1))) + (λ (v) + `((span (@ (class "animated-slot__outer") (style ,(format "--steps: ~a" (length v)))) + (span (@ (class "animated-slot__inner")) + ,@v)))) + children)) + children))) + + (define (updater element element-type attributes children) + ;; replace whole element? + (cond + ; wrap tables in a div.table-scroller + [(and (eq? element-type 'table) + (has-class? "wikitable" attributes) + (not (dict-has-key? attributes 'data-scrolling))) + `(div + ((class "table-scroller")) + ((,element-type (@ (data-scrolling) ,@attributes) + ,@children)))] + ; exclude empty figcaptions + [(and (eq? element-type 'figcaption) + (or (eq? (length (filter element-is-element? children)) 0) + ((query-selector (λ (element-type attributes children) + (eq? element-type 'use)) + element)))) + return-no-element] + ; exclude infobox items that are videos, and gallery items that are videos + [(and (or (has-class? "pi-item" attributes) + (has-class? "wikia-gallery-item" attributes)) + ((query-selector (λ (element-type attributes children) + (has-class? "video-thumbnail" attributes)) + element))) + return-no-element] + ; exclude the invisible brackets after headings + [(and (eq? element-type 'span) + (has-class? "mw-editsection" attributes)) + return-no-element] + ; display a link instead of an iframe + [(eq? element-type 'iframe) + (define src (car (dict-ref attributes 'src null))) + `(a + ((class "iframe-alternative") (href ,src)) + (,(format "Embedded media: ~a" src)))] + ; remove noscript versions of images because they are likely lower quality than the script versions + [(and (eq? element-type 'noscript) + (match children + ; either the noscript has a.image as a first child... + [(list (list 'a (list '@ a-att ...) _)) (has-class? "image" a-att)] + ; or the noscript has img as a first child + [(list (list 'img _)) #t] + [_ #f])) + return-no-element] + [#t + (list element-type + ;; attributes + (attributes-updater #; element-type attributes) + ;; children + (children-updater attributes children))])) + + updater) + +(define (update-tree-wiki tree wikiname) + (update-tree (updater wikiname) tree)) + +(module+ test + (define transformed + (parameterize ([(config-parameter 'strict_proxy) "true"]) + (update-tree-wiki wiki-document "test"))) + ; check that wikilinks are changed to be local + (check-equal? (get-attribute 'href (bits->attributes + ((query-selector + (λ (t a c) (dict-has-key? a 'data-test-wikilink)) + transformed)))) + "/test/wiki/Another_Page") + ; check that a.image has noreferrer + (check-equal? (get-attribute 'rel (bits->attributes + ((query-selector + (λ (t a c) (and (eq? t 'a) + (has-class? "image" a))) + transformed)))) + " noreferrer") + ; check that article collapse sections become uncollapsed + (check-equal? (get-attribute 'class (bits->attributes + ((query-selector + (λ (t a c) (dict-has-key? a 'data-test-collapsesection)) + transformed)))) + "collapsible collapsetoggle-inline") + ; check that iframes are gone + (check-false ((query-selector (λ (t a c) (eq? t 'iframe)) transformed))) + (check-equal? (let* ([alternative ((query-selector (λ (t a c) (has-class? "iframe-alternative" a)) transformed))] + [link ((query-selector (λ (t a c) (eq? t 'a)) alternative))]) + (get-attribute 'href (bits->attributes link))) + "https://example.com/iframe-src") + ; check that images are proxied + (check-equal? (get-attribute 'src (bits->attributes + ((query-selector + (λ (t a c) (eq? t 'img)) + transformed)))) + "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image-thumbnail.png") + ; check that links to images are proxied + (check-equal? (get-attribute 'href (bits->attributes + ((query-selector + (λ (t a c) (and (eq? t 'a) (has-class? "image-thumbnail" a))) + transformed)))) + "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png") + (check-equal? (get-attribute 'href (bits->attributes + ((query-selector + (λ (t a c) (member '(data-test-figure-a) a)) + transformed)))) + "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png") + ; check that noscript images are removed + (check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f) + ; benchmark + (when (file-exists? "Frog.html2") + (with-input-from-file "Frog.html2" + (λ () + (define tree (html->xexp (current-input-port))) + (time (length (update-tree-wiki tree "minecraft")))))))