diff --git a/lib/archive-file-mappings.rkt b/lib/archive-file-mappings.rkt index 4aa8a69..ba013ab 100644 --- a/lib/archive-file-mappings.rkt +++ b/lib/archive-file-mappings.rkt @@ -11,7 +11,7 @@ url-segments->guess-title) (define (local-encoded-url->segments str) ; '("wiki" "Page_title") - (map path/param-path (url-path (string->url str)))) + (map path/param-path (fix-semicolons-url-path (url-path (string->url str))))) (define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix (define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments))) diff --git a/lib/tree-updater.rkt b/lib/tree-updater.rkt index 0dbf695..098af3d 100644 --- a/lib/tree-updater.rkt +++ b/lib/tree-updater.rkt @@ -58,7 +58,11 @@ (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") (class "thumbimage"))))) (figcaption "Test figure!")) - (iframe (@ (src "https://example.com/iframe-src"))))))) + (iframe (@ (src "https://example.com/iframe-src"))) + (div (@ (class "reviews")) + (header "GameSpot Expert Reviews")) + (div (@ (data-test-ampersand) (class "mw-collapsible-content")) + (& ndash)))))) (define (updater wikiname #:strict-proxy? [strict-proxy? #f]) ;; precompute wikiurl regex for efficency @@ -157,7 +161,7 @@ (u (λ (v) (has-class? "mw-collapsible-content" attributes)) (λ (v) (for/list ([element v]) - (u (λ (element) (pair? element)) + (u (λ (element) (element-is-element? element)) (λ (element) `(,(car element) (@ ,@(attribute-maybe-update 'style (λ (a) (regexp-replace #rx"display: *none" a "display:inline")) (bits->attributes element))) @@ -238,6 +242,9 @@ [(list (list 'img _)) #t] [_ #f])) return-no-element] + ; remove gamespot reviews/ads + [(has-class? "reviews" attributes) + return-no-element] [#t (list element-type ;; attributes @@ -297,6 +304,12 @@ "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png") ; check that noscript images are removed (check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f) + ; check that gamespot reviews/ads are removed + (check-equal? ((query-selector (λ (t a c) (has-class? "reviews" a)) transformed)) #f) + ; check that (& x) sequences are not broken + (check-equal? ((query-selector (λ (t a c) (dict-has-key? a 'data-test-ampersand)) transformed)) + '(div (@ (data-test-ampersand) (class "mw-collapsible-content")) + (& ndash))) ; benchmark (when (file-exists? "../storage/Frog.html") (with-input-from-file "../storage/Frog.html" diff --git a/lib/url-utils.rkt b/lib/url-utils.rkt index 4722d49..3fb4310 100644 --- a/lib/url-utils.rkt +++ b/lib/url-utils.rkt @@ -1,5 +1,6 @@ #lang typed/racket/base (require racket/string + typed/net/url-structs "pure-utils.rkt") (require/typed web-server/http/request-structs [#:opaque Header header?]) @@ -20,7 +21,10 @@ ; pass in a header, headers, or something useless. they'll all combine into a list build-headers ; try to follow wikimedia's format for which characters should be encoded/replaced in page titles for the url - page-title->path) + page-title->path + ; path/param eats semicolons into params, which need to be fixed back into semicolons + fix-semicolons-url-path + fix-semicolons-url) (module+ test (require "typed-rackunit.rkt")) @@ -106,3 +110,20 @@ (: page-title->path (String -> Bytes)) (define (page-title->path title) (percent-encode (regexp-replace* " " title "_") path-set #f)) + +(: fix-semicolons-url-path ((Listof Path/Param) -> (Listof Path/Param))) +(define (fix-semicolons-url-path pps) + (for/list ([pp pps]) + (define path (path/param-path pp)) + (if (or (null? (path/param-param pp)) + (symbol? path)) + pp + ;; path/param does have params, which need to be fixed into a semicolon. + (path/param + (string-append path ";" (string-join (path/param-param pp) ";")) + null)))) + +(: fix-semicolons-url (URL -> URL)) +(define (fix-semicolons-url orig-url) + (struct-copy url orig-url [path (fix-semicolons-url-path (url-path orig-url))])) + diff --git a/lib/xexpr-utils.rkt b/lib/xexpr-utils.rkt index cb40510..e1ac957 100644 --- a/lib/xexpr-utils.rkt +++ b/lib/xexpr-utils.rkt @@ -86,15 +86,16 @@ ; "element" is a real element with a type and everything (non-string, non-attributes) (define (element-is-element? element) - (and (element-is-bits? element) (not (element-is-xattributes? element)))) + (and (element-is-bits? element) (not (eq? (car element) '&))(not (element-is-xattributes? element)))) (module+ test (check-true (element-is-element? '(span "hi"))) (check-false (element-is-element? '(@ (alt "Cute cat.")))) - (check-false (element-is-element? "hi"))) + (check-false (element-is-element? "hi")) + (check-false (element-is-element? '(& ndash)))) -; "element content" is a real element or a string +; "element content" is a real element or a string or a (& x) sequence (define (element-is-content? element) - (or (string? element) (element-is-element? element))) + (or (string? element) (element-is-element? element) (and (pair? element) (eq? (car element) '&)))) (module+ test (check-true (element-is-content? '(span "hi"))) (check-false (element-is-content? '(@ (alt "Cute cat.")))) diff --git a/src/dispatcher-tree.rkt b/src/dispatcher-tree.rkt index 315638a..48e8ebb 100644 --- a/src/dispatcher-tree.rkt +++ b/src/dispatcher-tree.rkt @@ -59,16 +59,5 @@ (make-semicolon-fixer-dispatcher tree)) (define ((make-semicolon-fixer-dispatcher orig-dispatcher) conn orig-req) - (define orig-uri (request-uri orig-req)) - (define pps (url-path orig-uri)) ; list of path/param structs - (define new-path - (for/list ([pp pps]) - (if (null? (path/param-param pp)) - pp - ;; path/param does have params, which need to be fixed into a semicolon. - (path/param - (string-append (path/param-path pp) ";" (string-join (path/param-param pp) ";")) - null)))) - (define new-uri (struct-copy url orig-uri [path new-path])) - (define new-req (struct-copy request orig-req [uri new-uri])) + (define new-req (struct-copy request orig-req [uri (fix-semicolons-url (request-uri orig-req))])) (orig-dispatcher conn new-req)) diff --git a/src/page-search.rkt b/src/page-search.rkt index ce527c0..e4960d8 100644 --- a/src/page-search.rkt +++ b/src/page-search.rkt @@ -28,28 +28,35 @@ (define search-json-data '#hasheq((batchcomplete . #t) (query . #hasheq((search . (#hasheq((ns . 0) (pageid . 219) (size . 1482) (snippet . "") (timestamp . "2022-08-21T08:54:23Z") (title . "Gacha Capsule") (wordcount . 214)) #hasheq((ns . 0) (pageid . 201) (size . 1198) (snippet . "") (timestamp . "2022-07-11T17:52:47Z") (title . "Badges") (wordcount . 181))))))))) +;; this takes the info we gathered from fandom and makes the big fat x-expression page (define (generate-results-page req dest-url wikiname query data #:siteinfo [siteinfo #f]) (define search-results (jp "/query/search" data)) + ;; this is *another* helper that builds the wiki page UI and lets me put the search results (or whatever else) in the middle (generate-wiki-page + ;; so I provide my helper function with the necessary context... #:req req #:source-url dest-url #:wikiname wikiname #:title query #:siteinfo siteinfo + ;; and here's the actual results to display in the wiki page layout `(div (@ (class "mw-parser-output")) + ;; header before the search results showing how many we found (p ,(format "~a results found for " (length search-results)) (strong ,query)) + ;; *u*nordered *l*ist of matching search results (ul ,@(map - (λ (result) + (λ (result) ;; for each result, run this code... (let* ([title (jp "/title" result)] [page-path (page-title->path title)] [timestamp (jp "/timestamp" result)] [wordcount (jp "/wordcount" result)] [size (jp "/size" result)]) + ;; and make this x-expression... `(li (@ (class "my-result")) - (a (@ (class "my-result__link") (href ,(format "/~a/wiki/~a" wikiname page-path))) - ,title) - (div (@ (class "my-result__info")) + (a (@ (class "my-result__link") (href ,(format "/~a/wiki/~a" wikiname page-path))) ; using unquote to insert the result page URL + ,title) ; using unquote to insert the result page title + (div (@ (class "my-result__info")) ; constructing the line under the search result "last edited " (time (@ (datetime ,timestamp)) ,(list-ref (string-split timestamp "T") 0)) ,(format ", ~a words, ~a kb" @@ -57,13 +64,18 @@ (exact->inexact (/ (round (/ size 100)) 10))))))) search-results))))) +;; will be called when the web browser asks to load the page (define (page-search req) + ;; this just means, catch any errors and display them in the browser. it's a function somewhere else (response-handler + ;; the URL will look like "/minecraft/wiki/Special:Search?q=Spawner" + ;; grab the first part to use as the wikiname, in this case, "minecraft" (define wikiname (path/param-path (first (url-path (request-uri req))))) + ;; grab the part after ?q= which is the search terms (define query (dict-ref (url-query (request-uri req)) 'q #f)) + ;; constructing the URL where I want to get fandom data from... (define origin (format "https://~a.fandom.com" wikiname)) - (when (config-true? 'feature_offline::only) - (raise-user-error "Full search is currently not available on breezewiki.com - for now, please use the pop-up search suggestions or wait for me to fix it! Thanks <3")) + ;; the dest-URL will look something like https://minecraft.fandom.com/api.php?action=query&list=search&srsearch=Spawner&formatversion=2&format=json (define dest-url (format "~a/api.php?~a" origin @@ -73,21 +85,26 @@ ("formatversion" . "2") ("format" . "json"))))) + ;; simultaneously get the search results from the fandom API, as well as information about the wiki as a whole (its license, icon, name) (define-values (dest-res siteinfo) (thread-values (λ () (log-outgoing dest-url) - (easy:get dest-url #:timeouts timeouts)) + (easy:get dest-url #:timeouts timeouts)) ;; HTTP request to dest-url for search results (λ () - (siteinfo-fetch wikiname)))) + (siteinfo-fetch wikiname)))) ;; helper function in another file to get information about the wiki + ;; search results are a JSON string. parse JSON into racket data structures (define data (easy:response-json dest-res)) - + ;; calling my generate-results-page function with the information so far in order to get a big fat x-expression + ;; big fat x-expression goes into the body variable (define body (generate-results-page req dest-url wikiname query data #:siteinfo siteinfo)) + ;; error checking (when (config-true? 'debug) ; used for its side effects ; convert to string with error checking, error will be raised if xexp is invalid (xexp->html body)) + ;; convert body to HTML and send to browser (response/output #:code 200 #:headers (build-headers always-headers)