Compare commits

...

4 commits

6 changed files with 70 additions and 29 deletions

View file

@ -11,7 +11,7 @@
url-segments->guess-title) url-segments->guess-title)
(define (local-encoded-url->segments str) ; '("wiki" "Page_title") (define (local-encoded-url->segments str) ; '("wiki" "Page_title")
(map path/param-path (url-path (string->url str)))) (map path/param-path (fix-semicolons-url-path (url-path (string->url str)))))
(define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix (define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix
(define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments))) (define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments)))

View file

@ -58,7 +58,11 @@
(data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png")
(class "thumbimage"))))) (class "thumbimage")))))
(figcaption "Test figure!")) (figcaption "Test figure!"))
(iframe (@ (src "https://example.com/iframe-src"))))))) (iframe (@ (src "https://example.com/iframe-src")))
(div (@ (class "reviews"))
(header "GameSpot Expert Reviews"))
(div (@ (data-test-ampersand) (class "mw-collapsible-content"))
(& ndash))))))
(define (updater wikiname #:strict-proxy? [strict-proxy? #f]) (define (updater wikiname #:strict-proxy? [strict-proxy? #f])
;; precompute wikiurl regex for efficency ;; precompute wikiurl regex for efficency
@ -157,7 +161,7 @@
(u (u
(λ (v) (has-class? "mw-collapsible-content" attributes)) (λ (v) (has-class? "mw-collapsible-content" attributes))
(λ (v) (for/list ([element v]) (λ (v) (for/list ([element v])
(u (λ (element) (pair? element)) (u (λ (element) (element-is-element? element))
(λ (element) (λ (element)
`(,(car element) `(,(car element)
(@ ,@(attribute-maybe-update 'style (λ (a) (regexp-replace #rx"display: *none" a "display:inline")) (bits->attributes element))) (@ ,@(attribute-maybe-update 'style (λ (a) (regexp-replace #rx"display: *none" a "display:inline")) (bits->attributes element)))
@ -238,6 +242,9 @@
[(list (list 'img _)) #t] [(list (list 'img _)) #t]
[_ #f])) [_ #f]))
return-no-element] return-no-element]
; remove gamespot reviews/ads
[(has-class? "reviews" attributes)
return-no-element]
[#t [#t
(list element-type (list element-type
;; attributes ;; attributes
@ -297,6 +304,12 @@
"/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png") "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png")
; check that noscript images are removed ; check that noscript images are removed
(check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f) (check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f)
; check that gamespot reviews/ads are removed
(check-equal? ((query-selector (λ (t a c) (has-class? "reviews" a)) transformed)) #f)
; check that (& x) sequences are not broken
(check-equal? ((query-selector (λ (t a c) (dict-has-key? a 'data-test-ampersand)) transformed))
'(div (@ (data-test-ampersand) (class "mw-collapsible-content"))
(& ndash)))
; benchmark ; benchmark
(when (file-exists? "../storage/Frog.html") (when (file-exists? "../storage/Frog.html")
(with-input-from-file "../storage/Frog.html" (with-input-from-file "../storage/Frog.html"

View file

@ -1,5 +1,6 @@
#lang typed/racket/base #lang typed/racket/base
(require racket/string (require racket/string
typed/net/url-structs
"pure-utils.rkt") "pure-utils.rkt")
(require/typed web-server/http/request-structs (require/typed web-server/http/request-structs
[#:opaque Header header?]) [#:opaque Header header?])
@ -20,7 +21,10 @@
; pass in a header, headers, or something useless. they'll all combine into a list ; pass in a header, headers, or something useless. they'll all combine into a list
build-headers build-headers
; try to follow wikimedia's format for which characters should be encoded/replaced in page titles for the url ; try to follow wikimedia's format for which characters should be encoded/replaced in page titles for the url
page-title->path) page-title->path
; path/param eats semicolons into params, which need to be fixed back into semicolons
fix-semicolons-url-path
fix-semicolons-url)
(module+ test (module+ test
(require "typed-rackunit.rkt")) (require "typed-rackunit.rkt"))
@ -106,3 +110,20 @@
(: page-title->path (String -> Bytes)) (: page-title->path (String -> Bytes))
(define (page-title->path title) (define (page-title->path title)
(percent-encode (regexp-replace* " " title "_") path-set #f)) (percent-encode (regexp-replace* " " title "_") path-set #f))
(: fix-semicolons-url-path ((Listof Path/Param) -> (Listof Path/Param)))
(define (fix-semicolons-url-path pps)
(for/list ([pp pps])
(define path (path/param-path pp))
(if (or (null? (path/param-param pp))
(symbol? path))
pp
;; path/param does have params, which need to be fixed into a semicolon.
(path/param
(string-append path ";" (string-join (path/param-param pp) ";"))
null))))
(: fix-semicolons-url (URL -> URL))
(define (fix-semicolons-url orig-url)
(struct-copy url orig-url [path (fix-semicolons-url-path (url-path orig-url))]))

View file

@ -86,15 +86,16 @@
; "element" is a real element with a type and everything (non-string, non-attributes) ; "element" is a real element with a type and everything (non-string, non-attributes)
(define (element-is-element? element) (define (element-is-element? element)
(and (element-is-bits? element) (not (element-is-xattributes? element)))) (and (element-is-bits? element) (not (eq? (car element) '&))(not (element-is-xattributes? element))))
(module+ test (module+ test
(check-true (element-is-element? '(span "hi"))) (check-true (element-is-element? '(span "hi")))
(check-false (element-is-element? '(@ (alt "Cute cat.")))) (check-false (element-is-element? '(@ (alt "Cute cat."))))
(check-false (element-is-element? "hi"))) (check-false (element-is-element? "hi"))
(check-false (element-is-element? '(& ndash))))
; "element content" is a real element or a string ; "element content" is a real element or a string or a (& x) sequence
(define (element-is-content? element) (define (element-is-content? element)
(or (string? element) (element-is-element? element))) (or (string? element) (element-is-element? element) (and (pair? element) (eq? (car element) '&))))
(module+ test (module+ test
(check-true (element-is-content? '(span "hi"))) (check-true (element-is-content? '(span "hi")))
(check-false (element-is-content? '(@ (alt "Cute cat.")))) (check-false (element-is-content? '(@ (alt "Cute cat."))))

View file

@ -59,16 +59,5 @@
(make-semicolon-fixer-dispatcher tree)) (make-semicolon-fixer-dispatcher tree))
(define ((make-semicolon-fixer-dispatcher orig-dispatcher) conn orig-req) (define ((make-semicolon-fixer-dispatcher orig-dispatcher) conn orig-req)
(define orig-uri (request-uri orig-req)) (define new-req (struct-copy request orig-req [uri (fix-semicolons-url (request-uri orig-req))]))
(define pps (url-path orig-uri)) ; list of path/param structs
(define new-path
(for/list ([pp pps])
(if (null? (path/param-param pp))
pp
;; path/param does have params, which need to be fixed into a semicolon.
(path/param
(string-append (path/param-path pp) ";" (string-join (path/param-param pp) ";"))
null))))
(define new-uri (struct-copy url orig-uri [path new-path]))
(define new-req (struct-copy request orig-req [uri new-uri]))
(orig-dispatcher conn new-req)) (orig-dispatcher conn new-req))

View file

@ -28,28 +28,35 @@
(define search-json-data (define search-json-data
'#hasheq((batchcomplete . #t) (query . #hasheq((search . (#hasheq((ns . 0) (pageid . 219) (size . 1482) (snippet . "") (timestamp . "2022-08-21T08:54:23Z") (title . "Gacha Capsule") (wordcount . 214)) #hasheq((ns . 0) (pageid . 201) (size . 1198) (snippet . "") (timestamp . "2022-07-11T17:52:47Z") (title . "Badges") (wordcount . 181))))))))) '#hasheq((batchcomplete . #t) (query . #hasheq((search . (#hasheq((ns . 0) (pageid . 219) (size . 1482) (snippet . "") (timestamp . "2022-08-21T08:54:23Z") (title . "Gacha Capsule") (wordcount . 214)) #hasheq((ns . 0) (pageid . 201) (size . 1198) (snippet . "") (timestamp . "2022-07-11T17:52:47Z") (title . "Badges") (wordcount . 181)))))))))
;; this takes the info we gathered from fandom and makes the big fat x-expression page
(define (generate-results-page req dest-url wikiname query data #:siteinfo [siteinfo #f]) (define (generate-results-page req dest-url wikiname query data #:siteinfo [siteinfo #f])
(define search-results (jp "/query/search" data)) (define search-results (jp "/query/search" data))
;; this is *another* helper that builds the wiki page UI and lets me put the search results (or whatever else) in the middle
(generate-wiki-page (generate-wiki-page
;; so I provide my helper function with the necessary context...
#:req req #:req req
#:source-url dest-url #:source-url dest-url
#:wikiname wikiname #:wikiname wikiname
#:title query #:title query
#:siteinfo siteinfo #:siteinfo siteinfo
;; and here's the actual results to display in the wiki page layout
`(div (@ (class "mw-parser-output")) `(div (@ (class "mw-parser-output"))
;; header before the search results showing how many we found
(p ,(format "~a results found for " (length search-results)) (p ,(format "~a results found for " (length search-results))
(strong ,query)) (strong ,query))
;; *u*nordered *l*ist of matching search results
(ul ,@(map (ul ,@(map
(λ (result) (λ (result) ;; for each result, run this code...
(let* ([title (jp "/title" result)] (let* ([title (jp "/title" result)]
[page-path (page-title->path title)] [page-path (page-title->path title)]
[timestamp (jp "/timestamp" result)] [timestamp (jp "/timestamp" result)]
[wordcount (jp "/wordcount" result)] [wordcount (jp "/wordcount" result)]
[size (jp "/size" result)]) [size (jp "/size" result)])
;; and make this x-expression...
`(li (@ (class "my-result")) `(li (@ (class "my-result"))
(a (@ (class "my-result__link") (href ,(format "/~a/wiki/~a" wikiname page-path))) (a (@ (class "my-result__link") (href ,(format "/~a/wiki/~a" wikiname page-path))) ; using unquote to insert the result page URL
,title) ,title) ; using unquote to insert the result page title
(div (@ (class "my-result__info")) (div (@ (class "my-result__info")) ; constructing the line under the search result
"last edited " "last edited "
(time (@ (datetime ,timestamp)) ,(list-ref (string-split timestamp "T") 0)) (time (@ (datetime ,timestamp)) ,(list-ref (string-split timestamp "T") 0))
,(format ", ~a words, ~a kb" ,(format ", ~a words, ~a kb"
@ -57,13 +64,18 @@
(exact->inexact (/ (round (/ size 100)) 10))))))) (exact->inexact (/ (round (/ size 100)) 10)))))))
search-results))))) search-results)))))
;; will be called when the web browser asks to load the page
(define (page-search req) (define (page-search req)
;; this just means, catch any errors and display them in the browser. it's a function somewhere else
(response-handler (response-handler
;; the URL will look like "/minecraft/wiki/Special:Search?q=Spawner"
;; grab the first part to use as the wikiname, in this case, "minecraft"
(define wikiname (path/param-path (first (url-path (request-uri req))))) (define wikiname (path/param-path (first (url-path (request-uri req)))))
;; grab the part after ?q= which is the search terms
(define query (dict-ref (url-query (request-uri req)) 'q #f)) (define query (dict-ref (url-query (request-uri req)) 'q #f))
;; constructing the URL where I want to get fandom data from...
(define origin (format "https://~a.fandom.com" wikiname)) (define origin (format "https://~a.fandom.com" wikiname))
(when (config-true? 'feature_offline::only) ;; the dest-URL will look something like https://minecraft.fandom.com/api.php?action=query&list=search&srsearch=Spawner&formatversion=2&format=json
(raise-user-error "Full search is currently not available on breezewiki.com - for now, please use the pop-up search suggestions or wait for me to fix it! Thanks <3"))
(define dest-url (define dest-url
(format "~a/api.php?~a" (format "~a/api.php?~a"
origin origin
@ -73,21 +85,26 @@
("formatversion" . "2") ("formatversion" . "2")
("format" . "json"))))) ("format" . "json")))))
;; simultaneously get the search results from the fandom API, as well as information about the wiki as a whole (its license, icon, name)
(define-values (dest-res siteinfo) (define-values (dest-res siteinfo)
(thread-values (thread-values
(λ () (λ ()
(log-outgoing dest-url) (log-outgoing dest-url)
(easy:get dest-url #:timeouts timeouts)) (easy:get dest-url #:timeouts timeouts)) ;; HTTP request to dest-url for search results
(λ () (λ ()
(siteinfo-fetch wikiname)))) (siteinfo-fetch wikiname)))) ;; helper function in another file to get information about the wiki
;; search results are a JSON string. parse JSON into racket data structures
(define data (easy:response-json dest-res)) (define data (easy:response-json dest-res))
;; calling my generate-results-page function with the information so far in order to get a big fat x-expression
;; big fat x-expression goes into the body variable
(define body (generate-results-page req dest-url wikiname query data #:siteinfo siteinfo)) (define body (generate-results-page req dest-url wikiname query data #:siteinfo siteinfo))
;; error checking
(when (config-true? 'debug) (when (config-true? 'debug)
; used for its side effects ; used for its side effects
; convert to string with error checking, error will be raised if xexp is invalid ; convert to string with error checking, error will be raised if xexp is invalid
(xexp->html body)) (xexp->html body))
;; convert body to HTML and send to browser
(response/output (response/output
#:code 200 #:code 200
#:headers (build-headers always-headers) #:headers (build-headers always-headers)