breezewiki/lib/url-utils.rkt

130 lines
4.7 KiB
Racket
Raw Permalink Normal View History

#lang typed/racket/base
(require racket/string
2023-05-27 11:37:45 +00:00
typed/net/url-structs
"pure-utils.rkt")
(require/typed web-server/http/request-structs
[#:opaque Header header?])
(provide
2022-10-02 09:44:44 +00:00
; regex to match wiki names
px-wikiname
; make a query string from an association list of strings
params->query
; custom percent encoding (you probably want params->query instead)
percent-encode
; sets for custom percent encoding
path-set urlencoded-set filename-set
; make a proxied version of a fandom url
u-proxy-url
; check whether a url is on a domain controlled by fandom
is-fandom-url?
; pass in a header, headers, or something useless. they'll all combine into a list
build-headers
; try to follow wikimedia's format for which characters should be encoded/replaced in page titles for the url
2023-05-27 11:37:45 +00:00
page-title->path
; path/param eats semicolons into params, which need to be fixed back into semicolons
fix-semicolons-url-path
fix-semicolons-url)
(module+ test
(require "typed-rackunit.rkt"))
(define px-wikiname "[a-zA-Z0-9-]{1,50}")
2022-10-02 09:44:44 +00:00
;; https://url.spec.whatwg.org/#urlencoded-serializing
(define path-set '(#\; ; semicolon is part of the userinfo set in the URL standard, but I'm putting it here
#\? #\` #\{ #\} ; path set
#\ #\" #\# #\< #\> ; query set
; c0 controls included elsewhere
; higher ranges included elsewhere
))
(define urlencoded-set (append
'(#\! #\' #\( #\) #\~ ; urlencoded set
#\$ #\% #\& #\+ #\, ; component set
#\/ #\: #\= #\@ #\[ #\\ #\] #\^ #\| ; userinfo set
)
path-set))
(define filename-set '(#\< #\> #\: #\" #\/ #\\ #\| #\? #\* #\# #\~ #\&))
(: percent-encode (String (Listof Char) Boolean -> Bytes))
(define (percent-encode value set space-as-plus)
(define b (string->bytes/utf-8 value))
(apply bytes-append
(for/list ([char b]) : (Listof Bytes)
(cond
[(and space-as-plus (eq? char 32))
#"+"]
[(or (member (integer->char char) set)
(char . > . #x7E)
(char . <= . #x1F))
(bytes-append #"%" (string->bytes/latin-1
(string-upcase (number->string char 16))))]
[#t
(bytes char)]))))
(: params->query ((Listof (Pair String String)) -> String))
(define (params->query params)
(string-join
(map (λ ([p : (Pair String String)])
(format "~a=~a"
(percent-encode (car p) urlencoded-set #t)
(percent-encode (cdr p) urlencoded-set #t)))
params)
"&"))
(module+ test
(check-equal? (params->query '(("hello" . "world")))
"hello=world")
(check-equal? (params->query '(("a" . "hello world''") ("utf8" . "")))
"a=hello+world%27%27&utf8=%E2%9C%93"))
(: is-fandom-url? (String -> Boolean))
(define (is-fandom-url? url)
(regexp-match? (pregexp (format "^https://static\\.wikia\\.nocookie\\.net/|^https://~a\\.fandom\\.com/" px-wikiname)) url))
(module+ test
(check-true (is-fandom-url? "https://static.wikia.nocookie.net/wikiname/images/2/2f/SomeImage.jpg/revision/latest?cb=20110210094136"))
(check-true (is-fandom-url? "https://test.fandom.com/wiki/Some_Page"))
(check-false (is-fandom-url? "https://cadence.moe")))
(: u-proxy-url (String -> String))
(define (u-proxy-url url)
(u
is-fandom-url?
(λ ([v : String]) (string-append "/proxy?" (params->query `(("dest" . ,url)))))
url))
(: build-headers ((U Header (Listof Header) False Void) * -> (Listof Header)))
(define (build-headers . fs)
(apply
append
(map (λ ([f : (U Header (Listof Header) False Void)])
(cond
[(not f) null]
[(void? f) null]
[(null? f) null]
[(header? f) (list f)]
[(pair? f) f]))
fs)))
(: page-title->path (String -> Bytes))
(define (page-title->path title)
(percent-encode (regexp-replace* " " title "_") path-set #f))
2023-05-27 11:37:45 +00:00
(: fix-semicolons-url-path ((Listof Path/Param) -> (Listof Path/Param)))
(define (fix-semicolons-url-path pps)
(for/list ([pp pps])
(define path (path/param-path pp))
(if (or (null? (path/param-param pp))
(symbol? path))
pp
;; path/param does have params, which need to be fixed into a semicolon.
(path/param
(string-append path ";" (string-join (path/param-param pp) ";"))
null))))
(: fix-semicolons-url (URL -> URL))
(define (fix-semicolons-url orig-url)
(struct-copy url orig-url [path (fix-semicolons-url-path (url-path orig-url))]))