From 92591a5eab3653ad3c742288cb040d320199f599 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Thu, 17 Nov 2022 23:25:06 +1300 Subject: [PATCH] Fix some special characters in titles like ? and ; --- src/page-category.rkt | 2 +- src/page-home.rkt | 8 ++++---- src/page-search.rkt | 2 +- src/url-utils.rkt | 28 +++++++++++++++++++--------- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/page-category.rkt b/src/page-category.rkt index 89bc45a0..69d9f427 100644 --- a/src/page-category.rkt +++ b/src/page-category.rkt @@ -52,7 +52,7 @@ ,@(map (λ (result) (define title (jp "/title" result)) - (define page-path (regexp-replace* #rx" " title "_")) + (define page-path (page-title->path title)) `(li (a (@ (href ,(format "/~a/wiki/~a" wikiname page-path))) ,title))) diff --git a/src/page-home.rkt b/src/page-home.rkt index 6037d9a8..23ede8b4 100644 --- a/src/page-home.rkt +++ b/src/page-home.rkt @@ -18,9 +18,9 @@ (define examples '(("minecraft" "Bricks") - ("crosscode" "CrossCode_Wiki") - ("undertale" "Hot_Dog...%3F") - ("tardis" "Eleanor_Blake") + ("crosscode" "CrossCode Wiki") + ("undertale" "Hot Dog...?") + ("tardis" "Eleanor Blake") ("zelda" "Boomerang"))) (define content @@ -45,7 +45,7 @@ (h2 "Example pages") (ul ,@(map (λ (x) - `(li (a (@ (href ,(apply format "/~a/wiki/~a" x))) + `(li (a (@ (href ,(format "/~a/wiki/~a" (car x) (page-title->path (cadr x))))) ,(apply format "~a: ~a" x)))) examples)) (h2 "Testimonials") diff --git a/src/page-search.rkt b/src/page-search.rkt index f4d1ce36..04042333 100644 --- a/src/page-search.rkt +++ b/src/page-search.rkt @@ -38,7 +38,7 @@ (ul ,@(map (λ (result) (let* ([title (jp "/title" result)] - [page-path (regexp-replace* #rx" " title "_")] + [page-path (page-title->path title)] [timestamp (jp "/timestamp" result)] [wordcount (jp "/wordcount" result)] [size (jp "/size" result)]) diff --git a/src/url-utils.rkt b/src/url-utils.rkt index b70b245f..1df933c5 100644 --- a/src/url-utils.rkt +++ b/src/url-utils.rkt @@ -17,7 +17,9 @@ ; prints "out: " log-outgoing ; pass in a header, headers, or something useless. they'll all combine into a list - build-headers) + build-headers + ; try to follow wikimedia's format for which characters should be encoded/replaced in page titles for the url + page-title->path) (module+ test (require "typed-rackunit.rkt")) @@ -26,14 +28,18 @@ ;; https://url.spec.whatwg.org/#urlencoded-serializing -(define urlencoded-set '(#\! #\' #\( #\) #\~ ; urlencoded set - #\$ #\% #\& #\+ #\, ; component set - #\/ #\: #\; #\= #\@ #\[ #\\ #\] #\^ #\| ; userinfo set - #\? #\` #\{ #\} ; path set - #\ #\" #\# #\< #\> ; query set - ; c0 controls included elsewhere - ; higher ranges included elsewhere - )) +(define path-set '(#\; ; semicolon is part of the userinfo set in the URL standard, but I'm putting it here + #\? #\` #\{ #\} ; path set + #\ #\" #\# #\< #\> ; query set + ; c0 controls included elsewhere + ; higher ranges included elsewhere + )) +(define urlencoded-set (append + '(#\! #\' #\( #\) #\~ ; urlencoded set + #\$ #\% #\& #\+ #\, ; component set + #\/ #\: #\= #\@ #\[ #\\ #\] #\^ #\| ; userinfo set + ) + path-set)) (: percent-encode (String (Listof Char) Boolean -> Bytes)) (define (percent-encode value set space-as-plus) @@ -98,3 +104,7 @@ [(header? f) (list f)] [(pair? f) f])) fs))) + +(: page-title->path (String -> Bytes)) +(define (page-title->path title) + (percent-encode (regexp-replace* " " title "_") path-set #f))