Add user agent and detect blocked pages

This commit is contained in:
Cadence Ember 2024-10-23 22:52:00 +13:00
parent 97c4e54f38
commit 443f1eecbc
2 changed files with 59 additions and 9 deletions

View file

@ -1,12 +1,16 @@
#lang typed/racket/base #lang typed/racket/base
(require "config.rkt" (require racket/format
racket/string
"config.rkt"
"../lib/url-utils.rkt") "../lib/url-utils.rkt")
(define-type Headers (HashTable Symbol (U Bytes String))) (define-type Headers (HashTable Symbol (U Bytes String)))
(require/typed net/http-easy (require/typed net/http-easy
[#:opaque Timeout-Config timeout-config?] [#:opaque Timeout-Config timeout-config?]
[#:opaque Response response?] [#:opaque Response response?]
[#:opaque Session session?] [#:opaque Session session?]
[response-status-code (Response -> Natural)]
[current-session (Parameter Session)] [current-session (Parameter Session)]
[current-user-agent (Parameter (U Bytes String))]
[make-timeout-config ([#:lease Positive-Real] [#:connect Positive-Real] -> Timeout-Config)] [make-timeout-config ([#:lease Positive-Real] [#:connect Positive-Real] -> Timeout-Config)]
[get ((U Bytes String) [get ((U Bytes String)
[#:close? Boolean] [#:close? Boolean]
@ -22,19 +26,41 @@
fandom-get-api fandom-get-api
timeouts) timeouts)
(unless (string-contains? (~a (current-user-agent)) "BreezeWiki")
(current-user-agent
(format "BreezeWiki/1.0 (~a) ~a"
(if (config-true? 'canonical_origin)
(config-get 'canonical_origin)
"local")
(current-user-agent))))
(define timeouts (make-timeout-config #:lease 5 #:connect 5)) (define timeouts (make-timeout-config #:lease 5 #:connect 5))
(: last-failure Flonum)
(define last-failure 0.0)
(: stored-failure (Option Response))
(define stored-failure #f)
(define failure-persist-time 30000)
(: no-headers Headers) (: no-headers Headers)
(define no-headers '#hasheq()) (define no-headers '#hasheq())
(: fandom-get (String String [#:headers (Option Headers)] -> Response)) (: fandom-get (String String [#:headers (Option Headers)] -> Response))
(define (fandom-get wikiname path #:headers [headers #f]) (define (fandom-get wikiname path #:headers [headers #f])
(or
(and ((current-inexact-milliseconds) . < . (+ last-failure failure-persist-time)) stored-failure)
(let ()
(define dest-url (string-append "https://www.fandom.com" path)) (define dest-url (string-append "https://www.fandom.com" path))
(define host (string-append wikiname ".fandom.com")) (define host (string-append wikiname ".fandom.com"))
(log-outgoing wikiname path) (log-outgoing wikiname path)
(define res
(get dest-url (get dest-url
#:timeouts timeouts #:timeouts timeouts
#:headers (hash-set (or headers no-headers) 'Host host))) #:headers (hash-set (or headers no-headers) 'Host host)))
(when (memq (response-status-code res) '(403 406))
(set! last-failure (current-inexact-milliseconds))
(set! stored-failure res))
res)))
(: fandom-get-api (String (Listof (Pair String String)) [#:headers (Option Headers)] -> Response)) (: fandom-get-api (String (Listof (Pair String String)) [#:headers (Option Headers)] -> Response))
(define (fandom-get-api wikiname params #:headers [headers #f]) (define (fandom-get-api wikiname params #:headers [headers #f])

View file

@ -18,6 +18,7 @@
"config.rkt" "config.rkt"
"data.rkt" "data.rkt"
"fandom-request.rkt" "fandom-request.rkt"
"../lib/archive-file-mappings.rkt"
"../lib/pure-utils.rkt" "../lib/pure-utils.rkt"
"../lib/syntax.rkt" "../lib/syntax.rkt"
"../lib/thread-utils.rkt" "../lib/thread-utils.rkt"
@ -37,8 +38,9 @@
(define (page-wiki req) (define (page-wiki req)
(define wikiname (path/param-path (first (url-path (request-uri req))))) (define wikiname (path/param-path (first (url-path (request-uri req)))))
(define segments (map path/param-path (cdr (url-path (request-uri req)))))
(define user-cookies (user-cookies-getter req)) (define user-cookies (user-cookies-getter req))
(define path (string-join (map path/param-path (cddr (url-path (request-uri req)))) "/")) (define path (string-join (cdr segments) "/"))
(define source-url (format "https://~a.fandom.com/wiki/~a" wikiname path)) (define source-url (format "https://~a.fandom.com/wiki/~a" wikiname path))
(define-values (dest-res siteinfo) (define-values (dest-res siteinfo)
@ -101,9 +103,31 @@
(write-html body out))))))] (write-html body out))))))]
[(eq? 404 (easy:response-status-code dest-res)) [(eq? 404 (easy:response-status-code dest-res))
(next-dispatcher)] (next-dispatcher)]
[(memq (easy:response-status-code dest-res) '(403 406))
(response-handler
(define body
(generate-wiki-page
`(div
(p "Sorry! Fandom isn't allowing BreezeWiki to show pages right now.")
(p "We'll automatically try again in 30 seconds, so please stay on this page and be patient.")
(p (small "In a hurry? " (a (@ (href ,source-url)) "Click here to read the page on Fandom."))))
#:req req
#:source-url source-url
#:wikiname wikiname
#:title (url-segments->guess-title segments)
#:siteinfo siteinfo))
(response/output
#:code 503
#:headers (build-headers
always-headers
(header #"Retry-After" #"30")
(header #"Cache-Control" #"max-age=30, public")
(header #"Refresh" #"35"))
(λ (out)
(write-html body out))))]
[else [else
(response-handler (response-handler
(error 'page-wiki "Tried to load page ~a/~v~nSadly, the page didn't load because Fandom returned status code ~a with response:~n~a" (error 'page-wiki "Tried to load page ~a/~a~nSadly, the page didn't load because Fandom returned status code ~a with response:~n~a"
wikiname wikiname
path path
(easy:response-status-code dest-res) (easy:response-status-code dest-res)