From 443f1eecbc4c15c8038920027e62fccbdcb0bbe7 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 23 Oct 2024 22:52:00 +1300 Subject: [PATCH] Add user agent and detect blocked pages --- src/fandom-request.rkt | 40 +++++++++++++++++++++++++++++++++------- src/page-wiki.rkt | 28 ++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/fandom-request.rkt b/src/fandom-request.rkt index 966eeee..c306b04 100644 --- a/src/fandom-request.rkt +++ b/src/fandom-request.rkt @@ -1,12 +1,16 @@ #lang typed/racket/base -(require "config.rkt" +(require racket/format + racket/string + "config.rkt" "../lib/url-utils.rkt") (define-type Headers (HashTable Symbol (U Bytes String))) (require/typed net/http-easy [#:opaque Timeout-Config timeout-config?] [#:opaque Response response?] [#:opaque Session session?] + [response-status-code (Response -> Natural)] [current-session (Parameter Session)] + [current-user-agent (Parameter (U Bytes String))] [make-timeout-config ([#:lease Positive-Real] [#:connect Positive-Real] -> Timeout-Config)] [get ((U Bytes String) [#:close? Boolean] @@ -22,19 +26,41 @@ fandom-get-api timeouts) +(unless (string-contains? (~a (current-user-agent)) "BreezeWiki") + (current-user-agent + (format "BreezeWiki/1.0 (~a) ~a" + (if (config-true? 'canonical_origin) + (config-get 'canonical_origin) + "local") + (current-user-agent)))) + (define timeouts (make-timeout-config #:lease 5 #:connect 5)) +(: last-failure Flonum) +(define last-failure 0.0) +(: stored-failure (Option Response)) +(define stored-failure #f) +(define failure-persist-time 30000) + (: no-headers Headers) (define no-headers '#hasheq()) (: fandom-get (String String [#:headers (Option Headers)] -> Response)) (define (fandom-get wikiname path #:headers [headers #f]) - (define dest-url (string-append "https://www.fandom.com" path)) - (define host (string-append wikiname ".fandom.com")) - (log-outgoing wikiname path) - (get dest-url - #:timeouts timeouts - #:headers (hash-set (or headers no-headers) 'Host host))) + (or + (and ((current-inexact-milliseconds) . < . (+ last-failure failure-persist-time)) stored-failure) + (let () + (define dest-url (string-append "https://www.fandom.com" path)) + (define host (string-append wikiname ".fandom.com")) + (log-outgoing wikiname path) + (define res + (get dest-url + #:timeouts timeouts + #:headers (hash-set (or headers no-headers) 'Host host))) + (when (memq (response-status-code res) '(403 406)) + (set! last-failure (current-inexact-milliseconds)) + (set! stored-failure res)) + res))) (: fandom-get-api (String (Listof (Pair String String)) [#:headers (Option Headers)] -> Response)) (define (fandom-get-api wikiname params #:headers [headers #f]) diff --git a/src/page-wiki.rkt b/src/page-wiki.rkt index f16792c..da63617 100644 --- a/src/page-wiki.rkt +++ b/src/page-wiki.rkt @@ -18,6 +18,7 @@ "config.rkt" "data.rkt" "fandom-request.rkt" + "../lib/archive-file-mappings.rkt" "../lib/pure-utils.rkt" "../lib/syntax.rkt" "../lib/thread-utils.rkt" @@ -37,8 +38,9 @@ (define (page-wiki req) (define wikiname (path/param-path (first (url-path (request-uri req))))) + (define segments (map path/param-path (cdr (url-path (request-uri req))))) (define user-cookies (user-cookies-getter req)) - (define path (string-join (map path/param-path (cddr (url-path (request-uri req)))) "/")) + (define path (string-join (cdr segments) "/")) (define source-url (format "https://~a.fandom.com/wiki/~a" wikiname path)) (define-values (dest-res siteinfo) @@ -101,9 +103,31 @@ (write-html body out))))))] [(eq? 404 (easy:response-status-code dest-res)) (next-dispatcher)] + [(memq (easy:response-status-code dest-res) '(403 406)) + (response-handler + (define body + (generate-wiki-page + `(div + (p "Sorry! Fandom isn't allowing BreezeWiki to show pages right now.") + (p "We'll automatically try again in 30 seconds, so please stay on this page and be patient.") + (p (small "In a hurry? " (a (@ (href ,source-url)) "Click here to read the page on Fandom.")))) + #:req req + #:source-url source-url + #:wikiname wikiname + #:title (url-segments->guess-title segments) + #:siteinfo siteinfo)) + (response/output + #:code 503 + #:headers (build-headers + always-headers + (header #"Retry-After" #"30") + (header #"Cache-Control" #"max-age=30, public") + (header #"Refresh" #"35")) + (λ (out) + (write-html body out))))] [else (response-handler - (error 'page-wiki "Tried to load page ~a/~v~nSadly, the page didn't load because Fandom returned status code ~a with response:~n~a" + (error 'page-wiki "Tried to load page ~a/~a~nSadly, the page didn't load because Fandom returned status code ~a with response:~n~a" wikiname path (easy:response-status-code dest-res)