Add user agent and detect blocked pages

This commit is contained in:
Cadence Ember 2024-10-23 22:52:00 +13:00
parent 97c4e54f38
commit 443f1eecbc
2 changed files with 59 additions and 9 deletions

View file

@ -1,12 +1,16 @@
#lang typed/racket/base
(require "config.rkt"
(require racket/format
racket/string
"config.rkt"
"../lib/url-utils.rkt")
(define-type Headers (HashTable Symbol (U Bytes String)))
(require/typed net/http-easy
[#:opaque Timeout-Config timeout-config?]
[#:opaque Response response?]
[#:opaque Session session?]
[response-status-code (Response -> Natural)]
[current-session (Parameter Session)]
[current-user-agent (Parameter (U Bytes String))]
[make-timeout-config ([#:lease Positive-Real] [#:connect Positive-Real] -> Timeout-Config)]
[get ((U Bytes String)
[#:close? Boolean]
@ -22,19 +26,41 @@
fandom-get-api
timeouts)
(unless (string-contains? (~a (current-user-agent)) "BreezeWiki")
(current-user-agent
(format "BreezeWiki/1.0 (~a) ~a"
(if (config-true? 'canonical_origin)
(config-get 'canonical_origin)
"local")
(current-user-agent))))
(define timeouts (make-timeout-config #:lease 5 #:connect 5))
(: last-failure Flonum)
(define last-failure 0.0)
(: stored-failure (Option Response))
(define stored-failure #f)
(define failure-persist-time 30000)
(: no-headers Headers)
(define no-headers '#hasheq())
(: fandom-get (String String [#:headers (Option Headers)] -> Response))
(define (fandom-get wikiname path #:headers [headers #f])
(or
(and ((current-inexact-milliseconds) . < . (+ last-failure failure-persist-time)) stored-failure)
(let ()
(define dest-url (string-append "https://www.fandom.com" path))
(define host (string-append wikiname ".fandom.com"))
(log-outgoing wikiname path)
(define res
(get dest-url
#:timeouts timeouts
#:headers (hash-set (or headers no-headers) 'Host host)))
(when (memq (response-status-code res) '(403 406))
(set! last-failure (current-inexact-milliseconds))
(set! stored-failure res))
res)))
(: fandom-get-api (String (Listof (Pair String String)) [#:headers (Option Headers)] -> Response))
(define (fandom-get-api wikiname params #:headers [headers #f])

View file

@ -18,6 +18,7 @@
"config.rkt"
"data.rkt"
"fandom-request.rkt"
"../lib/archive-file-mappings.rkt"
"../lib/pure-utils.rkt"
"../lib/syntax.rkt"
"../lib/thread-utils.rkt"
@ -37,8 +38,9 @@
(define (page-wiki req)
(define wikiname (path/param-path (first (url-path (request-uri req)))))
(define segments (map path/param-path (cdr (url-path (request-uri req)))))
(define user-cookies (user-cookies-getter req))
(define path (string-join (map path/param-path (cddr (url-path (request-uri req)))) "/"))
(define path (string-join (cdr segments) "/"))
(define source-url (format "https://~a.fandom.com/wiki/~a" wikiname path))
(define-values (dest-res siteinfo)
@ -101,9 +103,31 @@
(write-html body out))))))]
[(eq? 404 (easy:response-status-code dest-res))
(next-dispatcher)]
[(memq (easy:response-status-code dest-res) '(403 406))
(response-handler
(define body
(generate-wiki-page
`(div
(p "Sorry! Fandom isn't allowing BreezeWiki to show pages right now.")
(p "We'll automatically try again in 30 seconds, so please stay on this page and be patient.")
(p (small "In a hurry? " (a (@ (href ,source-url)) "Click here to read the page on Fandom."))))
#:req req
#:source-url source-url
#:wikiname wikiname
#:title (url-segments->guess-title segments)
#:siteinfo siteinfo))
(response/output
#:code 503
#:headers (build-headers
always-headers
(header #"Retry-After" #"30")
(header #"Cache-Control" #"max-age=30, public")
(header #"Refresh" #"35"))
(λ (out)
(write-html body out))))]
[else
(response-handler
(error 'page-wiki "Tried to load page ~a/~v~nSadly, the page didn't load because Fandom returned status code ~a with response:~n~a"
(error 'page-wiki "Tried to load page ~a/~a~nSadly, the page didn't load because Fandom returned status code ~a with response:~n~a"
wikiname
path
(easy:response-status-code dest-res)