From 2694eb724547970e364dc95a632dfe2374b6bf84 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Wed, 22 Mar 2023 21:44:40 -0700 Subject: [PATCH 01/58] use faster string split --- lib/xexpr-utils.rkt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/xexpr-utils.rkt b/lib/xexpr-utils.rkt index 018d8c3..b1538d6 100644 --- a/lib/xexpr-utils.rkt +++ b/lib/xexpr-utils.rkt @@ -190,7 +190,9 @@ '(body "Hey" (& nbsp) (a (@ (href "/")))))) (define (has-class? name attributes) - (and (member name (string-split (or (get-attribute 'class attributes) "") " ")) #t)) + ; splitting without specifying separator or splitting on #px"\\s+" makes + ; string-split use a faster whitespace-specialized implementation. + (and (member name (string-split (or (get-attribute 'class attributes) "") #px"\\s+")) #t)) (module+ test (check-true (has-class? "red" '((class "yellow red blue")))) (check-false (has-class? "red" '((class "yellow blue")))) From dd1b672c4d55d4b7ff560980feb5bf9756e750c7 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Wed, 22 Mar 2023 21:45:09 -0700 Subject: [PATCH 02/58] precompile regexp patterns --- lib/tree-updater.rkt | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/lib/tree-updater.rkt b/lib/tree-updater.rkt index 074c5a1..bf9f82c 100644 --- a/lib/tree-updater.rkt +++ b/lib/tree-updater.rkt @@ -100,6 +100,9 @@ (define (cardimage-class-updater c) (string-append c " bw-updated-cardtable-cardimage")) + ; precompute wikiurl regex for efficency + (define wikiurl-regex (pregexp (format "^https://(~a)\\.fandom\\.com(/wiki/.*)$" px-wikiname))) + (define attributes-updater (compose1 ; uncollapsing @@ -107,12 +110,15 @@ (λ (class) (string-join (classlist-updater (string-split class " ")) " "))) (curry attribute-maybe-update 'class class-updater) ; change links to stay on the same wiki - (curry attribute-maybe-update 'href - (λ (href) - ((compose1 - (λ (href) (regexp-replace #rx"^(/wiki/.*)" href (format "/~a\\1" wikiname))) - (λ (href) (regexp-replace (pregexp (format "^https://(~a)\\.fandom\\.com(/wiki/.*)" px-wikiname)) href "/\\1\\2"))) - href))) + (let + ; precompute wikiname replacement pattern + ([wiki-substitution (format "/~a\\1" wikiname)]) + (curry attribute-maybe-update 'href + (λ (href) + ((compose1 + (λ (href) (regexp-replace #rx"^(/wiki/.*)$" href wiki-substitution)) + (λ (href) (regexp-replace wikiurl-regex href "/\\1\\2"))) + href)))) ; add noreferrer to a.image (curry u (λ (v) (and #;(eq? element-type 'a) From 8274e6cf1fe2ed8df7190d1e48b3665fe1f51fb2 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Fri, 24 Mar 2023 22:54:46 +1300 Subject: [PATCH 03/58] Add RuneScape Classic redirect; merge RS category --- src/extwiki-data.rkt | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index 82978b1..4852541 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -61,14 +61,6 @@ ("Browser Extension" . "https://runescape.wiki/w/RuneScape:Finding_the_wikis_with_ease#Extensions")) (λ (props) '())) - 'OSRS - (extwiki-group^ - "OSRS" - '(("Leaving Wikia" . "https://runescape.wiki/w/Forum:Leaving_Wikia") - ("In the media" . "https://kotaku.com/video-game-wikis-abandon-their-platform-after-year-of-p-1829401866") - ("Browser Extension" . "https://oldschool.runescape.wiki/w/RuneScape:Finding_the_wikis_with_ease#Extensions")) - (λ (props) '())) - 'empty (extwiki-group^ "Misc" @@ -382,13 +374,22 @@ (extwiki^ '("oldschoolrunescape") 'default - 'OSRS + 'RuneScape "Old School RuneScape Wiki" "https://oldschool.runescape.wiki/" "https://oldschool.runescape.wiki/images/Wiki.png" (λ (props) `((p "The Old School RuneScape Wiki was founded on February 14, 2013. In October 2018, the RuneScape Wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements, with the Old School RuneScape Wiki following suit.")))) + (extwiki^ + '("runescapeclassic") 'default + 'RuneScape + "RuneScape Classic Wiki" + "https://classic.runescape.wiki/" + "https://classic.runescape.wiki/images/Wiki.png" + (λ (props) + `((p "The Old School RuneScape Wiki was founded on April 19, 2009. In October 2018, the RuneScape Wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements, with the RuneScape Classic Wiki following suit.")))) + (extwiki^ '("astroneer") 'default 'Astroneer From 3f1946a3b8b061b7f8a03e1681471212ff79ac71 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Wed, 22 Mar 2023 21:44:40 -0700 Subject: [PATCH 04/58] use faster string split --- lib/xexpr-utils.rkt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/xexpr-utils.rkt b/lib/xexpr-utils.rkt index 018d8c3..cb40510 100644 --- a/lib/xexpr-utils.rkt +++ b/lib/xexpr-utils.rkt @@ -190,7 +190,9 @@ '(body "Hey" (& nbsp) (a (@ (href "/")))))) (define (has-class? name attributes) - (and (member name (string-split (or (get-attribute 'class attributes) "") " ")) #t)) + ;; splitting without specifying separator or splitting on #px"\\s+" makes + ;; string-split use a faster whitespace-specialized implementation. + (and (member name (string-split (or (get-attribute 'class attributes) "") #px"\\s+")) #t)) (module+ test (check-true (has-class? "red" '((class "yellow red blue")))) (check-false (has-class? "red" '((class "yellow blue")))) From f5529ed12a3b6de478e30fdfbfe74ed787f65234 Mon Sep 17 00:00:00 2001 From: Artemis Everfree Date: Wed, 22 Mar 2023 21:45:09 -0700 Subject: [PATCH 05/58] precompile regexp patterns --- lib/tree-updater.rkt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/tree-updater.rkt b/lib/tree-updater.rkt index 074c5a1..6c369b5 100644 --- a/lib/tree-updater.rkt +++ b/lib/tree-updater.rkt @@ -66,6 +66,11 @@ (iframe (@ (src "https://example.com/iframe-src"))))))) (define (updater wikiname #:strict-proxy? [strict-proxy? #f]) + ;; precompute wikiurl regex for efficency + (define wikiurl-regex (pregexp (format "^https://(~a)\\.fandom\\.com(/wiki/.*)$" px-wikiname))) + ;; precompute link replacement string for efficiency + (define wiki-substitution (format "/~a\\1" wikiname)) + (define classlist-updater (compose1 ; uncollapse all navbox items (bottom of page mass navigation) @@ -110,8 +115,8 @@ (curry attribute-maybe-update 'href (λ (href) ((compose1 - (λ (href) (regexp-replace #rx"^(/wiki/.*)" href (format "/~a\\1" wikiname))) - (λ (href) (regexp-replace (pregexp (format "^https://(~a)\\.fandom\\.com(/wiki/.*)" px-wikiname)) href "/\\1\\2"))) + (λ (href) (regexp-replace #rx"^(/wiki/.*)$" href wiki-substitution)) + (λ (href) (regexp-replace wikiurl-regex href "/\\1\\2"))) href))) ; add noreferrer to a.image (curry u From ba6c5be9905529019c011b0d2de45eb451fb45ba Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sun, 2 Apr 2023 00:04:35 +1300 Subject: [PATCH 06/58] Optimise pre-processing regular expression --- lib/tree-updater.rkt | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/lib/tree-updater.rkt b/lib/tree-updater.rkt index 6c369b5..0dbf695 100644 --- a/lib/tree-updater.rkt +++ b/lib/tree-updater.rkt @@ -12,17 +12,12 @@ update-tree-wiki) (define (preprocess-html-wiki html) - (define ((rr* find replace) contents) - (regexp-replace* find contents replace)) - ((compose1 - ; fix navbox list nesting - ; navbox on right of page has incorrect html "
  • " and the xexpr parser puts the
  • much further up the tree - ; add a
      to make the parser happy - ; usage: /fallout/wiki/Fallout:_New_Vegas_achievements_and_trophies - (rr* #rx"(]*>\n?)(
    • )" "\\1
        \\2") - ; change

        to

        to make the parser happy - (rr* #rx"(]*>)[ \t]*

        ([^<]*)

        " "\\1\\2")) - html)) + (regexp-replace* #rx"(<(?:td|figcaption)[^>]*?>\n?)(?:
      • |[ \t]*?

        (.*?)

        )" + html (λ (whole first-tag [contents #f]) + (if (eq? (string-ref whole 1) #\f) ;; figcaption + (string-append first-tag "" contents "") + (string-append first-tag "
        • "))))) + (module+ test (check-equal? (preprocess-html-wiki "\n
        • Hey
        • ") "\n
          • Hey
          • ") From d3187cc310a4f622c88dcb9df7f2859bd101d87a Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sun, 2 Apr 2023 00:02:07 +1300 Subject: [PATCH 07/58] Tweak extwiki-generic migration notice --- src/extwiki-generic.rkt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/extwiki-generic.rkt b/src/extwiki-generic.rkt index fe30b14..113b139 100644 --- a/src/extwiki-generic.rkt +++ b/src/extwiki-generic.rkt @@ -110,8 +110,8 @@ (div (@ (class "niwa__left")) (a (@ (class "niwa__go") (href ,go)) "Read " ,title " on " ,display-name " →") ,@body - (p "This wiki's core community has wholly migrated away from Fandom. You should " - (a (@ (href ,go)) "go to " ,display-name " now!"))) + (p "This external wiki is a helpful alternative to Fandom. You should " + (a (@ (href ,go)) "check it out now!"))) ,(if logo `(div (@ (class "niwa__right")) (img (@ (class "niwa__logo") (src ,logo)))) From b5fb99c8abb49a796ae9de28a8753f1f5d51ea1e Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Mon, 10 Apr 2023 17:05:58 +1200 Subject: [PATCH 08/58] Fix category pages with slashes --- src/page-category.rkt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/page-category.rkt b/src/page-category.rkt index 29b541c..213d423 100644 --- a/src/page-category.rkt +++ b/src/page-category.rkt @@ -66,7 +66,7 @@ (define (page-category req) (response-handler (define wikiname (path/param-path (first (url-path (request-uri req))))) - (define prefixed-category (path/param-path (caddr (url-path (request-uri req))))) + (define prefixed-category (string-join (map path/param-path (cddr (url-path (request-uri req)))) "/")) (define origin (format "https://~a.fandom.com" wikiname)) (define source-url (format "~a/wiki/~a" origin prefixed-category)) From 040d9b94dec02c260b871caeceed80ae4d3fd7c4 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sun, 16 Apr 2023 00:05:54 +1200 Subject: [PATCH 09/58] New option: promotions::indie_wiki_buddy --- src/application-globals.rkt | 12 +++++++++++- src/config.rkt | 16 +++++++++++++--- src/page-home.rkt | 6 ++++-- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/application-globals.rkt b/src/application-globals.rkt index b6777ef..10b4530 100644 --- a/src/application-globals.rkt +++ b/src/application-globals.rkt @@ -62,10 +62,19 @@ (p (a (@ (href "https://lists.sr.ht/~cadence/breezewiki-discuss")) "Chat / Bug reports / Feature requests")) + ,(if (config-member? 'promotions::indie_wiki_buddy "footer") + `(p + (a (@ (href "https://getindie.wiki/")) + "Get Indie Wiki Buddy browser extension - be redirected to BreezeWiki every time!")) + "") ,(if (config-true? 'instance_is_official) `(p ,(format "This instance is run by the ~a developer, " (config-get 'application_name)) (a (@ (href "https://cadence.moe/contact")) - "Cadence.")) + "Cadence") + ". Proudly hosted by " + (a (@ (href "http://alphamethyl.barr0w.net")) + "Barrow Network Solutions" (sup "XD")) + ".") `(p ,(format "This unofficial instance is based off the ~a source code, but is not controlled by the code developer." (config-get 'application_name))))) ,(if source-url @@ -201,6 +210,7 @@ ,(let ([extension-eligible? (cond/var [(not req) #f] + [(not (config-member? 'promotions::indie_wiki_buddy "banner")) #f] (var ua-pair (assq 'user-agent (request-headers req))) [(not ua-pair) #f] (var ua (string-downcase (cdr ua-pair))) diff --git a/src/config.rkt b/src/config.rkt index 09407c4..2158e77 100644 --- a/src/config.rkt +++ b/src/config.rkt @@ -8,6 +8,7 @@ (provide config-parameter config-true? + config-member? config-get) (module+ test @@ -23,6 +24,11 @@ (define (config-true? key) (not (member ((config-parameter key)) '("" "false")))) +(: config-member? (Symbol String [#:sep String] -> Boolean)) +(define (config-member? key item #:sep [sep #px"\\s+"]) + (and (config-true? key) + (not (not (member item (string-split (config-get key) sep)))))) + (: config-get (Symbol -> String)) (define (config-get key) ((config-parameter key))) @@ -42,7 +48,9 @@ (feature_offline::format . "json.gz") (feature_offline::only . "false") - (access_log::enabled . "false"))) + (access_log::enabled . "false") + + (promotions::indie_wiki_buddy . "banner home"))) (define loaded-alist (with-handlers @@ -109,8 +117,10 @@ (module+ test ; this is just a sanity check (parameterize ([(config-parameter 'application_name) "JeffWiki"] - [(config-parameter 'strict_proxy) ""]) + [(config-parameter 'strict_proxy) ""] + [(config-parameter 'promotions::indie_wiki_buddy) "a b c"]) (check-equal? (config-get 'application_name) "JeffWiki") (check-false (config-true? 'strict_proxy)) - (check-equal? (string? (config-get 'feature_offline::format)) #t))) + (check-equal? (string? (config-get 'feature_offline::format)) #t) + (check-true (config-member? 'promotions::indie_wiki_buddy "b")))) diff --git a/src/page-home.rkt b/src/page-home.rkt index 95793d3..24f7393 100644 --- a/src/page-home.rkt +++ b/src/page-home.rkt @@ -31,8 +31,10 @@ (url-host (string->url (config-get 'canonical_origin))) "breezewiki.com"))) (p "If you'd like to be automatically sent to BreezeWiki every time in the future, " - (a (@ (href "https://getindie.wiki")) "get our affiliated browser extension (NEW!)") - " or " + ,@(if (config-member? 'promotions::indie_wiki_buddy "home") + `((a (@ (href "https://getindie.wiki")) "get our affiliated browser extension (NEW!)") + " or ") + null) (a (@ (href "https://docs.breezewiki.com/Automatic_Redirection.html")) "check out the tutorial in the manual.")) (p "BreezeWiki is available on several different websites called " (a (@ (href "https://en.wikipedia.org/wiki/Mirror_site")) "mirrors") ". Each is independently run. If one mirror is offline, the others still work. " (a (@ (href "https://docs.breezewiki.com/Links.html#%28part._.Mirrors%29")) "See the list.")) From a1bba220547ae43ac148652a1fbe3163bf354059 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Mon, 17 Apr 2023 00:46:15 +1200 Subject: [PATCH 10/58] Really fix semicolons in URL --- src/dispatcher-tree.rkt | 57 ++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/src/dispatcher-tree.rkt b/src/dispatcher-tree.rkt index 288634a..315638a 100644 --- a/src/dispatcher-tree.rkt +++ b/src/dispatcher-tree.rkt @@ -35,23 +35,40 @@ (define (make-dispatcher-tree ds) (define subdomain-dispatcher (hash-ref ds 'subdomain-dispatcher)) - (sequencer:make - subdomain-dispatcher - (pathprocedure:make "/" (hash-ref ds 'page-home)) - (pathprocedure:make "/proxy" (hash-ref ds 'page-proxy)) - (pathprocedure:make "/search" (hash-ref ds 'page-global-search)) - (pathprocedure:make "/set-user-settings" (hash-ref ds 'page-set-user-settings)) - (pathprocedure:make "/buddyfight/wiki/It_Doesn't_Work!!" (hash-ref ds 'page-it-works)) - (filter:make (pregexp (format "^/~a/wiki/Category:.+$" px-wikiname)) (lift:make (hash-ref ds 'page-category))) - (filter:make (pregexp (format "^/~a/wiki/File:.+$" px-wikiname)) (lift:make (hash-ref ds 'page-file))) - (if (config-true? 'feature_offline::enabled) - (filter:make (pregexp (format "^/~a/wiki/.+$" px-wikiname)) (lift:make (hash-ref ds 'page-wiki-offline))) - (λ (_conn _req) (next-dispatcher))) - (filter:make (pregexp (format "^/~a/wiki/.+$" px-wikiname)) (lift:make (hash-ref ds 'page-wiki))) - (filter:make (pregexp (format "^/~a/search$" px-wikiname)) (lift:make (hash-ref ds 'page-search))) - (filter:make (pregexp (format "^/~a(/(wiki(/)?)?)?$" px-wikiname)) (lift:make (hash-ref ds 'redirect-wiki-home))) - (if (config-true? 'feature_offline::enabled) - (filter:make (pregexp (format "^/archive/~a/(styles|images)/.+$" px-wikiname)) (lift:make (hash-ref ds 'page-static-archive))) - (λ (_conn _req) (next-dispatcher))) - (hash-ref ds 'static-dispatcher) - (lift:make (hash-ref ds 'page-not-found)))) + (define tree + (sequencer:make + subdomain-dispatcher + (pathprocedure:make "/" (hash-ref ds 'page-home)) + (pathprocedure:make "/proxy" (hash-ref ds 'page-proxy)) + (pathprocedure:make "/search" (hash-ref ds 'page-global-search)) + (pathprocedure:make "/set-user-settings" (hash-ref ds 'page-set-user-settings)) + (pathprocedure:make "/buddyfight/wiki/It_Doesn't_Work!!" (hash-ref ds 'page-it-works)) + (filter:make (pregexp (format "^/~a/wiki/Category:.+$" px-wikiname)) (lift:make (hash-ref ds 'page-category))) + (filter:make (pregexp (format "^/~a/wiki/File:.+$" px-wikiname)) (lift:make (hash-ref ds 'page-file))) + (if (config-true? 'feature_offline::enabled) + (filter:make (pregexp (format "^/~a/wiki/.+$" px-wikiname)) (lift:make (hash-ref ds 'page-wiki-offline))) + (λ (_conn _req) (next-dispatcher))) + (filter:make (pregexp (format "^/~a/wiki/.+$" px-wikiname)) (lift:make (hash-ref ds 'page-wiki))) + (filter:make (pregexp (format "^/~a/search$" px-wikiname)) (lift:make (hash-ref ds 'page-search))) + (filter:make (pregexp (format "^/~a(/(wiki(/)?)?)?$" px-wikiname)) (lift:make (hash-ref ds 'redirect-wiki-home))) + (if (config-true? 'feature_offline::enabled) + (filter:make (pregexp (format "^/archive/~a/(styles|images)/.+$" px-wikiname)) (lift:make (hash-ref ds 'page-static-archive))) + (λ (_conn _req) (next-dispatcher))) + (hash-ref ds 'static-dispatcher) + (lift:make (hash-ref ds 'page-not-found)))) + (make-semicolon-fixer-dispatcher tree)) + +(define ((make-semicolon-fixer-dispatcher orig-dispatcher) conn orig-req) + (define orig-uri (request-uri orig-req)) + (define pps (url-path orig-uri)) ; list of path/param structs + (define new-path + (for/list ([pp pps]) + (if (null? (path/param-param pp)) + pp + ;; path/param does have params, which need to be fixed into a semicolon. + (path/param + (string-append (path/param-path pp) ";" (string-join (path/param-param pp) ";")) + null)))) + (define new-uri (struct-copy url orig-uri [path new-path])) + (define new-req (struct-copy request orig-req [uri new-uri])) + (orig-dispatcher conn new-req)) From 2e292b4f8074974c8d58558c248771fc1e4e1203 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Thu, 18 May 2023 00:30:37 +1200 Subject: [PATCH 11/58] Add Terraria Mods redirect --- src/extwiki-data.rkt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index 4852541..8878845 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -403,7 +403,6 @@ ;; fandom wikinames * empty * empty * Name * Home Page (extwiki^ '("aether") 'empty 'empty "Aether Wiki" "https://aether.wiki.gg/wiki/Aether_Wiki" #f #f) (extwiki^ '("before-darkness-falls") 'empty 'empty "Before Darkness Falls Wiki" "https://beforedarknessfalls.wiki.gg/wiki/Before_Darkness_Falls_Wiki" #f #f) - (extwiki^ '("calamitymod") 'empty 'empty "Calamity Mod Wiki" "https://calamitymod.wiki.gg/wiki/Calamity_Mod_Wiki" #f #f) (extwiki^ '("chivalry" "chivalry2") 'empty 'empty "Official Chivalry Wiki" "https://chivalry.wiki.gg/wiki/Chivalry_Wiki" #f #f) (extwiki^ '("clockup") 'empty 'empty "CLOCKUP WIKI" "https://en.clockupwiki.org/wiki/CLOCKUP_WIKI:Plan" #f #f) (extwiki^ '("half-life") 'empty 'empty "Combine OverWiki" "https://combineoverwiki.net/wiki/Main_Page" #f #f) @@ -431,6 +430,7 @@ (extwiki^ '("steamworld") 'empty 'empty "Official SteamWorld Wiki" "https://steamworld.wiki.gg/wiki/SteamWorld_Wiki" #f #f) (extwiki^ '("teamfortress") 'empty 'empty "Official Team Fortress Wiki" "https://wiki.teamfortress.com/wiki/Main_Page" #f #f) (extwiki^ '("temtem") 'empty 'empty "Official Temtem Wiki" "https://temtem.wiki.gg/wiki/Temtem_Wiki" #f #f) + (extwiki^ '("terrariamods") 'empty 'empty "Official Terraria Mods Wiki" "https://terrariamods.wiki.gg/wiki/Terraria_Mods_Wiki" #f #f) (extwiki^ '("thoriummod") 'empty 'empty "Official Thorium Mod Wiki" "https://thoriummod.wiki.gg/wiki/Thorium_Mod_Wiki" #f #f) (extwiki^ '("totherescue") 'empty 'empty "To The Rescue!" "https://totherescue.wiki.gg/wiki/To_The_Rescue%21_Wiki" #f #f) (extwiki^ '("touhou") 'empty 'empty "Touhou Wiki" "https://en.touhouwiki.net/wiki/Touhou_Wiki" #f #f) From ca13aea547ab4c7eeef66495ce9a2b45823a26e8 Mon Sep 17 00:00:00 2001 From: blankie Date: Wed, 24 May 2023 21:42:29 +0700 Subject: [PATCH 12/58] Remove Gamespot reviews/ads https://lists.sr.ht/~cadence/breezewiki-discuss/%3C0ad4dbad07a1be394eefe43e33fff860b2d6176e%40disroot.org%3E --- lib/tree-updater.rkt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/tree-updater.rkt b/lib/tree-updater.rkt index 0dbf695..e7d48b4 100644 --- a/lib/tree-updater.rkt +++ b/lib/tree-updater.rkt @@ -58,7 +58,9 @@ (data-src "https://static.wikia.nocookie.net/nice-image-thumbnail.png") (class "thumbimage"))))) (figcaption "Test figure!")) - (iframe (@ (src "https://example.com/iframe-src"))))))) + (iframe (@ (src "https://example.com/iframe-src"))) + (div (@ (class "reviews")) + (header "GameSpot Expert Reviews")))))) (define (updater wikiname #:strict-proxy? [strict-proxy? #f]) ;; precompute wikiurl regex for efficency @@ -238,6 +240,9 @@ [(list (list 'img _)) #t] [_ #f])) return-no-element] + ; remove gamespot reviews/ads + [(has-class? "reviews" attributes) + return-no-element] [#t (list element-type ;; attributes @@ -297,6 +302,8 @@ "/proxy?dest=https%3A%2F%2Fstatic.wikia.nocookie.net%2Fnice-image.png") ; check that noscript images are removed (check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f) + ; check that gamespot reviews/ads are removed + (check-equal? ((query-selector (λ (t a c) (has-class? "reviews" a)) transformed)) #f) ; benchmark (when (file-exists? "../storage/Frog.html") (with-input-from-file "../storage/Frog.html" From 6fef9281c3afcb4efe9c18198e02585226e45cce Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sat, 27 May 2023 23:37:45 +1200 Subject: [PATCH 13/58] Move the semicolon fixing code again --- lib/archive-file-mappings.rkt | 2 +- lib/url-utils.rkt | 23 ++++++++++++++++++++++- src/dispatcher-tree.rkt | 13 +------------ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/lib/archive-file-mappings.rkt b/lib/archive-file-mappings.rkt index 4aa8a69..ba013ab 100644 --- a/lib/archive-file-mappings.rkt +++ b/lib/archive-file-mappings.rkt @@ -11,7 +11,7 @@ url-segments->guess-title) (define (local-encoded-url->segments str) ; '("wiki" "Page_title") - (map path/param-path (url-path (string->url str)))) + (map path/param-path (fix-semicolons-url-path (url-path (string->url str))))) (define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix (define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments))) diff --git a/lib/url-utils.rkt b/lib/url-utils.rkt index 4722d49..3fb4310 100644 --- a/lib/url-utils.rkt +++ b/lib/url-utils.rkt @@ -1,5 +1,6 @@ #lang typed/racket/base (require racket/string + typed/net/url-structs "pure-utils.rkt") (require/typed web-server/http/request-structs [#:opaque Header header?]) @@ -20,7 +21,10 @@ ; pass in a header, headers, or something useless. they'll all combine into a list build-headers ; try to follow wikimedia's format for which characters should be encoded/replaced in page titles for the url - page-title->path) + page-title->path + ; path/param eats semicolons into params, which need to be fixed back into semicolons + fix-semicolons-url-path + fix-semicolons-url) (module+ test (require "typed-rackunit.rkt")) @@ -106,3 +110,20 @@ (: page-title->path (String -> Bytes)) (define (page-title->path title) (percent-encode (regexp-replace* " " title "_") path-set #f)) + +(: fix-semicolons-url-path ((Listof Path/Param) -> (Listof Path/Param))) +(define (fix-semicolons-url-path pps) + (for/list ([pp pps]) + (define path (path/param-path pp)) + (if (or (null? (path/param-param pp)) + (symbol? path)) + pp + ;; path/param does have params, which need to be fixed into a semicolon. + (path/param + (string-append path ";" (string-join (path/param-param pp) ";")) + null)))) + +(: fix-semicolons-url (URL -> URL)) +(define (fix-semicolons-url orig-url) + (struct-copy url orig-url [path (fix-semicolons-url-path (url-path orig-url))])) + diff --git a/src/dispatcher-tree.rkt b/src/dispatcher-tree.rkt index 315638a..48e8ebb 100644 --- a/src/dispatcher-tree.rkt +++ b/src/dispatcher-tree.rkt @@ -59,16 +59,5 @@ (make-semicolon-fixer-dispatcher tree)) (define ((make-semicolon-fixer-dispatcher orig-dispatcher) conn orig-req) - (define orig-uri (request-uri orig-req)) - (define pps (url-path orig-uri)) ; list of path/param structs - (define new-path - (for/list ([pp pps]) - (if (null? (path/param-param pp)) - pp - ;; path/param does have params, which need to be fixed into a semicolon. - (path/param - (string-append (path/param-path pp) ";" (string-join (path/param-param pp) ";")) - null)))) - (define new-uri (struct-copy url orig-uri [path new-path])) - (define new-req (struct-copy request orig-req [uri new-uri])) + (define new-req (struct-copy request orig-req [uri (fix-semicolons-url (request-uri orig-req))])) (orig-dispatcher conn new-req)) From a9754463b6d9d95c63516e501f39dc83aa5ef2fa Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sat, 27 May 2023 23:41:20 +1200 Subject: [PATCH 14/58] Fix (& x) sequences truncating the page --- lib/tree-updater.rkt | 10 ++++++++-- lib/xexpr-utils.rkt | 9 +++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/lib/tree-updater.rkt b/lib/tree-updater.rkt index e7d48b4..098af3d 100644 --- a/lib/tree-updater.rkt +++ b/lib/tree-updater.rkt @@ -60,7 +60,9 @@ (figcaption "Test figure!")) (iframe (@ (src "https://example.com/iframe-src"))) (div (@ (class "reviews")) - (header "GameSpot Expert Reviews")))))) + (header "GameSpot Expert Reviews")) + (div (@ (data-test-ampersand) (class "mw-collapsible-content")) + (& ndash)))))) (define (updater wikiname #:strict-proxy? [strict-proxy? #f]) ;; precompute wikiurl regex for efficency @@ -159,7 +161,7 @@ (u (λ (v) (has-class? "mw-collapsible-content" attributes)) (λ (v) (for/list ([element v]) - (u (λ (element) (pair? element)) + (u (λ (element) (element-is-element? element)) (λ (element) `(,(car element) (@ ,@(attribute-maybe-update 'style (λ (a) (regexp-replace #rx"display: *none" a "display:inline")) (bits->attributes element))) @@ -304,6 +306,10 @@ (check-equal? ((query-selector (λ (t a c) (eq? t 'noscript)) transformed)) #f) ; check that gamespot reviews/ads are removed (check-equal? ((query-selector (λ (t a c) (has-class? "reviews" a)) transformed)) #f) + ; check that (& x) sequences are not broken + (check-equal? ((query-selector (λ (t a c) (dict-has-key? a 'data-test-ampersand)) transformed)) + '(div (@ (data-test-ampersand) (class "mw-collapsible-content")) + (& ndash))) ; benchmark (when (file-exists? "../storage/Frog.html") (with-input-from-file "../storage/Frog.html" diff --git a/lib/xexpr-utils.rkt b/lib/xexpr-utils.rkt index cb40510..e1ac957 100644 --- a/lib/xexpr-utils.rkt +++ b/lib/xexpr-utils.rkt @@ -86,15 +86,16 @@ ; "element" is a real element with a type and everything (non-string, non-attributes) (define (element-is-element? element) - (and (element-is-bits? element) (not (element-is-xattributes? element)))) + (and (element-is-bits? element) (not (eq? (car element) '&))(not (element-is-xattributes? element)))) (module+ test (check-true (element-is-element? '(span "hi"))) (check-false (element-is-element? '(@ (alt "Cute cat.")))) - (check-false (element-is-element? "hi"))) + (check-false (element-is-element? "hi")) + (check-false (element-is-element? '(& ndash)))) -; "element content" is a real element or a string +; "element content" is a real element or a string or a (& x) sequence (define (element-is-content? element) - (or (string? element) (element-is-element? element))) + (or (string? element) (element-is-element? element) (and (pair? element) (eq? (car element) '&)))) (module+ test (check-true (element-is-content? '(span "hi"))) (check-false (element-is-content? '(@ (alt "Cute cat.")))) From 4b039cca5e0ed30d1ec8c76e5ec7ff47acaef507 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sat, 27 May 2023 23:48:08 +1200 Subject: [PATCH 15/58] Explain how page-search works in the code --- src/page-search.rkt | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/page-search.rkt b/src/page-search.rkt index ce527c0..e4960d8 100644 --- a/src/page-search.rkt +++ b/src/page-search.rkt @@ -28,28 +28,35 @@ (define search-json-data '#hasheq((batchcomplete . #t) (query . #hasheq((search . (#hasheq((ns . 0) (pageid . 219) (size . 1482) (snippet . "") (timestamp . "2022-08-21T08:54:23Z") (title . "Gacha Capsule") (wordcount . 214)) #hasheq((ns . 0) (pageid . 201) (size . 1198) (snippet . "") (timestamp . "2022-07-11T17:52:47Z") (title . "Badges") (wordcount . 181))))))))) +;; this takes the info we gathered from fandom and makes the big fat x-expression page (define (generate-results-page req dest-url wikiname query data #:siteinfo [siteinfo #f]) (define search-results (jp "/query/search" data)) + ;; this is *another* helper that builds the wiki page UI and lets me put the search results (or whatever else) in the middle (generate-wiki-page + ;; so I provide my helper function with the necessary context... #:req req #:source-url dest-url #:wikiname wikiname #:title query #:siteinfo siteinfo + ;; and here's the actual results to display in the wiki page layout `(div (@ (class "mw-parser-output")) + ;; header before the search results showing how many we found (p ,(format "~a results found for " (length search-results)) (strong ,query)) + ;; *u*nordered *l*ist of matching search results (ul ,@(map - (λ (result) + (λ (result) ;; for each result, run this code... (let* ([title (jp "/title" result)] [page-path (page-title->path title)] [timestamp (jp "/timestamp" result)] [wordcount (jp "/wordcount" result)] [size (jp "/size" result)]) + ;; and make this x-expression... `(li (@ (class "my-result")) - (a (@ (class "my-result__link") (href ,(format "/~a/wiki/~a" wikiname page-path))) - ,title) - (div (@ (class "my-result__info")) + (a (@ (class "my-result__link") (href ,(format "/~a/wiki/~a" wikiname page-path))) ; using unquote to insert the result page URL + ,title) ; using unquote to insert the result page title + (div (@ (class "my-result__info")) ; constructing the line under the search result "last edited " (time (@ (datetime ,timestamp)) ,(list-ref (string-split timestamp "T") 0)) ,(format ", ~a words, ~a kb" @@ -57,13 +64,18 @@ (exact->inexact (/ (round (/ size 100)) 10))))))) search-results))))) +;; will be called when the web browser asks to load the page (define (page-search req) + ;; this just means, catch any errors and display them in the browser. it's a function somewhere else (response-handler + ;; the URL will look like "/minecraft/wiki/Special:Search?q=Spawner" + ;; grab the first part to use as the wikiname, in this case, "minecraft" (define wikiname (path/param-path (first (url-path (request-uri req))))) + ;; grab the part after ?q= which is the search terms (define query (dict-ref (url-query (request-uri req)) 'q #f)) + ;; constructing the URL where I want to get fandom data from... (define origin (format "https://~a.fandom.com" wikiname)) - (when (config-true? 'feature_offline::only) - (raise-user-error "Full search is currently not available on breezewiki.com - for now, please use the pop-up search suggestions or wait for me to fix it! Thanks <3")) + ;; the dest-URL will look something like https://minecraft.fandom.com/api.php?action=query&list=search&srsearch=Spawner&formatversion=2&format=json (define dest-url (format "~a/api.php?~a" origin @@ -73,21 +85,26 @@ ("formatversion" . "2") ("format" . "json"))))) + ;; simultaneously get the search results from the fandom API, as well as information about the wiki as a whole (its license, icon, name) (define-values (dest-res siteinfo) (thread-values (λ () (log-outgoing dest-url) - (easy:get dest-url #:timeouts timeouts)) + (easy:get dest-url #:timeouts timeouts)) ;; HTTP request to dest-url for search results (λ () - (siteinfo-fetch wikiname)))) + (siteinfo-fetch wikiname)))) ;; helper function in another file to get information about the wiki + ;; search results are a JSON string. parse JSON into racket data structures (define data (easy:response-json dest-res)) - + ;; calling my generate-results-page function with the information so far in order to get a big fat x-expression + ;; big fat x-expression goes into the body variable (define body (generate-results-page req dest-url wikiname query data #:siteinfo siteinfo)) + ;; error checking (when (config-true? 'debug) ; used for its side effects ; convert to string with error checking, error will be raised if xexp is invalid (xexp->html body)) + ;; convert body to HTML and send to browser (response/output #:code 200 #:headers (build-headers always-headers) From f6933e9e504898764fc18623a51d7a766498d5ab Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Fri, 16 Jun 2023 23:05:30 +1200 Subject: [PATCH 16/58] expand all Warframe ability collapsibles --- lib/tree-updater.rkt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/tree-updater.rkt b/lib/tree-updater.rkt index 098af3d..9e30f32 100644 --- a/lib/tree-updater.rkt +++ b/lib/tree-updater.rkt @@ -110,6 +110,11 @@ #;(curry attribute-maybe-update 'class (λ (class) (string-join (classlist-updater (string-split class " ")) " "))) (curry attribute-maybe-update 'class class-updater) + ; yet more uncollapsing - sample: warframe/wiki/Amp_(Ability) + (curry u + (λ (v) (and (dict-has-key? v 'id) + (string-prefix? (car (dict-ref v 'id)) "mw-customcollapsible"))) + (λ (v) (dict-set v 'style "display:block"))) ; change links to stay on the same wiki (curry attribute-maybe-update 'href (λ (href) @@ -245,6 +250,9 @@ ; remove gamespot reviews/ads [(has-class? "reviews" attributes) return-no-element] + ; remove customcollapsible customtoggle buttons - sample: warframe/wiki/Amp_(Ability) + [(and (dict-has-key? attributes 'class) (regexp-match? #rx"^mw-customtoggle-[^ ]* button-c$" (car (dict-ref attributes 'class)))) + return-no-element] [#t (list element-type ;; attributes From 4336e4ab4323446e2e1e29e55f684bca3742f674 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sat, 17 Jun 2023 00:09:59 +1200 Subject: [PATCH 17/58] fix javascript audio buttons on hearthstone wiki --- lib/tree-updater.rkt | 4 ++++ static/main.css | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/lib/tree-updater.rkt b/lib/tree-updater.rkt index 9e30f32..109c875 100644 --- a/lib/tree-updater.rkt +++ b/lib/tree-updater.rkt @@ -110,6 +110,10 @@ #;(curry attribute-maybe-update 'class (λ (class) (string-join (classlist-updater (string-split class " ")) " "))) (curry attribute-maybe-update 'class class-updater) + ; audio buttons - sample: hearthstone/wiki/Diablo_(Duels_hero)#Sounds + (curry u + (λ (v) (has-class? "ext-audiobutton" v)) + (λ (v) (dict-set (dict-remove v 'hidden) 'controls '("")))) ; yet more uncollapsing - sample: warframe/wiki/Amp_(Ability) (curry u (λ (v) (and (dict-has-key? v 'id) diff --git a/static/main.css b/static/main.css index 3d3ed5a..9414470 100644 --- a/static/main.css +++ b/static/main.css @@ -210,6 +210,11 @@ figcaption, .lightbox-caption, .thumbcaption { display: block; } +/* javascript audio play buttons, see hearthstone/wiki/Diablo_(Duels_hero) */ +a.ext-audiobutton { + display: none; +} + /* animated slots */ #mw-content-text .animated > :not(.animated-active), #mw-content-text .animated > .animated-subframe > :not(.animated-active) { display: inline-block; From 61d28203420120aa0df009212bae6614b66c2ad8 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sat, 17 Jun 2023 00:16:11 +1200 Subject: [PATCH 18/58] fix some more play buttons on minecraft wiki --- static/main.css | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/static/main.css b/static/main.css index 9414470..1641886 100644 --- a/static/main.css +++ b/static/main.css @@ -210,10 +210,13 @@ figcaption, .lightbox-caption, .thumbcaption { display: block; } -/* javascript audio play buttons, see hearthstone/wiki/Diablo_(Duels_hero) */ -a.ext-audiobutton { +/* javascript audio play buttons */ +a.ext-audiobutton { /* see hearthstone/wiki/Diablo_(Duels_hero) */ display: none; } +.sound > [style="display:none"] { /* see minecraft/wiki/villager#Sounds */ + display: inline !important; +} /* animated slots */ #mw-content-text .animated > :not(.animated-active), #mw-content-text .animated > .animated-subframe > :not(.animated-active) { From 33591ce0a3019fdf3a4c8634a993854b5d80db5e Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Tue, 26 Sep 2023 00:51:55 +1300 Subject: [PATCH 19/58] Add Minecraft to independent wikis --- .gitignore | 2 ++ src/extwiki-data.rkt | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/.gitignore b/.gitignore index 17b94c5..1faa8f2 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,5 @@ compiled # Personal /config.ini +misc +storage diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index 8878845..ead1696 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -61,6 +61,16 @@ ("Browser Extension" . "https://runescape.wiki/w/RuneScape:Finding_the_wikis_with_ease#Extensions")) (λ (props) '())) + 'Minecraft + (extwiki-group^ + "Minecraft" + '(("Post-Move FAQ" . "https://minecraft.wiki/w/Minecraft_Wiki:Moving_from_Fandom") + ("Pre-Move Discussion" . "https://minecraft.fandom.com/wiki/Minecraft_Wiki:Moving_from_Fandom") + ("In the media: PCGamer" . "https://www.pcgamer.com/official-minecraft-wiki-editors-so-furious-at-fandoms-degraded-functionality-and-popups-theyre-overwhelmingly-voting-to-leave-the-site/") + ("In the media: PCGamesN" . "https://www.pcgamesn.com/minecraft/wiki-fandom")) + (λ (props) + '(p "The wiki was founded by Citricsquid on July 16th, 2009 as a way to document information from Minecraft. Since November 15th, 2010, it has been hosted by Curse Media. On December 12th, 2018, it moved to Fandom as it purchased Curse Media. Since September 24, 2023, it forked from Fandom and has been hosted by Weird Gloop."))) + 'empty (extwiki-group^ "Misc" @@ -400,6 +410,15 @@ `((p "“Fandom bought Gamepedia and forced a migration, with their restricted, ad-heavy appearance, and other annoying features that we could not remove, the wiki grew slow and annoying to use, especially for logged out users.") (p "“We decided to move away from Fandom to Wiki.gg, which returns the wiki to how it used to be on gamepedia, without the ads spamming and forced videos.”")))) + (extwiki^ + '("minecraft") 'default + 'Minecraft + "The Minecraft Wiki" + "https://minecraft.wiki/w/Minecraft_Wiki" + "https://minecraft.wiki/images/Wiki.png" + (λ (props) + `())) + ;; fandom wikinames * empty * empty * Name * Home Page (extwiki^ '("aether") 'empty 'empty "Aether Wiki" "https://aether.wiki.gg/wiki/Aether_Wiki" #f #f) (extwiki^ '("before-darkness-falls") 'empty 'empty "Before Darkness Falls Wiki" "https://beforedarknessfalls.wiki.gg/wiki/Before_Darkness_Falls_Wiki" #f #f) From b8ccd6cc3e691b6b58a2156c2b210ab5bb433423 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Tue, 3 Oct 2023 18:54:07 +1300 Subject: [PATCH 20/58] Improve theme layout on non-flex browsers --- src/application-globals.rkt | 4 ++-- static/main.css | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/application-globals.rkt b/src/application-globals.rkt index 10b4530..de60820 100644 --- a/src/application-globals.rkt +++ b/src/application-globals.rkt @@ -240,7 +240,7 @@ (div (@ (class "bw-ss__container") (id "bw-pr-search-suggestions")))) (div (@ (class "bw-theme__select")) (span (@ (class "bw-theme__main-label")) "Page theme") - (div (@ (class "bw-theme__items")) + (span (@ (class "bw-theme__items")) ,@(for/list ([theme '(default light dark)]) (define class (if (equal? theme (user-cookies^-theme user-cookies)) @@ -251,7 +251,7 @@ req (struct-copy user-cookies^ user-cookies [theme theme]))) (class ,class)) - (div (@ (class "bw-theme__icon-container")) + (span (@ (class "bw-theme__icon-container")) ,(hash-ref theme-icons theme)) ,(format "~a" theme))))))) (div (@ (id "content") #;(class "page-content")) diff --git a/static/main.css b/static/main.css index 1641886..cb32df5 100644 --- a/static/main.css +++ b/static/main.css @@ -332,6 +332,7 @@ a.ext-audiobutton { /* see hearthstone/wiki/Diablo_(Duels_hero) */ display: flex; } .bw-theme__item { + display: inline-block; display: flex; align-items: baseline; padding: 2px; From 9773e62c46b3faebb8e621718f3e9870fbd24d10 Mon Sep 17 00:00:00 2001 From: blankie Date: Mon, 29 May 2023 23:02:28 +0700 Subject: [PATCH 21/58] Add better support for tabs Some pages break without actual tab support, such as https://breezewiki.com/ben10/wiki/Ultimatrix_(Original)#Modes This change aims to work with old browsers (such as Firefox for Android 68) and browsers with Javascript disabled (by showing all tab contents and hiding the tab bar, i.e. how tabs work before this change). --- src/application-globals.rkt | 3 +- static/main.css | 6 ++-- static/tabs.js | 63 +++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 static/tabs.js diff --git a/src/application-globals.rkt b/src/application-globals.rkt index de60820..a413785 100644 --- a/src/application-globals.rkt +++ b/src/application-globals.rkt @@ -203,10 +203,11 @@ `(script (@ (type "module") (src ,(get-static-url "search-suggestions.js")))) "") (script (@ (type "module") (src ,(get-static-url "countdown.js")))) + (script (@ (defer) (src ,(get-static-url "tabs.js")))) (link (@ (rel "icon") (href ,(u (λ (v) (config-true? 'strict_proxy)) (λ (v) (u-proxy-url v)) (head-data^-icon-url head-data)))))) - (body (@ (class ,(head-data^-body-class head-data))) + (body (@ (class ,(head-data^-body-class head-data) " bw-tabs-nojs")) ,(let ([extension-eligible? (cond/var [(not req) #f] diff --git a/static/main.css b/static/main.css index cb32df5..6b07d2b 100644 --- a/static/main.css +++ b/static/main.css @@ -202,11 +202,11 @@ figcaption, .lightbox-caption, .thumbcaption { padding: 0; } -/* show tabs always */ -.wds-tabs__wrapper { +/* show tabs if tabs.js isn't loaded */ +.bw-tabs-nojs .wds-tabs__wrapper { display: none; } -.wds-tab__content { +.bw-tabs-nojs .wds-tab__content { display: block; } diff --git a/static/tabs.js b/static/tabs.js new file mode 100644 index 0000000..589261d --- /dev/null +++ b/static/tabs.js @@ -0,0 +1,63 @@ +"use strict"; + +function handleTabber(tabber) { + let [tabs, contents] = getTabs(tabber); + + for (let i in tabs) { + let tab = tabs[i]; + let content = contents[i]; + tab.addEventListener("click", function(e) { + let [currentTab, currentContent] = getCurrentTab(tabber); + if (currentTab) { + currentTab.classList.remove("wds-is-current"); + } + if (currentContent) { + currentContent.classList.remove("wds-is-current"); + } + + tab.classList.add("wds-is-current"); + content.classList.add("wds-is-current"); + e.preventDefault(); + }); + } +} + +for (let tabber of document.body.querySelectorAll(".wds-tabber")) { + handleTabber(tabber); +} +document.body.classList.remove("bw-tabs-nojs"); + + + +function getTabs(tabber) { + let tabs = []; + let contents = []; + + for (let i of tabber.querySelector(".wds-tabs__wrapper").querySelectorAll(".wds-tabs__tab")) { + tabs.push(i); + } + for (let i of tabber.children) { + if (!i.matches(".wds-tab__content")) { + continue; + } + contents.push(i); + } + + return [tabs, contents]; +} + +function getCurrentTab(tabber) { + let tab = null; + let content = null; + + tab = tabber.querySelector(".wds-tabs__wrapper").querySelector(".wds-tabs__tab.wds-is-current"); + for (let i of tabber.children) { + if (!i.matches(".wds-tab__content.wds-is-current")) { + continue; + } + content = i; + break; + } + + return [tab, content]; +} From ead6896818be96048138d0d3ecd12438f8d475b8 Mon Sep 17 00:00:00 2001 From: blankie Date: Tue, 30 May 2023 13:41:49 +0700 Subject: [PATCH 22/58] Add the ability to specify/open the last open tab in the URL --- static/tabs.js | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/static/tabs.js b/static/tabs.js index 589261d..916cb4d 100644 --- a/static/tabs.js +++ b/static/tabs.js @@ -1,30 +1,23 @@ "use strict"; -function handleTabber(tabber) { +let tabToFind = location.hash.length > 1 ? location.hash.substring(1) : null; +for (let tabber of document.body.querySelectorAll(".wds-tabber")) { let [tabs, contents] = getTabs(tabber); for (let i in tabs) { let tab = tabs[i]; let content = contents[i]; - tab.addEventListener("click", function(e) { - let [currentTab, currentContent] = getCurrentTab(tabber); - if (currentTab) { - currentTab.classList.remove("wds-is-current"); - } - if (currentContent) { - currentContent.classList.remove("wds-is-current"); - } - tab.classList.add("wds-is-current"); - content.classList.add("wds-is-current"); + tab.addEventListener("click", function(e) { + setCurrentTab(tabber, tab, content); e.preventDefault(); }); + if (tab.dataset.hash === tabToFind) { + setCurrentTab(tabber, tab, content); + tabToFind = null; + } } } - -for (let tabber of document.body.querySelectorAll(".wds-tabber")) { - handleTabber(tabber); -} document.body.classList.remove("bw-tabs-nojs"); @@ -61,3 +54,18 @@ function getCurrentTab(tabber) { return [tab, content]; } + +function setCurrentTab(tabber, tab, content) { + let [currentTab, currentContent] = getCurrentTab(tabber); + if (currentTab) { + currentTab.classList.remove("wds-is-current"); + } + if (currentContent) { + currentContent.classList.remove("wds-is-current"); + } + + tab.classList.add("wds-is-current"); + content.classList.add("wds-is-current"); + location.hash = "#" + tab.dataset.hash; + history.pushState(null, "", "#" + tab.dataset.hash); +} From f5399524b135eedf346ad758a88f37cba57be700 Mon Sep 17 00:00:00 2001 From: blankie Date: Tue, 30 May 2023 14:01:14 +0700 Subject: [PATCH 23/58] Prevent linking to tabs with no IDs --- static/tabs.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/static/tabs.js b/static/tabs.js index 916cb4d..6f582e8 100644 --- a/static/tabs.js +++ b/static/tabs.js @@ -14,7 +14,6 @@ for (let tabber of document.body.querySelectorAll(".wds-tabber")) { }); if (tab.dataset.hash === tabToFind) { setCurrentTab(tabber, tab, content); - tabToFind = null; } } } @@ -66,6 +65,8 @@ function setCurrentTab(tabber, tab, content) { tab.classList.add("wds-is-current"); content.classList.add("wds-is-current"); - location.hash = "#" + tab.dataset.hash; - history.pushState(null, "", "#" + tab.dataset.hash); + if (tab.dataset.hash) { + location.hash = "#" + tab.dataset.hash; + history.pushState(null, "", "#" + tab.dataset.hash); + } } From dcb8a8a590df66023cc355266cc1fc0c4f278438 Mon Sep 17 00:00:00 2001 From: blankie Date: Mon, 6 Nov 2023 20:31:20 +1100 Subject: [PATCH 24/58] Prevent making duplicate history entries --- static/tabs.js | 1 - 1 file changed, 1 deletion(-) diff --git a/static/tabs.js b/static/tabs.js index 6f582e8..4c4cdda 100644 --- a/static/tabs.js +++ b/static/tabs.js @@ -67,6 +67,5 @@ function setCurrentTab(tabber, tab, content) { content.classList.add("wds-is-current"); if (tab.dataset.hash) { location.hash = "#" + tab.dataset.hash; - history.pushState(null, "", "#" + tab.dataset.hash); } } From 2b3a8fe1084abab995f6d0937f673dcb4eac78b6 Mon Sep 17 00:00:00 2001 From: blankie Date: Mon, 13 Nov 2023 14:35:07 +1100 Subject: [PATCH 25/58] Fix scrolling to sections if a tab's hash coincides with one ben10/wiki/Alien_X_(Classic)#Appearances --- static/tabs.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/static/tabs.js b/static/tabs.js index 4c4cdda..a077efe 100644 --- a/static/tabs.js +++ b/static/tabs.js @@ -66,6 +66,9 @@ function setCurrentTab(tabber, tab, content) { tab.classList.add("wds-is-current"); content.classList.add("wds-is-current"); if (tab.dataset.hash) { - location.hash = "#" + tab.dataset.hash; + let fragment = "#" + tab.dataset.hash; + if (location.hash !== fragment) { + history.pushState(null, "", fragment); + } } } From 43c3f70736d7aa394eda004672beedb166dd6baf Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Fri, 10 Nov 2023 17:09:18 +1300 Subject: [PATCH 26/58] Add experimental FTS indexer --- archiver/fts.rkt | 89 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 archiver/fts.rkt diff --git a/archiver/fts.rkt b/archiver/fts.rkt new file mode 100644 index 0000000..20b21cc --- /dev/null +++ b/archiver/fts.rkt @@ -0,0 +1,89 @@ +#lang racket/base +(require racket/function + racket/future + racket/match + racket/path + racket/promise + racket/port + racket/string + file/gunzip + db + db/unsafe/sqlite3 + json + json-pointer + "../lib/html-parsing/main.rkt" + "../lib/xexpr-utils.rkt" + "../lib/tree-updater.rkt") + +(define (class-has? attributes substrs) + (define cl (or (get-attribute 'class attributes) "")) + (ormap (λ (substr) (string-contains? cl substr)) substrs)) + +(define (updater element element-type attributes children) + (cond + [(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews")) + (list 'div '() '())] + [#t + (list element-type attributes children)])) + +(define slc (sqlite3-connect #:database "../storage/fts-separate.db")) +(sqlite3-load-extension slc "fts5") + +(define (writer page) + (for ([bit page]) + (cond + [(memq bit '(div p li td)) (displayln "")] + [(symbol? bit) (void)] + [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)] + [(and (pair? bit) (eq? (car bit) '@)) (void)] + [(pair? bit) (writer bit)] + [(string? bit) (display bit)]))) + +(define wikiname "sto") +(define tablename (format "page_~a" wikiname)) + +(define ((extract f)) ; f - filename + (with-handlers + ([exn:fail? (λ (err) (println f) (raise err))]) + (define j + (case (path-get-extension f) + [(#".json") + (with-input-from-file f (λ () (read-json)))] + [(#".gz") + (define-values (in out) (make-pipe)) + (with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out))) + (read-json in)] + [else #f])) + (define title (json-pointer-value "/parse/title" j)) + (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j))) + (define page (update-tree updater (html->xexp page-html))) + (define text (with-output-to-string (λ () (writer page)))) + (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n")) + (values title shrink-text))) + +(println "extracting text...") +(define results + (time + (for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)] + #:when (member (path-get-extension f) '(#".json" #".gz"))) + (extract f)))) + +(println "inserting...") +(query-exec slc "begin transaction") +#;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname)) +(time + (for ([fut results] + [i (in-naturals 1)]) + (display "-") + (when (and (> i 0) (= (modulo i 100) 0)) + (println i)) + (define-values (title shrink-text) (fut)) + (query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text))) + +(println "running optimize...") +(query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename)) + +(println "committing...") +(query-exec slc "commit") + +(disconnect slc) From 57e0d20657e441e06470ad658b1dd071b389d7ed Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 29 Nov 2023 16:57:50 +1300 Subject: [PATCH 27/58] Add more indexers and Solr configuration --- archiver/fts.rkt | 179 +- .../solr-config-dir/lang/contractions_ca.txt | 8 + .../solr-config-dir/lang/contractions_fr.txt | 15 + .../solr-config-dir/lang/contractions_ga.txt | 5 + .../solr-config-dir/lang/contractions_it.txt | 23 + .../solr-config-dir/lang/hyphenations_ga.txt | 5 + archiver/solr-config-dir/lang/stemdict_nl.txt | 6 + archiver/solr-config-dir/lang/stoptags_ja.txt | 420 +++++ .../solr-config-dir/lang/stopwords_ar.txt | 125 ++ .../solr-config-dir/lang/stopwords_bg.txt | 193 ++ .../solr-config-dir/lang/stopwords_ca.txt | 220 +++ .../solr-config-dir/lang/stopwords_cz.txt | 172 ++ .../solr-config-dir/lang/stopwords_da.txt | 110 ++ .../solr-config-dir/lang/stopwords_de.txt | 294 +++ .../solr-config-dir/lang/stopwords_el.txt | 78 + .../solr-config-dir/lang/stopwords_en.txt | 54 + .../solr-config-dir/lang/stopwords_es.txt | 356 ++++ .../solr-config-dir/lang/stopwords_et.txt | 1603 +++++++++++++++++ .../solr-config-dir/lang/stopwords_eu.txt | 99 + .../solr-config-dir/lang/stopwords_fa.txt | 313 ++++ .../solr-config-dir/lang/stopwords_fi.txt | 97 + .../solr-config-dir/lang/stopwords_fr.txt | 186 ++ .../solr-config-dir/lang/stopwords_ga.txt | 110 ++ .../solr-config-dir/lang/stopwords_gl.txt | 161 ++ .../solr-config-dir/lang/stopwords_hi.txt | 235 +++ .../solr-config-dir/lang/stopwords_hu.txt | 211 +++ .../solr-config-dir/lang/stopwords_hy.txt | 46 + .../solr-config-dir/lang/stopwords_id.txt | 359 ++++ .../solr-config-dir/lang/stopwords_it.txt | 303 ++++ .../solr-config-dir/lang/stopwords_ja.txt | 127 ++ .../solr-config-dir/lang/stopwords_lv.txt | 172 ++ .../solr-config-dir/lang/stopwords_nl.txt | 119 ++ .../solr-config-dir/lang/stopwords_no.txt | 194 ++ .../solr-config-dir/lang/stopwords_pt.txt | 253 +++ .../solr-config-dir/lang/stopwords_ro.txt | 233 +++ .../solr-config-dir/lang/stopwords_ru.txt | 243 +++ .../solr-config-dir/lang/stopwords_sv.txt | 133 ++ .../solr-config-dir/lang/stopwords_th.txt | 119 ++ .../solr-config-dir/lang/stopwords_tr.txt | 212 +++ archiver/solr-config-dir/lang/userdict_ja.txt | 29 + archiver/solr-config-dir/schema.xml | 153 ++ archiver/solr-config-dir/solrconfig.xml | 1076 +++++++++++ 42 files changed, 9016 insertions(+), 33 deletions(-) create mode 100644 archiver/solr-config-dir/lang/contractions_ca.txt create mode 100644 archiver/solr-config-dir/lang/contractions_fr.txt create mode 100644 archiver/solr-config-dir/lang/contractions_ga.txt create mode 100644 archiver/solr-config-dir/lang/contractions_it.txt create mode 100644 archiver/solr-config-dir/lang/hyphenations_ga.txt create mode 100644 archiver/solr-config-dir/lang/stemdict_nl.txt create mode 100644 archiver/solr-config-dir/lang/stoptags_ja.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_ar.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_bg.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_ca.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_cz.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_da.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_de.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_el.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_en.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_es.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_et.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_eu.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_fa.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_fi.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_fr.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_ga.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_gl.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_hi.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_hu.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_hy.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_id.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_it.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_ja.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_lv.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_nl.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_no.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_pt.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_ro.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_ru.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_sv.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_th.txt create mode 100644 archiver/solr-config-dir/lang/stopwords_tr.txt create mode 100644 archiver/solr-config-dir/lang/userdict_ja.txt create mode 100644 archiver/solr-config-dir/schema.xml create mode 100644 archiver/solr-config-dir/solrconfig.xml diff --git a/archiver/fts.rkt b/archiver/fts.rkt index 20b21cc..6e2266e 100644 --- a/archiver/fts.rkt +++ b/archiver/fts.rkt @@ -1,4 +1,4 @@ -#lang racket/base +#lang racket (require racket/function racket/future racket/match @@ -9,12 +9,35 @@ file/gunzip db db/unsafe/sqlite3 + net/http-easy json json-pointer "../lib/html-parsing/main.rkt" "../lib/xexpr-utils.rkt" "../lib/tree-updater.rkt") +(define-syntax (seq stx) + (syntax-case stx () + [(_ body ...) + #`(for ([op (list (lambda () body) ...)] + [i (in-naturals)]) + (define res (op)) + (when (>= (response-status-code res) 400) + (error 'seq "op #~a: status code was ~a: ~v" i (response-status-code res) (response-json res))) + (define taskuid (json-pointer-value "/taskUid" (response-json res))) + (for/or ([ticks (in-naturals)] + [res2 (in-producer (lambda () (get (format "http://localhost:7700/tasks/~a" taskuid))))]) + (define status (json-pointer-value "/status" (response-json res2))) + (case status + [("enqueued" "processing") + (sleep 1) + #f] + [("succeeded") + (printf "op #~a: ~a (~a ticks)~n" i status ticks) + #t] + [else + (error 'seq "op #~a: task status was ~a: ~v" i status res2)])))])) + (define (class-has? attributes substrs) (define cl (or (get-attribute 'class attributes) "")) (ormap (λ (substr) (string-contains? cl substr)) substrs)) @@ -29,17 +52,27 @@ (define slc (sqlite3-connect #:database "../storage/fts-separate.db")) (sqlite3-load-extension slc "fts5") -(define (writer page) - (for ([bit page]) - (cond - [(memq bit '(div p li td)) (displayln "")] - [(symbol? bit) (void)] - [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)] - [(and (pair? bit) (eq? (car bit) '@)) (void)] - [(pair? bit) (writer bit)] - [(string? bit) (display bit)]))) +(define (writer tables-mode? page) + (define (writer-inner page) + (for ([bit page]) + (cond + [(and tables-mode? (pair? bit) (memq (car bit) '(h1 h2 h3 p blockquote q))) (void)] + [(and (not tables-mode?) (pair? bit) (memq (car bit) '(ul ol dl table))) (void)] + [(memq bit '(div p li td dd dt br)) (displayln "")] + [(symbol? bit) (void)] + [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)] + [(and (pair? bit) (eq? (car bit) '@)) (void)] + [(pair? bit) (writer-inner bit)] + [(string? bit) (display bit)]))) + (writer-inner page)) -(define wikiname "sto") +(define (write-and-post-process tables-mode? page) + (define text (with-output-to-string (λ () (writer tables-mode? page)))) + ;; (define text-no-numbers (regexp-replace* #px"(?:-|[+$£€¥] *)?[0-9,.]{2,}%?\\s*" text "")) + (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n")) + shrink-text) + +(define wikiname "bloons") (define tablename (format "page_~a" wikiname)) (define ((extract f)) ; f - filename @@ -55,35 +88,115 @@ (read-json in)] [else #f])) (define title (json-pointer-value "/parse/title" j)) + (define pageid (json-pointer-value "/parse/pageid" j)) (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j))) (define page (update-tree updater (html->xexp page-html))) - (define text (with-output-to-string (λ () (writer page)))) - (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n")) - (values title shrink-text))) + (define body (write-and-post-process #f page)) + (define table (write-and-post-process #t page)) + (values title body table pageid))) -(println "extracting text...") (define results - (time - (for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)] - #:when (member (path-get-extension f) '(#".json" #".gz"))) - (extract f)))) + (for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)] + #:when (member (path-get-extension f) '(#".json" #".gz"))) + (extract f))) + +;; *************************************************************************************************** +;; TESTING WRITER +;; *************************************************************************************************** +#;(for/first ([fut results] + [i (in-naturals 1)] + #:when (i . >= . 4859)) + (define-values (title body table pageid) (fut)) + (println title) + (println body) + (println table)) (println "inserting...") -(query-exec slc "begin transaction") -#;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname)) -(time - (for ([fut results] - [i (in-naturals 1)]) - (display "-") - (when (and (> i 0) (= (modulo i 100) 0)) - (println i)) - (define-values (title shrink-text) (fut)) - (query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text))) -(println "running optimize...") -(query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename)) +;; *************************************************************************************************** +;; SQLite FTS5 +;; *************************************************************************************************** +#;(begin + (query-exec slc "begin transaction") + #;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname)) + (time + (for ([fut results] + [i (in-naturals 1)]) + (display "-") + (when (and (> i 0) (= (modulo i 100) 0)) + (println i)) + (define-values (title shrink-text) (fut)) + (query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text))) -(println "committing...") -(query-exec slc "commit") + (println "running optimize...") + (query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename)) + + (println "committing...") + (query-exec slc "commit")) + +;; *************************************************************************************************** +;; Solr +;; *************************************************************************************************** +(begin + (define data + (cond + #;[(file-exists? "cache.rkt") + (println "reading in...") + (with-input-from-file "cache.rkt" (λ () (read)))] + [else + (define data + (for/list ([fut results] + [i (in-naturals 1)]) + (display "-") + (when (and (> i 0) (= (modulo i 100) 0)) + (println i)) + (define-values (title body table pageid) (fut)) + (define len (string-length body)) + `#hasheq((id . ,(number->string pageid)) + (title . ,title) + (body . ,body) + (table . ,table) + (len . ,len)))) + + (println "writing out...") + (with-output-to-file "cache.rkt" (λ () (write data)) #:exists 'truncate/replace) + data])) + + (println "posting...") + (define res + (post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname) + #:json data))) + +;; *************************************************************************************************** +;; Meilisearch +;; *************************************************************************************************** +#;(begin + (seq + (put (format "http://localhost:7700/indexes/~a/settings/searchable-attributes" wikiname) + #:json '("title" "body")) + (put (format "http://localhost:7700/indexes/~a/settings/ranking-rules" wikiname) + #:json '("words" "typo" #;"proximity" "attribute" "sort" "exactness" #;"len:desc")) + (call-with-input-file "stop-words.json" + (λ (in) + (put (format "http://localhost:7700/indexes/~a/settings/stop-words" wikiname) + #:headers '#hasheq((Content-Type . "application/json")) + #:data in)))) + (define data + (for/list ([fut results] + [i (in-naturals 1)]) + (display "-") + (when (and (> i 0) (= (modulo i 100) 0)) + (println i)) + (define-values (title body pageid) (fut)) + (define len (string-length body)) + `#hasheq((id . ,pageid) + (title . ,title) + (body . ,body) + (len . ,len)))) + (define res + (post (format "http://localhost:7700/indexes/~a/documents" wikiname) + #:json data)) + (seq res) + (println (response-json res))) (disconnect slc) diff --git a/archiver/solr-config-dir/lang/contractions_ca.txt b/archiver/solr-config-dir/lang/contractions_ca.txt new file mode 100644 index 0000000..307a85f --- /dev/null +++ b/archiver/solr-config-dir/lang/contractions_ca.txt @@ -0,0 +1,8 @@ +# Set of Catalan contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +l +m +n +s +t diff --git a/archiver/solr-config-dir/lang/contractions_fr.txt b/archiver/solr-config-dir/lang/contractions_fr.txt new file mode 100644 index 0000000..f1bba51 --- /dev/null +++ b/archiver/solr-config-dir/lang/contractions_fr.txt @@ -0,0 +1,15 @@ +# Set of French contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +l +m +t +qu +n +s +j +d +c +jusqu +quoiqu +lorsqu +puisqu diff --git a/archiver/solr-config-dir/lang/contractions_ga.txt b/archiver/solr-config-dir/lang/contractions_ga.txt new file mode 100644 index 0000000..9ebe7fa --- /dev/null +++ b/archiver/solr-config-dir/lang/contractions_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +m +b diff --git a/archiver/solr-config-dir/lang/contractions_it.txt b/archiver/solr-config-dir/lang/contractions_it.txt new file mode 100644 index 0000000..cac0409 --- /dev/null +++ b/archiver/solr-config-dir/lang/contractions_it.txt @@ -0,0 +1,23 @@ +# Set of Italian contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d diff --git a/archiver/solr-config-dir/lang/hyphenations_ga.txt b/archiver/solr-config-dir/lang/hyphenations_ga.txt new file mode 100644 index 0000000..4d2642c --- /dev/null +++ b/archiver/solr-config-dir/lang/hyphenations_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish hyphenations for StopFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +h +n +t diff --git a/archiver/solr-config-dir/lang/stemdict_nl.txt b/archiver/solr-config-dir/lang/stemdict_nl.txt new file mode 100644 index 0000000..4410729 --- /dev/null +++ b/archiver/solr-config-dir/lang/stemdict_nl.txt @@ -0,0 +1,6 @@ +# Set of overrides for the dutch stemmer +# TODO: load this as a resource from the analyzer and sync it in build.xml +fiets fiets +bromfiets bromfiets +ei eier +kind kinder diff --git a/archiver/solr-config-dir/lang/stoptags_ja.txt b/archiver/solr-config-dir/lang/stoptags_ja.txt new file mode 100644 index 0000000..71b7508 --- /dev/null +++ b/archiver/solr-config-dir/lang/stoptags_ja.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#名詞 +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#名詞-一般 +# +# noun-proper: Proper nouns where the sub-classification is undefined +#名詞-固有名詞 +# +# noun-proper-misc: miscellaneous proper nouns +#名詞-固有名詞-一般 +# +# noun-proper-person: Personal names where the sub-classification is undefined +#名詞-固有名詞-人名 +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. お市の方 +#名詞-固有名詞-人名-一般 +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. 山田 +#名詞-固有名詞-人名-姓 +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太郎 +#名詞-固有名詞-人名-名 +# +# noun-proper-organization: Names representing organizations. +# e.g. 通産省, NHK +#名詞-固有名詞-組織 +# +# noun-proper-place: Place names where the sub-classification is undefined +#名詞-固有名詞-地域 +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. アジア, バルセロナ, 京都 +#名詞-固有名詞-地域-一般 +# +# noun-proper-place-country: Country names. +# e.g. 日本, オーストラリア +#名詞-固有名詞-地域-国 +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#名詞-代名詞 +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ +#名詞-代名詞-一般 +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ +#名詞-代名詞-縮約 +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. 金曜, 一月, 午後, 少量 +#名詞-副詞可能 +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (する, できる, なさる, くださる) +# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り +#名詞-サ変接続 +# +# noun-adjective-base: The base form of adjectives, words that appear before な ("na") +# e.g. 健康, 安易, 駄目, だめ +#名詞-形容動詞語幹 +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. +# e.g. 0, 1, 2, 何, 数, 幾 +#名詞-数 +# +# noun-affix: noun affixes where the sub-classification is undefined +#名詞-非自立 +# +# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, +# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, +# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, +# わり, 割り, 割, ん-口語/, もん-口語/ +#名詞-非自立-一般 +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, +# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, +# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, +# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, +# 儘, 侭, みぎり, 矢先 +#名詞-非自立-副詞可能 +# +# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars +# with the stem よう(だ) ("you(da)"). +# e.g. よう, やう, 様 (よう) +#名詞-非自立-助動詞語幹 +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form な (aux "da"). +# e.g. みたい, ふう +#名詞-非自立-形容動詞語幹 +# +# noun-special: special nouns where the sub-classification is undefined. +#名詞-特殊 +# +# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is +# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. そう +#名詞-特殊-助動詞語幹 +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#名詞-接尾 +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ガル or タイ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# 接尾語 ("suffix") and is usually the last element in a compound noun. +# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, +# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 +#名詞-接尾-一般 +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. 君, 様, 著 +#名詞-接尾-人名 +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. 町, 市, 県 +#名詞-接尾-地域 +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before スル ("suru"). +# e.g. 化, 視, 分け, 入り, 落ち, 買い +#名詞-接尾-サ変接続 +# +# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, +# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. そう +#名詞-接尾-助動詞語幹 +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula だ ("da"). +# e.g. 的, げ, がち +#名詞-接尾-形容動詞語幹 +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) +#名詞-接尾-副詞可能 +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach +# to numbers. +# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 +#名詞-接尾-助数詞 +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽し) さ, (考え) 方 +#名詞-接尾-特殊 +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) +#名詞-接続詞的 +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are +# semantically verb-like. +# e.g. ごらん, ご覧, 御覧, 頂戴 +#名詞-動詞非自立的 +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") +# is いわく ("iwaku"). +#名詞-引用文字列 +# +# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and +# behave like an adjective. +# e.g. 申し訳, 仕方, とんでも, 違い +#名詞-ナイ形容詞語幹 +# +##### +# prefix: unclassified prefixes +#接頭詞 +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) +#接頭詞-名詞接続 +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by なる/なさる/くださる. +# e.g. お (読みなさい), お (座り) +#接頭詞-動詞接続 +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. お (寒いですねえ), バカ (でかい) +#接頭詞-形容詞接続 +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. 約, およそ, 毎時 +#接頭詞-数接続 +# +##### +# verb: unclassified verbs +#動詞 +# +# verb-main: +#動詞-自立 +# +# verb-auxiliary: +#動詞-非自立 +# +# verb-suffix: +#動詞-接尾 +# +##### +# adjective: unclassified adjectives +#形容詞 +# +# adjective-main: +#形容詞-自立 +# +# adjective-auxiliary: +#形容詞-非自立 +# +# adjective-suffix: +#形容詞-接尾 +# +##### +# adverb: unclassified adverbs +#副詞 +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. あいかわらず, 多分 +#副詞-一般 +# +# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, +# な, する, だ, etc. +# e.g. こんなに, そんなに, あんなに, なにか, なんでも +#副詞-助詞類接続 +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, +# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, +# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き +#連体詞 +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. が, けれども, そして, じゃあ, それどころか +接続詞 +# +##### +# particle: unclassified particles. +助詞 +# +# particle-case: case particles where the subclassification is undefined. +助詞-格助詞 +# +# particle-case-misc: Case particles. +# e.g. から, が, で, と, に, へ, より, を, の, にて +助詞-格助詞-一般 +# +# particle-case-quote: the "to" that appears after nouns, a person’s speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) +助詞-格助詞-引用 +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, +# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, +# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, +# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, +# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, +# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, +# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, +# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ +助詞-格助詞-連語 +# +# particle-conjunctive: +# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, +# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, +# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ +助詞-接続助詞 +# +# particle-dependency: +# e.g. こそ, さえ, しか, すら, は, も, ぞ +助詞-係助詞 +# +# particle-adverbial: +# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, +# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, +# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, +# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, +# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) +助詞-副助詞 +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (松島) や +助詞-間投助詞 +# +# particle-coordinate: +# e.g. と, たり, だの, だり, とか, なり, や, やら +助詞-並立助詞 +# +# particle-final: +# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, +# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ +助詞-終助詞 +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 +# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 +# 「(祈りが届いたせい) か (, 試験に合格した.)」 +# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 +# e.g. か +助詞-副助詞/並立助詞/終助詞 +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +助詞-連体化 +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. に, と +助詞-副詞化 +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) +助詞-特殊 +# +##### +# auxiliary-verb: +助動詞 +# +##### +# interjection: Greetings and other exclamations. +# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, +# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい +#感動詞 +# +##### +# symbol: unclassified Symbols. +記号 +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [○◎@$〒→+] +記号-一般 +# +# symbol-comma: Commas +# e.g. [,、] +記号-読点 +# +# symbol-period: Periods and full stops. +# e.g. [..。] +記号-句点 +# +# symbol-space: Full-width whitespace. +記号-空白 +# +# symbol-open_bracket: +# e.g. [({‘“『【] +記号-括弧開 +# +# symbol-close_bracket: +# e.g. [)}’”』」】] +記号-括弧閉 +# +# symbol-alphabetic: +#記号-アルファベット +# +##### +# other: unclassified other +#その他 +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (だ)ァ +その他-間投 +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. あの, うんと, えと +フィラー +# +##### +# non-verbal: non-verbal sound. +非言語音 +# +##### +# fragment: +#語断片 +# +##### +# unknown: unknown part of speech. +#未知語 +# +##### End of file diff --git a/archiver/solr-config-dir/lang/stopwords_ar.txt b/archiver/solr-config-dir/lang/stopwords_ar.txt new file mode 100644 index 0000000..046829d --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_ar.txt @@ -0,0 +1,125 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Cleaned on October 11, 2009 (not normalized, so use before normalization) +# This means that when modifying this list, you might need to add some +# redundant entries, for example containing forms with both أ and ا +من +ومن +منها +منه +في +وفي +فيها +فيه +و +ف +ثم +او +أو +ب +بها +به +ا +أ +اى +اي +أي +أى +لا +ولا +الا +ألا +إلا +لكن +ما +وما +كما +فما +عن +مع +اذا +إذا +ان +أن +إن +انها +أنها +إنها +انه +أنه +إنه +بان +بأن +فان +فأن +وان +وأن +وإن +التى +التي +الذى +الذي +الذين +الى +الي +إلى +إلي +على +عليها +عليه +اما +أما +إما +ايضا +أيضا +كل +وكل +لم +ولم +لن +ولن +هى +هي +هو +وهى +وهي +وهو +فهى +فهي +فهو +انت +أنت +لك +لها +له +هذه +هذا +تلك +ذلك +هناك +كانت +كان +يكون +تكون +وكانت +وكان +غير +بعض +قد +نحو +بين +بينما +منذ +ضمن +حيث +الان +الآن +خلال +بعد +قبل +حتى +عند +عندما +لدى +جميع diff --git a/archiver/solr-config-dir/lang/stopwords_bg.txt b/archiver/solr-config-dir/lang/stopwords_bg.txt new file mode 100644 index 0000000..1ae4ba2 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_bg.txt @@ -0,0 +1,193 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +а +аз +ако +ала +бе +без +беше +би +бил +била +били +било +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главно +го +д +да +дали +до +докато +докога +дори +досега +доста +е +едва +един +ето +за +зад +заедно +заради +засега +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +ли +м +ме +между +мен +ми +мнозина +мога +могат +може +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нея +ни +ние +никой +нито +но +някои +някой +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първо +с +са +само +се +сега +си +скоро +след +сме +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +тн +то +това +тогава +този +той +толкова +точно +трябва +тук +тъй +тя +тях +у +харесва +ч +че +често +чрез +ще +щом +я diff --git a/archiver/solr-config-dir/lang/stopwords_ca.txt b/archiver/solr-config-dir/lang/stopwords_ca.txt new file mode 100644 index 0000000..3da65de --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_ca.txt @@ -0,0 +1,220 @@ +# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) +a +abans +ací +ah +així +això +al +als +aleshores +algun +alguna +algunes +alguns +alhora +allà +allí +allò +altra +altre +altres +amb +ambdós +ambdues +apa +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquí +baix +cada +cadascú +cadascuna +cadascunes +cadascuns +com +contra +d'un +d'una +d'unes +d'uns +dalt +de +del +dels +des +després +dins +dintre +donat +doncs +durant +e +eh +el +els +em +en +encara +ens +entre +érem +eren +éreu +es +és +esta +està +estàvem +estaven +estàveu +esteu +et +etc +ets +fins +fora +gairebé +ha +han +has +havia +he +hem +heu +hi +ho +i +igual +iguals +ja +l'hi +la +les +li +li'n +llavors +m'he +ma +mal +malgrat +mateix +mateixa +mateixes +mateixos +me +mentre +més +meu +meus +meva +meves +molt +molta +moltes +molts +mon +mons +n'he +n'hi +ne +ni +no +nogensmenys +només +nosaltres +nostra +nostre +nostres +o +oh +oi +on +pas +pel +pels +per +però +perquè +poc +poca +pocs +poques +potser +propi +qual +quals +quan +quant +que +què +quelcom +qui +quin +quina +quines +quins +s'ha +s'han +sa +semblant +semblants +ses +seu +seus +seva +seva +seves +si +sobre +sobretot +sóc +solament +sols +son +són +sons +sota +sou +t'ha +t'han +t'he +ta +tal +també +tampoc +tan +tant +tanta +tantes +teu +teus +teva +teves +ton +tons +tot +tota +totes +tots +un +una +unes +uns +us +va +vaig +vam +van +vas +veu +vosaltres +vostra +vostre +vostres diff --git a/archiver/solr-config-dir/lang/stopwords_cz.txt b/archiver/solr-config-dir/lang/stopwords_cz.txt new file mode 100644 index 0000000..53c6097 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_cz.txt @@ -0,0 +1,172 @@ +a +s +k +o +i +u +v +z +dnes +cz +tímto +budeš +budem +byli +jseš +můj +svým +ta +tomto +tohle +tuto +tyto +jej +zda +proč +máte +tato +kam +tohoto +kdo +kteří +mi +nám +tom +tomuto +mít +nic +proto +kterou +byla +toho +protože +asi +ho +naši +napište +re +což +tím +takže +svých +její +svými +jste +aj +tu +tedy +teto +bylo +kde +ke +pravé +ji +nad +nejsou +či +pod +téma +mezi +přes +ty +pak +vám +ani +když +však +neg +jsem +tento +článku +články +aby +jsme +před +pta +jejich +byl +ještě +až +bez +také +pouze +první +vaše +která +nás +nový +tipy +pokud +může +strana +jeho +své +jiné +zprávy +nové +není +vás +jen +podle +zde +už +být +více +bude +již +než +který +by +které +co +nebo +ten +tak +má +při +od +po +jsou +jak +další +ale +si +se +ve +to +jako +za +zpět +ze +do +pro +je +na +atd +atp +jakmile +přičemž +já +on +ona +ono +oni +ony +my +vy +jí +ji +mě +mne +jemu +tomu +těm +těmu +němu +němuž +jehož +jíž +jelikož +jež +jakož +načež diff --git a/archiver/solr-config-dir/lang/stopwords_da.txt b/archiver/solr-config-dir/lang/stopwords_da.txt new file mode 100644 index 0000000..42e6145 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_da.txt @@ -0,0 +1,110 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +på | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/youself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +når | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +også | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sådan | such, like this/like that diff --git a/archiver/solr-config-dir/lang/stopwords_de.txt b/archiver/solr-config-dir/lang/stopwords_de.txt new file mode 100644 index 0000000..86525e7 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_de.txt @@ -0,0 +1,294 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daß | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + diff --git a/archiver/solr-config-dir/lang/stopwords_el.txt b/archiver/solr-config-dir/lang/stopwords_el.txt new file mode 100644 index 0000000..232681f --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_el.txt @@ -0,0 +1,78 @@ +# Lucene Greek Stopwords list +# Note: by default this file is used after GreekLowerCaseFilter, +# so when modifying this file use 'σ' instead of 'ς' +ο +η +το +οι +τα +του +τησ +των +τον +την +και +κι +κ +ειμαι +εισαι +ειναι +ειμαστε +ειστε +στο +στον +στη +στην +μα +αλλα +απο +για +προσ +με +σε +ωσ +παρα +αντι +κατα +μετα +θα +να +δε +δεν +μη +μην +επι +ενω +εαν +αν +τοτε +που +πωσ +ποιοσ +ποια +ποιο +ποιοι +ποιεσ +ποιων +ποιουσ +αυτοσ +αυτη +αυτο +αυτοι +αυτων +αυτουσ +αυτεσ +αυτα +εκεινοσ +εκεινη +εκεινο +εκεινοι +εκεινεσ +εκεινα +εκεινων +εκεινουσ +οπωσ +ομωσ +ισωσ +οσο +οτι diff --git a/archiver/solr-config-dir/lang/stopwords_en.txt b/archiver/solr-config-dir/lang/stopwords_en.txt new file mode 100644 index 0000000..2c164c0 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with diff --git a/archiver/solr-config-dir/lang/stopwords_es.txt b/archiver/solr-config-dir/lang/stopwords_es.txt new file mode 100644 index 0000000..487d78c --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_es.txt @@ -0,0 +1,356 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | from, of +la | the, her +que | who, that +el | the +en | in +y | and +a | to +los | the, them +del | de + el +se | himself, from him etc +las | the, them +por | for, by, etc +un | a +para | for +con | with +no | no +una | a +su | his, her +al | a + el + | es from SER +lo | him +como | how +más | more +pero | pero +sus | su plural +le | to him, her +ya | already +o | or + | fue from SER +este | this + | ha from HABER +sí | himself etc +porque | because +esta | this + | son from SER +entre | between + | está from ESTAR +cuando | when +muy | very +sin | without +sobre | on + | ser from SER + | tiene from TENER +también | also +me | me +hasta | until +hay | there is/are +donde | where + | han from HABER +quien | whom, that + | están from ESTAR + | estado from ESTAR +desde | from +todo | all +nos | us +durante | during + | estados from ESTAR +todos | all +uno | a +les | to them +ni | nor +contra | against +otros | other + | fueron from SER +ese | that +eso | that + | había from HABER +ante | before +ellos | they +e | and (variant of y) +esto | this +mí | me +antes | before +algunos | some +qué | what? +unos | a +yo | I +otro | other +otras | other +otra | other +él | he +tanto | so much, many +esa | that +estos | these +mucho | much, many +quienes | who +nada | nothing +muchos | many +cual | who + | sea from SER +poco | few +ella | she +estar | to be + | haber from HABER +estas | these + | estaba from ESTAR + | estamos from ESTAR +algunas | some +algo | something +nosotros | we + + | other forms + +mi | me +mis | mi plural +tú | thou +te | thee +ti | thee +tu | thy +tus | tu plural +ellas | they +nosotras | we +vosotros | you +vosotras | you +os | you +mío | mine +mía | +míos | +mías | +tuyo | thine +tuya | +tuyos | +tuyas | +suyo | his, hers, theirs +suya | +suyos | +suyas | +nuestro | ours +nuestra | +nuestros | +nuestras | +vuestro | yours +vuestra | +vuestros | +vuestras | +esos | those +esas | those + + | forms of estar, to be (not including the infinitive): +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad + + | forms of haber, to have (not including the infinitive): +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas + + | forms of ser, to be (not including the infinitive): +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +siendo +sido + | sed also means 'thirst' + + | forms of tener, to have (not including the infinitive): +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened + diff --git a/archiver/solr-config-dir/lang/stopwords_et.txt b/archiver/solr-config-dir/lang/stopwords_et.txt new file mode 100644 index 0000000..1b06a13 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_et.txt @@ -0,0 +1,1603 @@ +# Estonian stopwords list +all +alla +allapoole +allpool +alt +altpoolt +eel +eespool +enne +hommikupoole +hoolimata +ilma +kaudu +keset +kesk +kohe +koos +kuhupoole +kuni +kuspool +kustpoolt +kõige +käsikäes +lappi +ligi +läbi +mööda +paitsi +peale +pealepoole +pealpool +pealt +pealtpoolt +piki +pikku +piku +pikuti +põiki +pärast +päri +risti +sealpool +sealtpoolt +seespool +seltsis +siiapoole +siinpool +siitpoolt +sinnapoole +sissepoole +taga +tagantpoolt +tagapidi +tagapool +taha +tahapoole +teispool +teispoole +tänu +tükkis +vaatamata +vastu +väljapoole +väljaspool +väljastpoolt +õhtupoole +ühes +ühestükis +ühestükkis +ülalpool +ülaltpoolt +üle +ülespoole +ülevalpool +ülevaltpoolt +ümber +ümbert +aegu +aegus +alguks +algul +algule +algult +alguni +all +alla +alt +alul +alutsi +arvel +asemel +asemele +eel +eeli +ees +eesotsas +eest +eestotsast +esitsi +ette +etteotsa +haaval +heaks +hoolimata +hulgas +hulgast +hulka +jalgu +jalus +jalust +jaoks +jooksul +juurde +juures +juurest +jälil +jälile +järel +järele +järelt +järgi +kaasas +kallal +kallale +kallalt +kamul +kannul +kannule +kannult +kaudu +kaupa +keskel +keskele +keskelt +keskis +keskpaiku +kestel +kestes +kilda +killas +killast +kimpu +kimpus +kiuste +kohal +kohale +kohalt +kohaselt +kohe +kohta +koos +korral +kukil +kukile +kukilt +kulul +kõrva +kõrval +kõrvale +kõrvalt +kõrvas +kõrvast +käekõrval +käekõrvale +käekõrvalt +käes +käest +kätte +külge +küljes +küljest +küüsi +küüsis +küüsist +ligi +ligidal +ligidale +ligidalt +aegu +aegus +alguks +algul +algule +algult +alguni +all +alla +alt +alul +alutsi +arvel +asemel +asemele +eel +eeli +ees +eesotsas +eest +eestotsast +esitsi +ette +etteotsa +haaval +heaks +hoolimata +hulgas +hulgast +hulka +jalgu +jalus +jalust +jaoks +jooksul +juurde +juures +juurest +jälil +jälile +järel +järele +järelt +järgi +kaasas +kallal +kallale +kallalt +kamul +kannul +kannule +kannult +kaudu +kaupa +keskel +keskele +keskelt +keskis +keskpaiku +kestel +kestes +kilda +killas +killast +kimpu +kimpus +kiuste +kohal +kohale +kohalt +kohaselt +kohe +kohta +koos +korral +kukil +kukile +kukilt +kulul +kõrva +kõrval +kõrvale +kõrvalt +kõrvas +kõrvast +käekõrval +käekõrvale +käekõrvalt +käes +käest +kätte +külge +küljes +küljest +küüsi +küüsis +küüsist +ligi +ligidal +ligidale +ligidalt +lool +läbi +lähedal +lähedale +lähedalt +man +mant +manu +meelest +mööda +nahas +nahka +nahkas +najal +najale +najalt +nõjal +nõjale +otsa +otsas +otsast +paigale +paigu +paiku +peal +peale +pealt +perra +perrä +pidi +pihta +piki +pikku +pool +poole +poolest +poolt +puhul +puksiiris +pähe +päralt +päras +pärast +päri +ringi +ringis +risust +saadetusel +saadik +saatel +saati +seas +seast +sees +seest +sekka +seljataga +seltsi +seltsis +seltsist +sisse +slepis +suhtes +šlepis +taga +tagant +tagantotsast +tagaotsas +tagaselja +tagasi +tagast +tagutsi +taha +tahaotsa +takka +tarvis +tasa +tuuri +tuuris +tõttu +tükkis +uhal +vaatamata +vahel +vahele +vahelt +vahepeal +vahepeale +vahepealt +vahetsi +varal +varale +varul +vastas +vastast +vastu +veerde +veeres +viisi +võidu +võrd +võrdki +võrra +võrragi +väel +väele +vältel +väärt +väärtki +äärde +ääre +ääres +äärest +ühes +üle +ümber +ümbert +a +abil +aina +ainult +alalt +alates +alati +alles +b +c +d +e +eales +ealeski +edasi +edaspidi +eelkõige +eemal +ei +eks +end +enda +enese +ennem +esialgu +f +g +h +hoopis +i +iganes +igatahes +igati +iial +iialgi +ikka +ikkagi +ilmaski +iseenda +iseenese +iseenesest +isegi +j +jah +ju +juba +juhul +just +järelikult +k +ka +kah +kas +kasvõi +keda +kestahes +kogu +koguni +kohati +kokku +kuhu +kuhugi +kuidagi +kuidas +kunagi +kus +kusagil +kusjuures +kuskil +kust +kõigepealt +küll +l +liiga +lisaks +m +miks +mil +millal +millalgi +mispärast +mistahes +mistõttu +mitte +muide +muidu +muidugi +muist +mujal +mujale +mujalt +mõlemad +mõnda +mõne +mõnikord +n +nii +niikaua +niimoodi +niipaljuke +niisama +niisiis +niivõrd +nõnda +nüüd +o +omaette +omakorda +omavahel +ometi +p +palju +paljuke +palju-palju +peaaegu +peagi +peamiselt +pigem +pisut +praegu +päris +r +rohkem +s +samas +samuti +seal +sealt +sedakorda +sedapuhku +seega +seejuures +seejärel +seekord +seepärast +seetõttu +sellepärast +seni +sestap +siia +siiani +siin +siinkohal +siis +siiski +siit +sinna +suht +š +z +ž +t +teel +teineteise +tõesti +täiesti +u +umbes +v +w +veel +veelgi +vist +võibolla +võib-olla +väga +vähemalt +välja +väljas +väljast +õ +ä +ära +ö +ü +ühtlasi +üksi +ükskõik +ülal +ülale +ülalt +üles +ülesse +üleval +ülevalt +ülimalt +üsna +x +y +aga +ega +ehk +ehkki +elik +ellik +enge +ennegu +ent +et +ja +justkui +kui +kuid +kuigi +kuivõrd +kuna +kuni +kut +mistab +muudkui +nagu +nigu +ning +olgugi +otsekui +otsenagu +selmet +sest +sestab +vaid +või +aa +adaa +adjöö +ae +ah +ahaa +ahah +ah-ah-ah +ah-haa +ahoi +ai +aidaa +aidu-raidu +aih +aijeh +aituma +aitäh +aitüma +ammuu +amps +ampsti +aptsih +ass +at +ata +at-at-at +atsih +atsihh +auh +bai-bai +bingo +braavo +brr +ee +eeh +eh +ehee +eheh +eh-eh-hee +eh-eh-ee +ehei +ehh +ehhee +einoh +ena +ennäe +ennäh +fuh +fui +fuih +haa +hah +hahaa +hah-hah-hah +halleluuja +hallo +halloo +hass +hee +heh +he-he-hee +hei +heldeke(ne) +heureka +hihii +hip-hip-hurraa +hmh +hmjah +hoh-hoh-hoo +hohoo +hoi +hollallaa +hoo +hoplaa +hopp +hops +hopsassaa +hopsti +hosianna +huh +huidii +huist +hurjah +hurjeh +hurjoh +hurjuh +hurraa +huu +hõhõh +hõi +hõissa +hõissassa +hõk +hõkk +häh +hä-hä-hää +hüvasti +ih-ah-haa +ih-ih-hii +ii-ha-ha +issake +issakene +isver +jaa-ah +ja-ah +jaah +janäe +jeeh +jeerum +jeever +jessas +jestas +juhhei +jumalaga +jumalime +jumaluke +jumalukene +jutas +kaaps +kaapsti +kaasike +kae +kalps +kalpsti +kannäe +kanäe +kappadi +kaps +kapsti +karkõmm +karkäuh +karkääks +karkääksti +karmauh +karmauhti +karnaps +karnapsti +karniuhti +karpartsaki +karpauh +karpauhti +karplauh +karplauhti +karprauh +karprauhti +karsumdi +karsumm +kartsumdi +kartsumm +karviuh +karviuhti +kaske +kassa +kauh +kauhti +keh +keksti +kepsti +khe +khm +kih +kiiks +kiiksti +kiis +kiiss +kikerii +kikerikii +kili +kilk +kilk-kõlk +kilks +kilks-kolks +kilks-kõlks +kill +killadi +killadi|-kolladi +killadi-kõlladi +killa-kolla +killa-kõlla +kill-kõll +kimps-komps +kipp +kips-kõps +kiriküüt +kirra-kõrra +kirr-kõrr +kirts +klaps +klapsti +klirdi +klirr +klonks +klops +klopsti +kluk +klu-kluu +klõks +klõksti +klõmdi +klõmm +klõmpsti +klõnks +klõnksti +klõps +klõpsti +kläu +kohva-kohva +kok +koks +koksti +kolaki +kolk +kolks +kolksti +koll +kolladi +komp +komps +kompsti +kop +kopp +koppadi +kops +kopsti +kossu +kotsu +kraa +kraak +kraaks +kraaps +kraapsti +krahh +kraks +kraksti +kraps +krapsti +krauh +krauhti +kriiks +kriiksti +kriips +kriips-kraaps +kripa-krõpa +krips-kraps +kriuh +kriuks +kriuksti +kromps +kronk +kronks +krooks +kruu +krõks +krõksti +krõpa +krõps +krõpsti +krõuh +kräu +kräuh +kräuhti +kräuks +kss +kukeleegu +kukku +kuku +kulu +kurluu +kurnäu +kuss +kussu +kõks +kõksti +kõldi +kõlks +kõlksti +kõll +kõmaki +kõmdi +kõmm +kõmps +kõpp +kõps +kõpsadi +kõpsat +kõpsti +kõrr +kõrra-kõrra +kõss +kõtt +kõõksti +kärr +kärts +kärtsti +käuks +käuksti +kääga +kääks +kääksti +köh +köki-möki +köksti +laks +laksti +lampsti +larts +lartsti +lats +latsti +leelo +legoo +lehva +liiri-lõõri +lika-lõka +likat-lõkat +limpsti +lips +lipsti +lirts +lirtsaki +lirtsti +lonksti +lops +lopsti +lorts +lortsti +luks +lups +lupsti +lurts +lurtsti +lõks +lõksti +lõmps +lõmpsti +lõnks +lõnksti +lärts +lärtsti +läts +lätsti +lörts +lörtsti +lötsti +lööps +lööpsti +marss +mats +matsti +mauh +mauhti +mh +mhh +mhmh +miau +mjaa +mkm +m-mh +mnjaa +mnjah +moens +mulks +mulksti +mull-mull +mull-mull-mull +muu +muuh +mõh +mõmm +mäh +mäts +mäu +mää +möh +möh-öh-ää +möö +müh-müh +mühüh +müks +müksti +müraki +mürr +mürts +mürtsaki +mürtsti +mütaku +müta-mäta +müta-müta +müt-müt +müt-müt-müt +müts +mütsti +mütt +naa +naah +nah +naks +naksti +nanuu +naps +napsti +nilpsti +nipsti +nirr +niuh +niuh-näuh +niuhti +noh +noksti +nolpsti +nonoh +nonoo +nonäh +noo +nooh +nooks +norr +nurr +nuuts +nõh +nõhh +nõka-nõka +nõks +nõksat-nõksat +nõks-nõks +nõksti +nõõ +nõõh +näeh +näh +nälpsti +nämm-nämm +näpsti +näts +nätsti +näu +näuh +näuhti +näuks +näuksti +nääh +nääks +nühkat-nühkat +oeh +oh +ohh +ohhh +oh-hoi +oh-hoo +ohoh +oh-oh-oo +oh-oh-hoo +ohoi +ohoo +oi +oih +oijee +oijeh +oo +ooh +oo-oh +oo-ohh +oot +ossa +ot +paa +pah +pahh +pakaa +pamm +pantsti +pardon +pardonks +parlartsti +parts +partsti +partsumdi +partsumm +pastoi +pats +patst +patsti +pau +pauh +pauhti +pele +pfui +phuh +phuuh +phäh +phähh +piiks +piip +piiri-pääri +pimm +pimm-pamm +pimm-pomm +pimm-põmm +piraki +piuks +piu-pau +plaks +plaksti +plarts +plartsti +plats +platsti +plauh +plauhh +plauhti +pliks +pliks-plaks +plinn +pliraki +plirts +plirtsti +pliu +pliuh +ploks +plotsti +plumps +plumpsti +plõks +plõksti +plõmdi +plõmm +plõnn +plärr +plärts +plärtsat +plärtsti +pläu +pläuh +plää +plörtsat +pomm +popp +pops +popsti +ports +pot +pots +potsti +pott +praks +praksti +prants +prantsaki +prantsti +prassai +prauh +prauhh +prauhti +priks +priuh +priuhh +priuh-prauh +proosit +proost +prr +prrr +prõks +prõksti +prõmdi +prõmm +prõntsti +prääk +prääks +pst +psst +ptrr +ptruu +ptüi +puh +puhh +puksti +pumm +pumps +pup-pup-pup +purts +puuh +põks +põksti +põmdi +põmm +põmmadi +põnks +põnn +põnnadi +põnt +põnts +põntsti +põraki +põrr +põrra-põrra +päh +pähh +päntsti +pää +pöörd +püh +raks +raksti +raps +rapsti +ratataa +rauh +riips +riipsti +riks +riks-raks +rips-raps +rivitult +robaki +rops +ropsaki +ropsti +ruik +räntsti +räts +röh +röhh +sah +sahh +sahkat +saps +sapsti +sauh +sauhti +servus +sihkadi-sahkadi +sihka-sahka +sihkat-sahkat +silks +silk-solk +sips +sipsti +sirr +sirr-sorr +sirts +sirtsti +siu +siuh +siuh-sauh +siuh-säuh +siuhti +siuks +siuts +skool +so +soh +solks +solksti +solpsti +soo +sooh +so-oh +soo-oh +sopp +sops +sopsti +sorr +sorts +sortsti +so-soo +soss +soss-soss +ss +sss +sst +stopp +suhkat-sahkat +sulk +sulks +sulksti +sull +sulla-sulla +sulpa-sulpa +sulps +sulpsti +sumaki +sumdi +summ +summat-summat +sups +supsaku +supsti +surts +surtsti +suss +susti +suts +sutsti +säh +sähke +särts +särtsti +säu +säuh +säuhti +taevake +taevakene +takk +tere +terekest +tibi-tibi +tikk-takk +tiks +tilk +tilks +till +tilla-talla +till-tall +tilulii +tinn +tip +tip-tap +tirr +tirtsti +tiu +tjaa +tjah +tohhoh +tohhoo +tohoh +tohoo +tok +tokk +toks +toksti +tonks +tonksti +tota +totsti +tot-tot +tprr +tpruu +trah +trahh +trallallaa +trill +trillallaa +trr +trrr +tsah +tsahh +tsilk +tsilk-tsolk +tsirr +tsiuh +tskae +tsolk +tss +tst +tsst +tsuhh +tsuk +tsumm +tsurr +tsäuh +tšao +tšš +tššš +tuk +tuks +turts +turtsti +tutki +tutkit +tutu-lutu +tutulutu +tuut +tuutu-luutu +tõks +tötsti +tümps +uh +uhh +uh-huu +uhtsa +uhtsaa +uhuh +uhuu +ui +uih +uih-aih +uijah +uijeh +uist +uit +uka +upsti +uraa +urjah +urjeh +urjoh +urjuh +urr +urraa +ust +utu +uu +uuh +vaak +vaat +vae +vaeh +vai +vat +vau +vhüüt +vidiit +viiks +vilks +vilksti +vinki-vinki +virdi +virr +viu +viudi +viuh +viuhti +voeh +voh +vohh +volks +volksti +vooh +vops +vopsti +vot +vuh +vuhti +vuih +vulks +vulksti +vull +vulpsti +vups +vupsaki +vupsaku +vupsti +vurdi +vurr +vurra-vurra +vurts +vurtsti +vutt +võe +võeh +või +võih +võrr +võts +võtt +vääks +õe +õits +õk +õkk +õrr +õss +õuh +äh +ähh +ähhähhää +äh-hää +äh-äh-hää +äiu +äiu-ää +äss +ää +ääh +äähh +öh +öhh +ök +üh +eelmine +eikeegi +eimiski +emb-kumb +enam +enim +iga +igasugune +igaüks +ise +isesugune +järgmine +keegi +kes +kumb +kumbki +kõik +meiesugune +meietaoline +midagi +mihuke +mihukene +milletaoline +milline +mina +minake +mingi +mingisugune +minusugune +minutaoline +mis +miski +miskisugune +missugune +misuke +mitmes +mitmesugune +mitu +mitu-mitu +mitu-setu +muu +mõlema +mõnesugune +mõni +mõningane +mõningas +mäherdune +määrane +naasugune +need +nemad +nendesugune +nendetaoline +nihuke +nihukene +niimitu +niisamasugune +niisugune +nisuke +nisukene +oma +omaenese +omasugune +omataoline +pool +praegune +sama +samasugune +samataoline +see +seesama +seesamane +seesamune +seesinane +seesugune +selline +sihuke +sihukene +sina +sinusugune +sinutaoline +siuke +siukene +säherdune +säärane +taoline +teiesugune +teine +teistsugune +tema +temake +temakene +temasugune +temataoline +too +toosama +toosamane +üks +üksteise +hakkama +minema +olema +pidama +saama +tegema +tulema +võima diff --git a/archiver/solr-config-dir/lang/stopwords_eu.txt b/archiver/solr-config-dir/lang/stopwords_eu.txt new file mode 100644 index 0000000..25f1db9 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_eu.txt @@ -0,0 +1,99 @@ +# example set of basque stopwords +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten diff --git a/archiver/solr-config-dir/lang/stopwords_fa.txt b/archiver/solr-config-dir/lang/stopwords_fa.txt new file mode 100644 index 0000000..723641c --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_fa.txt @@ -0,0 +1,313 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Note: by default this file is used after normalization, so when adding entries +# to this file, use the arabic 'ي' instead of 'ی' +انان +نداشته +سراسر +خياه +ايشان +وي +تاكنون +بيشتري +دوم +پس +ناشي +وگو +يا +داشتند +سپس +هنگام +هرگز +پنج +نشان +امسال +ديگر +گروهي +شدند +چطور +ده +و +دو +نخستين +ولي +چرا +چه +وسط +ه +كدام +قابل +يك +رفت +هفت +همچنين +در +هزار +بله +بلي +شايد +اما +شناسي +گرفته +دهد +داشته +دانست +داشتن +خواهيم +ميليارد +وقتيكه +امد +خواهد +جز +اورده +شده +بلكه +خدمات +شدن +برخي +نبود +بسياري +جلوگيري +حق +كردند +نوعي +بعري +نكرده +نظير +نبايد +بوده +بودن +داد +اورد +هست +جايي +شود +دنبال +داده +بايد +سابق +هيچ +همان +انجا +كمتر +كجاست +گردد +كسي +تر +مردم +تان +دادن +بودند +سري +جدا +ندارند +مگر +يكديگر +دارد +دهند +بنابراين +هنگامي +سمت +جا +انچه +خود +دادند +زياد +دارند +اثر +بدون +بهترين +بيشتر +البته +به +براساس +بيرون +كرد +بعضي +گرفت +توي +اي +ميليون +او +جريان +تول +بر +مانند +برابر +باشيم +مدتي +گويند +اكنون +تا +تنها +جديد +چند +بي +نشده +كردن +كردم +گويد +كرده +كنيم +نمي +نزد +روي +قصد +فقط +بالاي +ديگران +اين +ديروز +توسط +سوم +ايم +دانند +سوي +استفاده +شما +كنار +داريم +ساخته +طور +امده +رفته +نخست +بيست +نزديك +طي +كنيد +از +انها +تمامي +داشت +يكي +طريق +اش +چيست +روب +نمايد +گفت +چندين +چيزي +تواند +ام +ايا +با +ان +ايد +ترين +اينكه +ديگري +راه +هايي +بروز +همچنان +پاعين +كس +حدود +مختلف +مقابل +چيز +گيرد +ندارد +ضد +همچون +سازي +شان +مورد +باره +مرسي +خويش +برخوردار +چون +خارج +شش +هنوز +تحت +ضمن +هستيم +گفته +فكر +بسيار +پيش +براي +روزهاي +انكه +نخواهد +بالا +كل +وقتي +كي +چنين +كه +گيري +نيست +است +كجا +كند +نيز +يابد +بندي +حتي +توانند +عقب +خواست +كنند +بين +تمام +همه +ما +باشند +مثل +شد +اري +باشد +اره +طبق +بعد +اگر +صورت +غير +جاي +بيش +ريزي +اند +زيرا +چگونه +بار +لطفا +مي +درباره +من +ديده +همين +گذاري +برداري +علت +گذاشته +هم +فوق +نه +ها +شوند +اباد +همواره +هر +اول +خواهند +چهار +نام +امروز +مان +هاي +قبل +كنم +سعي +تازه +را +هستند +زير +جلوي +عنوان +بود diff --git a/archiver/solr-config-dir/lang/stopwords_fi.txt b/archiver/solr-config-dir/lang/stopwords_fi.txt new file mode 100644 index 0000000..4372c9a --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_fi.txt @@ -0,0 +1,97 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this +tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +niin | so +nyt | now +itse | self + diff --git a/archiver/solr-config-dir/lang/stopwords_fr.txt b/archiver/solr-config-dir/lang/stopwords_fr.txt new file mode 100644 index 0000000..749abae --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_fr.txt @@ -0,0 +1,186 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +au | a + le +aux | a + les +avec | with +ce | this +ces | these +dans | with +de | of +des | de + les +du | de + le +elle | she +en | `of them' etc +et | and +eux | them +il | he +je | I +la | the +le | the +leur | their +lui | him +ma | my (fem) +mais | but +me | me +même | same; as in moi-même (myself) etc +mes | me (pl) +moi | me +mon | my (masc) +ne | not +nos | our (pl) +notre | our +nous | we +on | one +ou | where +par | by +pas | not +pour | for +qu | que before vowel +que | that +qui | who +sa | his, her (fem) +se | oneself +ses | his (pl) +son | his, her (masc) +sur | on +ta | thy (fem) +te | thee +tes | thy (pl) +toi | thee +ton | thy (masc) +tu | thou +un | a +une | a +vos | your (pl) +votre | your +vous | you + + | single letter forms + +c | c' +d | d' +j | j' +l | l' +à | to, at +m | m' +n | n' +s | s' +t | t' +y | there + + | forms of être (not including the infinitive): +été +étée +étées +étés +étant +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent + + | forms of avoir (not including the infinitive): +ayant +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent + + | Later additions (from Jean-Christophe Deschamps) +ceci | this +cela | that +celà | that +cet | this +cette | this +ici | here +ils | they +les | the (pl) +leurs | their (pl) +quel | which +quels | which +quelle | which +quelles | which +sans | without +soi | oneself + diff --git a/archiver/solr-config-dir/lang/stopwords_ga.txt b/archiver/solr-config-dir/lang/stopwords_ga.txt new file mode 100644 index 0000000..9ff88d7 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_ga.txt @@ -0,0 +1,110 @@ + +a +ach +ag +agus +an +aon +ar +arna +as +b' +ba +beirt +bhúr +caoga +ceathair +ceathrar +chomh +chtó +chuig +chun +cois +céad +cúig +cúigear +d' +daichead +dar +de +deich +deichniúr +den +dhá +do +don +dtí +dá +dár +dó +faoi +faoin +faoina +faoinár +fara +fiche +gach +gan +go +gur +haon +hocht +i +iad +idir +in +ina +ins +inár +is +le +leis +lena +lenár +m' +mar +mo +mé +na +nach +naoi +naonúr +ná +ní +níor +nó +nócha +ocht +ochtar +os +roimh +sa +seacht +seachtar +seachtó +seasca +seisear +siad +sibh +sinn +sna +sé +sí +tar +thar +thú +triúr +trí +trína +trínár +tríocha +tú +um +ár +é +éis +í +ó +ón +óna +ónár diff --git a/archiver/solr-config-dir/lang/stopwords_gl.txt b/archiver/solr-config-dir/lang/stopwords_gl.txt new file mode 100644 index 0000000..d8760b1 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_gl.txt @@ -0,0 +1,161 @@ +# galican stopwords +a +aínda +alí +aquel +aquela +aquelas +aqueles +aquilo +aquí +ao +aos +as +así +á +ben +cando +che +co +coa +comigo +con +connosco +contigo +convosco +coas +cos +cun +cuns +cunha +cunhas +da +dalgunha +dalgunhas +dalgún +dalgúns +das +de +del +dela +delas +deles +desde +deste +do +dos +dun +duns +dunha +dunhas +e +el +ela +elas +eles +en +era +eran +esa +esas +ese +eses +esta +estar +estaba +está +están +este +estes +estiven +estou +eu +é +facer +foi +foron +fun +había +hai +iso +isto +la +las +lle +lles +lo +los +mais +me +meu +meus +min +miña +miñas +moi +na +nas +neste +nin +no +non +nos +nosa +nosas +noso +nosos +nós +nun +nunha +nuns +nunhas +o +os +ou +ó +ós +para +pero +pode +pois +pola +polas +polo +polos +por +que +se +senón +ser +seu +seus +sexa +sido +sobre +súa +súas +tamén +tan +te +ten +teñen +teño +ter +teu +teus +ti +tido +tiña +tiven +túa +túas +un +unha +unhas +uns +vos +vosa +vosas +voso +vosos +vós diff --git a/archiver/solr-config-dir/lang/stopwords_hi.txt b/archiver/solr-config-dir/lang/stopwords_hi.txt new file mode 100644 index 0000000..86286bb --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_hi.txt @@ -0,0 +1,235 @@ +# Also see http://www.opensource.org/licenses/bsd-license.html +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# This file was created by Jacques Savoy and is distributed under the BSD license. +# Note: by default this file also contains forms normalized by HindiNormalizer +# for spelling variation (see section below), such that it can be used whether or +# not you enable that feature. When adding additional entries to this list, +# please add the normalized form as well. +अंदर +अत +अपना +अपनी +अपने +अभी +आदि +आप +इत्यादि +इन +इनका +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हों +उस +उसके +उसी +उसे +एक +एवं +एस +ऐसे +और +कई +कर +करता +करते +करना +करने +करें +कहते +कहा +का +काफ़ी +कि +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गया +घर +जब +जहाँ +जा +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसे +जीधर +जैसा +जैसे +जो +तक +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तो +था +थी +थे +दबारा +दिया +दुसरा +दूसरे +दो +द्वारा +न +नहीं +ना +निहायत +नीचे +ने +पर +पर +पहले +पूरा +पे +फिर +बनी +बही +बहुत +बाद +बाला +बिलकुल +भी +भीतर +मगर +मानो +मे +में +यदि +यह +यहाँ +यही +या +यिह +ये +रखें +रहा +रहे +ऱ्वासा +लिए +लिये +लेकिन +व +वर्ग +वह +वह +वहाँ +वहीं +वाले +वुह +वे +वग़ैरह +संग +सकता +सकते +सबसे +सभी +साथ +साबुत +साभ +सारा +से +सो +ही +हुआ +हुई +हुए +है +हैं +हो +होता +होती +होते +होना +होने +# additional normalized forms of the above +अपनि +जेसे +होति +सभि +तिंहों +इंहों +दवारा +इसि +किंहें +थि +उंहों +ओर +जिंहें +वहिं +अभि +बनि +हि +उंहिं +उंहें +हें +वगेरह +एसे +रवासा +कोन +निचे +काफि +उसि +पुरा +भितर +हे +बहि +वहां +कोइ +यहां +जिंहों +तिंहें +किसि +कइ +यहि +इंहिं +जिधर +इंहें +अदि +इतयादि +हुइ +कोनसा +इसकि +दुसरे +जहां +अप +किंहों +उनकि +भि +वरग +हुअ +जेसा +नहिं diff --git a/archiver/solr-config-dir/lang/stopwords_hu.txt b/archiver/solr-config-dir/lang/stopwords_hu.txt new file mode 100644 index 0000000..37526da --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_hu.txt @@ -0,0 +1,211 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| Hungarian stop word list +| prepared by Anna Tordai + +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +ő +ők +őket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/archiver/solr-config-dir/lang/stopwords_hy.txt b/archiver/solr-config-dir/lang/stopwords_hy.txt new file mode 100644 index 0000000..60c1c50 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_hy.txt @@ -0,0 +1,46 @@ +# example set of Armenian stopwords. +այդ +այլ +այն +այս +դու +դուք +եմ +են +ենք +ես +եք +է +էի +էին +էինք +էիր +էիք +էր +ըստ +թ +ի +ին +իսկ +իր +կամ +համար +հետ +հետո +մենք +մեջ +մի +ն +նա +նաև +նրա +նրանք +որ +որը +որոնք +որպես +ու +ում +պիտի +վրա +և diff --git a/archiver/solr-config-dir/lang/stopwords_id.txt b/archiver/solr-config-dir/lang/stopwords_id.txt new file mode 100644 index 0000000..4617f83 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_id.txt @@ -0,0 +1,359 @@ +# from appendix D of: A Study of Stemming Effects on Information +# Retrieval in Bahasa Indonesia +ada +adanya +adalah +adapun +agak +agaknya +agar +akan +akankah +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +diantaranya +antara +antaranya +diantara +apa +apaan +mengapa +apabila +apakah +apalagi +apatah +atau +ataukah +ataupun +bagai +bagaikan +sebagai +sebagainya +bagaimana +bagaimanapun +sebagaimana +bagaimanakah +bagi +bahkan +bahwa +bahwasanya +sebaliknya +banyak +sebanyak +beberapa +seberapa +begini +beginian +beginikah +beginilah +sebegini +begitu +begitukah +begitulah +begitupun +sebegitu +belum +belumlah +sebelum +sebelumnya +sebenarnya +berapa +berapakah +berapalah +berapapun +betulkah +sebetulnya +biasa +biasanya +bila +bilakah +bisa +bisakah +sebisanya +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +cuma +percuma +dahulu +dalam +dan +dapat +dari +daripada +dekat +demi +demikian +demikianlah +sedemikian +dengan +depan +di +dia +dialah +dini +diri +dirinya +terdiri +dong +dulu +enggak +enggaknya +entah +entahlah +terhadap +terhadapnya +hal +hampir +hanya +hanyalah +harus +haruslah +harusnya +seharusnya +hendak +hendaklah +hendaknya +hingga +sehingga +ia +ialah +ibarat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jangan +jangankan +janganlah +jika +jikalau +juga +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +dikarenakan +karena +karenanya +ke +kecil +kemudian +kenapa +kepada +kepadanya +ketika +seketika +khususnya +kini +kinilah +kiranya +sekiranya +kita +kitalah +kok +lagi +lagian +selagi +lah +lain +lainnya +melainkan +selaku +lalu +melalui +terlalu +lama +lamanya +selama +selama +selamanya +lebih +terlebih +bermacam +macam +semacam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masih +masihkah +semasih +masing +mau +maupun +semaunya +memang +mereka +merekalah +meski +meskipun +semula +mungkin +mungkinkah +nah +namun +nanti +nantinya +nyaris +oleh +olehnya +seorang +seseorang +pada +padanya +padahal +paling +sepanjang +pantas +sepantasnya +sepantasnyalah +para +pasti +pastilah +per +pernah +pula +pun +merupakan +rupanya +serupa +saat +saatnya +sesaat +saja +sajalah +saling +bersama +sama +sesama +sambil +sampai +sana +sangat +sangatlah +saya +sayalah +se +sebab +sebabnya +sebuah +tersebut +tersebutlah +sedang +sedangkan +sedikit +sedikitnya +segala +segalanya +segera +sesegera +sejak +sejenak +sekali +sekalian +sekalipun +sesekali +sekaligus +sekarang +sekarang +sekitar +sekitarnya +sela +selain +selalu +seluruh +seluruhnya +semakin +sementara +sempat +semua +semuanya +sendiri +sendirinya +seolah +seperti +sepertinya +sering +seringnya +serta +siapa +siapakah +siapapun +disini +disinilah +sini +sinilah +sesuatu +sesuatunya +suatu +sesudah +sesudahnya +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tak +tanpa +setelah +telah +tentang +tentu +tentulah +tentunya +tertentu +seterusnya +tapi +tetapi +setiap +tiap +setidaknya +tidak +tidakkah +tidaklah +toh +waduh +wah +wahai +sewaktu +walau +walaupun +wong +yaitu +yakni +yang diff --git a/archiver/solr-config-dir/lang/stopwords_it.txt b/archiver/solr-config-dir/lang/stopwords_it.txt new file mode 100644 index 0000000..1219cc7 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_it.txt @@ -0,0 +1,303 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | An Italian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +ad | a (to) before vowel +al | a + il +allo | a + lo +ai | a + i +agli | a + gli +all | a + l' +agl | a + gl' +alla | a + la +alle | a + le +con | with +col | con + il +coi | con + i (forms collo, cogli etc are now very rare) +da | from +dal | da + il +dallo | da + lo +dai | da + i +dagli | da + gli +dall | da + l' +dagl | da + gll' +dalla | da + la +dalle | da + le +di | of +del | di + il +dello | di + lo +dei | di + i +degli | di + gli +dell | di + l' +degl | di + gl' +della | di + la +delle | di + le +in | in +nel | in + el +nello | in + lo +nei | in + i +negli | in + gli +nell | in + l' +negl | in + gl' +nella | in + la +nelle | in + le +su | on +sul | su + il +sullo | su + lo +sui | su + i +sugli | su + gli +sull | su + l' +sugl | su + gl' +sulla | su + la +sulle | su + le +per | through, by +tra | among +contro | against +io | I +tu | thou +lui | he +lei | she +noi | we +voi | you +loro | they +mio | my +mia | +miei | +mie | +tuo | +tua | +tuoi | thy +tue | +suo | +sua | +suoi | his, her +sue | +nostro | our +nostra | +nostri | +nostre | +vostro | your +vostra | +vostri | +vostre | +mi | me +ti | thee +ci | us, there +vi | you, there +lo | him, the +la | her, the +li | them +le | them, the +gli | to him, the +ne | from there etc +il | the +un | a +uno | a +una | a +ma | but +ed | and +se | if +perché | why, because +anche | also +come | how +dov | where (as dov') +dove | where +che | who, that +chi | who +cui | whom +non | not +più | more +quale | who, that +quanto | how much +quanti | +quanta | +quante | +quello | that +quelli | +quella | +quelle | +questo | this +questi | +questa | +queste | +si | yes +tutto | all +tutti | all + + | single letter forms: + +a | at +c | as c' for ce or ci +e | and +i | the +l | as l' +o | or + + | forms of avere, to have (not including the infinitive): + +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute + + | forms of essere, to be (not including the infinitive): +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo + + | forms of fare, to do (not including the infinitive, fa, fat-): +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo + + | forms of stare, to be (not including the infinitive): +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/archiver/solr-config-dir/lang/stopwords_ja.txt b/archiver/solr-config-dir/lang/stopwords_ja.txt new file mode 100644 index 0000000..d4321be --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_ja.txt @@ -0,0 +1,127 @@ +# +# This file defines a stopword set for Japanese. +# +# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. +# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 +# for frequency lists, etc. that can be useful for making your own set (if desired) +# +# Note that there is an overlap between these stopwords and the terms stopped when used +# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note +# that comments are not allowed on the same line as stopwords. +# +# Also note that stopping is done in a case-insensitive manner. Change your StopFilter +# configuration if you need case-sensitive stopping. Lastly, note that stopping is done +# using the same character width as the entries in this file. Since this StopFilter is +# normally done after a CJKWidthFilter in your chain, you would usually want your romaji +# entries to be in half-width and your kana entries to be in full-width. +# +の +に +は +を +た +が +で +て +と +し +れ +さ +ある +いる +も +する +から +な +こと +として +い +や +れる +など +なっ +ない +この +ため +その +あっ +よう +また +もの +という +あり +まで +られ +なる +へ +か +だ +これ +によって +により +おり +より +による +ず +なり +られる +において +ば +なかっ +なく +しかし +について +せ +だっ +その後 +できる +それ +う +ので +なお +のみ +でき +き +つ +における +および +いう +さらに +でも +ら +たり +その他 +に関する +たち +ます +ん +なら +に対して +特に +せる +及び +これら +とき +では +にて +ほか +ながら +うち +そして +とともに +ただし +かつて +それぞれ +または +お +ほど +ものの +に対する +ほとんど +と共に +といった +です +とも +ところ +ここ +##### End of file diff --git a/archiver/solr-config-dir/lang/stopwords_lv.txt b/archiver/solr-config-dir/lang/stopwords_lv.txt new file mode 100644 index 0000000..e21a23c --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_lv.txt @@ -0,0 +1,172 @@ +# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins +# the original list of over 800 forms was refined: +# pronouns, adverbs, interjections were removed +# +# prepositions +aiz +ap +ar +apakš +ārpus +augšpus +bez +caur +dēļ +gar +iekš +iz +kopš +labad +lejpus +līdz +no +otrpus +pa +par +pār +pēc +pie +pirms +pret +priekš +starp +šaipus +uz +viņpus +virs +virspus +zem +apakšpus +# Conjunctions +un +bet +jo +ja +ka +lai +tomēr +tikko +turpretī +arī +kaut +gan +tādēļ +tā +ne +tikvien +vien +kā +ir +te +vai +kamēr +# Particles +ar +diezin +droši +diemžēl +nebūt +ik +it +taču +nu +pat +tiklab +iekšpus +nedz +tik +nevis +turpretim +jeb +iekam +iekām +iekāms +kolīdz +līdzko +tiklīdz +jebšu +tālab +tāpēc +nekā +itin +jā +jau +jel +nē +nezin +tad +tikai +vis +tak +iekams +vien +# modal verbs +būt +biju +biji +bija +bijām +bijāt +esmu +esi +esam +esat +būšu +būsi +būs +būsim +būsiet +tikt +tiku +tiki +tika +tikām +tikāt +tieku +tiec +tiek +tiekam +tiekat +tikšu +tiks +tiksim +tiksiet +tapt +tapi +tapāt +topat +tapšu +tapsi +taps +tapsim +tapsiet +kļūt +kļuvu +kļuvi +kļuva +kļuvām +kļuvāt +kļūstu +kļūsti +kļūst +kļūstam +kļūstat +kļūšu +kļūsi +kļūs +kļūsim +kļūsiet +# verbs +varēt +varēju +varējām +varēšu +varēsim +var +varēji +varējāt +varēsi +varēsiet +varat +varēja +varēs diff --git a/archiver/solr-config-dir/lang/stopwords_nl.txt b/archiver/solr-config-dir/lang/stopwords_nl.txt new file mode 100644 index 0000000..47a2aea --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_nl.txt @@ -0,0 +1,119 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other diff --git a/archiver/solr-config-dir/lang/stopwords_no.txt b/archiver/solr-config-dir/lang/stopwords_no.txt new file mode 100644 index 0000000..a7a2c28 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_no.txt @@ -0,0 +1,194 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +som | who/which/that +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +være | to be +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ikkje | not * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +si | his/hers * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + diff --git a/archiver/solr-config-dir/lang/stopwords_pt.txt b/archiver/solr-config-dir/lang/stopwords_pt.txt new file mode 100644 index 0000000..acfeb01 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_pt.txt @@ -0,0 +1,253 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | of, from +a | the; to, at; her +o | the; him +que | who, that +e | and +do | de + o +da | de + a +em | in +um | a +para | for + | é from SER +com | with +não | not, no +uma | a +os | the; them +no | em + o +se | himself etc +na | em + a +por | for +mais | more +as | the; them +dos | de + os +como | as, like +mas | but + | foi from SER +ao | a + o +ele | he +das | de + as + | tem from TER +à | a + a +seu | his +sua | her +ou | or + | ser from SER +quando | when +muito | much + | há from HAV +nos | em + os; us +já | already, now + | está from EST +eu | I +também | also +só | only, just +pelo | per + o +pela | per + a +até | up to +isso | that +ela | he +entre | between + | era from SER +depois | after +sem | without +mesmo | same +aos | a + os + | ter from TER +seus | his +quem | whom +nas | em + as +me | me +esse | that +eles | they + | estão from EST +você | you + | tinha from TER + | foram from SER +essa | that +num | em + um +nem | nor +suas | her +meu | my +às | a + as +minha | my + | têm from TER +numa | em + uma +pelos | per + os +elas | they + | havia from HAV + | seja from SER +qual | which + | será from SER +nós | we + | tenho from TER +lhe | to him, her +deles | of them +essas | those +esses | those +pelas | per + as +este | this + | fosse from SER +dele | of him + + | other words. There are many contractions such as naquele = em+aquele, + | mo = me+o, but they are rare. + | Indefinite article plural forms are also rare. + +tu | thou +te | thee +vocês | you (plural) +vos | you +lhes | to them +meus | my +minhas +teu | thy +tua +teus +tuas +nosso | our +nossa +nossos +nossas + +dela | of her +delas | of them + +esta | this +estes | these +estas | these +aquele | that +aquela | that +aqueles | those +aquelas | those +isto | this +aquilo | that + + | forms of estar, to be (not including the infinitive): +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem + + | forms of haver, to have (not including the infinitive): +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam + + | forms of ser, to be (not including the infinitive): +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam + + | forms of ter, to have (not including the infinitive): +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/archiver/solr-config-dir/lang/stopwords_ro.txt b/archiver/solr-config-dir/lang/stopwords_ro.txt new file mode 100644 index 0000000..4fdee90 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_ro.txt @@ -0,0 +1,233 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +acea +aceasta +această +aceea +acei +aceia +acel +acela +acele +acelea +acest +acesta +aceste +acestea +aceşti +aceştia +acolo +acum +ai +aia +aibă +aici +al +ăla +ale +alea +ălea +altceva +altcineva +am +ar +are +aş +aşadar +asemenea +asta +ăsta +astăzi +astea +ăstea +ăştia +asupra +aţi +au +avea +avem +aveţi +azi +bine +bucur +bună +ca +că +căci +când +care +cărei +căror +cărui +cât +câte +câţi +către +câtva +ce +cel +ceva +chiar +cînd +cine +cineva +cît +cîte +cîţi +cîtva +contra +cu +cum +cumva +curând +curînd +da +dă +dacă +dar +datorită +de +deci +deja +deoarece +departe +deşi +din +dinaintea +dintr +dintre +drept +după +ea +ei +el +ele +eram +este +eşti +eu +face +fără +fi +fie +fiecare +fii +fim +fiţi +iar +ieri +îi +îl +îmi +împotriva +în +înainte +înaintea +încât +încît +încotro +între +întrucât +întrucît +îţi +la +lângă +le +li +lîngă +lor +lui +mă +mâine +mea +mei +mele +mereu +meu +mi +mine +mult +multă +mulţi +ne +nicăieri +nici +nimeni +nişte +noastră +noastre +noi +noştri +nostru +nu +ori +oricând +oricare +oricât +orice +oricînd +oricine +oricît +oricum +oriunde +până +pe +pentru +peste +pînă +poate +pot +prea +prima +primul +prin +printr +sa +să +săi +sale +sau +său +se +şi +sînt +sîntem +sînteţi +spre +sub +sunt +suntem +sunteţi +ta +tăi +tale +tău +te +ţi +ţie +tine +toată +toate +tot +toţi +totuşi +tu +un +una +unde +undeva +unei +unele +uneori +unor +vă +vi +voastră +voastre +voi +voştri +vostru +vouă +vreo +vreun diff --git a/archiver/solr-config-dir/lang/stopwords_ru.txt b/archiver/solr-config-dir/lang/stopwords_ru.txt new file mode 100644 index 0000000..5527140 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_ru.txt @@ -0,0 +1,243 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter `ё' is translated to `е'. + +и | and +в | in/into +во | alternative form +не | not +что | what/that +он | he +на | on/onto +я | i +с | from +со | alternative form +как | how +а | milder form of `no' (but) +то | conjunction and form of `that' +все | all +она | she +так | so, thus +его | him +но | but +да | yes/and +ты | thou +к | towards, by +у | around, chez +же | intensifier particle +вы | you +за | beyond, behind +бы | conditional/subj. particle +по | up to, along +только | only +ее | her +мне | to me +было | it was +вот | here is/are, particle +от | away from +меня | me +еще | still, yet, more +нет | no, there isnt/arent +о | about +из | out of +ему | to him +теперь | now +когда | when +даже | even +ну | so, well +вдруг | suddenly +ли | interrogative particle +если | if +уже | already, but homonym of `narrower' +или | or +ни | neither +быть | to be +был | he was +него | prepositional form of его +до | up to +вас | you accusative +нибудь | indef. suffix preceded by hyphen +опять | again +уж | already, but homonym of `adder' +вам | to you +сказал | he said +ведь | particle `after all' +там | there +потом | then +себя | oneself +ничего | nothing +ей | to her +может | usually with `быть' as `maybe' +они | they +тут | here +где | where +есть | there is/are +надо | got to, must +ней | prepositional form of ей +для | for +мы | we +тебя | thee +их | them, their +чем | than +была | she was +сам | self +чтоб | in order to +без | without +будто | as if +человек | man, person, one +чего | genitive form of `what' +раз | once +тоже | also +себе | to oneself +под | beneath +жизнь | life +будет | will be +ж | short form of intensifer particle `же' +тогда | then +кто | who +этот | this +говорил | was saying +того | genitive form of `that' +потому | for that reason +этого | genitive form of `this' +какой | which +совсем | altogether +ним | prepositional form of `его', `они' +здесь | here +этом | prepositional form of `этот' +один | one +почти | almost +мой | my +тем | instrumental/dative plural of `тот', `то' +чтобы | full form of `in order that' +нее | her (acc.) +кажется | it seems +сейчас | now +были | they were +куда | where to +зачем | why +сказать | to say +всех | all (acc., gen. preposn. plural) +никогда | never +сегодня | today +можно | possible, one can +при | by +наконец | finally +два | two +об | alternative form of `о', about +другой | another +хоть | even +после | after +над | above +больше | more +тот | that one (masc.) +через | across, in +эти | these +нас | us +про | about +всего | in all, only, of all +них | prepositional form of `они' (they) +какая | which, feminine +много | lots +разве | interrogative particle +сказала | she said +три | three +эту | this, acc. fem. sing. +моя | my, feminine +впрочем | moreover, besides +хорошо | good +свою | ones own, acc. fem. sing. +этой | oblique form of `эта', fem. `this' +перед | in front of +иногда | sometimes +лучше | better +чуть | a little +том | preposn. form of `that one' +нельзя | one must not +такой | such a one +им | to them +более | more +всегда | always +конечно | of course +всю | acc. fem. sing of `all' +между | between + + + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs `to be', `to have', `to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя + diff --git a/archiver/solr-config-dir/lang/stopwords_sv.txt b/archiver/solr-config-dir/lang/stopwords_sv.txt new file mode 100644 index 0000000..096f87f --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_sv.txt @@ -0,0 +1,133 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitta | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + diff --git a/archiver/solr-config-dir/lang/stopwords_th.txt b/archiver/solr-config-dir/lang/stopwords_th.txt new file mode 100644 index 0000000..07f0fab --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_th.txt @@ -0,0 +1,119 @@ +# Thai stopwords from: +# "Opinion Detection in Thai Political News Columns +# Based on Subjectivity Analysis" +# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak +ไว้ +ไม่ +ไป +ได้ +ให้ +ใน +โดย +แห่ง +แล้ว +และ +แรก +แบบ +แต่ +เอง +เห็น +เลย +เริ่ม +เรา +เมื่อ +เพื่อ +เพราะ +เป็นการ +เป็น +เปิดเผย +เปิด +เนื่องจาก +เดียวกัน +เดียว +เช่น +เฉพาะ +เคย +เข้า +เขา +อีก +อาจ +อะไร +ออก +อย่าง +อยู่ +อยาก +หาก +หลาย +หลังจาก +หลัง +หรือ +หนึ่ง +ส่วน +ส่ง +สุด +สําหรับ +ว่า +วัน +ลง +ร่วม +ราย +รับ +ระหว่าง +รวม +ยัง +มี +มาก +มา +พร้อม +พบ +ผ่าน +ผล +บาง +น่า +นี้ +นํา +นั้น +นัก +นอกจาก +ทุก +ที่สุด +ที่ +ทําให้ +ทํา +ทาง +ทั้งนี้ +ทั้ง +ถ้า +ถูก +ถึง +ต้อง +ต่างๆ +ต่าง +ต่อ +ตาม +ตั้งแต่ +ตั้ง +ด้าน +ด้วย +ดัง +ซึ่ง +ช่วง +จึง +จาก +จัด +จะ +คือ +ความ +ครั้ง +คง +ขึ้น +ของ +ขอ +ขณะ +ก่อน +ก็ +การ +กับ +กัน +กว่า +กล่าว diff --git a/archiver/solr-config-dir/lang/stopwords_tr.txt b/archiver/solr-config-dir/lang/stopwords_tr.txt new file mode 100644 index 0000000..84d9408 --- /dev/null +++ b/archiver/solr-config-dir/lang/stopwords_tr.txt @@ -0,0 +1,212 @@ +# Turkish stopwords from LUCENE-559 +# merged with the list from "Information Retrieval on Turkish Texts" +# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) +acaba +altmış +altı +ama +ancak +arada +aslında +ayrıca +bana +bazı +belki +ben +benden +beni +benim +beri +beş +bile +bin +bir +birçok +biri +birkaç +birkez +birşey +birşeyi +biz +bize +bizden +bizi +bizim +böyle +böylece +bu +buna +bunda +bundan +bunlar +bunları +bunların +bunu +bunun +burada +çok +çünkü +da +daha +dahi +de +defa +değil +diğer +diye +doksan +dokuz +dolayı +dolayısıyla +dört +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +eğer +elli +en +etmesi +etti +ettiği +ettiğini +gibi +göre +halen +hangi +hatta +hem +henüz +hep +hepsi +her +herhangi +herkesin +hiç +hiçbir +için +iki +ile +ilgili +ise +işte +itibaren +itibariyle +kadar +karşın +katrilyon +kendi +kendilerine +kendini +kendisi +kendisine +kendisini +kez +ki +kim +kimden +kime +kimi +kimse +kırk +milyar +milyon +mu +mü +mı +nasıl +ne +neden +nedenle +nerde +nerede +nereye +niye +niçin +o +olan +olarak +oldu +olduğu +olduğunu +olduklarını +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor +on +ona +ondan +onlar +onlardan +onları +onların +onu +onun +otuz +oysa +öyle +pek +rağmen +sadece +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +şey +şeyden +şeyi +şeyler +şöyle +şu +şuna +şunda +şundan +şunları +şunu +tarafından +trilyon +tüm +üç +üzere +var +vardı +ve +veya +ya +yani +yapacak +yapılan +yapılması +yapıyor +yapmak +yaptı +yaptığı +yaptığını +yaptıkları +yedi +yerine +yetmiş +yine +yirmi +yoksa +yüz +zaten diff --git a/archiver/solr-config-dir/lang/userdict_ja.txt b/archiver/solr-config-dir/lang/userdict_ja.txt new file mode 100644 index 0000000..6f0368e --- /dev/null +++ b/archiver/solr-config-dir/lang/userdict_ja.txt @@ -0,0 +1,29 @@ +# +# This is a sample user dictionary for Kuromoji (JapaneseTokenizer) +# +# Add entries to this file in order to override the statistical model in terms +# of segmentation, readings and part-of-speech tags. Notice that entries do +# not have weights since they are always used when found. This is by-design +# in order to maximize ease-of-use. +# +# Entries are defined using the following CSV format: +# , ... , ... , +# +# Notice that a single half-width space separates tokens and readings, and +# that the number tokens and readings must match exactly. +# +# Also notice that multiple entries with the same is undefined. +# +# Whitespace only lines are ignored. Comments are not allowed on entry lines. +# + +# Custom segmentation for kanji compounds +日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 +関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 + +# Custom segmentation for compound katakana +トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 +ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 + +# Custom reading for former sumo wrestler +朝青龍,朝青龍,アサショウリュウ,カスタム人名 diff --git a/archiver/solr-config-dir/schema.xml b/archiver/solr-config-dir/schema.xml new file mode 100644 index 0000000..692dfe1 --- /dev/null +++ b/archiver/solr-config-dir/schema.xml @@ -0,0 +1,153 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + diff --git a/archiver/solr-config-dir/solrconfig.xml b/archiver/solr-config-dir/solrconfig.xml new file mode 100644 index 0000000..3331803 --- /dev/null +++ b/archiver/solr-config-dir/solrconfig.xml @@ -0,0 +1,1076 @@ + + + + + + + + + 9.8 + + + + + + + + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.lock.type:native} + + + + + + + + + + + + + + + + + + + + + ${solr.ulog.dir:} + ${solr.ulog.numVersionBuckets:65536} + + + + + ${solr.autoCommit.maxTime:15000} + false + + + + + + ${solr.autoSoftCommit.maxTime:3000} + + + + + + + + + + + + + + ${solr.max.booleanClauses:1024} + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + 20 + + + 200 + + + + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + explicit + 10 + + + + + + + explicit + json + true + + + + + + + + + + text_general + + + + + + default + body + solr.DirectSolrSpellChecker + + internal + + 0.5 + + 2 + + 1 + + 5 + + 4 + + 0.01 + + + + + + + + + + + + default + on + true + 10 + 5 + 5 + true + true + 10 + 5 + + + spellcheck + + + + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + + + + + + ,, + ,, + ,, + ,, + ,]]> + ]]> + + + + + + 10 + .,!? + + + + + + + WORD + + + en + US + + + + + + + + + + + + + [^\w-\.] + _ + + + + + + + yyyy-MM-dd['T'[HH:mm[:ss[.SSS]][z + yyyy-MM-dd['T'[HH:mm[:ss[,SSS]][z + yyyy-MM-dd HH:mm[:ss[.SSS]][z + yyyy-MM-dd HH:mm[:ss[,SSS]][z + [EEE, ]dd MMM yyyy HH:mm[:ss] z + EEEE, dd-MMM-yy HH:mm:ss z + EEE MMM ppd HH:mm:ss [z ]yyyy + + + + + java.lang.String + text_general + + *_str + 256 + + + true + + + java.lang.Boolean + booleans + + + java.util.Date + pdates + + + java.lang.Long + java.lang.Integer + plongs + + + java.lang.Number + pdoubles + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 4f4fe9146612e7a12350611de7b525a66b7673ea Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Tue, 28 Nov 2023 18:41:07 +1300 Subject: [PATCH 28/58] Update URL of CLOCKUP WIKI --- src/extwiki-data.rkt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index ead1696..38e8a29 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -423,7 +423,7 @@ (extwiki^ '("aether") 'empty 'empty "Aether Wiki" "https://aether.wiki.gg/wiki/Aether_Wiki" #f #f) (extwiki^ '("before-darkness-falls") 'empty 'empty "Before Darkness Falls Wiki" "https://beforedarknessfalls.wiki.gg/wiki/Before_Darkness_Falls_Wiki" #f #f) (extwiki^ '("chivalry" "chivalry2") 'empty 'empty "Official Chivalry Wiki" "https://chivalry.wiki.gg/wiki/Chivalry_Wiki" #f #f) - (extwiki^ '("clockup") 'empty 'empty "CLOCKUP WIKI" "https://en.clockupwiki.org/wiki/CLOCKUP_WIKI:Plan" #f #f) + (extwiki^ '("clockup") 'empty 'empty "CLOCKUP WIKI" "https://en.clockup.wiki/wiki/Main_Page" #f #f) (extwiki^ '("half-life") 'empty 'empty "Combine OverWiki" "https://combineoverwiki.net/wiki/Main_Page" #f #f) (extwiki^ '("coromon") 'empty 'empty "Coromon Wiki" "https://coromon.wiki.gg/wiki/Coromon_Wiki" #f #f) (extwiki^ '("cosmoteer") 'empty 'empty "Cosmoteer Wiki" "https://cosmoteer.wiki.gg/wiki/Cosmoteer_Wiki" #f #f) From 4bf756bc9c3a855e7a48754dcd92066cd86c990c Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Tue, 5 Dec 2023 09:58:07 +1300 Subject: [PATCH 29/58] Solr schema title can store numbers --- archiver/solr-config-dir/schema.xml | 39 +++++++++++++---------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/archiver/solr-config-dir/schema.xml b/archiver/solr-config-dir/schema.xml index 692dfe1..a7bfa9e 100644 --- a/archiver/solr-config-dir/schema.xml +++ b/archiver/solr-config-dir/schema.xml @@ -83,40 +83,38 @@ https://solr.apache.org/guide/solr/latest/indexing-guide/document-analysis.html# - + + - + + - + - - - + + + + + - - + + + + - - + @@ -125,10 +123,7 @@ https://solr.apache.org/guide/solr/latest/indexing-guide/document-analysis.html# - + @@ -137,7 +132,7 @@ https://solr.apache.org/guide/solr/latest/indexing-guide/document-analysis.html# - + From 76eaaa0a4793678428d9d24d49e7efb932cb2901 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Tue, 5 Dec 2023 10:12:59 +1300 Subject: [PATCH 30/58] Solr schema add back text_general --- archiver/solr-config-dir/schema.xml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/archiver/solr-config-dir/schema.xml b/archiver/solr-config-dir/schema.xml index a7bfa9e..d82a24c 100644 --- a/archiver/solr-config-dir/schema.xml +++ b/archiver/solr-config-dir/schema.xml @@ -93,6 +93,19 @@ https://solr.apache.org/guide/solr/latest/indexing-guide/document-analysis.html# + + + + + + + + + + + + + From 61c304cf41880cc50cf0d69e754389ac0b17d972 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 6 Dec 2023 13:08:20 +1300 Subject: [PATCH 31/58] Implement Solr search provider --- src/config.rkt | 1 + src/page-search.rkt | 79 ++++++++++------------------- src/search-provider-fandom.rkt | 63 ++++++++++++++++++++++++ src/search-provider-solr.rkt | 90 ++++++++++++++++++++++++++++++++++ static/main.css | 13 +++++ 5 files changed, 192 insertions(+), 54 deletions(-) create mode 100644 src/search-provider-fandom.rkt create mode 100644 src/search-provider-solr.rkt diff --git a/src/config.rkt b/src/config.rkt index 2158e77..fec546b 100644 --- a/src/config.rkt +++ b/src/config.rkt @@ -47,6 +47,7 @@ (feature_offline::enabled . "false") (feature_offline::format . "json.gz") (feature_offline::only . "false") + (feature_offline::search . "fandom") (access_log::enabled . "false") diff --git a/src/page-search.rkt b/src/page-search.rkt index e4960d8..4fb76f5 100644 --- a/src/page-search.rkt +++ b/src/page-search.rkt @@ -13,6 +13,8 @@ "application-globals.rkt" "config.rkt" "data.rkt" + "search-provider-fandom.rkt" + "search-provider-solr.rkt" "../lib/syntax.rkt" "../lib/thread-utils.rkt" "../lib/url-utils.rkt" @@ -22,47 +24,22 @@ (provide page-search) -(module+ test - (require rackunit - "test-utils.rkt") - (define search-json-data - '#hasheq((batchcomplete . #t) (query . #hasheq((search . (#hasheq((ns . 0) (pageid . 219) (size . 1482) (snippet . "") (timestamp . "2022-08-21T08:54:23Z") (title . "Gacha Capsule") (wordcount . 214)) #hasheq((ns . 0) (pageid . 201) (size . 1198) (snippet . "") (timestamp . "2022-07-11T17:52:47Z") (title . "Badges") (wordcount . 181))))))))) +(define search-providers + (hash "fandom" generate-results-content-fandom + "solr" generate-results-content-solr)) ;; this takes the info we gathered from fandom and makes the big fat x-expression page -(define (generate-results-page req dest-url wikiname query data #:siteinfo [siteinfo #f]) - (define search-results (jp "/query/search" data)) +(define (generate-results-page req source-url wikiname query results-content #:siteinfo [siteinfo #f]) ;; this is *another* helper that builds the wiki page UI and lets me put the search results (or whatever else) in the middle (generate-wiki-page ;; so I provide my helper function with the necessary context... #:req req - #:source-url dest-url + #:source-url source-url #:wikiname wikiname #:title query #:siteinfo siteinfo ;; and here's the actual results to display in the wiki page layout - `(div (@ (class "mw-parser-output")) - ;; header before the search results showing how many we found - (p ,(format "~a results found for " (length search-results)) - (strong ,query)) - ;; *u*nordered *l*ist of matching search results - (ul ,@(map - (λ (result) ;; for each result, run this code... - (let* ([title (jp "/title" result)] - [page-path (page-title->path title)] - [timestamp (jp "/timestamp" result)] - [wordcount (jp "/wordcount" result)] - [size (jp "/size" result)]) - ;; and make this x-expression... - `(li (@ (class "my-result")) - (a (@ (class "my-result__link") (href ,(format "/~a/wiki/~a" wikiname page-path))) ; using unquote to insert the result page URL - ,title) ; using unquote to insert the result page title - (div (@ (class "my-result__info")) ; constructing the line under the search result - "last edited " - (time (@ (datetime ,timestamp)) ,(list-ref (string-split timestamp "T") 0)) - ,(format ", ~a words, ~a kb" - wordcount - (exact->inexact (/ (round (/ size 100)) 10))))))) - search-results))))) + results-content)) ;; will be called when the web browser asks to load the page (define (page-search req) @@ -71,34 +48,31 @@ ;; the URL will look like "/minecraft/wiki/Special:Search?q=Spawner" ;; grab the first part to use as the wikiname, in this case, "minecraft" (define wikiname (path/param-path (first (url-path (request-uri req))))) + ;; grab a dict of url search params + (define params (url-query (request-uri req))) ;; grab the part after ?q= which is the search terms - (define query (dict-ref (url-query (request-uri req)) 'q #f)) - ;; constructing the URL where I want to get fandom data from... - (define origin (format "https://~a.fandom.com" wikiname)) - ;; the dest-URL will look something like https://minecraft.fandom.com/api.php?action=query&list=search&srsearch=Spawner&formatversion=2&format=json - (define dest-url - (format "~a/api.php?~a" - origin - (params->query `(("action" . "query") - ("list" . "search") - ("srsearch" . ,query) - ("formatversion" . "2") - ("format" . "json"))))) + (define query (dict-ref params 'q #f)) + ;; figure out which search provider we're going to use + (define search-provider (hash-ref search-providers (config-get 'feature_offline::search))) - ;; simultaneously get the search results from the fandom API, as well as information about the wiki as a whole (its license, icon, name) - (define-values (dest-res siteinfo) + ;; external special:search url to link at the bottom of the page as the upstream source + (define external-search-url + (format "https://~a.fandom.com/wiki/Special:Search?~a" + wikiname + (params->query `(("query" . ,query) + ("search" . "internal"))))) + + ;; simultaneously get the search results, as well as information about the wiki as a whole (its license, icon, name) + (define-values (results-content siteinfo) (thread-values (λ () - (log-outgoing dest-url) - (easy:get dest-url #:timeouts timeouts)) ;; HTTP request to dest-url for search results + (search-provider wikiname query params)) ;; call the search provider (see file "search-provider-fandom.rkt") (λ () (siteinfo-fetch wikiname)))) ;; helper function in another file to get information about the wiki - ;; search results are a JSON string. parse JSON into racket data structures - (define data (easy:response-json dest-res)) ;; calling my generate-results-page function with the information so far in order to get a big fat x-expression ;; big fat x-expression goes into the body variable - (define body (generate-results-page req dest-url wikiname query data #:siteinfo siteinfo)) + (define body (generate-results-page req external-search-url wikiname query results-content #:siteinfo siteinfo)) ;; error checking (when (config-true? 'debug) ; used for its side effects @@ -110,7 +84,4 @@ #:headers (build-headers always-headers) (λ (out) (write-html body out))))) -(module+ test - (parameterize ([(config-parameter 'feature_offline::only) "false"]) - (check-not-false ((query-selector (attribute-selector 'href "/test/wiki/Gacha_Capsule") - (generate-results-page test-req "" "test" "Gacha" search-json-data)))))) + diff --git a/src/search-provider-fandom.rkt b/src/search-provider-fandom.rkt new file mode 100644 index 0000000..945b111 --- /dev/null +++ b/src/search-provider-fandom.rkt @@ -0,0 +1,63 @@ +#lang racket/base +(require racket/string + (prefix-in easy: net/http-easy) + "application-globals.rkt" + "config.rkt" + "../lib/url-utils.rkt" + "whole-utils.rkt" + "../lib/xexpr-utils.rkt") + +(provide + generate-results-content-fandom) + +(module+ test + (require rackunit + "test-utils.rkt") + (define search-json-data + '#hasheq((batchcomplete . #t) (query . #hasheq((search . (#hasheq((ns . 0) (pageid . 219) (size . 1482) (snippet . "") (timestamp . "2022-08-21T08:54:23Z") (title . "Gacha Capsule") (wordcount . 214)) #hasheq((ns . 0) (pageid . 201) (size . 1198) (snippet . "") (timestamp . "2022-07-11T17:52:47Z") (title . "Badges") (wordcount . 181))))))))) + +(define (generate-results-content-fandom wikiname query params) + ;; constructing the URL where I want to get fandom data from... + (define origin (format "https://~a.fandom.com" wikiname)) + ;; the dest-URL will look something like https://minecraft.fandom.com/api.php?action=query&list=search&srsearch=Spawner&formatversion=2&format=json + (define dest-url + (format "~a/api.php?~a" + origin + (params->query `(("action" . "query") + ("list" . "search") + ("srsearch" . ,query) + ("formatversion" . "2") + ("format" . "json"))))) + ;; HTTP request to dest-url for search results + (log-outgoing dest-url) + (define res (easy:get dest-url #:timeouts timeouts)) + (define json (easy:response-json res)) + (define search-results (jp "/query/search" json)) + + ;; generate content for display in the wiki page layout + `(div (@ (class "mw-parser-output")) + ;; header before the search results showing how many we found + (p ,(format "~a results found for " (length search-results)) + (strong ,query)) + ;; *u*nordered *l*ist of matching search results + (ul ,@(for/list ([result search-results]) + (let* ([title (jp "/title" result)] + [page-path (page-title->path title)] + [timestamp (jp "/timestamp" result)] + [wordcount (jp "/wordcount" result)] + [size (jp "/size" result)]) + ;; and make this x-expression... + `(li (@ (class "my-result")) + (a (@ (class "my-result__link") (href ,(format "/~a/wiki/~a" wikiname page-path))) ; using unquote to insert the result page URL + ,title) ; using unquote to insert the result page title + (div (@ (class "my-result__info")) ; constructing the line under the search result + "last edited " + (time (@ (datetime ,timestamp)) ,(list-ref (string-split timestamp "T") 0)) + ,(format ", ~a words, ~a kb" + wordcount + (exact->inexact (/ (round (/ size 100)) 10)))))))))) + +(module+ test + (parameterize ([(config-parameter 'feature_offline::only) "false"]) + (check-not-false ((query-selector (attribute-selector 'href "/test/wiki/Gacha_Capsule") + (generate-results-content-fandom test-req "" "test" "Gacha" search-json-data)))))) diff --git a/src/search-provider-solr.rkt b/src/search-provider-solr.rkt new file mode 100644 index 0000000..fbd77de --- /dev/null +++ b/src/search-provider-solr.rkt @@ -0,0 +1,90 @@ +#lang racket/base +(require racket/dict + racket/string + (prefix-in easy: net/http-easy) + "application-globals.rkt" + "../lib/html-parsing/main.rkt" + "../lib/url-utils.rkt" + "whole-utils.rkt" + "../lib/xexpr-utils.rkt") + +(provide + generate-results-content-solr) + +(struct result^ (hl-title hl-body kb words page-path) #:transparent) + +(define (generate-results-content-solr wikiname query params) + ;; grab things from params that would modify the search + (define op (if (equal? (dict-ref params 'op #f) "or") '("or" . "OR") '("and" . "AND"))) + (define sort (if (equal? (dict-ref params 'sort #f) "len") '("len" . "len desc") '("relevance" . "score desc"))) + + ;; the dest-URL will look something like http://localhost:8983/solr/bloons/select?defType=edismax&fl=id%2Clen&hl.defaultSummary=true&hl.encoder=html&hl.fl=title%2Cbody&hl.method=unified&hl.tag.post=%3C%2Fmark%3E&hl.tag.pre=%3Cmark%3E&hl=true&indent=true&q.op=AND&q=blo&qf=title_prefix%20title%5E2.0%20body%20table%5E0.3&useParams= + (define dest-url + (format "http://localhost:8983/solr/~a/select?~a" + wikiname + (params->query `(("defType" . "edismax") + ("q" . ,query) + ("q.op" . ,(cdr op)) + ("qf" . "title_prefix title^2.0 body table^0.3") + ("hl" . "true") + ("hl.method" . "unified") + ("hl.defaultSummary" . "true") + ("hl.fl" . "title,body") + ("fl" . "id,len,title") + ("hl.encoder" . "html") + ("hl.tag.pre" . "") + ("hl.tag.post" . "") + ("sort" . ,(cdr sort)))))) + ;; HTTP request to dest-url for search results + (log-outgoing dest-url) + (define res (easy:get dest-url #:timeouts timeouts)) + (define json (easy:response-json res)) + + ;; build result objects + (define highlighting (jp "/highlighting" json)) + (define results + (for/list ([doc (jp "/response/docs" json)]) + (define id (jp "/id" doc)) + (define len (jp "/len" doc)) + (define title (jp "/title" doc)) + (define page-path (page-title->path title)) + (define kb (exact->inexact (/ (round (/ len 100)) 10))) ; divide by 1000 and round to nearest 0.1 + (define words (* (round (/ len 60)) 10)) ; divide by 6 and round to nearest 10 + (define hl (hash-ref highlighting (string->symbol id))) + (define hl-title (cdr (html->xexp (jp "/title/0" hl)))) + (define hl-body (cdr (html->xexp (string-trim (jp "/body/0" hl))))) + (result^ hl-title hl-body kb words page-path))) + + (define qtime (exact->inexact (/ (round (/ (jp "/responseHeader/QTime" json) 10)) 100))) + + (define (value-selected? value current-value) + (append + `((value ,value)) + (if (equal? value current-value) + `((selected)) + `()))) + + ;; generate content for display in the wiki page layout + `(div (@ (class "mw-parser-output")) + (form (@ (class "my-result__filter")) + (input (@ (type "hidden") (name "q") (value ,query))) + (select (@ (name "op")) + (option (@ ,@(value-selected? "and" (car op))) "All words must match") + (option (@ ,@(value-selected? "or" (car op))) "Some words must match")) + (select (@ (name "sort")) + (option (@ ,@(value-selected? "relevance" (car sort))) "Relevant articles") + (option (@ ,@(value-selected? "len" (car sort))) "Wordiest articles")) + (button "Filter results")) + ;; header before the search results showing how many we found + (p ,(format "~a results (~a seconds) found for " (jp "/response/numFound" json) qtime) + (strong ,query)) + ;; *u*nordered *l*ist of matching search results + (ul ,@(for/list ([result results]) + `(li (@ (class "my-result")) + (a (@ (class "my-result__link") (href ,(format "/~a/wiki/~a" wikiname (result^-page-path result)))) ; url + ,@(result^-hl-title result)) ; title + (p (@ (class "my-result__description")) ,@(result^-hl-body result)) ; result preview + (div (@ (class "my-result__info")) ; line under the search result + ,(format "~a words, ~a kb of readable stuff" + (result^-words result) + (result^-kb result)))))))) diff --git a/static/main.css b/static/main.css index cb32df5..e3c0070 100644 --- a/static/main.css +++ b/static/main.css @@ -249,11 +249,24 @@ a.ext-audiobutton { /* see hearthstone/wiki/Diablo_(Duels_hero) */ .my-result__link { font-size: 1.2em; } +.my-result__description { + font-size: 0.8em; + white-space: pre-line; + margin-left: 1.2em; +} +.my-result mark { + background: rgba(255, 255, 0, 0.4); +} .my-result__info { font-size: 0.8em; color: var(--theme-page-text-color--hover); margin-left: 1.2em; } +.my-result__filter { + display: grid; + grid-template-columns: auto auto auto 1fr; + grid-gap: 8px; +} /* (breezewiki) search suggestions */ .bw-search-form { From e5e38762547a500a7bc547cc1721eb17dc88f3bb Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 6 Dec 2023 16:02:43 +1300 Subject: [PATCH 32/58] Solr managed schema sucks --- src/search-provider-solr.rkt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/search-provider-solr.rkt b/src/search-provider-solr.rkt index fbd77de..ed9fb7b 100644 --- a/src/search-provider-solr.rkt +++ b/src/search-provider-solr.rkt @@ -46,7 +46,8 @@ (for/list ([doc (jp "/response/docs" json)]) (define id (jp "/id" doc)) (define len (jp "/len" doc)) - (define title (jp "/title" doc)) + (define title (let ([t (jp "/title" doc)]) + (if (list? t) (car t) t))) (define page-path (page-title->path title)) (define kb (exact->inexact (/ (round (/ len 100)) 10))) ; divide by 1000 and round to nearest 0.1 (define words (* (round (/ len 60)) 10)) ; divide by 6 and round to nearest 10 From aea627b27f794c3a219d335f363130f16cdb0791 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 6 Dec 2023 16:35:07 +1300 Subject: [PATCH 33/58] Solr fix multivalued in schema --- archiver/solr-config-dir/schema.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archiver/solr-config-dir/schema.xml b/archiver/solr-config-dir/schema.xml index d82a24c..c1eae9a 100644 --- a/archiver/solr-config-dir/schema.xml +++ b/archiver/solr-config-dir/schema.xml @@ -106,7 +106,7 @@ https://solr.apache.org/guide/solr/latest/indexing-guide/document-analysis.html# - + @@ -145,7 +145,7 @@ https://solr.apache.org/guide/solr/latest/indexing-guide/document-analysis.html# - + From 27c9680f5b5cd78fbec8e9f88200f39b5faf9a17 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Tue, 12 Dec 2023 10:35:49 +1300 Subject: [PATCH 34/58] Solr indexer code cleanup --- archiver/fts.rkt | 402 +++++++++++++++++++++++------------------------ 1 file changed, 200 insertions(+), 202 deletions(-) diff --git a/archiver/fts.rkt b/archiver/fts.rkt index 6e2266e..c2f597b 100644 --- a/archiver/fts.rkt +++ b/archiver/fts.rkt @@ -1,202 +1,200 @@ -#lang racket -(require racket/function - racket/future - racket/match - racket/path - racket/promise - racket/port - racket/string - file/gunzip - db - db/unsafe/sqlite3 - net/http-easy - json - json-pointer - "../lib/html-parsing/main.rkt" - "../lib/xexpr-utils.rkt" - "../lib/tree-updater.rkt") - -(define-syntax (seq stx) - (syntax-case stx () - [(_ body ...) - #`(for ([op (list (lambda () body) ...)] - [i (in-naturals)]) - (define res (op)) - (when (>= (response-status-code res) 400) - (error 'seq "op #~a: status code was ~a: ~v" i (response-status-code res) (response-json res))) - (define taskuid (json-pointer-value "/taskUid" (response-json res))) - (for/or ([ticks (in-naturals)] - [res2 (in-producer (lambda () (get (format "http://localhost:7700/tasks/~a" taskuid))))]) - (define status (json-pointer-value "/status" (response-json res2))) - (case status - [("enqueued" "processing") - (sleep 1) - #f] - [("succeeded") - (printf "op #~a: ~a (~a ticks)~n" i status ticks) - #t] - [else - (error 'seq "op #~a: task status was ~a: ~v" i status res2)])))])) - -(define (class-has? attributes substrs) - (define cl (or (get-attribute 'class attributes) "")) - (ormap (λ (substr) (string-contains? cl substr)) substrs)) - -(define (updater element element-type attributes children) - (cond - [(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews")) - (list 'div '() '())] - [#t - (list element-type attributes children)])) - -(define slc (sqlite3-connect #:database "../storage/fts-separate.db")) -(sqlite3-load-extension slc "fts5") - -(define (writer tables-mode? page) - (define (writer-inner page) - (for ([bit page]) - (cond - [(and tables-mode? (pair? bit) (memq (car bit) '(h1 h2 h3 p blockquote q))) (void)] - [(and (not tables-mode?) (pair? bit) (memq (car bit) '(ul ol dl table))) (void)] - [(memq bit '(div p li td dd dt br)) (displayln "")] - [(symbol? bit) (void)] - [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)] - [(and (pair? bit) (eq? (car bit) '@)) (void)] - [(pair? bit) (writer-inner bit)] - [(string? bit) (display bit)]))) - (writer-inner page)) - -(define (write-and-post-process tables-mode? page) - (define text (with-output-to-string (λ () (writer tables-mode? page)))) - ;; (define text-no-numbers (regexp-replace* #px"(?:-|[+$£€¥] *)?[0-9,.]{2,}%?\\s*" text "")) - (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n")) - shrink-text) - -(define wikiname "bloons") -(define tablename (format "page_~a" wikiname)) - -(define ((extract f)) ; f - filename - (with-handlers - ([exn:fail? (λ (err) (println f) (raise err))]) - (define j - (case (path-get-extension f) - [(#".json") - (with-input-from-file f (λ () (read-json)))] - [(#".gz") - (define-values (in out) (make-pipe)) - (with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out))) - (read-json in)] - [else #f])) - (define title (json-pointer-value "/parse/title" j)) - (define pageid (json-pointer-value "/parse/pageid" j)) - (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j))) - (define page (update-tree updater (html->xexp page-html))) - (define body (write-and-post-process #f page)) - (define table (write-and-post-process #t page)) - (values title body table pageid))) - -(define results - (for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)] - #:when (member (path-get-extension f) '(#".json" #".gz"))) - (extract f))) - -;; *************************************************************************************************** -;; TESTING WRITER -;; *************************************************************************************************** -#;(for/first ([fut results] - [i (in-naturals 1)] - #:when (i . >= . 4859)) - (define-values (title body table pageid) (fut)) - (println title) - (println body) - (println table)) - -(println "inserting...") - -;; *************************************************************************************************** -;; SQLite FTS5 -;; *************************************************************************************************** -#;(begin - (query-exec slc "begin transaction") - #;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname)) - (time - (for ([fut results] - [i (in-naturals 1)]) - (display "-") - (when (and (> i 0) (= (modulo i 100) 0)) - (println i)) - (define-values (title shrink-text) (fut)) - (query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text))) - - (println "running optimize...") - (query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename)) - - (println "committing...") - (query-exec slc "commit")) - -;; *************************************************************************************************** -;; Solr -;; *************************************************************************************************** -(begin - (define data - (cond - #;[(file-exists? "cache.rkt") - (println "reading in...") - (with-input-from-file "cache.rkt" (λ () (read)))] - [else - (define data - (for/list ([fut results] - [i (in-naturals 1)]) - (display "-") - (when (and (> i 0) (= (modulo i 100) 0)) - (println i)) - (define-values (title body table pageid) (fut)) - (define len (string-length body)) - `#hasheq((id . ,(number->string pageid)) - (title . ,title) - (body . ,body) - (table . ,table) - (len . ,len)))) - - (println "writing out...") - (with-output-to-file "cache.rkt" (λ () (write data)) #:exists 'truncate/replace) - data])) - - (println "posting...") - (define res - (post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname) - #:json data))) - -;; *************************************************************************************************** -;; Meilisearch -;; *************************************************************************************************** -#;(begin - (seq - (put (format "http://localhost:7700/indexes/~a/settings/searchable-attributes" wikiname) - #:json '("title" "body")) - (put (format "http://localhost:7700/indexes/~a/settings/ranking-rules" wikiname) - #:json '("words" "typo" #;"proximity" "attribute" "sort" "exactness" #;"len:desc")) - (call-with-input-file "stop-words.json" - (λ (in) - (put (format "http://localhost:7700/indexes/~a/settings/stop-words" wikiname) - #:headers '#hasheq((Content-Type . "application/json")) - #:data in)))) - (define data - (for/list ([fut results] - [i (in-naturals 1)]) - (display "-") - (when (and (> i 0) (= (modulo i 100) 0)) - (println i)) - (define-values (title body pageid) (fut)) - (define len (string-length body)) - `#hasheq((id . ,pageid) - (title . ,title) - (body . ,body) - (len . ,len)))) - (define res - (post (format "http://localhost:7700/indexes/~a/documents" wikiname) - #:json data)) - (seq res) - (println (response-json res))) - -(disconnect slc) +#lang cli +(require (for-syntax racket/base)) +(require racket/format + racket/function + racket/future + racket/match + racket/path + racket/promise + racket/port + racket/runtime-path + racket/string + file/gunzip + db + db/unsafe/sqlite3 + net/http-easy + json + json-pointer + "../lib/html-parsing/main.rkt" + "../lib/xexpr-utils.rkt" + "../lib/tree-updater.rkt") + +(flag (read-from-cache?) + ("-c" "--read-from-cache" "read from last run cache instead of rebuilding documents") + (read-from-cache? #t)) + +(define-runtime-path storage-path "../storage/archive") + +;; *************************************************************************************************** +;; Progress bar display +;; *************************************************************************************************** + +(struct progress^ (n max title) #:transparent) + +(define (make-m-s seconds) + (define-values (eta-m eta-s) (quotient/remainder seconds 60)) + (format "~a:~a" eta-m (~a eta-s #:width 2 #:align 'right #:pad-string "0"))) + +(define (make-progress get-p [history-size 20]) + (define update-sleep 1) + (define name-width 30) + (define max-width 105) + (define history (make-vector history-size 0)) + (define history-pointer 0) + (define elapsed 0) + (define (report-progress) + (define p (get-p)) + (define history-cycle (vector-ref history history-pointer)) + (vector-set! history history-pointer (progress^-n p)) + (set! history-pointer (modulo (add1 history-pointer) history-size)) + (set! elapsed (add1 elapsed)) + (define-values (eta-display diff-per-second) + (cond + [((progress^-n p) . >= . (progress^-max p)) (values (format "~a **" (make-m-s elapsed)) (format "** ~a" (quotient (progress^-max p) (max elapsed 1))))] + [(= history-cycle 0) (values "-:--" "--")] + [else (define diff-per-second (/ (- (progress^-n p) history-cycle) (* history-size update-sleep))) + (define eta-total + (if (diff-per-second . > . 0) + (floor (round (/ (- (progress^-max p) (progress^-n p)) diff-per-second))) + 0)) + (values (make-m-s eta-total) + (round diff-per-second))])) + (define left (format "~a/~a ~a/s ~a ~a%" + (~a (progress^-n p) #:width (string-length (~a (progress^-max p))) #:align 'right #:pad-string " ") + (progress^-max p) + diff-per-second + eta-display + (floor (* 100 (/ (progress^-n p) (progress^-max p)))))) + (define name-display (~a (progress^-title p) #:max-width name-width #:limit-marker "...")) + (define remaining-space (- max-width name-width (string-length left) 2)) + (define bar-width + (floor (* (sub1 remaining-space) + (/ (progress^-n p) (progress^-max p))))) + (define bar (string-append (make-string bar-width #\=) + ">" + (make-string (- remaining-space bar-width) #\ ))) + (printf "\e[2K\r~a~a~a" left bar name-display) + (flush-output)) + (define (report-progress-loop) + (sleep update-sleep) + (report-progress) + (report-progress-loop)) + (define t (thread report-progress-loop)) + (define (quit) + (kill-thread t) + (report-progress) + (displayln "")) + quit) + +;; *************************************************************************************************** +;; Page text extractor +;; *************************************************************************************************** + +(define (class-has? attributes substrs) + (define cl (or (get-attribute 'class attributes) "")) + (ormap (λ (substr) (string-contains? cl substr)) substrs)) + +(define (updater element element-type attributes children) + (cond + [(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews")) + (list 'div '() '())] + [#t + (list element-type attributes children)])) + +(define (writer tables-mode? page) + (define (writer-inner page) + (for ([bit page]) + (cond + [(and tables-mode? (pair? bit) (memq (car bit) '(h1 h2 h3 p blockquote q))) (void)] + [(and (not tables-mode?) (pair? bit) (memq (car bit) '(ul ol dl table))) (void)] + [(memq bit '(div p li td dd dt br)) (displayln "")] + [(symbol? bit) (void)] + [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)] + [(and (pair? bit) (eq? (car bit) '@)) (void)] + [(pair? bit) (writer-inner bit)] + [(string? bit) (display bit)]))) + (writer-inner page)) + +(define (write-and-post-process tables-mode? page) + (define text (with-output-to-string (λ () (writer tables-mode? page)))) + ;; (define text-no-numbers (regexp-replace* #px"(?:-|[+$£€¥] *)?[0-9,.]{2,}%?\\s*" text "")) + (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n")) + shrink-text) + +(define ((extract f)) ; f - filename + (with-handlers + ([exn:fail? (λ (err) (printf "extract: ~a: ~v~n" f err))]) + (define j + (case (path-get-extension f) + [(#".json") + (with-input-from-file f (λ () (read-json)))] + [(#".gz") + (define-values (in out) (make-pipe)) + (with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out))) + (read-json in)] + [else #f])) + (define title (json-pointer-value "/parse/title" j)) + (define pageid (json-pointer-value "/parse/pageid" j)) + (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j))) + (define page (update-tree updater (html->xexp page-html))) + (define body (write-and-post-process #f page)) + (define table (write-and-post-process #t page)) + (list title body table pageid))) + +;; *************************************************************************************************** +;; Program, loop, Solr APIs +;; *************************************************************************************************** + +(program + (start [wikiname "wikiname to download"]) + + (define results + (for/list ([f (directory-list (build-path storage-path wikiname) #:build? #t)] + #:when (member (path-get-extension f) '(#".gz"))) + (extract f))) + + (define data + (cond + [(and (read-from-cache?) (file-exists? "cache.rkt")) + (displayln "Reading in...") + (with-input-from-file "cache.rkt" (λ () (read)))] + [else + (define x (box (progress^ 0 1 "..."))) + (define quit (make-progress (λ () (unbox x)))) + (define data + (for/list ([fut results] + [i (in-naturals 1)] + #:do [(define page (fut))] + #:when (not (void? page))) + (match-define (list title body table pageid) page) + (define len (string-length body)) + (set-box! x (progress^ i (length results) title)) + `#hasheq((id . ,(number->string pageid)) + (title . ,title) + (body . ,body) + (table . ,table) + (len . ,len)))) + (quit) + + (display "Writing out... ") + (flush-output) + (with-output-to-file "cache.rkt" (λ () (write data)) #:exists 'truncate/replace) + data])) + + (display "Converting... ") + (flush-output) + (define ser (jsexpr->bytes data)) + (define ser-port (open-input-bytes ser)) + (define quit (make-progress (λ () (progress^ (ceiling (/ (file-position ser-port) 64 1024)) + (ceiling (/ (bytes-length ser) 64 1024)) + "Posting...")) + 2)) + (define res + (post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname) + #:data ser-port + #:headers '#hasheq((Content-Type . "application/json")) + #:timeouts (make-timeout-config #:lease 5 #:connect 5 #:request 300))) + (quit) + (displayln (response-status-line res))) + +(run start) From a57445abcbda2df7d030587111b7eb8155f769e9 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Tue, 12 Dec 2023 11:10:47 +1300 Subject: [PATCH 35/58] Archiver now saves redirects --- archiver/archiver-cli.rkt | 13 +-- archiver/archiver-database.rkt | 3 +- archiver/archiver.rkt | 138 +++++++++++++++++++++---------- lib/archive-file-mappings.rkt | 7 ++ src/extwiki-data.rkt | 6 +- src/page-global-search.rkt | 2 +- src/page-search.rkt | 3 +- src/page-wiki-offline.rkt | 146 ++++++++++++++++++--------------- 8 files changed, 197 insertions(+), 121 deletions(-) diff --git a/archiver/archiver-cli.rkt b/archiver/archiver-cli.rkt index 11f25d0..0f349d6 100644 --- a/archiver/archiver-cli.rkt +++ b/archiver/archiver-cli.rkt @@ -35,9 +35,12 @@ (output-lines? #t)])) (define (update-width) (when (output-progress?) - (with-charterm - (call-with-values (λ () (charterm-screen-size)) - (λ (cols rows) (set! width cols)))))) + (case (system-type 'os) + [(linux) + (with-charterm + (call-with-values (λ () (charterm-screen-size)) + (λ (cols rows) (set! width cols))))] + [else 100]))) (update-width) ;; check (when (or (not wikiname) (equal? wikiname "")) @@ -56,8 +59,8 @@ (define real-width (min (string-length basename) rest)) (define spare-width (- rest real-width)) (define name-display (substring basename 0 real-width)) - (define whitespace (make-string spare-width #\ )) - (printf "~a~a~a\r" prefix name-display whitespace)])) + (printf "\e[2K\r~a~a" prefix name-display) + (flush-output)])) ;; download all stages (for ([stage all-stages] [i (in-naturals 1)]) diff --git a/archiver/archiver-database.rkt b/archiver/archiver-database.rkt index b81ad6c..91a4070 100644 --- a/archiver/archiver-database.rkt +++ b/archiver/archiver-database.rkt @@ -48,7 +48,8 @@ ((query-exec slc* "alter table wiki add column sitename TEXT") (query-exec slc* "alter table wiki add column basepage TEXT") (query-exec slc* "alter table wiki add column license_text TEXT") - (query-exec slc* "alter table wiki add column license_url TEXT")))) + (query-exec slc* "alter table wiki add column license_url TEXT")) + ((query-exec slc* "alter table page add column redirect")))) (let do-migrate-step () (when (database-version . < . (length migrations)) diff --git a/archiver/archiver.rkt b/archiver/archiver.rkt index edd0d2b..84fd16f 100644 --- a/archiver/archiver.rkt +++ b/archiver/archiver.rkt @@ -1,5 +1,6 @@ #lang racket/base (require racket/file + racket/format racket/function racket/list racket/path @@ -43,10 +44,18 @@ wikiname (params->query '(("action" . "query") ("meta" . "siteinfo") - ("siprop" . "general|rightsinfo|statistics") + ("siprop" . "general|rightsinfo|statistics|namespaces") ("format" . "json") ("formatversion" . "2"))))) (define data (response-json (get dest-url))) + (define content-nss + (sort + (for/list ([(k v) (in-hash (jp "/query/namespaces" data))] + #:do [(define id (hash-ref v 'id))] + #:when (and (id . < . 2900) ; exclude maps namespace + (hash-ref v 'content))) ; exclude non-content and talk namespaces + id) + <)) (define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) (if (and exists? (not (sql-null? exists?))) (query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" @@ -61,7 +70,8 @@ (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) (jp "/query/rightsinfo/text" data) (jp "/query/rightsinfo/url" data))) - (jp "/query/statistics/articles" data)) + (values (jp "/query/statistics/articles" data) + content-nss)) (define (check-style-for-images wikiname path) @@ -131,48 +141,57 @@ ;; done yet? (unless (and (real? wiki-progress) (wiki-progress . >= . 1)) ;; Count total pages - (define num-pages (insert-wiki-entry wikiname)) + (define-values (num-pages namespaces) (insert-wiki-entry wikiname)) ;; Download the entire index of pages - (define basenames - (let loop ([path-with-namefrom "/wiki/Local_Sitemap"] - [basenames-previous-pages null]) - ;; Download the current index page - (define url (format "https://~a.fandom.com~a" wikiname path-with-namefrom)) - (define r (get url)) - ;; Metadata from this page (the link to the next page) - (define page (html->xexp (bytes->string/utf-8 (response-body r)))) - (define link-namefrom - ((query-selector (λ (t a c x) (and (eq? t 'a) - (pair? x) - (string-contains? (car x) "Next page") - (let ([href (get-attribute 'href a)] ) - (and href (string-contains? href "/wiki/Local_Sitemap"))))) - page #:include-text? #t))) - ;; Content from this page - (define basenames-this-page - (for/list ([link (in-producer - (query-selector - (λ (t a c) (eq? t 'a)) - ((query-selector (λ (t a c) (has-class? "mw-allpages-chunk" a)) page))) - #f)]) - (local-encoded-url->basename (get-attribute 'href (bits->attributes link))))) - ;; Call the progress callback - (define all-basenames (append basenames-previous-pages basenames-this-page)) - (callback (length all-basenames) num-pages (last all-basenames)) - ;; Recurse to download from the next page - (if link-namefrom - (loop (get-attribute 'href (bits->attributes link-namefrom)) all-basenames) - all-basenames))) - ;; Save those pages into the database - ;; SQLite can have a maximum of 32766 parameters in a single query - (for ([slice (in-slice 32760 basenames)]) - (define query-template (string-join (make-list (length slice) "(?1, ?, 0)") ", " #:before-first "insert or ignore into page (wikiname, basename, progress) values ")) - (call-with-transaction - (get-slc) - (λ () - (apply query-exec* query-template wikiname slice) - ;; Record that we have the complete list of pages - (query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname)))))) + (for*/fold ([total 0]) + ([namespace namespaces] + [redir-filter '("nonredirects" "redirects")]) + (let loop ([apcontinue ""] + [basenames null]) + (cond + [apcontinue + (define url (format "https://~a.fandom.com/api.php?~a" + wikiname + (params->query `(("action" . "query") + ("list" . "allpages") + ("apnamespace" . ,(~a namespace)) + ("apfilterredir" . ,redir-filter) + ("aplimit" . "500") + ("apcontinue" . ,apcontinue) + ("format" . "json") + ("formatversion" . "2"))))) + ;; Download the current listing page + (define res (get url)) + (define json (response-json res)) + ;; Content from this page + (define current-basenames + (for/list ([page (jp "/query/allpages" json)]) + (title->basename (jp "/title" page)))) + (when ((length current-basenames) . > . 0) + ;; Report + (if (equal? redir-filter "nonredirects") + (callback (+ (length basenames) (length current-basenames) total) num-pages (last current-basenames)) + (callback total num-pages (last current-basenames)))) + ;; Loop + (loop (jp "/continue/apcontinue" json #f) (append basenames current-basenames))] + [else + ;; All done with this (loop)! Save those pages into the database + ;; SQLite can have a maximum of 32766 parameters in a single query + (begin0 + ;; next for*/fold + (if (equal? redir-filter "nonredirects") + (+ (length basenames) total) + total) ; redirects don't count for the site statistics total + (call-with-transaction + (get-slc) + (λ () + (for ([slice (in-slice 32760 basenames)]) + (define query-template + (string-join #:before-first "insert or ignore into page (wikiname, redirect, basename, progress) values " + (make-list (length slice) "(?1, ?2, ?, 0)") ", ")) + (apply query-exec* query-template wikiname (if (equal? redir-filter "redirects") 1 sql-null) slice)))))]))) + ;; Record that we have the complete list of pages + (query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname))) ;; 2. Download each page via API and: @@ -183,7 +202,7 @@ (define save-dir (build-path archive-root wikiname)) (make-directory* save-dir) ;; gather list of basenames to download (that aren't yet complete) - (define basenames (query-list* "select basename from page where wikiname = ? and progress < ?" + (define basenames (query-list* "select basename from page where wikiname = ? and progress < ? and redirect is null" wikiname max-page-progress)) ;; counter of complete/incomplete basenames (define already-done-count @@ -222,10 +241,41 @@ (query-exec* "update page set progress = 1 where wikiname = ? and basename = ?" wikiname basename) (callback i total-count basename)) + ;; save redirects as well + (save-redirects wikiname callback (+ already-done-count (length basenames)) total-count) ;; saved all pages, register that fact in the database (query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname)) +;; 2.5. Download each redirect-target via API and save mapping in database +(define (save-redirects wikiname callback already-done-count total-count) + (define basenames (query-list* "select basename from page where wikiname = ? and progress < ? and redirect = 1" + wikiname max-page-progress)) + ;; loop through basenames, in slices of 50 (MediaWiki API max per request), and download + (for ([basename basenames] + [i (in-naturals (add1 already-done-count))]) + (define dest-url + (format "https://~a.fandom.com/api.php?~a" + wikiname + (params->query `(("action" . "query") + ("prop" . "links") + ("titles" . ,(basename->name-for-query basename)) + ("format" . "json") + ("formatversion" . "2"))))) + (define res (get dest-url)) + (define json (response-json res)) + (define dest-title (jp "/query/pages/0/links/0/title" json #f)) + (callback i total-count basename) + (cond + [dest-title + ;; store it + (define dest-basename (title->basename dest-title)) + (query-exec* "update page set progress = 1, redirect = ? where wikiname = ? and basename = ?" dest-basename wikiname basename)] + [else + ;; the page just doesn't exist + (query-exec* "delete from page where wikiname = ? and basename = ?" wikiname basename)]))) + + ;; 3. Download CSS and: ;; * Save CSS to file ;; * Record style images to database diff --git a/lib/archive-file-mappings.rkt b/lib/archive-file-mappings.rkt index ba013ab..03f97f5 100644 --- a/lib/archive-file-mappings.rkt +++ b/lib/archive-file-mappings.rkt @@ -7,6 +7,7 @@ local-encoded-url->segments url-segments->basename local-encoded-url->basename + title->basename basename->name-for-query url-segments->guess-title) @@ -21,6 +22,12 @@ (define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix (url-segments->basename (local-encoded-url->segments str))) +(define (title->basename title) ; "Page title/Strategies" -> "Page_title#Strategies" filename encoded, no extension or dir prefi + (define elements (string-split (string-replace title " " "_") "/")) + (define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) elements)) + (define basic-filename (string-join extra-encoded "#")) + basic-filename) + (define (basename->name-for-query str) (uri-decode (regexp-replace* #rx"#" str "/"))) diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index 38e8a29..5628078 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -377,7 +377,7 @@ '("runescape") 'default 'RuneScape "RuneScape Wiki" - "https://runescape.wiki/" + "https://runescape.wiki/w/Main_Page" "https://runescape.wiki/images/Wiki.png" (λ (props) `((p "The RuneScape Wiki was founded on April 8, 2005. In October 2018, the wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements.")))) @@ -386,7 +386,7 @@ '("oldschoolrunescape") 'default 'RuneScape "Old School RuneScape Wiki" - "https://oldschool.runescape.wiki/" + "https://oldschool.runescape.wiki/w/Main_Page" "https://oldschool.runescape.wiki/images/Wiki.png" (λ (props) `((p "The Old School RuneScape Wiki was founded on February 14, 2013. In October 2018, the RuneScape Wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements, with the Old School RuneScape Wiki following suit.")))) @@ -395,7 +395,7 @@ '("runescapeclassic") 'default 'RuneScape "RuneScape Classic Wiki" - "https://classic.runescape.wiki/" + "https://classic.runescape.wiki/w/Main_Page" "https://classic.runescape.wiki/images/Wiki.png" (λ (props) `((p "The Old School RuneScape Wiki was founded on April 19, 2009. In October 2018, the RuneScape Wiki left Fandom (then Wikia), citing their apathy towards the wiki and excessive advertisements, with the RuneScape Classic Wiki following suit.")))) diff --git a/src/page-global-search.rkt b/src/page-global-search.rkt index a7748d5..08bfd13 100644 --- a/src/page-global-search.rkt +++ b/src/page-global-search.rkt @@ -19,7 +19,7 @@ [(not wikiname) (response/output #:code 400 - #:mime-type "text/plain" + #:mime-type #"text/plain" (λ (out) (displayln "Requires wikiname and q parameters." out)))] [(or (not q) (equal? q "")) diff --git a/src/page-search.rkt b/src/page-search.rkt index 4fb76f5..193dbd6 100644 --- a/src/page-search.rkt +++ b/src/page-search.rkt @@ -53,7 +53,8 @@ ;; grab the part after ?q= which is the search terms (define query (dict-ref params 'q #f)) ;; figure out which search provider we're going to use - (define search-provider (hash-ref search-providers (config-get 'feature_offline::search))) + (define search-provider (hash-ref search-providers (config-get 'feature_offline::search) + (λ () (error 'search-provider "unknown search provider configured")))) ;; external special:search url to link at the bottom of the page as the upstream source (define external-search-url diff --git a/src/page-wiki-offline.rkt b/src/page-wiki-offline.rkt index 3783271..1d0323b 100644 --- a/src/page-wiki-offline.rkt +++ b/src/page-wiki-offline.rkt @@ -20,6 +20,7 @@ web-server/dispatchers/dispatch ; my libs "application-globals.rkt" + "../archiver/archiver-database.rkt" "config.rkt" "data.rkt" "log.rkt" @@ -40,6 +41,9 @@ (define path-archive (anytime-path ".." "storage/archive")) +(when (config-true? 'feature_offline::only) + (void (get-slc))) + (define (page-wiki-offline req) (response-handler (define wikiname (path/param-path (first (url-path (request-uri req))))) @@ -64,84 +68,94 @@ [else (error 'archive-format "unknown archive format configured")])) (define fs-path (build-path path-archive wikiname (format (car archive-format) maybe-hashed-basename))) (define source-url (format "https://~a.fandom.com/wiki/~a" wikiname (basename->name-for-query basename))) - (cond - [(not (file-exists? fs-path)) - (unless (config-true? 'feature_offline::only) - (next-dispatcher)) - (define mirror-path (url->string (request-uri req))) + (cond/var + + [(file-exists? fs-path) + (when (config-true? 'debug) + (printf "using offline mode for ~v~n" fs-path)) + (response-handler + (define data (with-input-from-file fs-path (cdr archive-format))) + (define article-title (jp "/parse/title" data)) + (define original-page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) + (define page ((query-selector (λ (t a c) (has-class? "mw-parser-output" a)) original-page))) + (define initial-head-data ((head-data-getter wikiname) data)) + (define head-data + (case theme + [(light dark) + (struct-copy head-data^ initial-head-data + [body-class (regexp-replace #rx"(theme-fandomdesktop-)(light|dark)" + (head-data^-body-class initial-head-data) + (format "\\1~a" theme))])] + [else initial-head-data])) (define body (generate-wiki-page - `(div (@ (class "unsaved-page")) - (style ".unsaved-page a { text-decoration: underline !important }") - (p "breezewiki.com doesn't have this page saved.") - (p "You can see this page by visiting a BreezeWiki mirror:") - (ul - (li (a (@ (href ,(format "https://antifandom.com~a" mirror-path))) "View on antifandom.com")) - (li (a (@ (href ,(format "https://bw.artemislena.eu~a" mirror-path))) "View on artemislena.eu")) - (li (a (@ (href ,source-url)) "or, you can see the original page on Fandom (ugh)"))) - (p "If you'd like " ,wikiname ".fandom.com to be added to breezewiki.com, " (a (@ (href "https://lists.sr.ht/~cadence/breezewiki-requests")) "let me know about it!"))) + (update-tree-wiki page wikiname) #:req req #:source-url source-url #:wikiname wikiname - #:title (url-segments->guess-title segments) + #:title article-title #:online-styles #f + #:head-data head-data #:siteinfo (siteinfo-fetch wikiname) )) + (define redirect-msg ((query-selector (attribute-selector 'class "redirectMsg") body))) + (define redirect-query-parameter (dict-ref (url-query (request-uri req)) 'redirect "yes")) + (define headers + (build-headers + always-headers + ; redirect-query-parameter: only the string "no" is significant: + ; https://github.com/Wikia/app/blob/fe60579a53f16816d65dad1644363160a63206a6/includes/Wiki.php#L367 + (when (and redirect-msg + (not (equal? redirect-query-parameter "no"))) + (let* ([dest (get-attribute 'href (bits->attributes ((query-selector (λ (t a c) (eq? t 'a)) redirect-msg))))] + [value (bytes-append #"0;url=" (string->bytes/utf-8 dest))]) + (header #"Refresh" value))))) (when (config-true? 'debug) ; used for its side effects ; convert to string with error checking, error will be raised if xexp is invalid (xexp->html body)) (response/output #:code 200 - #:headers always-headers + #:headers headers (λ (out) - (write-html body out)))] - [#t - (when (config-true? 'debug) - (printf "using offline mode for ~v~n" fs-path)) - (response-handler - (define data (with-input-from-file fs-path (cdr archive-format))) - (define article-title (jp "/parse/title" data)) - (define original-page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) - (define page ((query-selector (λ (t a c) (has-class? "mw-parser-output" a)) original-page))) - (define initial-head-data ((head-data-getter wikiname) data)) - (define head-data - (case theme - [(light dark) - (struct-copy head-data^ initial-head-data - [body-class (regexp-replace #rx"(theme-fandomdesktop-)(light|dark)" - (head-data^-body-class initial-head-data) - (format "\\1~a" theme))])] - [else initial-head-data])) - (define body - (generate-wiki-page - (update-tree-wiki page wikiname) - #:req req - #:source-url source-url - #:wikiname wikiname - #:title article-title - #:online-styles #f - #:head-data head-data - #:siteinfo (siteinfo-fetch wikiname) - )) - (define redirect-msg ((query-selector (attribute-selector 'class "redirectMsg") body))) - (define redirect-query-parameter (dict-ref (url-query (request-uri req)) 'redirect "yes")) - (define headers - (build-headers - always-headers - ; redirect-query-parameter: only the string "no" is significant: - ; https://github.com/Wikia/app/blob/fe60579a53f16816d65dad1644363160a63206a6/includes/Wiki.php#L367 - (when (and redirect-msg - (not (equal? redirect-query-parameter "no"))) - (let* ([dest (get-attribute 'href (bits->attributes ((query-selector (λ (t a c) (eq? t 'a)) redirect-msg))))] - [value (bytes-append #"0;url=" (string->bytes/utf-8 dest))]) - (header #"Refresh" value))))) - (when (config-true? 'debug) - ; used for its side effects - ; convert to string with error checking, error will be raised if xexp is invalid - (xexp->html body)) - (response/output - #:code 200 - #:headers headers - (λ (out) - (write-html body out))))]))) + (write-html body out))))] + + ;; page not found on disk, perhaps it's a redirect? redirects are stored in the database + (var target (query-maybe-value* "select redirect from page where wikiname = ? and basename = ?" wikiname basename)) + [target + (generate-redirect (basename->name-for-query target))] + + ;; breezewiki doesn't have the page archived, see if we can make a network request for it + [(not (config-true? 'feature_offline::only)) + (next-dispatcher)] + + ;; no possible way to provide the page + [else + (define mirror-path (url->string (request-uri req))) + (define body + (generate-wiki-page + `(div (@ (class "unsaved-page")) + (style ".unsaved-page a { text-decoration: underline !important }") + (p "breezewiki.com doesn't have this page saved.") + (p "You can see this page by visiting a BreezeWiki mirror:") + (ul + (li (a (@ (href ,(format "https://antifandom.com~a" mirror-path))) "View on antifandom.com")) + (li (a (@ (href ,(format "https://bw.artemislena.eu~a" mirror-path))) "View on artemislena.eu")) + (li (a (@ (href ,source-url)) "or, you can see the original page on Fandom (ugh)"))) + (p "If you'd like " ,wikiname ".fandom.com to be added to breezewiki.com, " (a (@ (href "https://lists.sr.ht/~cadence/breezewiki-requests")) "let me know about it!"))) + #:req req + #:source-url source-url + #:wikiname wikiname + #:title (url-segments->guess-title segments) + #:online-styles #f + #:siteinfo (siteinfo-fetch wikiname) + )) + (when (config-true? 'debug) + ; used for its side effects + ; convert to string with error checking, error will be raised if xexp is invalid + (xexp->html body)) + (response/output + #:code 200 + #:headers always-headers + (λ (out) + (write-html body out)))]))) From 9c3125d6bec1de1b012d083c56e97607b5796072 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 13 Dec 2023 23:09:00 +1300 Subject: [PATCH 36/58] Minor improvements to archiver --- archiver/archiver-gui.rkt | 36 ++++++++++++++---------------------- archiver/archiver.rkt | 11 +++++++---- lib/mime.types | 2 ++ src/page-static-archive.rkt | 5 ++++- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/archiver/archiver-gui.rkt b/archiver/archiver-gui.rkt index 6f09cb8..a5facae 100644 --- a/archiver/archiver-gui.rkt +++ b/archiver/archiver-gui.rkt @@ -90,27 +90,21 @@ (define/obs @input "") -(splicing-let ([frame-count 30]) +(splicing-let ([frame-count 20]) (define stickman-frames (for/vector ([s (in-range 0 1 (/ 1 frame-count))]) (running-stickman-icon s #:height status-icon-size - #:material (default-icon-material)))) + #:material (default-icon-material))))) - (define/obs @stick-frame-no 0) - (define stick-timer - (new timer% - [notify-callback (λ () (@stick-frame-no . <~ . add1))] - [interval (truncate (/ 1000 frame-count))])) - (define/obs @stick - (@stick-frame-no . ~> . (λ (n) (vector-ref stickman-frames - (modulo n (vector-length stickman-frames))))))) +(define (stick n) + (vector-ref stickman-frames (modulo n (vector-length stickman-frames)))) (define status-icons (hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size) 'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size) - 'running @stick + 'running (stick 0) 'error (x-icon #:height status-icon-size) 'complete (check-icon #:color color-green #:height status-icon-size))) @@ -155,7 +149,7 @@ #:mixin (λ (%) (class % (super-new) (obs-observe! @visible? (λ (visible?) (send this show visible?))))) (vpanel #:margin '(15 15) - (text "Encountered this error while downloading:") + (text (format "Encountered this error while downloading ~a:" (qi^-wikiname (obs-peek @qi)))) (input #:style '(multiple hscroll) #:min-size '(#f 200) (exn->string e)) @@ -259,15 +253,16 @@ (update-qi @qi [th #f] [st 'paused])) (define (do-reset-qi @qi) + (define reset-progress-to 0) (define th (qi^-th (obs-peek @qi))) (when th (kill-thread th)) - (update-qi @qi [th #f] [st 'queued] [stage 0] [progress 0] [max-progress 0]) - (query-exec* "update wiki set progress = 0 where wikiname = ?" (qi^-wikiname (obs-peek @qi)))) + (update-qi @qi [th #f] [st 'queued] [stage reset-progress-to] [progress 0] [max-progress 0]) + (query-exec* "update wiki set progress = ? where wikiname = ?" reset-progress-to (qi^-wikiname (obs-peek @qi)))) (define (do-try-unpause-next-entry) (define queue (obs-peek @queue)) - (define next-qi (for/first ([qi queue] - #:when (memq (qi^-st qi) '(paused queued error))) + (define next-qi (for/last ([qi queue] + #:when (memq (qi^-st qi) '(paused queued))) qi)) (when next-qi (define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue)))) @@ -281,7 +276,6 @@ #:mixin (λ (%) (class % (super-new) (define/augment (on-close) - (send stick-timer stop) (for ([qi (obs-peek @queue)]) (when (qi^-th qi) (kill-thread (qi^-th qi)))) @@ -307,7 +301,7 @@ (λ (k @qi) (define @status-icons (@> (case (qi^-st @qi) - [(running) @stick] + [(running) (stick (qi^-progress @qi))] [else (hash-ref status-icons (qi^-st @qi))]))) (define @is-running? (@> (memq (qi^-st @qi) '(running)))) @@ -325,10 +319,8 @@ (spacer) (hpanel #:stretch '(#f #f) - (if-view @is-complete? - (button (hash-ref action-icons 'reset) - (λ () (do-reset-qi @qi))) - (spacer)) + (button (hash-ref action-icons 'reset) + (λ () (do-reset-qi @qi))) (if-view @is-running? (button (hash-ref action-icons 'pause) (λ () (do-stop-qi @qi))) diff --git a/archiver/archiver.rkt b/archiver/archiver.rkt index 84fd16f..01f03ad 100644 --- a/archiver/archiver.rkt +++ b/archiver/archiver.rkt @@ -86,14 +86,17 @@ (string-contains? url "/drm_fonts/") (string-contains? url "//db.onlinewebfonts.com/") (string-contains? url "//bits.wikimedia.org/") + (string-contains? url "mygamercard.net/") (string-contains? url "dropbox") (string-contains? url "only=styles") (string-contains? url "https://https://") (regexp-match? #rx"^%20" url) - (regexp-match? #rx"^data:" url)))) + (regexp-match? #rx"^data:" url) + (regexp-match? #rx"^file:" url)))) (cond [(string-prefix? url "https://") url] [(string-prefix? url "http://") (regexp-replace #rx"http:" url "https:")] + [(string-prefix? url "httpshttps://") (regexp-replace #rx"httpshttps://" url "https://")] [(string-prefix? url "//") (string-append "https:" url)] [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)] [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)]))) @@ -244,7 +247,7 @@ ;; save redirects as well (save-redirects wikiname callback (+ already-done-count (length basenames)) total-count) ;; saved all pages, register that fact in the database - (query-exec* "update wiki set progress = 2 where wikiname = ?" wikiname)) + (query-exec* "update wiki set progress = 2 where wikiname = ? and progress <= 2" wikiname)) ;; 2.5. Download each redirect-target via API and save mapping in database @@ -334,8 +337,8 @@ (define url (vector-ref row 0)) (define hash (vector-ref row 1)) ;; check - #; (printf "~a -> ~a~n" url hash) - (define r (get url)) + #;(printf "~a -> ~a~n" url hash) + (define r (get url #:timeouts (make-timeout-config #:connect 15))) (define declared-type (response-headers-ref r 'content-type)) (define final-type (if (equal? declared-type #"application/octet-stream") (let ([sniff-entity (message-entity (mime-analyze (response-body r)))]) diff --git a/lib/mime.types b/lib/mime.types index c06a1e9..4ae48d8 100644 --- a/lib/mime.types +++ b/lib/mime.types @@ -22,6 +22,7 @@ image/x-jng jng image/x-ms-bmp bmp image/svg+xml svg image/webp webp +image/avif avif application/font-woff2 woff2 application/acad woff2 @@ -31,6 +32,7 @@ font/woff woff application/x-font-ttf ttf application/x-font-truetype ttf application/x-truetype-font ttf +font/ttf ttf application/font-sfnt ttf font/sfnt ttf application/vnd.oasis.opendocument.formula-template otf diff --git a/src/page-static-archive.rkt b/src/page-static-archive.rkt index c0c2e09..501bda7 100644 --- a/src/page-static-archive.rkt +++ b/src/page-static-archive.rkt @@ -30,16 +30,19 @@ (string-contains? url "/drm_fonts/") (string-contains? url "//db.onlinewebfonts.com/") (string-contains? url "//bits.wikimedia.org/") + (string-contains? url "mygamercard.net/") (string-contains? url "dropbox") (string-contains? url "only=styles") (string-contains? url "https://https://") (regexp-match? #rx"^%20|^'" url) - (regexp-match? #rx"^\"?data:" url)) + (regexp-match? #rx"^\"?data:" url) + (regexp-match? #rx"^file:" url)) url (let* ([norm-url (cond [(string-prefix? url "https://") url] [(string-prefix? url "http://") (regexp-replace #rx"http:" url "https:")] + [(string-prefix? url "httpshttps://") (regexp-replace #rx"httpshttps://" url "https://")] [(string-prefix? url "//") (string-append "https:" url)] [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)] [else (error 'replace-style-for-images "unknown URL format: ~a" url)])]) From 723bb92b0ae7a9ba3865b49828df0d48cea1dff4 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Fri, 15 Dec 2023 21:23:12 +1300 Subject: [PATCH 37/58] Archiver GUI super-reset button --- archiver/archiver-gui.rkt | 71 +++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/archiver/archiver-gui.rkt b/archiver/archiver-gui.rkt index a5facae..25f447f 100644 --- a/archiver/archiver-gui.rkt +++ b/archiver/archiver-gui.rkt @@ -2,11 +2,14 @@ (require racket/class racket/draw racket/format + racket/function racket/list + racket/math racket/port racket/set racket/splicing racket/string + (except-in pict text table) db net/http-easy memo @@ -65,7 +68,7 @@ (define/obs @auto-retry #f) -(define-struct qi^ (wikiname st stage progress max-progress eta th) #:transparent) ;; queue item +(define-struct qi^ (wikiname st stage progress max-progress ticks eta th) #:transparent) ;; queue item (define rows (query-rows* "select wikiname, progress from wiki where progress < 4")) (define/obs @queue null) @@ -74,7 +77,7 @@ (define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue)) (if already-exists? queue - (append queue (list (qi^ wikiname st stage 0 1 "..." #f))))))) + (append queue (list (qi^ wikiname st stage 0 1 0 "..." #f))))))) (for ([row rows]) (add-wikiname-to-queue (vector-ref row 0) (if (= (vector-ref row 1) 4) @@ -88,7 +91,43 @@ (define color-green (make-color 90 212 68)) -(define/obs @input "") +(define (resize coords fraction) + (for/list ([coord (in-list coords)]) + (cons (* fraction (car coord)) + (* fraction (cdr coord))))) + +(define (flat-right-arrow #:height [height 32] #:color [color #f]) + ((if color + (curryr colorize color) + values) + (dc (λ (dc dx dy) + (send dc draw-polygon (resize + (list '(0 . 9) '(15 . 9) '(14 . 0) + '(31 . 15.5) + '(14 . 31) '(15 . 22) '(0 . 22)) + (/ height 32)))) + height height))) + +(define (double-left-arrow-icon #:height [height 32]) + (define shift (/ height 48)) + (pict->bitmap + (scale-to-fit + (panorama + (pin-under + (bitmap + (left-over-arrow-icon #:color halt-icon-color #:height height + #:material rubber-icon-material)) + (- (* -20 shift) 2) (+ (* 6 shift) 1) + (bitmap + (bitmap-render-icon + (pict->bitmap + (rotate + (flat-right-arrow #:color (make-object color% 255 64 64) #:height (/ height 1.26)) + (* pi 1.23))))) + #;(rotate + (flat-right-arrow #:color (make-object color% 255 64 64) #:height (/ height 1.26)) + (* pi 1.23)))) + height height #:mode 'preserve/max))) (splicing-let ([frame-count 20]) (define stickman-frames @@ -111,7 +150,8 @@ (define action-icons (hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size) 'resume (play-icon #:color color-green #:height button-icon-size) - 'reset (left-over-arrow-icon #:color halt-icon-color #:height button-icon-size))) + 'reset (left-over-arrow-icon #:color halt-icon-color #:height button-icon-size) + 'reseter (double-left-arrow-icon #:height button-icon-size))) (define (bitmap-view @the-bitmap [min-width 1]) (pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f) @@ -217,7 +257,9 @@ (define ((make-progress-updater @qi) a b c) ;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c) - (update-qi @qi [progress a] [max-progress b])) + (update-qi @qi [progress a] [max-progress b] [ticks (add1 (qi^-ticks (obs-peek @qi)))])) + +(define/obs @input "") (define (do-add-to-queue) (define wikiname (string-trim (obs-peek @input))) @@ -259,6 +301,10 @@ (update-qi @qi [th #f] [st 'queued] [stage reset-progress-to] [progress 0] [max-progress 0]) (query-exec* "update wiki set progress = ? where wikiname = ?" reset-progress-to (qi^-wikiname (obs-peek @qi)))) +(define (do-reseter-qi @qi) + (do-reset-qi @qi) + (query-exec* "delete from page where wikiname = ?" (qi^-wikiname (obs-peek @qi)))) + (define (do-try-unpause-next-entry) (define queue (obs-peek @queue)) (define next-qi (for/last ([qi queue] @@ -301,7 +347,7 @@ (λ (k @qi) (define @status-icons (@> (case (qi^-st @qi) - [(running) (stick (qi^-progress @qi))] + [(running) (stick (qi^-ticks @qi))] [else (hash-ref status-icons (qi^-st @qi))]))) (define @is-running? (@> (memq (qi^-st @qi) '(running)))) @@ -319,13 +365,18 @@ (spacer) (hpanel #:stretch '(#f #f) - (button (hash-ref action-icons 'reset) - (λ () (do-reset-qi @qi))) + (if-view @is-running? (button (hash-ref action-icons 'pause) (λ () (do-stop-qi @qi))) - (button (hash-ref action-icons 'resume) - (λ () (do-start-qi @qi)))))) + (hpanel + #:stretch '(#f #f) + (button (hash-ref action-icons 'reseter) + (λ () (do-reseter-qi @qi))) + (button (hash-ref action-icons 'reset) + (λ () (do-reset-qi @qi))) + (button (hash-ref action-icons 'resume) + (λ () (do-start-qi @qi))))))) ;; progress bar (bottom half) (hpanel (canvas From 39423504689a0f7a0dfb8da22258e03072d1552d Mon Sep 17 00:00:00 2001 From: blankie Date: Sun, 31 Dec 2023 21:55:56 +1100 Subject: [PATCH 38/58] Fix error on a redirect page with no link https://lists.sr.ht/~cadence/breezewiki-discuss/%3CCY2G0E3G55N3.ANW2QREUS5SO%40nixnetmail.com%3E --- src/page-wiki-offline.rkt | 9 ++++++--- src/page-wiki.rkt | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/page-wiki-offline.rkt b/src/page-wiki-offline.rkt index 1d0323b..3f818ee 100644 --- a/src/page-wiki-offline.rkt +++ b/src/page-wiki-offline.rkt @@ -98,16 +98,19 @@ #:head-data head-data #:siteinfo (siteinfo-fetch wikiname) )) - (define redirect-msg ((query-selector (attribute-selector 'class "redirectMsg") body))) (define redirect-query-parameter (dict-ref (url-query (request-uri req)) 'redirect "yes")) + (define redirect-msg ((query-selector (attribute-selector 'class "redirectMsg") body))) + (define redirect-msg-a (if redirect-msg + ((query-selector (λ (t a c) (eq? t 'a)) redirect-msg)) + #f)) (define headers (build-headers always-headers ; redirect-query-parameter: only the string "no" is significant: ; https://github.com/Wikia/app/blob/fe60579a53f16816d65dad1644363160a63206a6/includes/Wiki.php#L367 - (when (and redirect-msg + (when (and redirect-msg-a (not (equal? redirect-query-parameter "no"))) - (let* ([dest (get-attribute 'href (bits->attributes ((query-selector (λ (t a c) (eq? t 'a)) redirect-msg))))] + (let* ([dest (get-attribute 'href (bits->attributes redirect-msg-a))] [value (bytes-append #"0;url=" (string->bytes/utf-8 dest))]) (header #"Refresh" value))))) (when (config-true? 'debug) diff --git a/src/page-wiki.rkt b/src/page-wiki.rkt index 8df701c..ae060d4 100644 --- a/src/page-wiki.rkt +++ b/src/page-wiki.rkt @@ -80,16 +80,19 @@ #:title title #:head-data head-data #:siteinfo siteinfo)) - (define redirect-msg ((query-selector (attribute-selector 'class "redirectMsg") body))) (define redirect-query-parameter (dict-ref (url-query (request-uri req)) 'redirect "yes")) + (define redirect-msg ((query-selector (attribute-selector 'class "redirectMsg") body))) + (define redirect-msg-a (if redirect-msg + ((query-selector (λ (t a c) (eq? t 'a)) redirect-msg)) + #f)) (define headers (build-headers always-headers ; redirect-query-parameter: only the string "no" is significant: ; https://github.com/Wikia/app/blob/fe60579a53f16816d65dad1644363160a63206a6/includes/Wiki.php#L367 - (when (and redirect-msg + (when (and redirect-msg-a (not (equal? redirect-query-parameter "no"))) - (let* ([dest (get-attribute 'href (bits->attributes ((query-selector (λ (t a c) (eq? t 'a)) redirect-msg))))] + (let* ([dest (get-attribute 'href (bits->attributes redirect-msg-a))] [value (bytes-append #"0;url=" (string->bytes/utf-8 dest))]) (header #"Refresh" value))))) (when (config-true? 'debug) From 0fed724604b95cdf2e0227f3e788aa341b30b387 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Tue, 9 Jan 2024 17:13:04 +1300 Subject: [PATCH 39/58] Fix redirects to category pages in offline mode --- src/page-wiki-offline.rkt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/page-wiki-offline.rkt b/src/page-wiki-offline.rkt index 3f818ee..2b52373 100644 --- a/src/page-wiki-offline.rkt +++ b/src/page-wiki-offline.rkt @@ -126,7 +126,8 @@ ;; page not found on disk, perhaps it's a redirect? redirects are stored in the database (var target (query-maybe-value* "select redirect from page where wikiname = ? and basename = ?" wikiname basename)) [target - (generate-redirect (basename->name-for-query target))] + ; don't url decode the target, or Category: pages will be interpreted as a protocol + (generate-redirect (regexp-replace* #rx"#" target "/"))] ;; breezewiki doesn't have the page archived, see if we can make a network request for it [(not (config-true? 'feature_offline::only)) From b8a6c5198d1a4d6e4907e1346033a054912fcfcf Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Tue, 9 Jan 2024 17:35:26 +1300 Subject: [PATCH 40/58] Fix more redirects in offline mode --- src/page-wiki-offline.rkt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/page-wiki-offline.rkt b/src/page-wiki-offline.rkt index 2b52373..906718d 100644 --- a/src/page-wiki-offline.rkt +++ b/src/page-wiki-offline.rkt @@ -127,7 +127,7 @@ (var target (query-maybe-value* "select redirect from page where wikiname = ? and basename = ?" wikiname basename)) [target ; don't url decode the target, or Category: pages will be interpreted as a protocol - (generate-redirect (regexp-replace* #rx"#" target "/"))] + (generate-redirect (format "/~a/wiki/~a" wikiname (regexp-replace* #rx"#" target "/")))] ;; breezewiki doesn't have the page archived, see if we can make a network request for it [(not (config-true? 'feature_offline::only)) From b02e2a405329ec56c5ce8fd357fd60ff11332033 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 10 Jan 2024 11:21:42 +1300 Subject: [PATCH 41/58] Fix failing test after search was refactored --- src/page-search.rkt | 4 ++-- src/search-provider-fandom.rkt | 14 ++++++++------ src/search-provider-solr.rkt | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/page-search.rkt b/src/page-search.rkt index 193dbd6..019ebfe 100644 --- a/src/page-search.rkt +++ b/src/page-search.rkt @@ -25,8 +25,8 @@ page-search) (define search-providers - (hash "fandom" generate-results-content-fandom - "solr" generate-results-content-solr)) + (hash "fandom" search-fandom + "solr" search-solr)) ;; this takes the info we gathered from fandom and makes the big fat x-expression page (define (generate-results-page req source-url wikiname query results-content #:siteinfo [siteinfo #f]) diff --git a/src/search-provider-fandom.rkt b/src/search-provider-fandom.rkt index 945b111..2338c13 100644 --- a/src/search-provider-fandom.rkt +++ b/src/search-provider-fandom.rkt @@ -8,15 +8,15 @@ "../lib/xexpr-utils.rkt") (provide - generate-results-content-fandom) + search-fandom) (module+ test (require rackunit "test-utils.rkt") - (define search-json-data - '#hasheq((batchcomplete . #t) (query . #hasheq((search . (#hasheq((ns . 0) (pageid . 219) (size . 1482) (snippet . "") (timestamp . "2022-08-21T08:54:23Z") (title . "Gacha Capsule") (wordcount . 214)) #hasheq((ns . 0) (pageid . 201) (size . 1198) (snippet . "") (timestamp . "2022-07-11T17:52:47Z") (title . "Badges") (wordcount . 181))))))))) + (define search-results-data + '(#hasheq((ns . 0) (pageid . 219) (size . 1482) (snippet . "") (timestamp . "2022-08-21T08:54:23Z") (title . "Gacha Capsule") (wordcount . 214)) #hasheq((ns . 0) (pageid . 201) (size . 1198) (snippet . "") (timestamp . "2022-07-11T17:52:47Z") (title . "Badges") (wordcount . 181))))) -(define (generate-results-content-fandom wikiname query params) +(define (search-fandom wikiname query params) ;; constructing the URL where I want to get fandom data from... (define origin (format "https://~a.fandom.com" wikiname)) ;; the dest-URL will look something like https://minecraft.fandom.com/api.php?action=query&list=search&srsearch=Spawner&formatversion=2&format=json @@ -33,8 +33,10 @@ (define res (easy:get dest-url #:timeouts timeouts)) (define json (easy:response-json res)) (define search-results (jp "/query/search" json)) + (generate-results-content-fandom wikiname query search-results)) - ;; generate content for display in the wiki page layout +;;; generate content for display in the wiki page layout +(define (generate-results-content-fandom wikiname query search-results) `(div (@ (class "mw-parser-output")) ;; header before the search results showing how many we found (p ,(format "~a results found for " (length search-results)) @@ -60,4 +62,4 @@ (module+ test (parameterize ([(config-parameter 'feature_offline::only) "false"]) (check-not-false ((query-selector (attribute-selector 'href "/test/wiki/Gacha_Capsule") - (generate-results-content-fandom test-req "" "test" "Gacha" search-json-data)))))) + (generate-results-content-fandom "test" "Gacha" search-results-data)))))) diff --git a/src/search-provider-solr.rkt b/src/search-provider-solr.rkt index ed9fb7b..1ec48e2 100644 --- a/src/search-provider-solr.rkt +++ b/src/search-provider-solr.rkt @@ -9,11 +9,11 @@ "../lib/xexpr-utils.rkt") (provide - generate-results-content-solr) + search-solr) (struct result^ (hl-title hl-body kb words page-path) #:transparent) -(define (generate-results-content-solr wikiname query params) +(define (search-solr wikiname query params) ;; grab things from params that would modify the search (define op (if (equal? (dict-ref params 'op #f) "or") '("or" . "OR") '("and" . "AND"))) (define sort (if (equal? (dict-ref params 'sort #f) "len") '("len" . "len desc") '("relevance" . "score desc"))) From a52d131b936037fdd7847bf8d58c1662227101ad Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Thu, 11 Jan 2024 22:33:59 +1300 Subject: [PATCH 42/58] Split massive uploads in Solr indexer --- archiver/fts.rkt | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/archiver/fts.rkt b/archiver/fts.rkt index c2f597b..6a00041 100644 --- a/archiver/fts.rkt +++ b/archiver/fts.rkt @@ -8,6 +8,7 @@ racket/promise racket/port racket/runtime-path + racket/sequence racket/string file/gunzip db @@ -156,8 +157,16 @@ (define data (cond [(and (read-from-cache?) (file-exists? "cache.rkt")) - (displayln "Reading in...") - (with-input-from-file "cache.rkt" (λ () (read)))] + (define size (file-size "cache.rkt")) + (call-with-input-file "cache.rkt" + (λ (in) + (define quit (make-progress (λ () (progress^ (ceiling (/ (file-position in) 64 1024)) + (ceiling (/ size 64 1024)) + "Reading in...")) + 2)) + (begin0 + (read in) + (quit))))] [else (define x (box (progress^ 0 1 "..."))) (define quit (make-progress (λ () (unbox x)))) @@ -183,18 +192,22 @@ (display "Converting... ") (flush-output) - (define ser (jsexpr->bytes data)) - (define ser-port (open-input-bytes ser)) - (define quit (make-progress (λ () (progress^ (ceiling (/ (file-position ser-port) 64 1024)) - (ceiling (/ (bytes-length ser) 64 1024)) - "Posting...")) - 2)) - (define res - (post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname) - #:data ser-port - #:headers '#hasheq((Content-Type . "application/json")) - #:timeouts (make-timeout-config #:lease 5 #:connect 5 #:request 300))) - (quit) - (displayln (response-status-line res))) + (define slice-size 30000) + (define slices (ceiling (/ (length data) slice-size))) + (for ([slice (in-slice slice-size data)] + [i (in-naturals 1)]) + (define ser (jsexpr->bytes slice)) + (define ser-port (open-input-bytes ser)) + (define quit (make-progress (λ () (progress^ (ceiling (/ (file-position ser-port) 64 1024)) + (ceiling (/ (bytes-length ser) 64 1024)) + (format "Posting... (~a/~a)" i slices))) + 2)) + (define res + (post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname) + #:data ser-port + #:headers '#hasheq((Content-Type . "application/json")) + #:timeouts (make-timeout-config #:lease 5 #:connect 5 #:request 300))) + (quit) + (displayln (response-status-line res)))) (run start) From 6260ba809bdd2321fa59c6133f749f2db0d23e17 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 1 May 2024 00:53:09 +1200 Subject: [PATCH 43/58] Fix running out of file descriptors --- lib/thread-utils.rkt | 3 +-- src/application-globals.rkt | 3 --- src/data.rkt | 20 +++++++------- src/dispatcher-tree.rkt | 21 +++++++++++++-- src/fandom-request.rkt | 48 ++++++++++++++++++++++++++++++++++ src/page-category.rkt | 42 +++++++++++++---------------- src/page-file.rkt | 23 +++++++--------- src/page-search.rkt | 2 -- src/page-wiki.rkt | 34 +++++++++++++----------- src/search-provider-fandom.rkt | 24 +++++++---------- src/search-provider-solr.rkt | 3 +-- src/whole-utils.rkt | 11 -------- 12 files changed, 134 insertions(+), 100 deletions(-) create mode 100644 src/fandom-request.rkt delete mode 100644 src/whole-utils.rkt diff --git a/lib/thread-utils.rkt b/lib/thread-utils.rkt index f907dac..66e2b4c 100644 --- a/lib/thread-utils.rkt +++ b/lib/thread-utils.rkt @@ -1,6 +1,5 @@ #lang racket/base -(require (prefix-in easy: net/http-easy) - "../src/data.rkt" +(require "../src/data.rkt" "xexpr-utils.rkt") (provide diff --git a/src/application-globals.rkt b/src/application-globals.rkt index de60820..fb8f118 100644 --- a/src/application-globals.rkt +++ b/src/application-globals.rkt @@ -22,8 +22,6 @@ (provide ; headers to always send on all http responses always-headers - ; timeout durations for http-easy requests - timeouts ; generates a consistent footer application-footer ; generates a consistent template for wiki page content to sit in @@ -39,7 +37,6 @@ (define always-headers (list (header #"Referrer-Policy" #"same-origin") ; header to not send referers to fandom (header #"Link" (string->bytes/latin-1 link-header)))) -(define timeouts (easy:make-timeout-config #:lease 5 #:connect 5)) (define-runtime-path path-static "../static") (define theme-icons diff --git a/src/data.rkt b/src/data.rkt index b22e8a0..6975b37 100644 --- a/src/data.rkt +++ b/src/data.rkt @@ -7,8 +7,8 @@ (prefix-in easy: net/http-easy) db memo + "fandom-request.rkt" "static-data.rkt" - "whole-utils.rkt" "../lib/url-utils.rkt" "../lib/xexpr-utils.rkt" "../archiver/archiver-database.rkt" @@ -54,16 +54,14 @@ (vector-ref row 3))) siteinfo-default)] [else - (define dest-url - (format "https://~a.fandom.com/api.php?~a" - wikiname - (params->query '(("action" . "query") - ("meta" . "siteinfo") - ("siprop" . "general|rightsinfo") - ("format" . "json") - ("formatversion" . "2"))))) - (log-outgoing dest-url) - (define res (easy:get dest-url)) + (define res + (fandom-get-api + wikiname + (params->query '(("action" . "query") + ("meta" . "siteinfo") + ("siprop" . "general|rightsinfo") + ("format" . "json") + ("formatversion" . "2"))))) (define data (easy:response-json res)) (siteinfo^ (jp "/query/general/sitename" data) (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) diff --git a/src/dispatcher-tree.rkt b/src/dispatcher-tree.rkt index 48e8ebb..0212242 100644 --- a/src/dispatcher-tree.rkt +++ b/src/dispatcher-tree.rkt @@ -33,12 +33,29 @@ ; don't forget that I'm returning *code* - return a call to the function (datum->syntax stx `(make-dispatcher-tree ,ds))) +; guard that the page returned a response, otherwise print more detailed debugging information +(define-syntax-rule (page ds name) + (λ (req) + (define dispatcher (hash-ref ds (quote name))) + (define page-response (dispatcher req)) + (if (response? page-response) + page-response + (response/output + #:code 500 + #:mime-type #"text/plain" + (λ (out) + (for ([port (list (current-error-port) out)]) + (parameterize ([current-output-port port]) + (printf "error in ~a:~n expected page to return a response~n actually returned: ~v~n" + (quote name) + page-response)))))))) + (define (make-dispatcher-tree ds) (define subdomain-dispatcher (hash-ref ds 'subdomain-dispatcher)) (define tree (sequencer:make subdomain-dispatcher - (pathprocedure:make "/" (hash-ref ds 'page-home)) + (pathprocedure:make "/" (page ds page-home)) (pathprocedure:make "/proxy" (hash-ref ds 'page-proxy)) (pathprocedure:make "/search" (hash-ref ds 'page-global-search)) (pathprocedure:make "/set-user-settings" (hash-ref ds 'page-set-user-settings)) @@ -48,7 +65,7 @@ (if (config-true? 'feature_offline::enabled) (filter:make (pregexp (format "^/~a/wiki/.+$" px-wikiname)) (lift:make (hash-ref ds 'page-wiki-offline))) (λ (_conn _req) (next-dispatcher))) - (filter:make (pregexp (format "^/~a/wiki/.+$" px-wikiname)) (lift:make (hash-ref ds 'page-wiki))) + (filter:make (pregexp (format "^/~a/wiki/.+$" px-wikiname)) (lift:make (page ds page-wiki))) (filter:make (pregexp (format "^/~a/search$" px-wikiname)) (lift:make (hash-ref ds 'page-search))) (filter:make (pregexp (format "^/~a(/(wiki(/)?)?)?$" px-wikiname)) (lift:make (hash-ref ds 'redirect-wiki-home))) (if (config-true? 'feature_offline::enabled) diff --git a/src/fandom-request.rkt b/src/fandom-request.rkt new file mode 100644 index 0000000..966eeee --- /dev/null +++ b/src/fandom-request.rkt @@ -0,0 +1,48 @@ +#lang typed/racket/base +(require "config.rkt" + "../lib/url-utils.rkt") +(define-type Headers (HashTable Symbol (U Bytes String))) +(require/typed net/http-easy + [#:opaque Timeout-Config timeout-config?] + [#:opaque Response response?] + [#:opaque Session session?] + [current-session (Parameter Session)] + [make-timeout-config ([#:lease Positive-Real] [#:connect Positive-Real] -> Timeout-Config)] + [get ((U Bytes String) + [#:close? Boolean] + [#:headers Headers] + [#:timeouts Timeout-Config] + [#:max-attempts Exact-Positive-Integer] + [#:max-redirects Exact-Nonnegative-Integer] + [#:user-agent (U Bytes String)] + -> Response)]) + +(provide + fandom-get + fandom-get-api + timeouts) + +(define timeouts (make-timeout-config #:lease 5 #:connect 5)) + +(: no-headers Headers) +(define no-headers '#hasheq()) + +(: fandom-get (String String [#:headers (Option Headers)] -> Response)) +(define (fandom-get wikiname path #:headers [headers #f]) + (define dest-url (string-append "https://www.fandom.com" path)) + (define host (string-append wikiname ".fandom.com")) + (log-outgoing wikiname path) + (get dest-url + #:timeouts timeouts + #:headers (hash-set (or headers no-headers) 'Host host))) + +(: fandom-get-api (String (Listof (Pair String String)) [#:headers (Option Headers)] -> Response)) +(define (fandom-get-api wikiname params #:headers [headers #f]) + (fandom-get wikiname + (string-append "/api.php?" (params->query params)) + #:headers headers)) + +(: log-outgoing (String String -> Void)) +(define (log-outgoing wikiname path) + (when (config-true? 'log_outgoing) + (printf "out: ~a ~a~n" wikiname path))) diff --git a/src/page-category.rkt b/src/page-category.rkt index 213d423..e1fe659 100644 --- a/src/page-category.rkt +++ b/src/page-category.rkt @@ -15,11 +15,11 @@ "application-globals.rkt" "config.rkt" "data.rkt" + "fandom-request.rkt" "page-wiki.rkt" "../lib/syntax.rkt" "../lib/thread-utils.rkt" "../lib/url-utils.rkt" - "whole-utils.rkt" "../lib/xexpr-utils.rkt") (provide @@ -73,30 +73,24 @@ (define-values (members-data page-data siteinfo) (thread-values (λ () - (define dest-url - (format "~a/api.php?~a" - origin - (params->query `(("action" . "query") - ("list" . "categorymembers") - ("cmtitle" . ,prefixed-category) - ("cmlimit" . "max") - ("formatversion" . "2") - ("format" . "json"))))) - (log-outgoing dest-url) - (define dest-res (easy:get dest-url #:timeouts timeouts)) - (easy:response-json dest-res)) + (easy:response-json + (fandom-get-api + wikiname + `(("action" . "query") + ("list" . "categorymembers") + ("cmtitle" . ,prefixed-category) + ("cmlimit" . "max") + ("formatversion" . "2") + ("format" . "json"))))) (λ () - (define dest-url - (format "~a/api.php?~a" - origin - (params->query `(("action" . "parse") - ("page" . ,prefixed-category) - ("prop" . "text|headhtml|langlinks") - ("formatversion" . "2") - ("format" . "json"))))) - (log-outgoing dest-url) - (define dest-res (easy:get dest-url #:timeouts timeouts)) - (easy:response-json dest-res)) + (easy:response-json + (fandom-get-api + wikiname + `(("action" . "parse") + ("page" . ,prefixed-category) + ("prop" . "text|headhtml|langlinks") + ("formatversion" . "2") + ("format" . "json"))))) (λ () (siteinfo-fetch wikiname)))) diff --git a/src/page-file.rkt b/src/page-file.rkt index 2a7332c..5151f1d 100644 --- a/src/page-file.rkt +++ b/src/page-file.rkt @@ -15,11 +15,11 @@ "application-globals.rkt" "config.rkt" "data.rkt" + "fandom-request.rkt" "page-wiki.rkt" "../lib/syntax.rkt" "../lib/thread-utils.rkt" "../lib/url-utils.rkt" - "whole-utils.rkt" "../lib/xexpr-utils.rkt") (provide page-file) @@ -40,8 +40,7 @@ (imageDescription . #f)))) (define (url-content-type url) - (log-outgoing url) - (define dest-res (easy:head url #:timeouts timeouts)) + (define dest-res (easy:head url)) (easy:response-headers-ref dest-res 'content-type)) (define (get-media-html url content-type) @@ -106,20 +105,18 @@ (response-handler (define wikiname (path/param-path (first (url-path (request-uri req))))) (define prefixed-title (path/param-path (caddr (url-path (request-uri req))))) - (define origin (format "https://~a.fandom.com" wikiname)) - (define source-url (format "~a/wiki/~a" origin prefixed-title)) + (define source-url (format "https://~a.fandom.com/wiki/~a" wikiname prefixed-title)) (define-values (media-detail siteinfo) (thread-values (λ () - (define dest-url - (format "~a/wikia.php?~a" - origin - (params->query `(("format" . "json") ("controller" . "Lightbox") - ("method" . "getMediaDetail") - ("fileTitle" . ,prefixed-title))))) - (log-outgoing dest-url) - (define dest-res (easy:get dest-url #:timeouts timeouts)) + (define dest-res + (fandom-get + wikiname + (format "/wikia.php?~a" + (params->query `(("format" . "json") ("controller" . "Lightbox") + ("method" . "getMediaDetail") + ("fileTitle" . ,prefixed-title)))))) (easy:response-json dest-res)) (λ () (siteinfo-fetch wikiname)))) diff --git a/src/page-search.rkt b/src/page-search.rkt index 019ebfe..39f361a 100644 --- a/src/page-search.rkt +++ b/src/page-search.rkt @@ -2,7 +2,6 @@ (require racket/dict racket/list racket/string - (prefix-in easy: net/http-easy) ; html libs html-writing ; web server libs @@ -18,7 +17,6 @@ "../lib/syntax.rkt" "../lib/thread-utils.rkt" "../lib/url-utils.rkt" - "whole-utils.rkt" "../lib/xexpr-utils.rkt") (provide diff --git a/src/page-wiki.rkt b/src/page-wiki.rkt index ae060d4..f16792c 100644 --- a/src/page-wiki.rkt +++ b/src/page-wiki.rkt @@ -17,12 +17,12 @@ "application-globals.rkt" "config.rkt" "data.rkt" + "fandom-request.rkt" "../lib/pure-utils.rkt" "../lib/syntax.rkt" "../lib/thread-utils.rkt" "../lib/tree-updater.rkt" "../lib/url-utils.rkt" - "whole-utils.rkt" "../lib/xexpr-utils.rkt") (provide @@ -38,25 +38,20 @@ (define (page-wiki req) (define wikiname (path/param-path (first (url-path (request-uri req))))) (define user-cookies (user-cookies-getter req)) - (define origin (format "https://~a.fandom.com" wikiname)) (define path (string-join (map path/param-path (cddr (url-path (request-uri req)))) "/")) (define source-url (format "https://~a.fandom.com/wiki/~a" wikiname path)) (define-values (dest-res siteinfo) (thread-values (λ () - (define dest-url - (format "~a/api.php?~a" - origin - (params->query `(("action" . "parse") - ("page" . ,path) - ("prop" . "text|headhtml|langlinks") - ("formatversion" . "2") - ("format" . "json"))))) - (log-outgoing dest-url) - (easy:get dest-url - #:timeouts timeouts - #:headers `#hasheq((cookie . ,(format "theme=~a" (user-cookies^-theme user-cookies)))))) + (fandom-get-api + wikiname + `(("action" . "parse") + ("page" . ,path) + ("prop" . "text|headhtml|langlinks") + ("formatversion" . "2") + ("format" . "json")) + #:headers `#hasheq((cookie . ,(format "theme=~a" (user-cookies^-theme user-cookies)))))) (λ () (siteinfo-fetch wikiname)))) @@ -103,4 +98,13 @@ #:code 200 #:headers headers (λ (out) - (write-html body out))))))])) + (write-html body out))))))] + [(eq? 404 (easy:response-status-code dest-res)) + (next-dispatcher)] + [else + (response-handler + (error 'page-wiki "Tried to load page ~a/~v~nSadly, the page didn't load because Fandom returned status code ~a with response:~n~a" + wikiname + path + (easy:response-status-code dest-res) + (easy:response-body dest-res)))])) diff --git a/src/search-provider-fandom.rkt b/src/search-provider-fandom.rkt index 2338c13..b8dd48f 100644 --- a/src/search-provider-fandom.rkt +++ b/src/search-provider-fandom.rkt @@ -3,8 +3,8 @@ (prefix-in easy: net/http-easy) "application-globals.rkt" "config.rkt" + "fandom-request.rkt" "../lib/url-utils.rkt" - "whole-utils.rkt" "../lib/xexpr-utils.rkt") (provide @@ -17,20 +17,14 @@ '(#hasheq((ns . 0) (pageid . 219) (size . 1482) (snippet . "") (timestamp . "2022-08-21T08:54:23Z") (title . "Gacha Capsule") (wordcount . 214)) #hasheq((ns . 0) (pageid . 201) (size . 1198) (snippet . "") (timestamp . "2022-07-11T17:52:47Z") (title . "Badges") (wordcount . 181))))) (define (search-fandom wikiname query params) - ;; constructing the URL where I want to get fandom data from... - (define origin (format "https://~a.fandom.com" wikiname)) - ;; the dest-URL will look something like https://minecraft.fandom.com/api.php?action=query&list=search&srsearch=Spawner&formatversion=2&format=json - (define dest-url - (format "~a/api.php?~a" - origin - (params->query `(("action" . "query") - ("list" . "search") - ("srsearch" . ,query) - ("formatversion" . "2") - ("format" . "json"))))) - ;; HTTP request to dest-url for search results - (log-outgoing dest-url) - (define res (easy:get dest-url #:timeouts timeouts)) + (define res + (fandom-get-api + wikiname + `(("action" . "query") + ("list" . "search") + ("srsearch" . ,query) + ("formatversion" . "2") + ("format" . "json")))) (define json (easy:response-json res)) (define search-results (jp "/query/search" json)) (generate-results-content-fandom wikiname query search-results)) diff --git a/src/search-provider-solr.rkt b/src/search-provider-solr.rkt index 1ec48e2..31813da 100644 --- a/src/search-provider-solr.rkt +++ b/src/search-provider-solr.rkt @@ -5,7 +5,6 @@ "application-globals.rkt" "../lib/html-parsing/main.rkt" "../lib/url-utils.rkt" - "whole-utils.rkt" "../lib/xexpr-utils.rkt") (provide @@ -37,7 +36,7 @@ ("sort" . ,(cdr sort)))))) ;; HTTP request to dest-url for search results (log-outgoing dest-url) - (define res (easy:get dest-url #:timeouts timeouts)) + (define res (easy:get dest-url #:timeouts (easy:make-timeout-config #:lease 5 #:connect 5))) (define json (easy:response-json res)) ;; build result objects diff --git a/src/whole-utils.rkt b/src/whole-utils.rkt deleted file mode 100644 index 7118866..0000000 --- a/src/whole-utils.rkt +++ /dev/null @@ -1,11 +0,0 @@ -#lang typed/racket/base -(require "config.rkt") - -(provide - ; prints "out: " - log-outgoing) - -(: log-outgoing (String -> Void)) -(define (log-outgoing url-string) - (when (config-true? 'log_outgoing) - (printf "out: ~a~n" url-string))) From 7dff049ece876d1d31d4537c2176629cabdd7af0 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 1 May 2024 00:57:13 +1200 Subject: [PATCH 44/58] Wrap all pages in response safety checker --- src/dispatcher-tree.rkt | 20 ++++++++++---------- src/search-provider-solr.rkt | 1 - 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/dispatcher-tree.rkt b/src/dispatcher-tree.rkt index 0212242..a967095 100644 --- a/src/dispatcher-tree.rkt +++ b/src/dispatcher-tree.rkt @@ -56,20 +56,20 @@ (sequencer:make subdomain-dispatcher (pathprocedure:make "/" (page ds page-home)) - (pathprocedure:make "/proxy" (hash-ref ds 'page-proxy)) - (pathprocedure:make "/search" (hash-ref ds 'page-global-search)) - (pathprocedure:make "/set-user-settings" (hash-ref ds 'page-set-user-settings)) - (pathprocedure:make "/buddyfight/wiki/It_Doesn't_Work!!" (hash-ref ds 'page-it-works)) - (filter:make (pregexp (format "^/~a/wiki/Category:.+$" px-wikiname)) (lift:make (hash-ref ds 'page-category))) - (filter:make (pregexp (format "^/~a/wiki/File:.+$" px-wikiname)) (lift:make (hash-ref ds 'page-file))) + (pathprocedure:make "/proxy" (page ds page-proxy)) + (pathprocedure:make "/search" (page ds page-global-search)) + (pathprocedure:make "/set-user-settings" (page ds page-set-user-settings)) + (pathprocedure:make "/buddyfight/wiki/It_Doesn't_Work!!" (page ds page-it-works)) + (filter:make (pregexp (format "^/~a/wiki/Category:.+$" px-wikiname)) (lift:make (page ds page-category))) + (filter:make (pregexp (format "^/~a/wiki/File:.+$" px-wikiname)) (lift:make (page ds page-file))) (if (config-true? 'feature_offline::enabled) - (filter:make (pregexp (format "^/~a/wiki/.+$" px-wikiname)) (lift:make (hash-ref ds 'page-wiki-offline))) + (filter:make (pregexp (format "^/~a/wiki/.+$" px-wikiname)) (lift:make (page ds page-wiki-offline))) (λ (_conn _req) (next-dispatcher))) (filter:make (pregexp (format "^/~a/wiki/.+$" px-wikiname)) (lift:make (page ds page-wiki))) - (filter:make (pregexp (format "^/~a/search$" px-wikiname)) (lift:make (hash-ref ds 'page-search))) - (filter:make (pregexp (format "^/~a(/(wiki(/)?)?)?$" px-wikiname)) (lift:make (hash-ref ds 'redirect-wiki-home))) + (filter:make (pregexp (format "^/~a/search$" px-wikiname)) (lift:make (page ds page-search))) + (filter:make (pregexp (format "^/~a(/(wiki(/)?)?)?$" px-wikiname)) (lift:make (page ds redirect-wiki-home))) (if (config-true? 'feature_offline::enabled) - (filter:make (pregexp (format "^/archive/~a/(styles|images)/.+$" px-wikiname)) (lift:make (hash-ref ds 'page-static-archive))) + (filter:make (pregexp (format "^/archive/~a/(styles|images)/.+$" px-wikiname)) (lift:make (page ds page-static-archive))) (λ (_conn _req) (next-dispatcher))) (hash-ref ds 'static-dispatcher) (lift:make (hash-ref ds 'page-not-found)))) diff --git a/src/search-provider-solr.rkt b/src/search-provider-solr.rkt index 31813da..c15e31f 100644 --- a/src/search-provider-solr.rkt +++ b/src/search-provider-solr.rkt @@ -35,7 +35,6 @@ ("hl.tag.post" . "") ("sort" . ,(cdr sort)))))) ;; HTTP request to dest-url for search results - (log-outgoing dest-url) (define res (easy:get dest-url #:timeouts (easy:make-timeout-config #:lease 5 #:connect 5))) (define json (easy:response-json res)) From d2765c2a78a5413d4ab9e7f3e7676521f0372fa6 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Thu, 2 May 2024 00:01:32 +1200 Subject: [PATCH 45/58] Fix duplicate params->query --- src/data.rkt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/data.rkt b/src/data.rkt index 6975b37..9fd0774 100644 --- a/src/data.rkt +++ b/src/data.rkt @@ -57,11 +57,11 @@ (define res (fandom-get-api wikiname - (params->query '(("action" . "query") - ("meta" . "siteinfo") - ("siprop" . "general|rightsinfo") - ("format" . "json") - ("formatversion" . "2"))))) + '(("action" . "query") + ("meta" . "siteinfo") + ("siprop" . "general|rightsinfo") + ("format" . "json") + ("formatversion" . "2")))) (define data (easy:response-json res)) (siteinfo^ (jp "/query/general/sitename" data) (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) From 0fd0efc3f2371e25ba8ff5e8725b179428ed7c29 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sat, 4 May 2024 18:01:50 +1200 Subject: [PATCH 46/58] Use default siteinfo when online wiki not found --- src/data.rkt | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/data.rkt b/src/data.rkt index 9fd0774..5aba2c2 100644 --- a/src/data.rkt +++ b/src/data.rkt @@ -62,11 +62,13 @@ ("siprop" . "general|rightsinfo") ("format" . "json") ("formatversion" . "2")))) - (define data (easy:response-json res)) - (siteinfo^ (jp "/query/general/sitename" data) - (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) - (license^ (jp "/query/rightsinfo/text" data) - (jp "/query/rightsinfo/url" data)))])) + (cond [(= (easy:response-status-code res) 200) + (define data (easy:response-json res)) + (siteinfo^ (jp "/query/general/sitename" data) + (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) + (license^ (jp "/query/rightsinfo/text" data) + (jp "/query/rightsinfo/url" data)))] + [else siteinfo-default])])) (define/memoize (head-data-getter wikiname) #:hash hash ;; data will be stored here, can be referenced by the memoized closure From 2e0bd786ec60591382fbb9f31b9689a03c30b84f Mon Sep 17 00:00:00 2001 From: Evalprime Date: Sun, 25 Feb 2024 20:45:52 +0000 Subject: [PATCH 47/58] add tardis --- src/application-globals.rkt | 2 +- src/extwiki-data.rkt | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/application-globals.rkt b/src/application-globals.rkt index fb8f118..4d8e27e 100644 --- a/src/application-globals.rkt +++ b/src/application-globals.rkt @@ -110,7 +110,7 @@ (div (@ (class "niwa__left")) (p ,((extwiki-group^-description group) props)) (p ,((extwiki^-description xt) props)) - (p "This wiki's core community has wholly migrated away from Fandom. You should " + (p "This wiki's core community has largely migrated away from Fandom. You should " (a (@ (href ,go)) "go to " ,(extwiki^-name xt) " now!")) (p (@ (class "niwa__feedback")) ,@(add-between diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index 5628078..77aa27a 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -71,6 +71,13 @@ (λ (props) '(p "The wiki was founded by Citricsquid on July 16th, 2009 as a way to document information from Minecraft. Since November 15th, 2010, it has been hosted by Curse Media. On December 12th, 2018, it moved to Fandom as it purchased Curse Media. Since September 24, 2023, it forked from Fandom and has been hosted by Weird Gloop."))) + 'Tardis + (extwiki-group^ + "Tardis" + '(("Forking announcement" . "https://tardis.wiki/wiki/Tardis:Forking_announcement") + ("Discussion on Reddit" . "https://old.reddit.com/r/doctorwho/comments/1azxmrl/tardis_wiki_has_regenerated/")) + (λ (props) '())) + 'empty (extwiki-group^ "Misc" @@ -419,6 +426,15 @@ (λ (props) `())) + (extwiki^ + '("tardis") 'default + 'Tardis + "Tardis Data Core" + "https://tardis.wiki/wiki/Doctor_Who_Wiki" + "https://tardis.wiki/images/Tardis_images/e/e6/Site-logo.png" + (λ (props) + `())) + ;; fandom wikinames * empty * empty * Name * Home Page (extwiki^ '("aether") 'empty 'empty "Aether Wiki" "https://aether.wiki.gg/wiki/Aether_Wiki" #f #f) (extwiki^ '("before-darkness-falls") 'empty 'empty "Before Darkness Falls Wiki" "https://beforedarknessfalls.wiki.gg/wiki/Before_Darkness_Falls_Wiki" #f #f) From 755efe3cd65968b626c3d661d83c44e468428322 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 5 Jun 2024 23:07:05 +1200 Subject: [PATCH 48/58] Tabber code size and quality --- src/application-globals.rkt | 2 +- static/tabs.js | 92 ++++++++++++------------------------- 2 files changed, 30 insertions(+), 64 deletions(-) diff --git a/src/application-globals.rkt b/src/application-globals.rkt index ff0e81f..e574707 100644 --- a/src/application-globals.rkt +++ b/src/application-globals.rkt @@ -172,7 +172,7 @@ (define styles (list (format "~a/wikia.php?controller=ThemeApi&method=themeVariables&variant=~a" origin (user-cookies^-theme user-cookies)) - (format "~a/load.php?lang=en&modules=site.styles%7Cskin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles&only=styles&skin=fandomdesktop" origin))) + (format "~a/load.php?lang=en&modules=site.styles%7Cskin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.fandom.photoGallery.gallery.css%7Cext.gadget.site-styles%2Csound-styles&only=styles&skin=fandomdesktop" origin))) (if (config-true? 'strict_proxy) (map u-proxy-url styles) styles)] diff --git a/static/tabs.js b/static/tabs.js index a077efe..718b48e 100644 --- a/static/tabs.js +++ b/static/tabs.js @@ -1,74 +1,40 @@ "use strict"; -let tabToFind = location.hash.length > 1 ? location.hash.substring(1) : null; -for (let tabber of document.body.querySelectorAll(".wds-tabber")) { - let [tabs, contents] = getTabs(tabber); +const tabFromHash = location.hash.length > 1 ? location.hash.substring(1) : null - for (let i in tabs) { - let tab = tabs[i]; - let content = contents[i]; +for (const tabber of document.body.querySelectorAll(".wds-tabber")) { + for (const [tab, content] of getTabberTabs(tabber)) { + // set up click listener on every tab + tab.addEventListener("click", e => { + setCurrentTab(tabber, tab, content) + e.preventDefault() + }) - tab.addEventListener("click", function(e) { - setCurrentTab(tabber, tab, content); - e.preventDefault(); - }); - if (tab.dataset.hash === tabToFind) { - setCurrentTab(tabber, tab, content); - } - } -} -document.body.classList.remove("bw-tabs-nojs"); - - - -function getTabs(tabber) { - let tabs = []; - let contents = []; - - for (let i of tabber.querySelector(".wds-tabs__wrapper").querySelectorAll(".wds-tabs__tab")) { - tabs.push(i); - } - for (let i of tabber.children) { - if (!i.matches(".wds-tab__content")) { - continue; - } - contents.push(i); - } - - return [tabs, contents]; + // re-open a specific tab on page load based on the URL hash + if (tab.dataset.hash === tabFromHash) { + setCurrentTab(tabber, tab, content) + tab.scrollIntoView() + } + } } -function getCurrentTab(tabber) { - let tab = null; - let content = null; - - tab = tabber.querySelector(".wds-tabs__wrapper").querySelector(".wds-tabs__tab.wds-is-current"); - for (let i of tabber.children) { - if (!i.matches(".wds-tab__content.wds-is-current")) { - continue; - } - content = i; - break; - } - - return [tab, content]; +function getTabberTabs(tabber) { + // need to scope the selector to handle nested tabs. see /unturned/wiki/Crate for an example + const tabs = [...tabber.querySelectorAll(":scope > .wds-tabs__wrapper .wds-tabs__tab")] + const contents = [...tabber.querySelectorAll(":scope > .wds-tab__content")] + return tabs.map((_, index) => [tabs[index], contents[index]]) // transpose arrays into [[tab, content], ...] } function setCurrentTab(tabber, tab, content) { - let [currentTab, currentContent] = getCurrentTab(tabber); - if (currentTab) { - currentTab.classList.remove("wds-is-current"); - } - if (currentContent) { - currentContent.classList.remove("wds-is-current"); - } + // clear currently selected tab + getTabberTabs(tabber).flat().forEach(e => e.classList.remove("wds-is-current")) - tab.classList.add("wds-is-current"); - content.classList.add("wds-is-current"); - if (tab.dataset.hash) { - let fragment = "#" + tab.dataset.hash; - if (location.hash !== fragment) { - history.pushState(null, "", fragment); - } - } + // select new tab + tab.classList.add("wds-is-current") + content.classList.add("wds-is-current") + if (tab.dataset.hash) { + history.replaceState(null, "", `#${tab.dataset.hash}`) + } } + +document.body.classList.remove("bw-tabs-nojs") From 5672f468862951837f54c792bddd9758f64a4738 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Mon, 1 Jul 2024 01:28:27 +1200 Subject: [PATCH 49/58] Add new independent wikis --- src/application-globals.rkt | 2 +- src/extwiki-data.rkt | 68 +++++++++++++++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/src/application-globals.rkt b/src/application-globals.rkt index e574707..6d940ac 100644 --- a/src/application-globals.rkt +++ b/src/application-globals.rkt @@ -108,8 +108,8 @@ (a (@ (class "niwa__go") (href ,go)) "Read " ,title " on " ,(extwiki^-name xt) " →") (div (@ (class "niwa__cols")) (div (@ (class "niwa__left")) - (p ,((extwiki-group^-description group) props)) (p ,((extwiki^-description xt) props)) + (p ,((extwiki-group^-description group) props)) (p "This wiki's core community has largely migrated away from Fandom. You should " (a (@ (href ,go)) "go to " ,(extwiki^-name xt) " now!")) (p (@ (class "niwa__feedback")) diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index 77aa27a..29997c4 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -27,6 +27,13 @@ (λ (props) `(p "The Square Enix Indpendent Wiki Alliance, or SEIWA, is a network of independent wikis established in 2011 and focused on providing high-quality coverage of Square Enix and its content. We work together, along with our affiliates and others, to co-operate and support one another while providing the best-quality content on the various Square Enix video games and media."))) + 'GWN + (extwiki-group^ + "GWN" + '(("Gaming Wiki Network" . "https://gamingwikinetwork.org/")) + (λ (props) + `(p "This wiki is part of the Gaming Wiki Network, a network of independently-hosted wikis about video game franchises. The GWN was founded on October 21, 2022. It aims to support all gaming communities in building independently-hosted wikis."))) + 'Terraria (extwiki-group^ "Terraria" @@ -316,11 +323,11 @@ (extwiki^ '("zelda" "zelda-archive") 'default 'NIWA - "Zeldapedia" - "https://zeldapedia.wiki/wiki/Main_Page" + "Zelda Wiki" + "https://zeldawiki.wiki/wiki/Main_Page" "https://niwanetwork.org/images/logos/zeldapedia.png" (λ (props) - `((p "Founded on April 23, 2005 as Zelda Wiki, today's Zeldapedia is your definitive source for encyclopedic information on The Legend of Zelda series, as well as all of the latest Zelda news. Zeldapedia went independent from Fandom in October 2022, citing Fandom's recent buyouts and staffing decisions among their reasons.")))) + `((p "Founded on April 23, 2005, Zelda Wiki is your definitive source for encyclopedic information on The Legend of Zelda series, as well as all of the latest Zelda news. Zelda Wiki went independent from Fandom in October 2022, citing Fandom's recent buyouts and staffing decisions among their reasons.")))) (extwiki^ '("chrono") 'default @@ -435,6 +442,61 @@ (λ (props) `())) + (extwiki^ + '("wizardry") 'default + 'GWN + "Wizardry Wiki" + "https://wizardry.wiki.gg/wiki/Wizardry_Wiki" + "https://wizardry.wiki.gg/images/e/e6/Site-logo.png" + (λ (props) + `((p "On March 21, 2023, the wiki has decided to leave and abandoning from Fandom due to numerous of issues such as intrusive advertising, long-lasting bugs, restrictions on customization, etcetera. Wizardry Wiki was officially inducted into the wiki.gg wikifarm, with all contents forked over.") + (p "The wiki has partnered with " (a (@ (href "https://fallout.wiki/")) "Independent Fallout Wiki") " as of June 14, 2024.")))) + + (extwiki^ + '("jackryan") 'default + 'GWN + "Tom Clancy Wiki" + "https://tomclancy.wiki.gg/wiki/Tom_Clancy_Wiki" + "https://tomclancy.wiki.gg/images/thumb/c/c5/Jack_Ryan_Logo_Dark.png/600px-Jack_Ryan_Logo_Dark.png" + (λ (props) + `((p "The Tom Clancy Wiki is a collaborative encyclopedia dedicated to Tom Clancy’s franchises. The Tom Clancy franchise is a 40-year old expansive franchise founded by Tom Clancy, telling several unique sagas through books, video games, and films, as well as a TV show.")))) + + (extwiki^ + '("hollowknight") 'default + 'GWN + "Hollow Knight Wiki" + "https://hollowknight.wiki/wiki/Main_Page" + "https://gamingwikinetwork.org/images/logos/hollowknight.png" + (λ (props) + `((p "We are an independently hosted wiki for the games Hollow Knight and Hollow Knight: Silksong, created by fans, for fans. The wiki is a fork of the FANDOM Hollow Knight Wiki and was officially unveiled on October 31, 2023.")))) + + (extwiki^ + '("hellokitty" "sanrio") 'default + 'GWN + "Sanrio Wiki" + "https://sanriowiki.com/wiki/Sanrio_Wiki" + "https://cdn.sanriowiki.com/wiki.png" + (λ (props) + `((p "Sanrio Wiki is a project that was started on April 14, 2015 by EvieMelody. It was hosted on the wiki-farm ShoutWiki and has since become independent.")))) + + (extwiki^ + '("sto") 'default + 'GWN + "Star Trek Online Wiki" + "https://stowiki.net/wiki/Main_Page" + "https://gamingwikinetwork.org/images/logos/stowiki.png" + (λ (props) + `())) + + (extwiki^ + '("rayman-game" "ubisoftrayman") 'default + 'GWN + "Rayman Wiki" + "https://raymanpc.com/wiki/en/Main_Page" + "https://raymanpc.com/wiki/script-en/resources/assets/logo-en.png?5c608" + (λ (props) + `())) + ;; fandom wikinames * empty * empty * Name * Home Page (extwiki^ '("aether") 'empty 'empty "Aether Wiki" "https://aether.wiki.gg/wiki/Aether_Wiki" #f #f) (extwiki^ '("before-darkness-falls") 'empty 'empty "Before Darkness Falls Wiki" "https://beforedarknessfalls.wiki.gg/wiki/Before_Darkness_Falls_Wiki" #f #f) From 1ef184547b2be6715318c3d749d3114fe61866ec Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Mon, 1 Jul 2024 02:28:17 +1200 Subject: [PATCH 50/58] Allow minimising independent wiki notice --- src/application-globals.rkt | 18 ++++++++++-------- src/data.rkt | 25 ++++++++++++++++++++----- static/main.css | 17 +++++++++++++++++ 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/src/application-globals.rkt b/src/application-globals.rkt index 6d940ac..26cd6ae 100644 --- a/src/application-globals.rkt +++ b/src/application-globals.rkt @@ -68,9 +68,6 @@ `(p ,(format "This instance is run by the ~a developer, " (config-get 'application_name)) (a (@ (href "https://cadence.moe/contact")) "Cadence") - ". Proudly hosted by " - (a (@ (href "http://alphamethyl.barr0w.net")) - "Barrow Network Solutions" (sup "XD")) ".") `(p ,(format "This unofficial instance is based off the ~a source code, but is not controlled by the code developer." (config-get 'application_name))))) @@ -88,11 +85,13 @@ ;; generate a notice with a link if a fandom wiki has a replacement as part of NIWA or similar ;; if the wiki has no replacement, display nothing -(define (extwiki-notice wikiname title) +(define (extwiki-notice wikiname title req user-cookies) (define xt (findf (λ (item) (member wikiname (extwiki^-wikinames item))) extwikis)) (cond/var [xt - (let* ([group (hash-ref extwiki-groups (extwiki^-group xt))] + (let* ([seen? (member wikiname (user-cookies^-notices user-cookies))] + [aside-class (if seen? "niwa__notice niwa--seen" "niwa__notice")] + [group (hash-ref extwiki-groups (extwiki^-group xt))] [search-page (format "/Special:Search?~a" (params->query `(("search" . ,title) ("go" . "Go"))))] @@ -103,7 +102,7 @@ [props (extwiki-props^ go)]) (cond [(eq? (extwiki^-banner xt) 'default) - `(aside (@ (class "niwa__notice")) + `(aside (@ (class ,aside-class)) (h1 (@ (class "niwa__header")) ,(extwiki^-name xt) " has its own website separate from Fandom.") (a (@ (class "niwa__go") (href ,go)) "Read " ,title " on " ,(extwiki^-name xt) " →") (div (@ (class "niwa__cols")) @@ -117,7 +116,10 @@ `(,@(for/list ([link (extwiki-group^-links group)]) `(a (@ (href ,(cdr link))) ,(car link))) "This notice is from BreezeWiki" - (a (@ (href "https://docs.breezewiki.com/Reporting_Bugs.html")) "Feedback?")) + (a (@ (rel "nofollow") + (class "niwa__got-it") + (href ,(user-cookies-setter-url/add-notice req user-cookies wikiname))) + "OK, got it")) " / "))) (div (@ (class "niwa__right")) (img (@ (class "niwa__logo") (src ,(extwiki^-logo xt)))))))] @@ -225,7 +227,7 @@ (div (@ (class "fandom-community-header__background tileHorizontally header"))) (div (@ (class "page")) (main (@ (class "page__main")) - ,(extwiki-notice wikiname title) + ,(extwiki-notice wikiname title req user-cookies) (div (@ (class "custom-top")) (h1 (@ (class "page-title")) ,title) (nav (@ (class "sitesearch")) diff --git a/src/data.rkt b/src/data.rkt index 5aba2c2..63c7f03 100644 --- a/src/data.rkt +++ b/src/data.rkt @@ -1,6 +1,7 @@ #lang racket/base (require racket/list racket/match + racket/string web-server/http/request-structs net/url-string (only-in net/cookies/server cookie-header->alist cookie->set-cookie-header make-cookie) @@ -27,7 +28,8 @@ user-cookies-getter user-cookies-default user-cookies-setter - user-cookies-setter-url) + user-cookies-setter-url + user-cookies-setter-url/add-notice) (struct siteinfo^ (sitename basepage license) #:transparent) (struct license^ (text url) #:transparent) @@ -90,8 +92,8 @@ ;; then no matter what, return the best information we have so far this-data)) -(struct user-cookies^ (theme) #:prefab) -(define user-cookies-default (user-cookies^ 'default)) +(struct user-cookies^ (theme notices) #:prefab) +(define user-cookies-default (user-cookies^ 'default '())) (define (user-cookies-getter req) (define cookie-header (headers-assq* #"cookie" (request-headers/raw req))) (define cookies-alist (if cookie-header (cookie-header->alist (header-value cookie-header) bytes->string/utf-8) null)) @@ -100,16 +102,29 @@ (match pair [(cons "theme" (and theme (or "light" "dark" "default"))) (values 'theme (string->symbol theme))] + [(cons "notices" notices) + (values 'notices (string-split notices "|"))] [_ (values #f #f)]))) (user-cookies^ - (hash-ref cookies-hash 'theme (user-cookies^-theme user-cookies-default)))) + (hash-ref cookies-hash 'theme (user-cookies^-theme user-cookies-default)) + (hash-ref cookies-hash 'notices (user-cookies^-notices user-cookies-default)))) (define (user-cookies-setter user-cookies) (map (λ (c) (header #"Set-Cookie" (cookie->set-cookie-header c))) (list (make-cookie "theme" (symbol->string (user-cookies^-theme user-cookies)) + #:path "/" + #:max-age (* 60 60 24 365 10)) + (make-cookie "notices" (string-join (user-cookies^-notices user-cookies) "|") + #:path "/" #:max-age (* 60 60 24 365 10))))) (define (user-cookies-setter-url req new-settings) (format "/set-user-settings?~a" (params->query `(("next_location" . ,(url->string (request-uri req))) - ("new_settings" . ,(format "~a" new-settings)))))) + ("new_settings" . ,(format "~s" new-settings)))))) + +(define (user-cookies-setter-url/add-notice req user-cookies notice-name) + (user-cookies-setter-url + req + (struct-copy user-cookies^ user-cookies + [notices (cons notice-name (user-cookies^-notices user-cookies))]))) diff --git a/static/main.css b/static/main.css index bcd612b..a8e9703 100644 --- a/static/main.css +++ b/static/main.css @@ -431,6 +431,23 @@ a.ext-audiobutton { /* see hearthstone/wiki/Diablo_(Duels_hero) */ font-size: 14px; text-align: right; } +/* more compact notice after it's been seen the first time */ +.niwa--seen { + padding: 1.5vw 2vw 2vw; + overflow-y: auto; + max-height: min(280px, 33vh); + font-size: 17px; +} +.niwa--seen .niwa__header { + font-size: 26px; +} +.niwa--seen .niwa__go { + padding: 10px 18px; + font-size: 20px; +} +.niwa--seen .niwa__got-it { + display: none; +} /* media queries */ From 14930f18dc73457f1cb6ef677285333a112a048e Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Mon, 1 Jul 2024 02:32:39 +1200 Subject: [PATCH 51/58] Save even more vertical space --- static/main.css | 2 ++ 1 file changed, 2 insertions(+) diff --git a/static/main.css b/static/main.css index a8e9703..5b3e7e2 100644 --- a/static/main.css +++ b/static/main.css @@ -437,6 +437,8 @@ a.ext-audiobutton { /* see hearthstone/wiki/Diablo_(Duels_hero) */ overflow-y: auto; max-height: min(280px, 33vh); font-size: 17px; + margin-top: -2vw; + margin-bottom: 12px; } .niwa--seen .niwa__header { font-size: 26px; From 8f0caa913240df0e797c587d460ab460f75528f3 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Mon, 1 Jul 2024 02:45:51 +1200 Subject: [PATCH 52/58] Add Enter the Gungeon wiki.gg redirect --- src/extwiki-data.rkt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index 29997c4..f190e5f 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -512,6 +512,7 @@ (extwiki^ '("doom") 'empty 'empty "DoomWiki.org" "https://doomwiki.org/wiki/Entryway" #f #f) (extwiki^ '("dreamscaper") 'empty 'empty "Official Dreamscaper Wiki" "https://dreamscaper.wiki.gg/wiki/Dreamscaper_Wiki" #f #f) (extwiki^ '("elderscrolls") 'empty 'empty "UESP" "https://en.uesp.net/wiki/Main_Page" #f #f) + (extwiki^ '("enterthegungeon" "exit-the-gungeon" "enter-the-gungeon-archive") 'empty 'empty "Official Enter The Gungeon Wiki" "https://enterthegungeon.wiki.gg/wiki/Enter_the_Gungeon_Wiki" "https://enterthegungeon.wiki.gg/images/e/e6/Site-logo.png" #f) (extwiki^ '("fiend-folio") 'empty 'empty "Official Fiend Folio Wiki" "https://fiendfolio.wiki.gg/wiki/Fiend_Folio_Wiki" #f #f) (extwiki^ '("foxhole") 'empty 'empty "Foxhole Wiki" "https://foxhole.wiki.gg/wiki/Foxhole_Wiki" #f #f) (extwiki^ '("have-a-nice-death") 'empty 'empty "Have a Nice Death Wiki" "https://haveanicedeath.wiki.gg/wiki/Have_a_Nice_Death_Wiki" #f #f) From 49682b23207443a7766853f4afd90e0dbd4520c8 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 3 Jul 2024 23:37:44 +1200 Subject: [PATCH 53/58] Fix for Racket 8.13 --- src/config.rkt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.rkt b/src/config.rkt index fec546b..b1afe0a 100644 --- a/src/config.rkt +++ b/src/config.rkt @@ -104,7 +104,7 @@ ; all values here are optimised for maximum prettiness (parameterize ([pretty-print-columns 80]) (display "config: ") - (pretty-write ((inst sort (Pairof Symbol String)) + (pretty-write ((inst sort (Pairof Symbol String) Symbol) (hash->list (make-immutable-hasheq combined-alist)) symbol Date: Thu, 4 Jul 2024 19:21:17 +1200 Subject: [PATCH 54/58] Add Granblue redirect --- src/extwiki-data.rkt | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index f190e5f..8971640 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -89,7 +89,7 @@ (extwiki-group^ "Misc" '(("This wiki doesn't have a description yet. Add one?" . "https://docs.breezewiki.com/Reporting_Bugs.html")) - #f))) + (λ (props) '())))) ;; wikiname, niwa-name, url, logo-url (struct extwiki^ (wikinames banner group name home logo description) #:transparent) @@ -497,6 +497,15 @@ (λ (props) `())) + (extwiki^ + '("granblue") 'empty + 'empty + "Granblue Fantasy Wiki" + "https://gbf.wiki/" + "https://gbf.wiki/images/1/18/Vyrnball.png?0704c" + (λ (props) + `())) + ;; fandom wikinames * empty * empty * Name * Home Page (extwiki^ '("aether") 'empty 'empty "Aether Wiki" "https://aether.wiki.gg/wiki/Aether_Wiki" #f #f) (extwiki^ '("before-darkness-falls") 'empty 'empty "Before Darkness Falls Wiki" "https://beforedarknessfalls.wiki.gg/wiki/Before_Darkness_Falls_Wiki" #f #f) From 1e3451a990e64fc75b76ee51bd626ccd57e28127 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sun, 14 Jul 2024 23:25:52 +1200 Subject: [PATCH 55/58] Add HELLMET wiki --- src/extwiki-data.rkt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index 8971640..4a2cca2 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -506,6 +506,15 @@ (λ (props) `())) + (extwiki^ + '("hellmet-roblox") 'empty + 'empty + "HELLMET Wiki" + "https://hellmet.miraheze.org/wiki/Main_Page" + "https://static.miraheze.org/hellmetwiki/thumb/c/ce/Hellmet_Wiki_Logo.png/135px-Hellmet_Wiki_Logo.png" + (λ (props) + `())) + ;; fandom wikinames * empty * empty * Name * Home Page (extwiki^ '("aether") 'empty 'empty "Aether Wiki" "https://aether.wiki.gg/wiki/Aether_Wiki" #f #f) (extwiki^ '("before-darkness-falls") 'empty 'empty "Before Darkness Falls Wiki" "https://beforedarknessfalls.wiki.gg/wiki/Before_Darkness_Falls_Wiki" #f #f) From 8db91d5e32c1a8671d4d7537e381611e5732e8f9 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sun, 14 Jul 2024 23:38:30 +1200 Subject: [PATCH 56/58] Add Rainverse wiki redirect --- src/extwiki-data.rkt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index 4a2cca2..e283dfb 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -85,6 +85,13 @@ ("Discussion on Reddit" . "https://old.reddit.com/r/doctorwho/comments/1azxmrl/tardis_wiki_has_regenerated/")) (λ (props) '())) + 'Rainverse + (extwiki-group^ + "Rainverse" + '(("Forking announcement" . "https://transfem.social/notes/9qsqdkmqi78e01bh")) + (λ (props) + '())) + 'empty (extwiki-group^ "Misc" @@ -515,6 +522,17 @@ (λ (props) `())) + (extwiki^ + '("rain-web-comic") 'default + 'empty + "Rainverse Wiki" + "https://rainverse.wiki/wiki/Main_Page" + "https://static.miraheze.org/rainversewiki/2/2c/Rain_comic_cover.png" + (λ (props) + `((p "We have a newly-migrated Rainverse Wiki which escaped from Fandom! Rain is the comic that helped me figure out my gender, so I am really glad to have a wiki on a non-evil host.") + (p "Please stop using the abandoned copy of Rain Wiki on Fandom. Fandom is still \"training\" a generator which adds procedurally-generated bullshit to articles, with no way for users to remove or correct it, and they're demanding volunteer wiki admins waste time \"vetting\" the procedurally-generated BS for accuracy. As Jocelyn herself said, \"fuck Fandom forever.\"") + (p "If you are interested, please add more articles related to other Rainverse stories.")))) + ;; fandom wikinames * empty * empty * Name * Home Page (extwiki^ '("aether") 'empty 'empty "Aether Wiki" "https://aether.wiki.gg/wiki/Aether_Wiki" #f #f) (extwiki^ '("before-darkness-falls") 'empty 'empty "Before Darkness Falls Wiki" "https://beforedarknessfalls.wiki.gg/wiki/Before_Darkness_Falls_Wiki" #f #f) From 97c4e54f38558f67abb1a1cc5baec23fa9cc0b21 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Sat, 10 Aug 2024 15:04:13 +1200 Subject: [PATCH 57/58] Fix Tardis Wiki metadata --- src/extwiki-data.rkt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/extwiki-data.rkt b/src/extwiki-data.rkt index e283dfb..a8ee159 100644 --- a/src/extwiki-data.rkt +++ b/src/extwiki-data.rkt @@ -443,9 +443,9 @@ (extwiki^ '("tardis") 'default 'Tardis - "Tardis Data Core" + "TARDIS Wiki" "https://tardis.wiki/wiki/Doctor_Who_Wiki" - "https://tardis.wiki/images/Tardis_images/e/e6/Site-logo.png" + "https://tardis.wiki/w/images/Tardis_Images/e/e6/Site-logo.png" (λ (props) `())) From 443f1eecbc4c15c8038920027e62fccbdcb0bbe7 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Wed, 23 Oct 2024 22:52:00 +1300 Subject: [PATCH 58/58] Add user agent and detect blocked pages --- src/fandom-request.rkt | 40 +++++++++++++++++++++++++++++++++------- src/page-wiki.rkt | 28 ++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/fandom-request.rkt b/src/fandom-request.rkt index 966eeee..c306b04 100644 --- a/src/fandom-request.rkt +++ b/src/fandom-request.rkt @@ -1,12 +1,16 @@ #lang typed/racket/base -(require "config.rkt" +(require racket/format + racket/string + "config.rkt" "../lib/url-utils.rkt") (define-type Headers (HashTable Symbol (U Bytes String))) (require/typed net/http-easy [#:opaque Timeout-Config timeout-config?] [#:opaque Response response?] [#:opaque Session session?] + [response-status-code (Response -> Natural)] [current-session (Parameter Session)] + [current-user-agent (Parameter (U Bytes String))] [make-timeout-config ([#:lease Positive-Real] [#:connect Positive-Real] -> Timeout-Config)] [get ((U Bytes String) [#:close? Boolean] @@ -22,19 +26,41 @@ fandom-get-api timeouts) +(unless (string-contains? (~a (current-user-agent)) "BreezeWiki") + (current-user-agent + (format "BreezeWiki/1.0 (~a) ~a" + (if (config-true? 'canonical_origin) + (config-get 'canonical_origin) + "local") + (current-user-agent)))) + (define timeouts (make-timeout-config #:lease 5 #:connect 5)) +(: last-failure Flonum) +(define last-failure 0.0) +(: stored-failure (Option Response)) +(define stored-failure #f) +(define failure-persist-time 30000) + (: no-headers Headers) (define no-headers '#hasheq()) (: fandom-get (String String [#:headers (Option Headers)] -> Response)) (define (fandom-get wikiname path #:headers [headers #f]) - (define dest-url (string-append "https://www.fandom.com" path)) - (define host (string-append wikiname ".fandom.com")) - (log-outgoing wikiname path) - (get dest-url - #:timeouts timeouts - #:headers (hash-set (or headers no-headers) 'Host host))) + (or + (and ((current-inexact-milliseconds) . < . (+ last-failure failure-persist-time)) stored-failure) + (let () + (define dest-url (string-append "https://www.fandom.com" path)) + (define host (string-append wikiname ".fandom.com")) + (log-outgoing wikiname path) + (define res + (get dest-url + #:timeouts timeouts + #:headers (hash-set (or headers no-headers) 'Host host))) + (when (memq (response-status-code res) '(403 406)) + (set! last-failure (current-inexact-milliseconds)) + (set! stored-failure res)) + res))) (: fandom-get-api (String (Listof (Pair String String)) [#:headers (Option Headers)] -> Response)) (define (fandom-get-api wikiname params #:headers [headers #f]) diff --git a/src/page-wiki.rkt b/src/page-wiki.rkt index f16792c..da63617 100644 --- a/src/page-wiki.rkt +++ b/src/page-wiki.rkt @@ -18,6 +18,7 @@ "config.rkt" "data.rkt" "fandom-request.rkt" + "../lib/archive-file-mappings.rkt" "../lib/pure-utils.rkt" "../lib/syntax.rkt" "../lib/thread-utils.rkt" @@ -37,8 +38,9 @@ (define (page-wiki req) (define wikiname (path/param-path (first (url-path (request-uri req))))) + (define segments (map path/param-path (cdr (url-path (request-uri req))))) (define user-cookies (user-cookies-getter req)) - (define path (string-join (map path/param-path (cddr (url-path (request-uri req)))) "/")) + (define path (string-join (cdr segments) "/")) (define source-url (format "https://~a.fandom.com/wiki/~a" wikiname path)) (define-values (dest-res siteinfo) @@ -101,9 +103,31 @@ (write-html body out))))))] [(eq? 404 (easy:response-status-code dest-res)) (next-dispatcher)] + [(memq (easy:response-status-code dest-res) '(403 406)) + (response-handler + (define body + (generate-wiki-page + `(div + (p "Sorry! Fandom isn't allowing BreezeWiki to show pages right now.") + (p "We'll automatically try again in 30 seconds, so please stay on this page and be patient.") + (p (small "In a hurry? " (a (@ (href ,source-url)) "Click here to read the page on Fandom.")))) + #:req req + #:source-url source-url + #:wikiname wikiname + #:title (url-segments->guess-title segments) + #:siteinfo siteinfo)) + (response/output + #:code 503 + #:headers (build-headers + always-headers + (header #"Retry-After" #"30") + (header #"Cache-Control" #"max-age=30, public") + (header #"Refresh" #"35")) + (λ (out) + (write-html body out))))] [else (response-handler - (error 'page-wiki "Tried to load page ~a/~v~nSadly, the page didn't load because Fandom returned status code ~a with response:~n~a" + (error 'page-wiki "Tried to load page ~a/~a~nSadly, the page didn't load because Fandom returned status code ~a with response:~n~a" wikiname path (easy:response-status-code dest-res)