diff --git a/.gitignore b/.gitignore index 1faa8f2..17b94c5 100644 --- a/.gitignore +++ b/.gitignore @@ -16,5 +16,3 @@ compiled # Personal /config.ini -misc -storage diff --git a/archiver/archiver-cli.rkt b/archiver/archiver-cli.rkt deleted file mode 100644 index 0f349d6..0000000 --- a/archiver/archiver-cli.rkt +++ /dev/null @@ -1,71 +0,0 @@ -#lang cli -(require charterm - "archiver.rkt") - -(help (usage "Downloads a single Fandom wiki in BreezeWiki offline format." - "" - "Downloaded pages go into `archive/` next to the executable." - "Database goes into `archiver.db*` next to the executable." - "The database is necessary to store your download progress and resume where you left off if the process is interrupted.")) - -(flag (output-quiet?) - ("-q" "--output-quiet" "disable progress output") - (output-quiet? #t)) - -(flag (output-progress?) - ("-p" "--output-progress" "progress output for terminals (default in a tty)") - (output-progress? #t)) - -(flag (output-lines?) - ("-l" "--output-lines" "output the name of each file downloaded (default outside of a tty)") - (output-lines? #t)) - -(constraint (one-of output-quiet? output-lines? output-progress?)) - - - -(program - (start [wikiname "wikiname to download"]) - ;; set up arguments - (define width 80) - (when (not (or (output-quiet?) (output-lines?) (output-progress?))) - (cond [(terminal-port? current-input-port) - (output-progress? #t)] - [else - (output-lines? #t)])) - (define (update-width) - (when (output-progress?) - (case (system-type 'os) - [(linux) - (with-charterm - (call-with-values (λ () (charterm-screen-size)) - (λ (cols rows) (set! width cols))))] - [else 100]))) - (update-width) - ;; check - (when (or (not wikiname) (equal? wikiname "")) - (raise-user-error "Please specify the wikiname to download on the command line.")) - ;; progress reporting based on selected mode - (define (report-progress a b c) - (define basename (basename->name-for-query c)) - (cond - [(output-lines?) - (displayln basename)] - [(output-progress?) - (when (eq? (modulo a 20) 0) - (thread (λ () (update-width)))) - (define prefix (format "[~a] [~a/~a] " wikiname a b)) - (define rest (- width (string-length prefix))) - (define real-width (min (string-length basename) rest)) - (define spare-width (- rest real-width)) - (define name-display (substring basename 0 real-width)) - (printf "\e[2K\r~a~a" prefix name-display) - (flush-output)])) - ;; download all stages - (for ([stage all-stages] - [i (in-naturals 1)]) - (printf "> Stage ~a/~a~n" i (length all-stages)) - (stage wikiname report-progress) - (displayln ""))) - -(run start) diff --git a/archiver/archiver-database.rkt b/archiver/archiver-database.rkt deleted file mode 100644 index 91a4070..0000000 --- a/archiver/archiver-database.rkt +++ /dev/null @@ -1,82 +0,0 @@ -#lang racket/base -(require racket/file - racket/list - racket/path - racket/string - json - json-pointer - db - "../lib/syntax.rkt") - -(provide - get-slc - query-exec* - query-rows* - query-list* - query-value* - query-maybe-value* - query-maybe-row*) - -(define storage-path (anytime-path ".." "storage")) -(define database-file (build-path storage-path "archiver.db")) - -(define slc (box #f)) -(define (get-slc) - (define slc* (unbox slc)) - (cond - [slc* slc*] - [else - (make-directory* storage-path) - (define slc* (sqlite3-connect #:database database-file #:mode 'create)) - (query-exec slc* "PRAGMA journal_mode=WAL") - (define database-version - (with-handlers ([exn:fail:sql? - (λ (exn) - ; need to set up the database - (query-exec slc* "create table database_version (version integer, primary key (version))") - (query-exec slc* "insert into database_version values (0)") - 0)]) - (query-value slc* "select version from database_version"))) - - (define migrations - (wrap-sql - ((query-exec slc* "create table page (wikiname TEXT NOT NULL, basename TEXT NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, basename))") - (query-exec slc* "create table wiki (wikiname TEXT NOT NULL, progress INTEGER, PRIMARY KEY (wikiname))")) - ((query-exec slc* "create table special_page (wikiname TEXT NOT NULL, key TEXT NOT NULL, basename TEXT NOT NULL, PRIMARY KEY (wikiname, key))")) - ((query-exec slc* "update wiki set progress = 2 where wikiname in (select wikiname from wiki inner join page using (wikiname) group by wikiname having min(page.progress) = 1)")) - ((query-exec slc* "create table image (wikiname TEXT NOT NULL, hash TEXT NTO NULL, url TEXT NOT NULL, ext TEXT, source INTEGER NOT NULL, progress INTEGER NOT NULL, PRIMARY KEY (wikiname, hash))")) - ((query-exec slc* "alter table wiki add column sitename TEXT") - (query-exec slc* "alter table wiki add column basepage TEXT") - (query-exec slc* "alter table wiki add column license_text TEXT") - (query-exec slc* "alter table wiki add column license_url TEXT")) - ((query-exec slc* "alter table page add column redirect")))) - - (let do-migrate-step () - (when (database-version . < . (length migrations)) - (call-with-transaction - slc* - (list-ref migrations database-version)) - (set! database-version (add1 database-version)) - (query-exec slc* "update database_version set version = $1" database-version) - (do-migrate-step))) - - (set-box! slc slc*) - slc*])) - -(define (query-exec* . args) - (apply query-exec (get-slc) args)) - -(define (query-rows* . args) - (apply query-rows (get-slc) args)) - -(define (query-list* . args) - (apply query-list (get-slc) args)) - -(define (query-value* . args) - (apply query-value (get-slc) args)) - -(define (query-maybe-value* . args) - (apply query-maybe-value (get-slc) args)) - -(define (query-maybe-row* . args) - (apply query-maybe-row (get-slc) args)) diff --git a/archiver/archiver-gui.rkt b/archiver/archiver-gui.rkt deleted file mode 100644 index 25f447f..0000000 --- a/archiver/archiver-gui.rkt +++ /dev/null @@ -1,390 +0,0 @@ -#lang racket/base -(require racket/class - racket/draw - racket/format - racket/function - racket/list - racket/math - racket/port - racket/set - racket/splicing - racket/string - (except-in pict text table) - db - net/http-easy - memo - (only-in racket/gui timer%) - racket/gui/easy - racket/gui/easy/operator - (only-in pict bitmap) - images/icons/arrow - images/icons/control - images/icons/stickman - images/icons/style - images/icons/symbol - "archiver-database.rkt" - "archiver.rkt" - "../lib/url-utils.rkt" - "../lib/xexpr-utils.rkt") - -(default-icon-material rubber-icon-material) - -(require (for-syntax racket/base racket/match racket/set racket/string)) - -(define-syntax (@> stx) - (define form (cdr (syntax->datum stx))) - (match form - [(list form) ; (@> (fn @obs)) - ;; identify the observables and replace with non-@ symbols - (define collection (mutable-set)) - (define updated - (let loop ([sexp form]) - (cond [(symbol? sexp) - (let ([as-s (symbol->string sexp)]) - (if (string-prefix? as-s "@") - (let ([without-@ (string->symbol (substring as-s 1))]) - (set-add! collection (cons sexp without-@)) - without-@) - sexp))] - [(pair? sexp) (cons (loop (car sexp)) (loop (cdr sexp)))] - [#t sexp]))) - (define collection-l (set->list collection)) - ;; return obs-combine -> updated-form - (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-l)) ,updated) ,@(map car collection-l)))] - [(list (? string? str) args ...) ; (@> "Blah: ~a/~a" @arg1 arg2) - ;; identify the observables and replace with non-@ symbols - (define collection-l - (for/list ([arg args]) - (if (symbol? arg) - (let ([as-s (symbol->string arg)]) - (if (string-prefix? as-s "@") - (let ([without-@ (string->symbol (substring as-s 1))]) - (cons arg without-@)) - (cons #f arg))) - (cons #f arg)))) - (define collection-lo (filter car collection-l)) - ;; return obs-combine -> format - (datum->syntax stx `(obs-combine (λ (,@(map cdr collection-lo)) (format ,str ,@(map cdr collection-l))) ,@(map car collection-lo)))])) - -(define/obs @auto-retry #f) - -(define-struct qi^ (wikiname st stage progress max-progress ticks eta th) #:transparent) ;; queue item - -(define rows (query-rows* "select wikiname, progress from wiki where progress < 4")) -(define/obs @queue null) -(define (add-wikiname-to-queue wikiname st stage) - (@queue . <~ . (λ (queue) - (define already-exists? (findf (λ (qi) (equal? (qi^-wikiname qi) wikiname)) queue)) - (if already-exists? - queue - (append queue (list (qi^ wikiname st stage 0 1 0 "..." #f))))))) -(for ([row rows]) - (add-wikiname-to-queue (vector-ref row 0) - (if (= (vector-ref row 1) 4) - 'complete - 'queued) - (vector-ref row 1))) - -(define status-icon-size 32) -(define status-icon-min-width 36) -(define button-icon-size 12) - -(define color-green (make-color 90 212 68)) - -(define (resize coords fraction) - (for/list ([coord (in-list coords)]) - (cons (* fraction (car coord)) - (* fraction (cdr coord))))) - -(define (flat-right-arrow #:height [height 32] #:color [color #f]) - ((if color - (curryr colorize color) - values) - (dc (λ (dc dx dy) - (send dc draw-polygon (resize - (list '(0 . 9) '(15 . 9) '(14 . 0) - '(31 . 15.5) - '(14 . 31) '(15 . 22) '(0 . 22)) - (/ height 32)))) - height height))) - -(define (double-left-arrow-icon #:height [height 32]) - (define shift (/ height 48)) - (pict->bitmap - (scale-to-fit - (panorama - (pin-under - (bitmap - (left-over-arrow-icon #:color halt-icon-color #:height height - #:material rubber-icon-material)) - (- (* -20 shift) 2) (+ (* 6 shift) 1) - (bitmap - (bitmap-render-icon - (pict->bitmap - (rotate - (flat-right-arrow #:color (make-object color% 255 64 64) #:height (/ height 1.26)) - (* pi 1.23))))) - #;(rotate - (flat-right-arrow #:color (make-object color% 255 64 64) #:height (/ height 1.26)) - (* pi 1.23)))) - height height #:mode 'preserve/max))) - -(splicing-let ([frame-count 20]) - (define stickman-frames - (for/vector ([s (in-range 0 1 (/ 1 frame-count))]) - (running-stickman-icon - s - #:height status-icon-size - #:material (default-icon-material))))) - -(define (stick n) - (vector-ref stickman-frames (modulo n (vector-length stickman-frames)))) - -(define status-icons - (hasheq 'queued (stop-icon #:color syntax-icon-color #:height status-icon-size) - 'paused (continue-forward-icon #:color syntax-icon-color #:height status-icon-size) - 'running (stick 0) - 'error (x-icon #:height status-icon-size) - 'complete (check-icon #:color color-green #:height status-icon-size))) - -(define action-icons - (hasheq 'pause (pause-icon #:color syntax-icon-color #:height button-icon-size) - 'resume (play-icon #:color color-green #:height button-icon-size) - 'reset (left-over-arrow-icon #:color halt-icon-color #:height button-icon-size) - 'reseter (double-left-arrow-icon #:height button-icon-size))) - -(define (bitmap-view @the-bitmap [min-width 1]) - (pict-canvas #:min-size (@> (list (max min-width (send @the-bitmap get-width)) (send @the-bitmap get-height))) #;(if min-size (list min-size min-size) #f) - #:stretch '(#f #f) - #:style '(transparent) - @the-bitmap - bitmap)) - -(define (exn->string e) - (with-output-to-string - (λ () - (displayln (exn-message e)) - (displayln "context:") - (for ([item (continuation-mark-set->context (exn-continuation-marks e))]) - (printf " ~a" (srcloc->string (cdr item))) - (when (car item) - (printf ": ~a" (car item))) - (displayln ""))))) - -(define ((handle-graphical-exn @qi) e) - (displayln (exn->string e) (current-error-port)) - (cond - [(obs-peek @auto-retry) - (void) ;; TODO - #;(do-retry-end wikiname)] - [#t - (update-qi @qi [st 'error]) - (do-try-unpause-next-entry) - (thread - (λ () - (define/obs @visible? #t) - (render - (dialog #:title "Download Error" - #:style '(resize-border) - #:mixin (λ (%) (class % (super-new) - (obs-observe! @visible? (λ (visible?) (send this show visible?))))) - (vpanel #:margin '(15 15) - (text (format "Encountered this error while downloading ~a:" (qi^-wikiname (obs-peek @qi)))) - (input #:style '(multiple hscroll) - #:min-size '(#f 200) - (exn->string e)) - ;; TODO - #;(button "Retry Now" (λ () (:= @visible? #f) (do-retry-now wikiname))) - #;(button "Retry Round-Robin" (λ () (:= @visible? #f) (do-retry-end wikiname))) - #;(button "Skip Wiki" (λ () (:= @visible? #f) (do-continue))) - #;(button "Use Auto-Retry" (λ () - (:= @auto-retry #t) - (:= @visible? #f) - (do-retry-end wikiname))) - #;(text "Be careful not to auto-retry an infinite loop!"))) - main-window))) - (sleep) - ; make sure the broken thread is gone - (define th (qi^-th (obs-peek @qi))) - (when th (kill-thread th))])) - -(define segments - (list - (list 5/100 (make-color 0 223 217)) - (list 88/100 color-green) - (list 2/100 (make-color 0 223 217)) - (list 5/100 color-green))) -(define segment-spacing 2) -(unless (= (apply + (map car segments)) 1) - (error 'segments "segments add up to ~a, not 1" (apply + (map car segments)))) - -;; return the new bitmap, which can be drawn on a dc<%> -(define/memoize (ray-trace width height stage progress max-progress) - ;; (printf "rendering ~a ~a/~a at ~a~n" stage progress max-progress (current-inexact-milliseconds)) - (define bm (make-object bitmap% width height #f #t)) - (define dc (make-object bitmap-dc% bm)) - (define width-available (- width (* (length segments) segment-spacing))) - (send dc set-smoothing 'unsmoothed) - (send dc set-pen "black" 0 'transparent) - (for/fold ([offset 0]) - ([segment segments] - [i (in-naturals 0)]) ;; zero indexed stages? - ;; calculate start and end locations of grey bar - (define-values (segment-proportion segment-color) (apply values segment)) - (define segment-start (if (= offset 0) 0 (+ offset segment-spacing))) - (define segment-width (* width-available segment-proportion)) - ;; draw grey bar - (send dc set-brush (make-color 180 180 180 0.4) 'solid) - (send dc draw-rectangle segment-start 0 segment-width height) - ;; draw solid bar according to the current item's progress - (define proportion - (cond [(stage . < . i) 0] - [(stage . > . i) 1] - [(max-progress . <= . 0) 0] - [(progress . < . 0) 0] - [(progress . >= . max-progress) 1] - [else (progress . / . max-progress)])) - (send dc set-brush segment-color 'solid) - (send dc draw-rectangle segment-start 0 (* proportion segment-width) height) - (+ segment-start segment-width)) - (bitmap-render-icon bm 6/8)) - -;; get ray traced bitmap (possibly from cache) and draw on dc<%> -(define (draw-bar orig-dc qi) - ;; (println ray-traced) - (define-values (width height) (send orig-dc get-size)) - (send orig-dc draw-bitmap (ray-trace width height (qi^-stage qi) (qi^-progress qi) (qi^-max-progress qi)) 0 0)) - -(define ((make-progress-updater @qi) a b c) - ;; (printf "~a: ~a/~a ~a~n" (qi^-wikiname (obs-peek @qi)) a b c) - (update-qi @qi [progress a] [max-progress b] [ticks (add1 (qi^-ticks (obs-peek @qi)))])) - -(define/obs @input "") - -(define (do-add-to-queue) - (define wikiname (string-trim (obs-peek @input))) - (when ((string-length wikiname) . > . 0) - (add-wikiname-to-queue wikiname 'queued 0)) ;; TODO: automatically start? - (:= @input "")) - -(define-syntax-rule (update-qi @qi args ...) - (let ([wikiname (qi^-wikiname (obs-peek @qi))]) - (@queue . <~ . (λ (queue) - (for/list ([qi queue]) - (if (equal? (qi^-wikiname qi) wikiname) - (struct-copy qi^ qi args ...) - qi)))))) - -(define (do-start-qi @qi) - (define th - (thread (λ () - (with-handlers ([exn? (handle-graphical-exn @qi)]) - (define last-stage - (for/last ([stage all-stages] - [i (in-naturals)]) - (update-qi @qi [stage i]) - (stage (qi^-wikiname (obs-peek @qi)) (make-progress-updater @qi)) - i)) - (update-qi @qi [st 'complete] [stage (add1 last-stage)]) - (do-try-unpause-next-entry))))) - (update-qi @qi [st 'running] [th th])) - -(define (do-stop-qi @qi) - (define th (qi^-th (obs-peek @qi))) - (when th (kill-thread th)) - (update-qi @qi [th #f] [st 'paused])) - -(define (do-reset-qi @qi) - (define reset-progress-to 0) - (define th (qi^-th (obs-peek @qi))) - (when th (kill-thread th)) - (update-qi @qi [th #f] [st 'queued] [stage reset-progress-to] [progress 0] [max-progress 0]) - (query-exec* "update wiki set progress = ? where wikiname = ?" reset-progress-to (qi^-wikiname (obs-peek @qi)))) - -(define (do-reseter-qi @qi) - (do-reset-qi @qi) - (query-exec* "delete from page where wikiname = ?" (qi^-wikiname (obs-peek @qi)))) - -(define (do-try-unpause-next-entry) - (define queue (obs-peek @queue)) - (define next-qi (for/last ([qi queue] - #:when (memq (qi^-st qi) '(paused queued))) - qi)) - (when next-qi - (define @qi (@queue . ~> . (λ (queue) (findf (λ (qi) (equal? (qi^-wikiname qi) (qi^-wikiname next-qi))) queue)))) - (do-start-qi @qi))) - -(define main-window - (render - (window - #:title "Fandom Archiver" - #:size '(400 300) - #:mixin (λ (%) (class % - (super-new) - (define/augment (on-close) - (for ([qi (obs-peek @queue)]) - (when (qi^-th qi) - (kill-thread (qi^-th qi)))) - #;(disconnect*)))) - (vpanel - #:spacing 10 - #:margin '(5 5) - (hpanel - #:stretch '(#t #f) - #:spacing 10 - (hpanel - (text "https://") - (input @input - (λ (event data) (cond - [(eq? event 'input) (:= @input data)] - [(eq? event 'return) (do-add-to-queue)]))) - (text ".fandom.com")) - (button "Download Wiki" do-add-to-queue)) - (list-view - #:style '(vertical) - @queue - #:key qi^-wikiname - (λ (k @qi) - (define @status-icons - (@> (case (qi^-st @qi) - [(running) (stick (qi^-ticks @qi))] - [else (hash-ref status-icons (qi^-st @qi))]))) - (define @is-running? - (@> (memq (qi^-st @qi) '(running)))) - (define @is-complete? - (@> (eq? (qi^-st @qi) 'complete))) - ;; state icon at the left side - (hpanel #:stretch '(#t #f) - #:alignment '(left center) - #:spacing 8 - (bitmap-view @status-icons status-icon-min-width) - (vpanel - ;; name and buttons (top half) - (hpanel #:alignment '(left bottom) - (text (@> (qi^-wikiname @qi))) - (spacer) - (hpanel - #:stretch '(#f #f) - - (if-view @is-running? - (button (hash-ref action-icons 'pause) - (λ () (do-stop-qi @qi))) - (hpanel - #:stretch '(#f #f) - (button (hash-ref action-icons 'reseter) - (λ () (do-reseter-qi @qi))) - (button (hash-ref action-icons 'reset) - (λ () (do-reset-qi @qi))) - (button (hash-ref action-icons 'resume) - (λ () (do-start-qi @qi))))))) - ;; progress bar (bottom half) - (hpanel - (canvas - @qi - #:style '(transparent) - #:margin '(3 3) - draw-bar) - (hpanel #:min-size '(68 #f) - #:stretch '(#f #f) - #:alignment '(right center) - (text (@> (format "eta ~a" (qi^-eta @qi)))))))))))))) diff --git a/archiver/archiver.rkt b/archiver/archiver.rkt deleted file mode 100644 index 01f03ad..0000000 --- a/archiver/archiver.rkt +++ /dev/null @@ -1,388 +0,0 @@ -#lang racket/base -(require racket/file - racket/format - racket/function - racket/list - racket/path - racket/sequence - racket/string - net/url - net/mime - file/sha1 - net/http-easy - db - json - "archiver-database.rkt" - "../lib/html-parsing/main.rkt" - "../lib/mime-types.rkt" - "../lib/syntax.rkt" - "../lib/tree-updater.rkt" - "../lib/url-utils.rkt" - "../lib/xexpr-utils.rkt" - "../lib/archive-file-mappings.rkt") - -(provide - basename->name-for-query - image-url->values - hash->save-dir - all-stages) - -(module+ test - (require rackunit)) - -(define archive-root (anytime-path ".." "storage/archive")) -(make-directory* archive-root) - -(define sources '#hasheq((style . 1) (page . 2))) - -(define (get-origin wikiname) - (format "https://~a.fandom.com" wikiname)) - -(define (insert-wiki-entry wikiname) - (define dest-url - (format "https://~a.fandom.com/api.php?~a" - wikiname - (params->query '(("action" . "query") - ("meta" . "siteinfo") - ("siprop" . "general|rightsinfo|statistics|namespaces") - ("format" . "json") - ("formatversion" . "2"))))) - (define data (response-json (get dest-url))) - (define content-nss - (sort - (for/list ([(k v) (in-hash (jp "/query/namespaces" data))] - #:do [(define id (hash-ref v 'id))] - #:when (and (id . < . 2900) ; exclude maps namespace - (hash-ref v 'content))) ; exclude non-content and talk namespaces - id) - <)) - (define exists? (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) - (if (and exists? (not (sql-null? exists?))) - (query-exec* "update wiki set sitename = ?, basepage = ?, license_text = ?, license_url = ? where wikiname = ?" - (jp "/query/general/sitename" data) - (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) - (jp "/query/rightsinfo/text" data) - (jp "/query/rightsinfo/url" data) - wikiname) - (query-exec* "insert into wiki (wikiname, progress, sitename, basepage, license_text, license_url) values (?, 0, ?, ?, ?, ?)" - wikiname - (jp "/query/general/sitename" data) - (second (regexp-match #rx"/wiki/(.*)" (jp "/query/general/base" data))) - (jp "/query/rightsinfo/text" data) - (jp "/query/rightsinfo/url" data))) - (values (jp "/query/statistics/articles" data) - content-nss)) - - -(define (check-style-for-images wikiname path) - (define content (file->string path)) - (define urls (regexp-match* #rx"url\\(\"?'?([^)]*)'?\"?\\)" content #:match-select cadr)) - (for/list ([url urls] - #:when (not (or (equal? url "") - (equal? url "'") - (string-suffix? url "\"") - (string-contains? url "/resources-ucp/") - (string-contains? url "/fonts/") - (string-contains? url "/drm_fonts/") - (string-contains? url "//db.onlinewebfonts.com/") - (string-contains? url "//bits.wikimedia.org/") - (string-contains? url "mygamercard.net/") - (string-contains? url "dropbox") - (string-contains? url "only=styles") - (string-contains? url "https://https://") - (regexp-match? #rx"^%20" url) - (regexp-match? #rx"^data:" url) - (regexp-match? #rx"^file:" url)))) - (cond - [(string-prefix? url "https://") url] - [(string-prefix? url "http://") (regexp-replace #rx"http:" url "https:")] - [(string-prefix? url "httpshttps://") (regexp-replace #rx"httpshttps://" url "https://")] - [(string-prefix? url "//") (string-append "https:" url)] - [(string-prefix? url "/") (format "https://~a.fandom.com~a" wikiname url)] - [else (raise-user-error "While calling check-style-for-images, this URL had an unknown format and couldn't be saved:" url path)]))) - -(define (download-styles-for-wiki wikiname callback) - (define save-dir (build-path archive-root wikiname "styles")) - (make-directory* save-dir) - (define theme (λ (theme-name) - (cons (format "https://~a.fandom.com/wikia.php?controller=ThemeApi&method=themeVariables&variant=~a" wikiname theme-name) - (build-path save-dir (format "themeVariables-~a.css" theme-name))))) - ;; (Listof (Pair url save-path)) - (define styles - (list - (theme "default") - (theme "light") - (theme "dark") - (cons (format "https://~a.fandom.com/load.php?lang=en&modules=site.styles%7Cskin.fandomdesktop.styles%7Cext.fandom.PortableInfoboxFandomDesktop.css%7Cext.fandom.GlobalComponents.CommunityHeaderBackground.css%7Cext.gadget.site-styles%2Csound-styles&only=styles&skin=fandomdesktop" wikiname) - (build-path save-dir "site.css")))) - (for ([style styles] - [i (in-naturals)]) - (callback i (length styles) "styles...") - (define r (get (car style))) - (define body (response-body r)) - (display-to-file body (cdr style) #:exists 'replace) - ;; XXX: how the HELL do I deal with @import?? would need some kind of recursion here. how will the page server know where to look up the style file to be able to serve them again? do I add another link-stylesheet tag to the main page? what about the remaining stuck @import url? - ) - (callback (length styles) (length styles) "styles...") - styles) - -(define (hash->save-dir wikiname hash) - (build-path archive-root wikiname "images" (substring hash 0 1) (substring hash 0 2))) - -(define (image-url->values i) - ;; TODO: handle case where there is multiple broken cb parameter on minecraft wiki - ;; TODO: ensure it still "works" with broken & on minecraft wiki - (define no-cb (regexp-replace #rx"\\cb=[0-9]+&?" i "")) ; remove cb url parameter which does nothing - (define key (regexp-replace #rx"[&?]$" no-cb "")) ; remove extra separator if necessary - (define hash (sha1 (string->bytes/utf-8 key))) - (cons key hash)) - - -;; 1. Download list of wiki pages and store in database, if not done yet for that wiki -(define (if-necessary-download-list-of-pages wikiname callback) - (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) - ;; done yet? - (unless (and (real? wiki-progress) (wiki-progress . >= . 1)) - ;; Count total pages - (define-values (num-pages namespaces) (insert-wiki-entry wikiname)) - ;; Download the entire index of pages - (for*/fold ([total 0]) - ([namespace namespaces] - [redir-filter '("nonredirects" "redirects")]) - (let loop ([apcontinue ""] - [basenames null]) - (cond - [apcontinue - (define url (format "https://~a.fandom.com/api.php?~a" - wikiname - (params->query `(("action" . "query") - ("list" . "allpages") - ("apnamespace" . ,(~a namespace)) - ("apfilterredir" . ,redir-filter) - ("aplimit" . "500") - ("apcontinue" . ,apcontinue) - ("format" . "json") - ("formatversion" . "2"))))) - ;; Download the current listing page - (define res (get url)) - (define json (response-json res)) - ;; Content from this page - (define current-basenames - (for/list ([page (jp "/query/allpages" json)]) - (title->basename (jp "/title" page)))) - (when ((length current-basenames) . > . 0) - ;; Report - (if (equal? redir-filter "nonredirects") - (callback (+ (length basenames) (length current-basenames) total) num-pages (last current-basenames)) - (callback total num-pages (last current-basenames)))) - ;; Loop - (loop (jp "/continue/apcontinue" json #f) (append basenames current-basenames))] - [else - ;; All done with this (loop)! Save those pages into the database - ;; SQLite can have a maximum of 32766 parameters in a single query - (begin0 - ;; next for*/fold - (if (equal? redir-filter "nonredirects") - (+ (length basenames) total) - total) ; redirects don't count for the site statistics total - (call-with-transaction - (get-slc) - (λ () - (for ([slice (in-slice 32760 basenames)]) - (define query-template - (string-join #:before-first "insert or ignore into page (wikiname, redirect, basename, progress) values " - (make-list (length slice) "(?1, ?2, ?, 0)") ", ")) - (apply query-exec* query-template wikiname (if (equal? redir-filter "redirects") 1 sql-null) slice)))))]))) - ;; Record that we have the complete list of pages - (query-exec* "update wiki set progress = 1 where wikiname = ?" wikiname))) - - -;; 2. Download each page via API and: -;; * Save API response to file -(define max-page-progress 1) -(define (save-each-page wikiname callback) - ;; prepare destination folder - (define save-dir (build-path archive-root wikiname)) - (make-directory* save-dir) - ;; gather list of basenames to download (that aren't yet complete) - (define basenames (query-list* "select basename from page where wikiname = ? and progress < ? and redirect is null" - wikiname max-page-progress)) - ;; counter of complete/incomplete basenames - (define already-done-count - (query-value* "select count(*) from page where wikiname = ? and progress = ?" - wikiname max-page-progress)) - (define not-done-count - (query-value* "select count(*) from page where wikiname = ? and progress < ?" - wikiname max-page-progress)) - (define total-count (+ already-done-count not-done-count)) - ;; set initial progress - (callback already-done-count total-count "") - ;; loop through basenames and download - (for ([basename basenames] - [i (in-naturals (add1 already-done-count))]) - (define name-for-query (basename->name-for-query basename)) - (define dest-url - (format "https://~a.fandom.com/api.php?~a" - wikiname - (params->query `(("action" . "parse") - ("page" . ,name-for-query) - ("prop" . "text|headhtml|langlinks") - ("formatversion" . "2") - ("format" . "json"))))) - (define r (get dest-url)) - (define body (response-body r)) - (define filename (string-append basename ".json")) - (define save-path - (cond [((string-length basename) . > . 240) - (define key (sha1 (string->bytes/latin-1 basename))) - (query-exec* "insert into special_page (wikiname, key, basename) values (?, ?, ?)" - wikiname key basename) - (build-path save-dir (string-append key ".json"))] - [#t - (build-path save-dir (string-append basename ".json"))])) - (display-to-file body save-path #:exists 'replace) - (query-exec* "update page set progress = 1 where wikiname = ? and basename = ?" - wikiname basename) - (callback i total-count basename)) - ;; save redirects as well - (save-redirects wikiname callback (+ already-done-count (length basenames)) total-count) - ;; saved all pages, register that fact in the database - (query-exec* "update wiki set progress = 2 where wikiname = ? and progress <= 2" wikiname)) - - -;; 2.5. Download each redirect-target via API and save mapping in database -(define (save-redirects wikiname callback already-done-count total-count) - (define basenames (query-list* "select basename from page where wikiname = ? and progress < ? and redirect = 1" - wikiname max-page-progress)) - ;; loop through basenames, in slices of 50 (MediaWiki API max per request), and download - (for ([basename basenames] - [i (in-naturals (add1 already-done-count))]) - (define dest-url - (format "https://~a.fandom.com/api.php?~a" - wikiname - (params->query `(("action" . "query") - ("prop" . "links") - ("titles" . ,(basename->name-for-query basename)) - ("format" . "json") - ("formatversion" . "2"))))) - (define res (get dest-url)) - (define json (response-json res)) - (define dest-title (jp "/query/pages/0/links/0/title" json #f)) - (callback i total-count basename) - (cond - [dest-title - ;; store it - (define dest-basename (title->basename dest-title)) - (query-exec* "update page set progress = 1, redirect = ? where wikiname = ? and basename = ?" dest-basename wikiname basename)] - [else - ;; the page just doesn't exist - (query-exec* "delete from page where wikiname = ? and basename = ?" wikiname basename)]))) - - -;; 3. Download CSS and: -;; * Save CSS to file -;; * Record style images to database -(define (if-necessary-download-and-check-styles wikiname callback) - (define wiki-progress (query-maybe-value* "select progress from wiki where wikiname = ?" wikiname)) - (unless (and (number? wiki-progress) (wiki-progress . >= . 3)) - (define styles (download-styles-for-wiki wikiname callback)) - (define unique-image-urls - (remove-duplicates - (map image-url->values - (flatten - (for/list ([style styles]) - (check-style-for-images wikiname (cdr style))))) - #:key cdr)) - (for ([pair unique-image-urls]) - (query-exec* "insert or ignore into image (wikiname, url, hash, ext, source, progress) values (?, ?, ?, NULL, 1, 0)" wikiname (car pair) (cdr pair))) - (query-exec* "update wiki set progress = 3 where wikiname = ?" wikiname))) - - -;; 4: From downloaded pages, record URLs of image sources and inline style images to database -(define (check-json-for-images wikiname path) - (define data (with-input-from-file path (λ () (read-json)))) - (define page (html->xexp (preprocess-html-wiki (jp "/parse/text" data)))) - (define tree (update-tree-wiki page wikiname)) - null - #;(remove-duplicates - (for/list ([element (in-producer - (query-selector - (λ (t a c) - (and (eq? t 'img) - (get-attribute 'src a))) - tree) - #f)]) - (image-url->values (get-attribute 'src (bits->attributes element)))))) - - -;; 5. Download image sources and style images according to database -(define (save-each-image wikiname callback) - (define source (hash-ref sources 'style)) ;; TODO: download entire wiki images instead? - ;; gather list of basenames to download (that aren't yet complete) - (define rows (query-rows* "select url, hash from image where wikiname = ? and source <= ? and progress < 1" - wikiname source)) - ;; counter of complete/incomplete basenames - (define already-done-count - (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress = 1" - wikiname source)) - (define not-done-count - (query-value* "select count(*) from image where wikiname = ? and source <= ? and progress < 1" - wikiname source)) - ;; set initial progress - (callback already-done-count (+ already-done-count not-done-count) "") - ;; loop through urls and download - (for ([row rows] - [i (in-naturals 1)]) - ;; row fragments - (define url (vector-ref row 0)) - (define hash (vector-ref row 1)) - ;; check - #;(printf "~a -> ~a~n" url hash) - (define r (get url #:timeouts (make-timeout-config #:connect 15))) - (define declared-type (response-headers-ref r 'content-type)) - (define final-type (if (equal? declared-type #"application/octet-stream") - (let ([sniff-entity (message-entity (mime-analyze (response-body r)))]) - (string->bytes/latin-1 (format "~a/~a" (entity-type sniff-entity) (entity-subtype sniff-entity)))) - declared-type)) - (define ext - (with-handlers ([exn:fail:contract? (λ _ (error 'save-each-image "no ext found for mime type `~a` in file ~a" final-type url))]) - (bytes->string/latin-1 (mime-type->ext final-type)))) - ;; save - (define save-dir (hash->save-dir wikiname hash)) - (make-directory* save-dir) - (define save-path (build-path save-dir (string-append hash "." ext))) - (define body (response-body r)) - (display-to-file body save-path #:exists 'replace) - (query-exec* "update image set progress = 1, ext = ? where wikiname = ? and hash = ?" - ext wikiname hash) - (callback (+ already-done-count i) (+ already-done-count not-done-count) (string-append (substring hash 0 6) "..." ext))) - ;; saved all images, register that fact in the database - (query-exec* "update wiki set progress = 4 where wikiname = ?" wikiname)) - -(define all-stages - (list - if-necessary-download-list-of-pages - save-each-page - if-necessary-download-and-check-styles - ;; check-json-for-images - save-each-image)) - -(module+ test - (check-equal? (html->xexp "") - '(*TOP* (img (@ (src "https://example.com/images?src=Blah.jpg&width=150"))))) - #;(download-list-of-pages "minecraft" values) - #;(save-each-page "minecraft" values) - #;(check-json-for-images "chiki" (build-path archive-root "chiki" "Fiona.json")) - #;(do-step-3 "gallowmere") - #;(save-each-image "gallowmere" (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))) - - #;(for ([wikiname (query-list* "select wikiname from wiki")]) - (println wikiname) - (insert-wiki-entry wikiname)) - - #;(for ([wikiname (query-list* "select wikiname from wiki")]) - (println wikiname) - (do-step-3 wikiname) - (save-each-image wikiname (hash-ref sources 'style) (λ (a b c) (printf "~a/~a ~a~n" a b c))))) - -; (for ([stage all-stages]) (stage "create" (λ (a b c) (printf "~a/~a ~a~n" a b c)))) diff --git a/archiver/fts.rkt b/archiver/fts.rkt deleted file mode 100644 index 6a00041..0000000 --- a/archiver/fts.rkt +++ /dev/null @@ -1,213 +0,0 @@ -#lang cli -(require (for-syntax racket/base)) -(require racket/format - racket/function - racket/future - racket/match - racket/path - racket/promise - racket/port - racket/runtime-path - racket/sequence - racket/string - file/gunzip - db - db/unsafe/sqlite3 - net/http-easy - json - json-pointer - "../lib/html-parsing/main.rkt" - "../lib/xexpr-utils.rkt" - "../lib/tree-updater.rkt") - -(flag (read-from-cache?) - ("-c" "--read-from-cache" "read from last run cache instead of rebuilding documents") - (read-from-cache? #t)) - -(define-runtime-path storage-path "../storage/archive") - -;; *************************************************************************************************** -;; Progress bar display -;; *************************************************************************************************** - -(struct progress^ (n max title) #:transparent) - -(define (make-m-s seconds) - (define-values (eta-m eta-s) (quotient/remainder seconds 60)) - (format "~a:~a" eta-m (~a eta-s #:width 2 #:align 'right #:pad-string "0"))) - -(define (make-progress get-p [history-size 20]) - (define update-sleep 1) - (define name-width 30) - (define max-width 105) - (define history (make-vector history-size 0)) - (define history-pointer 0) - (define elapsed 0) - (define (report-progress) - (define p (get-p)) - (define history-cycle (vector-ref history history-pointer)) - (vector-set! history history-pointer (progress^-n p)) - (set! history-pointer (modulo (add1 history-pointer) history-size)) - (set! elapsed (add1 elapsed)) - (define-values (eta-display diff-per-second) - (cond - [((progress^-n p) . >= . (progress^-max p)) (values (format "~a **" (make-m-s elapsed)) (format "** ~a" (quotient (progress^-max p) (max elapsed 1))))] - [(= history-cycle 0) (values "-:--" "--")] - [else (define diff-per-second (/ (- (progress^-n p) history-cycle) (* history-size update-sleep))) - (define eta-total - (if (diff-per-second . > . 0) - (floor (round (/ (- (progress^-max p) (progress^-n p)) diff-per-second))) - 0)) - (values (make-m-s eta-total) - (round diff-per-second))])) - (define left (format "~a/~a ~a/s ~a ~a%" - (~a (progress^-n p) #:width (string-length (~a (progress^-max p))) #:align 'right #:pad-string " ") - (progress^-max p) - diff-per-second - eta-display - (floor (* 100 (/ (progress^-n p) (progress^-max p)))))) - (define name-display (~a (progress^-title p) #:max-width name-width #:limit-marker "...")) - (define remaining-space (- max-width name-width (string-length left) 2)) - (define bar-width - (floor (* (sub1 remaining-space) - (/ (progress^-n p) (progress^-max p))))) - (define bar (string-append (make-string bar-width #\=) - ">" - (make-string (- remaining-space bar-width) #\ ))) - (printf "\e[2K\r~a~a~a" left bar name-display) - (flush-output)) - (define (report-progress-loop) - (sleep update-sleep) - (report-progress) - (report-progress-loop)) - (define t (thread report-progress-loop)) - (define (quit) - (kill-thread t) - (report-progress) - (displayln "")) - quit) - -;; *************************************************************************************************** -;; Page text extractor -;; *************************************************************************************************** - -(define (class-has? attributes substrs) - (define cl (or (get-attribute 'class attributes) "")) - (ormap (λ (substr) (string-contains? cl substr)) substrs)) - -(define (updater element element-type attributes children) - (cond - [(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews")) - (list 'div '() '())] - [#t - (list element-type attributes children)])) - -(define (writer tables-mode? page) - (define (writer-inner page) - (for ([bit page]) - (cond - [(and tables-mode? (pair? bit) (memq (car bit) '(h1 h2 h3 p blockquote q))) (void)] - [(and (not tables-mode?) (pair? bit) (memq (car bit) '(ul ol dl table))) (void)] - [(memq bit '(div p li td dd dt br)) (displayln "")] - [(symbol? bit) (void)] - [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)] - [(and (pair? bit) (eq? (car bit) '@)) (void)] - [(pair? bit) (writer-inner bit)] - [(string? bit) (display bit)]))) - (writer-inner page)) - -(define (write-and-post-process tables-mode? page) - (define text (with-output-to-string (λ () (writer tables-mode? page)))) - ;; (define text-no-numbers (regexp-replace* #px"(?:-|[+$£€¥] *)?[0-9,.]{2,}%?\\s*" text "")) - (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n")) - shrink-text) - -(define ((extract f)) ; f - filename - (with-handlers - ([exn:fail? (λ (err) (printf "extract: ~a: ~v~n" f err))]) - (define j - (case (path-get-extension f) - [(#".json") - (with-input-from-file f (λ () (read-json)))] - [(#".gz") - (define-values (in out) (make-pipe)) - (with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out))) - (read-json in)] - [else #f])) - (define title (json-pointer-value "/parse/title" j)) - (define pageid (json-pointer-value "/parse/pageid" j)) - (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j))) - (define page (update-tree updater (html->xexp page-html))) - (define body (write-and-post-process #f page)) - (define table (write-and-post-process #t page)) - (list title body table pageid))) - -;; *************************************************************************************************** -;; Program, loop, Solr APIs -;; *************************************************************************************************** - -(program - (start [wikiname "wikiname to download"]) - - (define results - (for/list ([f (directory-list (build-path storage-path wikiname) #:build? #t)] - #:when (member (path-get-extension f) '(#".gz"))) - (extract f))) - - (define data - (cond - [(and (read-from-cache?) (file-exists? "cache.rkt")) - (define size (file-size "cache.rkt")) - (call-with-input-file "cache.rkt" - (λ (in) - (define quit (make-progress (λ () (progress^ (ceiling (/ (file-position in) 64 1024)) - (ceiling (/ size 64 1024)) - "Reading in...")) - 2)) - (begin0 - (read in) - (quit))))] - [else - (define x (box (progress^ 0 1 "..."))) - (define quit (make-progress (λ () (unbox x)))) - (define data - (for/list ([fut results] - [i (in-naturals 1)] - #:do [(define page (fut))] - #:when (not (void? page))) - (match-define (list title body table pageid) page) - (define len (string-length body)) - (set-box! x (progress^ i (length results) title)) - `#hasheq((id . ,(number->string pageid)) - (title . ,title) - (body . ,body) - (table . ,table) - (len . ,len)))) - (quit) - - (display "Writing out... ") - (flush-output) - (with-output-to-file "cache.rkt" (λ () (write data)) #:exists 'truncate/replace) - data])) - - (display "Converting... ") - (flush-output) - (define slice-size 30000) - (define slices (ceiling (/ (length data) slice-size))) - (for ([slice (in-slice slice-size data)] - [i (in-naturals 1)]) - (define ser (jsexpr->bytes slice)) - (define ser-port (open-input-bytes ser)) - (define quit (make-progress (λ () (progress^ (ceiling (/ (file-position ser-port) 64 1024)) - (ceiling (/ (bytes-length ser) 64 1024)) - (format "Posting... (~a/~a)" i slices))) - 2)) - (define res - (post (format "http://localhost:8983/solr/~a/update?commit=true" wikiname) - #:data ser-port - #:headers '#hasheq((Content-Type . "application/json")) - #:timeouts (make-timeout-config #:lease 5 #:connect 5 #:request 300))) - (quit) - (displayln (response-status-line res)))) - -(run start) diff --git a/archiver/info.rkt b/archiver/info.rkt deleted file mode 100644 index 17bb747..0000000 --- a/archiver/info.rkt +++ /dev/null @@ -1,3 +0,0 @@ -#lang info - -(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo" "net-cookies-lib" "gui-easy-lib" "sql" "charterm" "cli")) diff --git a/archiver/req.rktd b/archiver/req.rktd deleted file mode 100644 index e2d2fc2..0000000 --- a/archiver/req.rktd +++ /dev/null @@ -1 +0,0 @@ -((local ("."))) diff --git a/archiver/solr-config-dir/lang/contractions_ca.txt b/archiver/solr-config-dir/lang/contractions_ca.txt deleted file mode 100644 index 307a85f..0000000 --- a/archiver/solr-config-dir/lang/contractions_ca.txt +++ /dev/null @@ -1,8 +0,0 @@ -# Set of Catalan contractions for ElisionFilter -# TODO: load this as a resource from the analyzer and sync it in build.xml -d -l -m -n -s -t diff --git a/archiver/solr-config-dir/lang/contractions_fr.txt b/archiver/solr-config-dir/lang/contractions_fr.txt deleted file mode 100644 index f1bba51..0000000 --- a/archiver/solr-config-dir/lang/contractions_fr.txt +++ /dev/null @@ -1,15 +0,0 @@ -# Set of French contractions for ElisionFilter -# TODO: load this as a resource from the analyzer and sync it in build.xml -l -m -t -qu -n -s -j -d -c -jusqu -quoiqu -lorsqu -puisqu diff --git a/archiver/solr-config-dir/lang/contractions_ga.txt b/archiver/solr-config-dir/lang/contractions_ga.txt deleted file mode 100644 index 9ebe7fa..0000000 --- a/archiver/solr-config-dir/lang/contractions_ga.txt +++ /dev/null @@ -1,5 +0,0 @@ -# Set of Irish contractions for ElisionFilter -# TODO: load this as a resource from the analyzer and sync it in build.xml -d -m -b diff --git a/archiver/solr-config-dir/lang/contractions_it.txt b/archiver/solr-config-dir/lang/contractions_it.txt deleted file mode 100644 index cac0409..0000000 --- a/archiver/solr-config-dir/lang/contractions_it.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Set of Italian contractions for ElisionFilter -# TODO: load this as a resource from the analyzer and sync it in build.xml -c -l -all -dall -dell -nell -sull -coll -pell -gl -agl -dagl -degl -negl -sugl -un -m -t -s -v -d diff --git a/archiver/solr-config-dir/lang/hyphenations_ga.txt b/archiver/solr-config-dir/lang/hyphenations_ga.txt deleted file mode 100644 index 4d2642c..0000000 --- a/archiver/solr-config-dir/lang/hyphenations_ga.txt +++ /dev/null @@ -1,5 +0,0 @@ -# Set of Irish hyphenations for StopFilter -# TODO: load this as a resource from the analyzer and sync it in build.xml -h -n -t diff --git a/archiver/solr-config-dir/lang/stemdict_nl.txt b/archiver/solr-config-dir/lang/stemdict_nl.txt deleted file mode 100644 index 4410729..0000000 --- a/archiver/solr-config-dir/lang/stemdict_nl.txt +++ /dev/null @@ -1,6 +0,0 @@ -# Set of overrides for the dutch stemmer -# TODO: load this as a resource from the analyzer and sync it in build.xml -fiets fiets -bromfiets bromfiets -ei eier -kind kinder diff --git a/archiver/solr-config-dir/lang/stoptags_ja.txt b/archiver/solr-config-dir/lang/stoptags_ja.txt deleted file mode 100644 index 71b7508..0000000 --- a/archiver/solr-config-dir/lang/stoptags_ja.txt +++ /dev/null @@ -1,420 +0,0 @@ -# -# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. -# -# Any token with a part-of-speech tag that exactly matches those defined in this -# file are removed from the token stream. -# -# Set your own stoptags by uncommenting the lines below. Note that comments are -# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, -# etc. that can be useful for building you own stoptag set. -# -# The entire possible tagset is provided below for convenience. -# -##### -# noun: unclassified nouns -#名詞 -# -# noun-common: Common nouns or nouns where the sub-classification is undefined -#名詞-一般 -# -# noun-proper: Proper nouns where the sub-classification is undefined -#名詞-固有名詞 -# -# noun-proper-misc: miscellaneous proper nouns -#名詞-固有名詞-一般 -# -# noun-proper-person: Personal names where the sub-classification is undefined -#名詞-固有名詞-人名 -# -# noun-proper-person-misc: names that cannot be divided into surname and -# given name; foreign names; names where the surname or given name is unknown. -# e.g. お市の方 -#名詞-固有名詞-人名-一般 -# -# noun-proper-person-surname: Mainly Japanese surnames. -# e.g. 山田 -#名詞-固有名詞-人名-姓 -# -# noun-proper-person-given_name: Mainly Japanese given names. -# e.g. 太郎 -#名詞-固有名詞-人名-名 -# -# noun-proper-organization: Names representing organizations. -# e.g. 通産省, NHK -#名詞-固有名詞-組織 -# -# noun-proper-place: Place names where the sub-classification is undefined -#名詞-固有名詞-地域 -# -# noun-proper-place-misc: Place names excluding countries. -# e.g. アジア, バルセロナ, 京都 -#名詞-固有名詞-地域-一般 -# -# noun-proper-place-country: Country names. -# e.g. 日本, オーストラリア -#名詞-固有名詞-地域-国 -# -# noun-pronoun: Pronouns where the sub-classification is undefined -#名詞-代名詞 -# -# noun-pronoun-misc: miscellaneous pronouns: -# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ -#名詞-代名詞-一般 -# -# noun-pronoun-contraction: Spoken language contraction made by combining a -# pronoun and the particle 'wa'. -# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ -#名詞-代名詞-縮約 -# -# noun-adverbial: Temporal nouns such as names of days or months that behave -# like adverbs. Nouns that represent amount or ratios and can be used adverbially, -# e.g. 金曜, 一月, 午後, 少量 -#名詞-副詞可能 -# -# noun-verbal: Nouns that take arguments with case and can appear followed by -# 'suru' and related verbs (する, できる, なさる, くださる) -# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り -#名詞-サ変接続 -# -# noun-adjective-base: The base form of adjectives, words that appear before な ("na") -# e.g. 健康, 安易, 駄目, だめ -#名詞-形容動詞語幹 -# -# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. -# e.g. 0, 1, 2, 何, 数, 幾 -#名詞-数 -# -# noun-affix: noun affixes where the sub-classification is undefined -#名詞-非自立 -# -# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that -# attach to the base form of inflectional words, words that cannot be classified -# into any of the other categories below. This category includes indefinite nouns. -# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, -# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, -# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, -# わり, 割り, 割, ん-口語/, もん-口語/ -#名詞-非自立-一般 -# -# noun-affix-adverbial: noun affixes that that can behave as adverbs. -# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, -# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, -# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, -# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, -# 儘, 侭, みぎり, 矢先 -#名詞-非自立-副詞可能 -# -# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars -# with the stem よう(だ) ("you(da)"). -# e.g. よう, やう, 様 (よう) -#名詞-非自立-助動詞語幹 -# -# noun-affix-adjective-base: noun affixes that can connect to the indeclinable -# connection form な (aux "da"). -# e.g. みたい, ふう -#名詞-非自立-形容動詞語幹 -# -# noun-special: special nouns where the sub-classification is undefined. -#名詞-特殊 -# -# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is -# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base -# form of inflectional words. -# e.g. そう -#名詞-特殊-助動詞語幹 -# -# noun-suffix: noun suffixes where the sub-classification is undefined. -#名詞-接尾 -# -# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect -# to ガル or タイ and can combine into compound nouns, words that cannot be classified into -# any of the other categories below. In general, this category is more inclusive than -# 接尾語 ("suffix") and is usually the last element in a compound noun. -# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, -# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 -#名詞-接尾-一般 -# -# noun-suffix-person: Suffixes that form nouns and attach to person names more often -# than other nouns. -# e.g. 君, 様, 著 -#名詞-接尾-人名 -# -# noun-suffix-place: Suffixes that form nouns and attach to place names more often -# than other nouns. -# e.g. 町, 市, 県 -#名詞-接尾-地域 -# -# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that -# can appear before スル ("suru"). -# e.g. 化, 視, 分け, 入り, 落ち, 買い -#名詞-接尾-サ変接続 -# -# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, -# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the -# conjunctive form of inflectional words. -# e.g. そう -#名詞-接尾-助動詞語幹 -# -# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive -# form of inflectional words and appear before the copula だ ("da"). -# e.g. 的, げ, がち -#名詞-接尾-形容動詞語幹 -# -# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. -# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) -#名詞-接尾-副詞可能 -# -# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category -# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach -# to numbers. -# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 -#名詞-接尾-助数詞 -# -# noun-suffix-special: Special suffixes that mainly attach to inflecting words. -# e.g. (楽し) さ, (考え) 方 -#名詞-接尾-特殊 -# -# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words -# together. -# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) -#名詞-接続詞的 -# -# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are -# semantically verb-like. -# e.g. ごらん, ご覧, 御覧, 頂戴 -#名詞-動詞非自立的 -# -# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, -# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") -# is いわく ("iwaku"). -#名詞-引用文字列 -# -# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and -# behave like an adjective. -# e.g. 申し訳, 仕方, とんでも, 違い -#名詞-ナイ形容詞語幹 -# -##### -# prefix: unclassified prefixes -#接頭詞 -# -# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) -# excluding numerical expressions. -# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) -#接頭詞-名詞接続 -# -# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb -# in conjunctive form followed by なる/なさる/くださる. -# e.g. お (読みなさい), お (座り) -#接頭詞-動詞接続 -# -# prefix-adjectival: Prefixes that attach to adjectives. -# e.g. お (寒いですねえ), バカ (でかい) -#接頭詞-形容詞接続 -# -# prefix-numerical: Prefixes that attach to numerical expressions. -# e.g. 約, およそ, 毎時 -#接頭詞-数接続 -# -##### -# verb: unclassified verbs -#動詞 -# -# verb-main: -#動詞-自立 -# -# verb-auxiliary: -#動詞-非自立 -# -# verb-suffix: -#動詞-接尾 -# -##### -# adjective: unclassified adjectives -#形容詞 -# -# adjective-main: -#形容詞-自立 -# -# adjective-auxiliary: -#形容詞-非自立 -# -# adjective-suffix: -#形容詞-接尾 -# -##### -# adverb: unclassified adverbs -#副詞 -# -# adverb-misc: Words that can be segmented into one unit and where adnominal -# modification is not possible. -# e.g. あいかわらず, 多分 -#副詞-一般 -# -# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, -# な, する, だ, etc. -# e.g. こんなに, そんなに, あんなに, なにか, なんでも -#副詞-助詞類接続 -# -##### -# adnominal: Words that only have noun-modifying forms. -# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, -# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, -# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き -#連体詞 -# -##### -# conjunction: Conjunctions that can occur independently. -# e.g. が, けれども, そして, じゃあ, それどころか -接続詞 -# -##### -# particle: unclassified particles. -助詞 -# -# particle-case: case particles where the subclassification is undefined. -助詞-格助詞 -# -# particle-case-misc: Case particles. -# e.g. から, が, で, と, に, へ, より, を, の, にて -助詞-格助詞-一般 -# -# particle-case-quote: the "to" that appears after nouns, a person’s speech, -# quotation marks, expressions of decisions from a meeting, reasons, judgements, -# conjectures, etc. -# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) -助詞-格助詞-引用 -# -# particle-case-compound: Compounds of particles and verbs that mainly behave -# like case particles. -# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, -# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, -# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, -# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, -# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, -# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, -# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, -# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ -助詞-格助詞-連語 -# -# particle-conjunctive: -# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, -# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, -# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ -助詞-接続助詞 -# -# particle-dependency: -# e.g. こそ, さえ, しか, すら, は, も, ぞ -助詞-係助詞 -# -# particle-adverbial: -# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, -# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, -# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, -# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, -# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) -助詞-副助詞 -# -# particle-interjective: particles with interjective grammatical roles. -# e.g. (松島) や -助詞-間投助詞 -# -# particle-coordinate: -# e.g. と, たり, だの, だり, とか, なり, や, やら -助詞-並立助詞 -# -# particle-final: -# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, -# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ -助詞-終助詞 -# -# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is -# adverbial, conjunctive, or sentence final. For example: -# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 -# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 -# 「(祈りが届いたせい) か (, 試験に合格した.)」 -# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 -# e.g. か -助詞-副助詞/並立助詞/終助詞 -# -# particle-adnominalizer: The "no" that attaches to nouns and modifies -# non-inflectional words. -助詞-連体化 -# -# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs -# that are giongo, giseigo, or gitaigo. -# e.g. に, と -助詞-副詞化 -# -# particle-special: A particle that does not fit into one of the above classifications. -# This includes particles that are used in Tanka, Haiku, and other poetry. -# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) -助詞-特殊 -# -##### -# auxiliary-verb: -助動詞 -# -##### -# interjection: Greetings and other exclamations. -# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, -# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい -#感動詞 -# -##### -# symbol: unclassified Symbols. -記号 -# -# symbol-misc: A general symbol not in one of the categories below. -# e.g. [○◎@$〒→+] -記号-一般 -# -# symbol-comma: Commas -# e.g. [,、] -記号-読点 -# -# symbol-period: Periods and full stops. -# e.g. [..。] -記号-句点 -# -# symbol-space: Full-width whitespace. -記号-空白 -# -# symbol-open_bracket: -# e.g. [({‘“『【] -記号-括弧開 -# -# symbol-close_bracket: -# e.g. [)}’”』」】] -記号-括弧閉 -# -# symbol-alphabetic: -#記号-アルファベット -# -##### -# other: unclassified other -#その他 -# -# other-interjection: Words that are hard to classify as noun-suffixes or -# sentence-final particles. -# e.g. (だ)ァ -その他-間投 -# -##### -# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. -# e.g. あの, うんと, えと -フィラー -# -##### -# non-verbal: non-verbal sound. -非言語音 -# -##### -# fragment: -#語断片 -# -##### -# unknown: unknown part of speech. -#未知語 -# -##### End of file diff --git a/archiver/solr-config-dir/lang/stopwords_ar.txt b/archiver/solr-config-dir/lang/stopwords_ar.txt deleted file mode 100644 index 046829d..0000000 --- a/archiver/solr-config-dir/lang/stopwords_ar.txt +++ /dev/null @@ -1,125 +0,0 @@ -# This file was created by Jacques Savoy and is distributed under the BSD license. -# See http://members.unine.ch/jacques.savoy/clef/index.html. -# Also see http://www.opensource.org/licenses/bsd-license.html -# Cleaned on October 11, 2009 (not normalized, so use before normalization) -# This means that when modifying this list, you might need to add some -# redundant entries, for example containing forms with both أ and ا -من -ومن -منها -منه -في -وفي -فيها -فيه -و -ف -ثم -او -أو -ب -بها -به -ا -أ -اى -اي -أي -أى -لا -ولا -الا -ألا -إلا -لكن -ما -وما -كما -فما -عن -مع -اذا -إذا -ان -أن -إن -انها -أنها -إنها -انه -أنه -إنه -بان -بأن -فان -فأن -وان -وأن -وإن -التى -التي -الذى -الذي -الذين -الى -الي -إلى -إلي -على -عليها -عليه -اما -أما -إما -ايضا -أيضا -كل -وكل -لم -ولم -لن -ولن -هى -هي -هو -وهى -وهي -وهو -فهى -فهي -فهو -انت -أنت -لك -لها -له -هذه -هذا -تلك -ذلك -هناك -كانت -كان -يكون -تكون -وكانت -وكان -غير -بعض -قد -نحو -بين -بينما -منذ -ضمن -حيث -الان -الآن -خلال -بعد -قبل -حتى -عند -عندما -لدى -جميع diff --git a/archiver/solr-config-dir/lang/stopwords_bg.txt b/archiver/solr-config-dir/lang/stopwords_bg.txt deleted file mode 100644 index 1ae4ba2..0000000 --- a/archiver/solr-config-dir/lang/stopwords_bg.txt +++ /dev/null @@ -1,193 +0,0 @@ -# This file was created by Jacques Savoy and is distributed under the BSD license. -# See http://members.unine.ch/jacques.savoy/clef/index.html. -# Also see http://www.opensource.org/licenses/bsd-license.html -а -аз -ако -ала -бе -без -беше -би -бил -била -били -било -близо -бъдат -бъде -бяха -в -вас -ваш -ваша -вероятно -вече -взема -ви -вие -винаги -все -всеки -всички -всичко -всяка -във -въпреки -върху -г -ги -главно -го -д -да -дали -до -докато -докога -дори -досега -доста -е -едва -един -ето -за -зад -заедно -заради -засега -затова -защо -защото -и -из -или -им -има -имат -иска -й -каза -как -каква -какво -както -какъв -като -кога -когато -което -които -кой -който -колко -която -къде -където -към -ли -м -ме -между -мен -ми -мнозина -мога -могат -може -моля -момента -му -н -на -над -назад -най -направи -напред -например -нас -не -него -нея -ни -ние -никой -нито -но -някои -някой -няма -обаче -около -освен -особено -от -отгоре -отново -още -пак -по -повече -повечето -под -поне -поради -после -почти -прави -пред -преди -през -при -пък -първо -с -са -само -се -сега -си -скоро -след -сме -според -сред -срещу -сте -съм -със -също -т -тази -така -такива -такъв -там -твой -те -тези -ти -тн -то -това -тогава -този -той -толкова -точно -трябва -тук -тъй -тя -тях -у -харесва -ч -че -често -чрез -ще -щом -я diff --git a/archiver/solr-config-dir/lang/stopwords_ca.txt b/archiver/solr-config-dir/lang/stopwords_ca.txt deleted file mode 100644 index 3da65de..0000000 --- a/archiver/solr-config-dir/lang/stopwords_ca.txt +++ /dev/null @@ -1,220 +0,0 @@ -# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) -a -abans -ací -ah -així -això -al -als -aleshores -algun -alguna -algunes -alguns -alhora -allà -allí -allò -altra -altre -altres -amb -ambdós -ambdues -apa -aquell -aquella -aquelles -aquells -aquest -aquesta -aquestes -aquests -aquí -baix -cada -cadascú -cadascuna -cadascunes -cadascuns -com -contra -d'un -d'una -d'unes -d'uns -dalt -de -del -dels -des -després -dins -dintre -donat -doncs -durant -e -eh -el -els -em -en -encara -ens -entre -érem -eren -éreu -es -és -esta -està -estàvem -estaven -estàveu -esteu -et -etc -ets -fins -fora -gairebé -ha -han -has -havia -he -hem -heu -hi -ho -i -igual -iguals -ja -l'hi -la -les -li -li'n -llavors -m'he -ma -mal -malgrat -mateix -mateixa -mateixes -mateixos -me -mentre -més -meu -meus -meva -meves -molt -molta -moltes -molts -mon -mons -n'he -n'hi -ne -ni -no -nogensmenys -només -nosaltres -nostra -nostre -nostres -o -oh -oi -on -pas -pel -pels -per -però -perquè -poc -poca -pocs -poques -potser -propi -qual -quals -quan -quant -que -què -quelcom -qui -quin -quina -quines -quins -s'ha -s'han -sa -semblant -semblants -ses -seu -seus -seva -seva -seves -si -sobre -sobretot -sóc -solament -sols -son -són -sons -sota -sou -t'ha -t'han -t'he -ta -tal -també -tampoc -tan -tant -tanta -tantes -teu -teus -teva -teves -ton -tons -tot -tota -totes -tots -un -una -unes -uns -us -va -vaig -vam -van -vas -veu -vosaltres -vostra -vostre -vostres diff --git a/archiver/solr-config-dir/lang/stopwords_cz.txt b/archiver/solr-config-dir/lang/stopwords_cz.txt deleted file mode 100644 index 53c6097..0000000 --- a/archiver/solr-config-dir/lang/stopwords_cz.txt +++ /dev/null @@ -1,172 +0,0 @@ -a -s -k -o -i -u -v -z -dnes -cz -tímto -budeš -budem -byli -jseš -můj -svým -ta -tomto -tohle -tuto -tyto -jej -zda -proč -máte -tato -kam -tohoto -kdo -kteří -mi -nám -tom -tomuto -mít -nic -proto -kterou -byla -toho -protože -asi -ho -naši -napište -re -což -tím -takže -svých -její -svými -jste -aj -tu -tedy -teto -bylo -kde -ke -pravé -ji -nad -nejsou -či -pod -téma -mezi -přes -ty -pak -vám -ani -když -však -neg -jsem -tento -článku -články -aby -jsme -před -pta -jejich -byl -ještě -až -bez -také -pouze -první -vaše -která -nás -nový -tipy -pokud -může -strana -jeho -své -jiné -zprávy -nové -není -vás -jen -podle -zde -už -být -více -bude -již -než -který -by -které -co -nebo -ten -tak -má -při -od -po -jsou -jak -další -ale -si -se -ve -to -jako -za -zpět -ze -do -pro -je -na -atd -atp -jakmile -přičemž -já -on -ona -ono -oni -ony -my -vy -jí -ji -mě -mne -jemu -tomu -těm -těmu -němu -němuž -jehož -jíž -jelikož -jež -jakož -načež diff --git a/archiver/solr-config-dir/lang/stopwords_da.txt b/archiver/solr-config-dir/lang/stopwords_da.txt deleted file mode 100644 index 42e6145..0000000 --- a/archiver/solr-config-dir/lang/stopwords_da.txt +++ /dev/null @@ -1,110 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A Danish stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - | This is a ranked list (commonest to rarest) of stopwords derived from - | a large text sample. - - -og | and -i | in -jeg | I -det | that (dem. pronoun)/it (pers. pronoun) -at | that (in front of a sentence)/to (with infinitive) -en | a/an -den | it (pers. pronoun)/that (dem. pronoun) -til | to/at/for/until/against/by/of/into, more -er | present tense of "to be" -som | who, as -på | on/upon/in/on/at/to/after/of/with/for, on -de | they -med | with/by/in, along -han | he -af | of/by/from/off/for/in/with/on, off -for | at/for/to/from/by/of/ago, in front/before, because -ikke | not -der | who/which, there/those -var | past tense of "to be" -mig | me/myself -sig | oneself/himself/herself/itself/themselves -men | but -et | a/an/one, one (number), someone/somebody/one -har | present tense of "to have" -om | round/about/for/in/a, about/around/down, if -vi | we -min | my -havde | past tense of "to have" -ham | him -hun | she -nu | now -over | over/above/across/by/beyond/past/on/about, over/past -da | then, when/as/since -fra | from/off/since, off, since -du | you -ud | out -sin | his/her/its/one's -dem | them -os | us/ourselves -op | up -man | you/one -hans | his -hvor | where -eller | or -hvad | what -skal | must/shall etc. -selv | myself/youself/herself/ourselves etc., even -her | here -alle | all/everyone/everybody etc. -vil | will (verb) -blev | past tense of "to stay/to remain/to get/to become" -kunne | could -ind | in -når | when -være | present tense of "to be" -dog | however/yet/after all -noget | something -ville | would -jo | you know/you see (adv), yes -deres | their/theirs -efter | after/behind/according to/for/by/from, later/afterwards -ned | down -skulle | should -denne | this -end | than -dette | this -mit | my/mine -også | also -under | under/beneath/below/during, below/underneath -have | have -dig | you -anden | other -hende | her -mine | my -alt | everything -meget | much/very, plenty of -sit | his, her, its, one's -sine | his, her, its, one's -vor | our -mod | against -disse | these -hvis | if -din | your/yours -nogle | some -hos | by/at -blive | be/become -mange | many -ad | by/through -bliver | present tense of "to be/to become" -hendes | her/hers -været | be -thi | for (conj) -jer | you -sådan | such, like this/like that diff --git a/archiver/solr-config-dir/lang/stopwords_de.txt b/archiver/solr-config-dir/lang/stopwords_de.txt deleted file mode 100644 index 86525e7..0000000 --- a/archiver/solr-config-dir/lang/stopwords_de.txt +++ /dev/null @@ -1,294 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A German stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - | The number of forms in this list is reduced significantly by passing it - | through the German stemmer. - - -aber | but - -alle | all -allem -allen -aller -alles - -als | than, as -also | so -am | an + dem -an | at - -ander | other -andere -anderem -anderen -anderer -anderes -anderm -andern -anderr -anders - -auch | also -auf | on -aus | out of -bei | by -bin | am -bis | until -bist | art -da | there -damit | with it -dann | then - -der | the -den -des -dem -die -das - -daß | that - -derselbe | the same -derselben -denselben -desselben -demselben -dieselbe -dieselben -dasselbe - -dazu | to that - -dein | thy -deine -deinem -deinen -deiner -deines - -denn | because - -derer | of those -dessen | of him - -dich | thee -dir | to thee -du | thou - -dies | this -diese -diesem -diesen -dieser -dieses - - -doch | (several meanings) -dort | (over) there - - -durch | through - -ein | a -eine -einem -einen -einer -eines - -einig | some -einige -einigem -einigen -einiger -einiges - -einmal | once - -er | he -ihn | him -ihm | to him - -es | it -etwas | something - -euer | your -eure -eurem -euren -eurer -eures - -für | for -gegen | towards -gewesen | p.p. of sein -hab | have -habe | have -haben | have -hat | has -hatte | had -hatten | had -hier | here -hin | there -hinter | behind - -ich | I -mich | me -mir | to me - - -ihr | you, to her -ihre -ihrem -ihren -ihrer -ihres -euch | to you - -im | in + dem -in | in -indem | while -ins | in + das -ist | is - -jede | each, every -jedem -jeden -jeder -jedes - -jene | that -jenem -jenen -jener -jenes - -jetzt | now -kann | can - -kein | no -keine -keinem -keinen -keiner -keines - -können | can -könnte | could -machen | do -man | one - -manche | some, many a -manchem -manchen -mancher -manches - -mein | my -meine -meinem -meinen -meiner -meines - -mit | with -muss | must -musste | had to -nach | to(wards) -nicht | not -nichts | nothing -noch | still, yet -nun | now -nur | only -ob | whether -oder | or -ohne | without -sehr | very - -sein | his -seine -seinem -seinen -seiner -seines - -selbst | self -sich | herself - -sie | they, she -ihnen | to them - -sind | are -so | so - -solche | such -solchem -solchen -solcher -solches - -soll | shall -sollte | should -sondern | but -sonst | else -über | over -um | about, around -und | and - -uns | us -unse -unsem -unsen -unser -unses - -unter | under -viel | much -vom | von + dem -von | from -vor | before -während | while -war | was -waren | were -warst | wast -was | what -weg | away, off -weil | because -weiter | further - -welche | which -welchem -welchen -welcher -welches - -wenn | when -werde | will -werden | will -wie | how -wieder | again -will | want -wir | we -wird | will -wirst | willst -wo | where -wollen | want -wollte | wanted -würde | would -würden | would -zu | to -zum | zu + dem -zur | zu + der -zwar | indeed -zwischen | between - diff --git a/archiver/solr-config-dir/lang/stopwords_el.txt b/archiver/solr-config-dir/lang/stopwords_el.txt deleted file mode 100644 index 232681f..0000000 --- a/archiver/solr-config-dir/lang/stopwords_el.txt +++ /dev/null @@ -1,78 +0,0 @@ -# Lucene Greek Stopwords list -# Note: by default this file is used after GreekLowerCaseFilter, -# so when modifying this file use 'σ' instead of 'ς' -ο -η -το -οι -τα -του -τησ -των -τον -την -και -κι -κ -ειμαι -εισαι -ειναι -ειμαστε -ειστε -στο -στον -στη -στην -μα -αλλα -απο -για -προσ -με -σε -ωσ -παρα -αντι -κατα -μετα -θα -να -δε -δεν -μη -μην -επι -ενω -εαν -αν -τοτε -που -πωσ -ποιοσ -ποια -ποιο -ποιοι -ποιεσ -ποιων -ποιουσ -αυτοσ -αυτη -αυτο -αυτοι -αυτων -αυτουσ -αυτεσ -αυτα -εκεινοσ -εκεινη -εκεινο -εκεινοι -εκεινεσ -εκεινα -εκεινων -εκεινουσ -οπωσ -ομωσ -ισωσ -οσο -οτι diff --git a/archiver/solr-config-dir/lang/stopwords_en.txt b/archiver/solr-config-dir/lang/stopwords_en.txt deleted file mode 100644 index 2c164c0..0000000 --- a/archiver/solr-config-dir/lang/stopwords_en.txt +++ /dev/null @@ -1,54 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# a couple of test stopwords to test that the words are really being -# configured from this file: -stopworda -stopwordb - -# Standard english stop words taken from Lucene's StopAnalyzer -a -an -and -are -as -at -be -but -by -for -if -in -into -is -it -no -not -of -on -or -such -that -the -their -then -there -these -they -this -to -was -will -with diff --git a/archiver/solr-config-dir/lang/stopwords_es.txt b/archiver/solr-config-dir/lang/stopwords_es.txt deleted file mode 100644 index 487d78c..0000000 --- a/archiver/solr-config-dir/lang/stopwords_es.txt +++ /dev/null @@ -1,356 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A Spanish stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - - | The following is a ranked list (commonest to rarest) of stopwords - | deriving from a large sample of text. - - | Extra words have been added at the end. - -de | from, of -la | the, her -que | who, that -el | the -en | in -y | and -a | to -los | the, them -del | de + el -se | himself, from him etc -las | the, them -por | for, by, etc -un | a -para | for -con | with -no | no -una | a -su | his, her -al | a + el - | es from SER -lo | him -como | how -más | more -pero | pero -sus | su plural -le | to him, her -ya | already -o | or - | fue from SER -este | this - | ha from HABER -sí | himself etc -porque | because -esta | this - | son from SER -entre | between - | está from ESTAR -cuando | when -muy | very -sin | without -sobre | on - | ser from SER - | tiene from TENER -también | also -me | me -hasta | until -hay | there is/are -donde | where - | han from HABER -quien | whom, that - | están from ESTAR - | estado from ESTAR -desde | from -todo | all -nos | us -durante | during - | estados from ESTAR -todos | all -uno | a -les | to them -ni | nor -contra | against -otros | other - | fueron from SER -ese | that -eso | that - | había from HABER -ante | before -ellos | they -e | and (variant of y) -esto | this -mí | me -antes | before -algunos | some -qué | what? -unos | a -yo | I -otro | other -otras | other -otra | other -él | he -tanto | so much, many -esa | that -estos | these -mucho | much, many -quienes | who -nada | nothing -muchos | many -cual | who - | sea from SER -poco | few -ella | she -estar | to be - | haber from HABER -estas | these - | estaba from ESTAR - | estamos from ESTAR -algunas | some -algo | something -nosotros | we - - | other forms - -mi | me -mis | mi plural -tú | thou -te | thee -ti | thee -tu | thy -tus | tu plural -ellas | they -nosotras | we -vosotros | you -vosotras | you -os | you -mío | mine -mía | -míos | -mías | -tuyo | thine -tuya | -tuyos | -tuyas | -suyo | his, hers, theirs -suya | -suyos | -suyas | -nuestro | ours -nuestra | -nuestros | -nuestras | -vuestro | yours -vuestra | -vuestros | -vuestras | -esos | those -esas | those - - | forms of estar, to be (not including the infinitive): -estoy -estás -está -estamos -estáis -están -esté -estés -estemos -estéis -estén -estaré -estarás -estará -estaremos -estaréis -estarán -estaría -estarías -estaríamos -estaríais -estarían -estaba -estabas -estábamos -estabais -estaban -estuve -estuviste -estuvo -estuvimos -estuvisteis -estuvieron -estuviera -estuvieras -estuviéramos -estuvierais -estuvieran -estuviese -estuvieses -estuviésemos -estuvieseis -estuviesen -estando -estado -estada -estados -estadas -estad - - | forms of haber, to have (not including the infinitive): -he -has -ha -hemos -habéis -han -haya -hayas -hayamos -hayáis -hayan -habré -habrás -habrá -habremos -habréis -habrán -habría -habrías -habríamos -habríais -habrían -había -habías -habíamos -habíais -habían -hube -hubiste -hubo -hubimos -hubisteis -hubieron -hubiera -hubieras -hubiéramos -hubierais -hubieran -hubiese -hubieses -hubiésemos -hubieseis -hubiesen -habiendo -habido -habida -habidos -habidas - - | forms of ser, to be (not including the infinitive): -soy -eres -es -somos -sois -son -sea -seas -seamos -seáis -sean -seré -serás -será -seremos -seréis -serán -sería -serías -seríamos -seríais -serían -era -eras -éramos -erais -eran -fui -fuiste -fue -fuimos -fuisteis -fueron -fuera -fueras -fuéramos -fuerais -fueran -fuese -fueses -fuésemos -fueseis -fuesen -siendo -sido - | sed also means 'thirst' - - | forms of tener, to have (not including the infinitive): -tengo -tienes -tiene -tenemos -tenéis -tienen -tenga -tengas -tengamos -tengáis -tengan -tendré -tendrás -tendrá -tendremos -tendréis -tendrán -tendría -tendrías -tendríamos -tendríais -tendrían -tenía -tenías -teníamos -teníais -tenían -tuve -tuviste -tuvo -tuvimos -tuvisteis -tuvieron -tuviera -tuvieras -tuviéramos -tuvierais -tuvieran -tuviese -tuvieses -tuviésemos -tuvieseis -tuviesen -teniendo -tenido -tenida -tenidos -tenidas -tened - diff --git a/archiver/solr-config-dir/lang/stopwords_et.txt b/archiver/solr-config-dir/lang/stopwords_et.txt deleted file mode 100644 index 1b06a13..0000000 --- a/archiver/solr-config-dir/lang/stopwords_et.txt +++ /dev/null @@ -1,1603 +0,0 @@ -# Estonian stopwords list -all -alla -allapoole -allpool -alt -altpoolt -eel -eespool -enne -hommikupoole -hoolimata -ilma -kaudu -keset -kesk -kohe -koos -kuhupoole -kuni -kuspool -kustpoolt -kõige -käsikäes -lappi -ligi -läbi -mööda -paitsi -peale -pealepoole -pealpool -pealt -pealtpoolt -piki -pikku -piku -pikuti -põiki -pärast -päri -risti -sealpool -sealtpoolt -seespool -seltsis -siiapoole -siinpool -siitpoolt -sinnapoole -sissepoole -taga -tagantpoolt -tagapidi -tagapool -taha -tahapoole -teispool -teispoole -tänu -tükkis -vaatamata -vastu -väljapoole -väljaspool -väljastpoolt -õhtupoole -ühes -ühestükis -ühestükkis -ülalpool -ülaltpoolt -üle -ülespoole -ülevalpool -ülevaltpoolt -ümber -ümbert -aegu -aegus -alguks -algul -algule -algult -alguni -all -alla -alt -alul -alutsi -arvel -asemel -asemele -eel -eeli -ees -eesotsas -eest -eestotsast -esitsi -ette -etteotsa -haaval -heaks -hoolimata -hulgas -hulgast -hulka -jalgu -jalus -jalust -jaoks -jooksul -juurde -juures -juurest -jälil -jälile -järel -järele -järelt -järgi -kaasas -kallal -kallale -kallalt -kamul -kannul -kannule -kannult -kaudu -kaupa -keskel -keskele -keskelt -keskis -keskpaiku -kestel -kestes -kilda -killas -killast -kimpu -kimpus -kiuste -kohal -kohale -kohalt -kohaselt -kohe -kohta -koos -korral -kukil -kukile -kukilt -kulul -kõrva -kõrval -kõrvale -kõrvalt -kõrvas -kõrvast -käekõrval -käekõrvale -käekõrvalt -käes -käest -kätte -külge -küljes -küljest -küüsi -küüsis -küüsist -ligi -ligidal -ligidale -ligidalt -aegu -aegus -alguks -algul -algule -algult -alguni -all -alla -alt -alul -alutsi -arvel -asemel -asemele -eel -eeli -ees -eesotsas -eest -eestotsast -esitsi -ette -etteotsa -haaval -heaks -hoolimata -hulgas -hulgast -hulka -jalgu -jalus -jalust -jaoks -jooksul -juurde -juures -juurest -jälil -jälile -järel -järele -järelt -järgi -kaasas -kallal -kallale -kallalt -kamul -kannul -kannule -kannult -kaudu -kaupa -keskel -keskele -keskelt -keskis -keskpaiku -kestel -kestes -kilda -killas -killast -kimpu -kimpus -kiuste -kohal -kohale -kohalt -kohaselt -kohe -kohta -koos -korral -kukil -kukile -kukilt -kulul -kõrva -kõrval -kõrvale -kõrvalt -kõrvas -kõrvast -käekõrval -käekõrvale -käekõrvalt -käes -käest -kätte -külge -küljes -küljest -küüsi -küüsis -küüsist -ligi -ligidal -ligidale -ligidalt -lool -läbi -lähedal -lähedale -lähedalt -man -mant -manu -meelest -mööda -nahas -nahka -nahkas -najal -najale -najalt -nõjal -nõjale -otsa -otsas -otsast -paigale -paigu -paiku -peal -peale -pealt -perra -perrä -pidi -pihta -piki -pikku -pool -poole -poolest -poolt -puhul -puksiiris -pähe -päralt -päras -pärast -päri -ringi -ringis -risust -saadetusel -saadik -saatel -saati -seas -seast -sees -seest -sekka -seljataga -seltsi -seltsis -seltsist -sisse -slepis -suhtes -šlepis -taga -tagant -tagantotsast -tagaotsas -tagaselja -tagasi -tagast -tagutsi -taha -tahaotsa -takka -tarvis -tasa -tuuri -tuuris -tõttu -tükkis -uhal -vaatamata -vahel -vahele -vahelt -vahepeal -vahepeale -vahepealt -vahetsi -varal -varale -varul -vastas -vastast -vastu -veerde -veeres -viisi -võidu -võrd -võrdki -võrra -võrragi -väel -väele -vältel -väärt -väärtki -äärde -ääre -ääres -äärest -ühes -üle -ümber -ümbert -a -abil -aina -ainult -alalt -alates -alati -alles -b -c -d -e -eales -ealeski -edasi -edaspidi -eelkõige -eemal -ei -eks -end -enda -enese -ennem -esialgu -f -g -h -hoopis -i -iganes -igatahes -igati -iial -iialgi -ikka -ikkagi -ilmaski -iseenda -iseenese -iseenesest -isegi -j -jah -ju -juba -juhul -just -järelikult -k -ka -kah -kas -kasvõi -keda -kestahes -kogu -koguni -kohati -kokku -kuhu -kuhugi -kuidagi -kuidas -kunagi -kus -kusagil -kusjuures -kuskil -kust -kõigepealt -küll -l -liiga -lisaks -m -miks -mil -millal -millalgi -mispärast -mistahes -mistõttu -mitte -muide -muidu -muidugi -muist -mujal -mujale -mujalt -mõlemad -mõnda -mõne -mõnikord -n -nii -niikaua -niimoodi -niipaljuke -niisama -niisiis -niivõrd -nõnda -nüüd -o -omaette -omakorda -omavahel -ometi -p -palju -paljuke -palju-palju -peaaegu -peagi -peamiselt -pigem -pisut -praegu -päris -r -rohkem -s -samas -samuti -seal -sealt -sedakorda -sedapuhku -seega -seejuures -seejärel -seekord -seepärast -seetõttu -sellepärast -seni -sestap -siia -siiani -siin -siinkohal -siis -siiski -siit -sinna -suht -š -z -ž -t -teel -teineteise -tõesti -täiesti -u -umbes -v -w -veel -veelgi -vist -võibolla -võib-olla -väga -vähemalt -välja -väljas -väljast -õ -ä -ära -ö -ü -ühtlasi -üksi -ükskõik -ülal -ülale -ülalt -üles -ülesse -üleval -ülevalt -ülimalt -üsna -x -y -aga -ega -ehk -ehkki -elik -ellik -enge -ennegu -ent -et -ja -justkui -kui -kuid -kuigi -kuivõrd -kuna -kuni -kut -mistab -muudkui -nagu -nigu -ning -olgugi -otsekui -otsenagu -selmet -sest -sestab -vaid -või -aa -adaa -adjöö -ae -ah -ahaa -ahah -ah-ah-ah -ah-haa -ahoi -ai -aidaa -aidu-raidu -aih -aijeh -aituma -aitäh -aitüma -ammuu -amps -ampsti -aptsih -ass -at -ata -at-at-at -atsih -atsihh -auh -bai-bai -bingo -braavo -brr -ee -eeh -eh -ehee -eheh -eh-eh-hee -eh-eh-ee -ehei -ehh -ehhee -einoh -ena -ennäe -ennäh -fuh -fui -fuih -haa -hah -hahaa -hah-hah-hah -halleluuja -hallo -halloo -hass -hee -heh -he-he-hee -hei -heldeke(ne) -heureka -hihii -hip-hip-hurraa -hmh -hmjah -hoh-hoh-hoo -hohoo -hoi -hollallaa -hoo -hoplaa -hopp -hops -hopsassaa -hopsti -hosianna -huh -huidii -huist -hurjah -hurjeh -hurjoh -hurjuh -hurraa -huu -hõhõh -hõi -hõissa -hõissassa -hõk -hõkk -häh -hä-hä-hää -hüvasti -ih-ah-haa -ih-ih-hii -ii-ha-ha -issake -issakene -isver -jaa-ah -ja-ah -jaah -janäe -jeeh -jeerum -jeever -jessas -jestas -juhhei -jumalaga -jumalime -jumaluke -jumalukene -jutas -kaaps -kaapsti -kaasike -kae -kalps -kalpsti -kannäe -kanäe -kappadi -kaps -kapsti -karkõmm -karkäuh -karkääks -karkääksti -karmauh -karmauhti -karnaps -karnapsti -karniuhti -karpartsaki -karpauh -karpauhti -karplauh -karplauhti -karprauh -karprauhti -karsumdi -karsumm -kartsumdi -kartsumm -karviuh -karviuhti -kaske -kassa -kauh -kauhti -keh -keksti -kepsti -khe -khm -kih -kiiks -kiiksti -kiis -kiiss -kikerii -kikerikii -kili -kilk -kilk-kõlk -kilks -kilks-kolks -kilks-kõlks -kill -killadi -killadi|-kolladi -killadi-kõlladi -killa-kolla -killa-kõlla -kill-kõll -kimps-komps -kipp -kips-kõps -kiriküüt -kirra-kõrra -kirr-kõrr -kirts -klaps -klapsti -klirdi -klirr -klonks -klops -klopsti -kluk -klu-kluu -klõks -klõksti -klõmdi -klõmm -klõmpsti -klõnks -klõnksti -klõps -klõpsti -kläu -kohva-kohva -kok -koks -koksti -kolaki -kolk -kolks -kolksti -koll -kolladi -komp -komps -kompsti -kop -kopp -koppadi -kops -kopsti -kossu -kotsu -kraa -kraak -kraaks -kraaps -kraapsti -krahh -kraks -kraksti -kraps -krapsti -krauh -krauhti -kriiks -kriiksti -kriips -kriips-kraaps -kripa-krõpa -krips-kraps -kriuh -kriuks -kriuksti -kromps -kronk -kronks -krooks -kruu -krõks -krõksti -krõpa -krõps -krõpsti -krõuh -kräu -kräuh -kräuhti -kräuks -kss -kukeleegu -kukku -kuku -kulu -kurluu -kurnäu -kuss -kussu -kõks -kõksti -kõldi -kõlks -kõlksti -kõll -kõmaki -kõmdi -kõmm -kõmps -kõpp -kõps -kõpsadi -kõpsat -kõpsti -kõrr -kõrra-kõrra -kõss -kõtt -kõõksti -kärr -kärts -kärtsti -käuks -käuksti -kääga -kääks -kääksti -köh -köki-möki -köksti -laks -laksti -lampsti -larts -lartsti -lats -latsti -leelo -legoo -lehva -liiri-lõõri -lika-lõka -likat-lõkat -limpsti -lips -lipsti -lirts -lirtsaki -lirtsti -lonksti -lops -lopsti -lorts -lortsti -luks -lups -lupsti -lurts -lurtsti -lõks -lõksti -lõmps -lõmpsti -lõnks -lõnksti -lärts -lärtsti -läts -lätsti -lörts -lörtsti -lötsti -lööps -lööpsti -marss -mats -matsti -mauh -mauhti -mh -mhh -mhmh -miau -mjaa -mkm -m-mh -mnjaa -mnjah -moens -mulks -mulksti -mull-mull -mull-mull-mull -muu -muuh -mõh -mõmm -mäh -mäts -mäu -mää -möh -möh-öh-ää -möö -müh-müh -mühüh -müks -müksti -müraki -mürr -mürts -mürtsaki -mürtsti -mütaku -müta-mäta -müta-müta -müt-müt -müt-müt-müt -müts -mütsti -mütt -naa -naah -nah -naks -naksti -nanuu -naps -napsti -nilpsti -nipsti -nirr -niuh -niuh-näuh -niuhti -noh -noksti -nolpsti -nonoh -nonoo -nonäh -noo -nooh -nooks -norr -nurr -nuuts -nõh -nõhh -nõka-nõka -nõks -nõksat-nõksat -nõks-nõks -nõksti -nõõ -nõõh -näeh -näh -nälpsti -nämm-nämm -näpsti -näts -nätsti -näu -näuh -näuhti -näuks -näuksti -nääh -nääks -nühkat-nühkat -oeh -oh -ohh -ohhh -oh-hoi -oh-hoo -ohoh -oh-oh-oo -oh-oh-hoo -ohoi -ohoo -oi -oih -oijee -oijeh -oo -ooh -oo-oh -oo-ohh -oot -ossa -ot -paa -pah -pahh -pakaa -pamm -pantsti -pardon -pardonks -parlartsti -parts -partsti -partsumdi -partsumm -pastoi -pats -patst -patsti -pau -pauh -pauhti -pele -pfui -phuh -phuuh -phäh -phähh -piiks -piip -piiri-pääri -pimm -pimm-pamm -pimm-pomm -pimm-põmm -piraki -piuks -piu-pau -plaks -plaksti -plarts -plartsti -plats -platsti -plauh -plauhh -plauhti -pliks -pliks-plaks -plinn -pliraki -plirts -plirtsti -pliu -pliuh -ploks -plotsti -plumps -plumpsti -plõks -plõksti -plõmdi -plõmm -plõnn -plärr -plärts -plärtsat -plärtsti -pläu -pläuh -plää -plörtsat -pomm -popp -pops -popsti -ports -pot -pots -potsti -pott -praks -praksti -prants -prantsaki -prantsti -prassai -prauh -prauhh -prauhti -priks -priuh -priuhh -priuh-prauh -proosit -proost -prr -prrr -prõks -prõksti -prõmdi -prõmm -prõntsti -prääk -prääks -pst -psst -ptrr -ptruu -ptüi -puh -puhh -puksti -pumm -pumps -pup-pup-pup -purts -puuh -põks -põksti -põmdi -põmm -põmmadi -põnks -põnn -põnnadi -põnt -põnts -põntsti -põraki -põrr -põrra-põrra -päh -pähh -päntsti -pää -pöörd -püh -raks -raksti -raps -rapsti -ratataa -rauh -riips -riipsti -riks -riks-raks -rips-raps -rivitult -robaki -rops -ropsaki -ropsti -ruik -räntsti -räts -röh -röhh -sah -sahh -sahkat -saps -sapsti -sauh -sauhti -servus -sihkadi-sahkadi -sihka-sahka -sihkat-sahkat -silks -silk-solk -sips -sipsti -sirr -sirr-sorr -sirts -sirtsti -siu -siuh -siuh-sauh -siuh-säuh -siuhti -siuks -siuts -skool -so -soh -solks -solksti -solpsti -soo -sooh -so-oh -soo-oh -sopp -sops -sopsti -sorr -sorts -sortsti -so-soo -soss -soss-soss -ss -sss -sst -stopp -suhkat-sahkat -sulk -sulks -sulksti -sull -sulla-sulla -sulpa-sulpa -sulps -sulpsti -sumaki -sumdi -summ -summat-summat -sups -supsaku -supsti -surts -surtsti -suss -susti -suts -sutsti -säh -sähke -särts -särtsti -säu -säuh -säuhti -taevake -taevakene -takk -tere -terekest -tibi-tibi -tikk-takk -tiks -tilk -tilks -till -tilla-talla -till-tall -tilulii -tinn -tip -tip-tap -tirr -tirtsti -tiu -tjaa -tjah -tohhoh -tohhoo -tohoh -tohoo -tok -tokk -toks -toksti -tonks -tonksti -tota -totsti -tot-tot -tprr -tpruu -trah -trahh -trallallaa -trill -trillallaa -trr -trrr -tsah -tsahh -tsilk -tsilk-tsolk -tsirr -tsiuh -tskae -tsolk -tss -tst -tsst -tsuhh -tsuk -tsumm -tsurr -tsäuh -tšao -tšš -tššš -tuk -tuks -turts -turtsti -tutki -tutkit -tutu-lutu -tutulutu -tuut -tuutu-luutu -tõks -tötsti -tümps -uh -uhh -uh-huu -uhtsa -uhtsaa -uhuh -uhuu -ui -uih -uih-aih -uijah -uijeh -uist -uit -uka -upsti -uraa -urjah -urjeh -urjoh -urjuh -urr -urraa -ust -utu -uu -uuh -vaak -vaat -vae -vaeh -vai -vat -vau -vhüüt -vidiit -viiks -vilks -vilksti -vinki-vinki -virdi -virr -viu -viudi -viuh -viuhti -voeh -voh -vohh -volks -volksti -vooh -vops -vopsti -vot -vuh -vuhti -vuih -vulks -vulksti -vull -vulpsti -vups -vupsaki -vupsaku -vupsti -vurdi -vurr -vurra-vurra -vurts -vurtsti -vutt -võe -võeh -või -võih -võrr -võts -võtt -vääks -õe -õits -õk -õkk -õrr -õss -õuh -äh -ähh -ähhähhää -äh-hää -äh-äh-hää -äiu -äiu-ää -äss -ää -ääh -äähh -öh -öhh -ök -üh -eelmine -eikeegi -eimiski -emb-kumb -enam -enim -iga -igasugune -igaüks -ise -isesugune -järgmine -keegi -kes -kumb -kumbki -kõik -meiesugune -meietaoline -midagi -mihuke -mihukene -milletaoline -milline -mina -minake -mingi -mingisugune -minusugune -minutaoline -mis -miski -miskisugune -missugune -misuke -mitmes -mitmesugune -mitu -mitu-mitu -mitu-setu -muu -mõlema -mõnesugune -mõni -mõningane -mõningas -mäherdune -määrane -naasugune -need -nemad -nendesugune -nendetaoline -nihuke -nihukene -niimitu -niisamasugune -niisugune -nisuke -nisukene -oma -omaenese -omasugune -omataoline -pool -praegune -sama -samasugune -samataoline -see -seesama -seesamane -seesamune -seesinane -seesugune -selline -sihuke -sihukene -sina -sinusugune -sinutaoline -siuke -siukene -säherdune -säärane -taoline -teiesugune -teine -teistsugune -tema -temake -temakene -temasugune -temataoline -too -toosama -toosamane -üks -üksteise -hakkama -minema -olema -pidama -saama -tegema -tulema -võima diff --git a/archiver/solr-config-dir/lang/stopwords_eu.txt b/archiver/solr-config-dir/lang/stopwords_eu.txt deleted file mode 100644 index 25f1db9..0000000 --- a/archiver/solr-config-dir/lang/stopwords_eu.txt +++ /dev/null @@ -1,99 +0,0 @@ -# example set of basque stopwords -al -anitz -arabera -asko -baina -bat -batean -batek -bati -batzuei -batzuek -batzuetan -batzuk -bera -beraiek -berau -berauek -bere -berori -beroriek -beste -bezala -da -dago -dira -ditu -du -dute -edo -egin -ere -eta -eurak -ez -gainera -gu -gutxi -guzti -haiei -haiek -haietan -hainbeste -hala -han -handik -hango -hara -hari -hark -hartan -hau -hauei -hauek -hauetan -hemen -hemendik -hemengo -hi -hona -honek -honela -honetan -honi -hor -hori -horiei -horiek -horietan -horko -horra -horrek -horrela -horretan -horri -hortik -hura -izan -ni -noiz -nola -non -nondik -nongo -nor -nora -ze -zein -zen -zenbait -zenbat -zer -zergatik -ziren -zituen -zu -zuek -zuen -zuten diff --git a/archiver/solr-config-dir/lang/stopwords_fa.txt b/archiver/solr-config-dir/lang/stopwords_fa.txt deleted file mode 100644 index 723641c..0000000 --- a/archiver/solr-config-dir/lang/stopwords_fa.txt +++ /dev/null @@ -1,313 +0,0 @@ -# This file was created by Jacques Savoy and is distributed under the BSD license. -# See http://members.unine.ch/jacques.savoy/clef/index.html. -# Also see http://www.opensource.org/licenses/bsd-license.html -# Note: by default this file is used after normalization, so when adding entries -# to this file, use the arabic 'ي' instead of 'ی' -انان -نداشته -سراسر -خياه -ايشان -وي -تاكنون -بيشتري -دوم -پس -ناشي -وگو -يا -داشتند -سپس -هنگام -هرگز -پنج -نشان -امسال -ديگر -گروهي -شدند -چطور -ده -و -دو -نخستين -ولي -چرا -چه -وسط -ه -كدام -قابل -يك -رفت -هفت -همچنين -در -هزار -بله -بلي -شايد -اما -شناسي -گرفته -دهد -داشته -دانست -داشتن -خواهيم -ميليارد -وقتيكه -امد -خواهد -جز -اورده -شده -بلكه -خدمات -شدن -برخي -نبود -بسياري -جلوگيري -حق -كردند -نوعي -بعري -نكرده -نظير -نبايد -بوده -بودن -داد -اورد -هست -جايي -شود -دنبال -داده -بايد -سابق -هيچ -همان -انجا -كمتر -كجاست -گردد -كسي -تر -مردم -تان -دادن -بودند -سري -جدا -ندارند -مگر -يكديگر -دارد -دهند -بنابراين -هنگامي -سمت -جا -انچه -خود -دادند -زياد -دارند -اثر -بدون -بهترين -بيشتر -البته -به -براساس -بيرون -كرد -بعضي -گرفت -توي -اي -ميليون -او -جريان -تول -بر -مانند -برابر -باشيم -مدتي -گويند -اكنون -تا -تنها -جديد -چند -بي -نشده -كردن -كردم -گويد -كرده -كنيم -نمي -نزد -روي -قصد -فقط -بالاي -ديگران -اين -ديروز -توسط -سوم -ايم -دانند -سوي -استفاده -شما -كنار -داريم -ساخته -طور -امده -رفته -نخست -بيست -نزديك -طي -كنيد -از -انها -تمامي -داشت -يكي -طريق -اش -چيست -روب -نمايد -گفت -چندين -چيزي -تواند -ام -ايا -با -ان -ايد -ترين -اينكه -ديگري -راه -هايي -بروز -همچنان -پاعين -كس -حدود -مختلف -مقابل -چيز -گيرد -ندارد -ضد -همچون -سازي -شان -مورد -باره -مرسي -خويش -برخوردار -چون -خارج -شش -هنوز -تحت -ضمن -هستيم -گفته -فكر -بسيار -پيش -براي -روزهاي -انكه -نخواهد -بالا -كل -وقتي -كي -چنين -كه -گيري -نيست -است -كجا -كند -نيز -يابد -بندي -حتي -توانند -عقب -خواست -كنند -بين -تمام -همه -ما -باشند -مثل -شد -اري -باشد -اره -طبق -بعد -اگر -صورت -غير -جاي -بيش -ريزي -اند -زيرا -چگونه -بار -لطفا -مي -درباره -من -ديده -همين -گذاري -برداري -علت -گذاشته -هم -فوق -نه -ها -شوند -اباد -همواره -هر -اول -خواهند -چهار -نام -امروز -مان -هاي -قبل -كنم -سعي -تازه -را -هستند -زير -جلوي -عنوان -بود diff --git a/archiver/solr-config-dir/lang/stopwords_fi.txt b/archiver/solr-config-dir/lang/stopwords_fi.txt deleted file mode 100644 index 4372c9a..0000000 --- a/archiver/solr-config-dir/lang/stopwords_fi.txt +++ /dev/null @@ -1,97 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - -| forms of BE - -olla -olen -olet -on -olemme -olette -ovat -ole | negative form - -oli -olisi -olisit -olisin -olisimme -olisitte -olisivat -olit -olin -olimme -olitte -olivat -ollut -olleet - -en | negation -et -ei -emme -ette -eivät - -|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans -minä minun minut minua minussa minusta minuun minulla minulta minulle | I -sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you -hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she -me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we -te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you -he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they - -tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this -tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that -se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it -nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these -nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those -ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they - -kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who -ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) -mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what -mitkä | (pl) - -joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which -jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) - -| conjunctions - -että | that -ja | and -jos | if -koska | because -kuin | than -mutta | but -niin | so -sekä | and -sillä | for -tai | or -vaan | but -vai | or -vaikka | although - - -| prepositions - -kanssa | with -mukaan | according to -noin | about -poikki | across -yli | over, across - -| other - -kun | when -niin | so -nyt | now -itse | self - diff --git a/archiver/solr-config-dir/lang/stopwords_fr.txt b/archiver/solr-config-dir/lang/stopwords_fr.txt deleted file mode 100644 index 749abae..0000000 --- a/archiver/solr-config-dir/lang/stopwords_fr.txt +++ /dev/null @@ -1,186 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A French stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - -au | a + le -aux | a + les -avec | with -ce | this -ces | these -dans | with -de | of -des | de + les -du | de + le -elle | she -en | `of them' etc -et | and -eux | them -il | he -je | I -la | the -le | the -leur | their -lui | him -ma | my (fem) -mais | but -me | me -même | same; as in moi-même (myself) etc -mes | me (pl) -moi | me -mon | my (masc) -ne | not -nos | our (pl) -notre | our -nous | we -on | one -ou | where -par | by -pas | not -pour | for -qu | que before vowel -que | that -qui | who -sa | his, her (fem) -se | oneself -ses | his (pl) -son | his, her (masc) -sur | on -ta | thy (fem) -te | thee -tes | thy (pl) -toi | thee -ton | thy (masc) -tu | thou -un | a -une | a -vos | your (pl) -votre | your -vous | you - - | single letter forms - -c | c' -d | d' -j | j' -l | l' -à | to, at -m | m' -n | n' -s | s' -t | t' -y | there - - | forms of être (not including the infinitive): -été -étée -étées -étés -étant -suis -es -est -sommes -êtes -sont -serai -seras -sera -serons -serez -seront -serais -serait -serions -seriez -seraient -étais -était -étions -étiez -étaient -fus -fut -fûmes -fûtes -furent -sois -soit -soyons -soyez -soient -fusse -fusses -fût -fussions -fussiez -fussent - - | forms of avoir (not including the infinitive): -ayant -eu -eue -eues -eus -ai -as -avons -avez -ont -aurai -auras -aura -aurons -aurez -auront -aurais -aurait -aurions -auriez -auraient -avais -avait -avions -aviez -avaient -eut -eûmes -eûtes -eurent -aie -aies -ait -ayons -ayez -aient -eusse -eusses -eût -eussions -eussiez -eussent - - | Later additions (from Jean-Christophe Deschamps) -ceci | this -cela | that -celà | that -cet | this -cette | this -ici | here -ils | they -les | the (pl) -leurs | their (pl) -quel | which -quels | which -quelle | which -quelles | which -sans | without -soi | oneself - diff --git a/archiver/solr-config-dir/lang/stopwords_ga.txt b/archiver/solr-config-dir/lang/stopwords_ga.txt deleted file mode 100644 index 9ff88d7..0000000 --- a/archiver/solr-config-dir/lang/stopwords_ga.txt +++ /dev/null @@ -1,110 +0,0 @@ - -a -ach -ag -agus -an -aon -ar -arna -as -b' -ba -beirt -bhúr -caoga -ceathair -ceathrar -chomh -chtó -chuig -chun -cois -céad -cúig -cúigear -d' -daichead -dar -de -deich -deichniúr -den -dhá -do -don -dtí -dá -dár -dó -faoi -faoin -faoina -faoinár -fara -fiche -gach -gan -go -gur -haon -hocht -i -iad -idir -in -ina -ins -inár -is -le -leis -lena -lenár -m' -mar -mo -mé -na -nach -naoi -naonúr -ná -ní -níor -nó -nócha -ocht -ochtar -os -roimh -sa -seacht -seachtar -seachtó -seasca -seisear -siad -sibh -sinn -sna -sé -sí -tar -thar -thú -triúr -trí -trína -trínár -tríocha -tú -um -ár -é -éis -í -ó -ón -óna -ónár diff --git a/archiver/solr-config-dir/lang/stopwords_gl.txt b/archiver/solr-config-dir/lang/stopwords_gl.txt deleted file mode 100644 index d8760b1..0000000 --- a/archiver/solr-config-dir/lang/stopwords_gl.txt +++ /dev/null @@ -1,161 +0,0 @@ -# galican stopwords -a -aínda -alí -aquel -aquela -aquelas -aqueles -aquilo -aquí -ao -aos -as -así -á -ben -cando -che -co -coa -comigo -con -connosco -contigo -convosco -coas -cos -cun -cuns -cunha -cunhas -da -dalgunha -dalgunhas -dalgún -dalgúns -das -de -del -dela -delas -deles -desde -deste -do -dos -dun -duns -dunha -dunhas -e -el -ela -elas -eles -en -era -eran -esa -esas -ese -eses -esta -estar -estaba -está -están -este -estes -estiven -estou -eu -é -facer -foi -foron -fun -había -hai -iso -isto -la -las -lle -lles -lo -los -mais -me -meu -meus -min -miña -miñas -moi -na -nas -neste -nin -no -non -nos -nosa -nosas -noso -nosos -nós -nun -nunha -nuns -nunhas -o -os -ou -ó -ós -para -pero -pode -pois -pola -polas -polo -polos -por -que -se -senón -ser -seu -seus -sexa -sido -sobre -súa -súas -tamén -tan -te -ten -teñen -teño -ter -teu -teus -ti -tido -tiña -tiven -túa -túas -un -unha -unhas -uns -vos -vosa -vosas -voso -vosos -vós diff --git a/archiver/solr-config-dir/lang/stopwords_hi.txt b/archiver/solr-config-dir/lang/stopwords_hi.txt deleted file mode 100644 index 86286bb..0000000 --- a/archiver/solr-config-dir/lang/stopwords_hi.txt +++ /dev/null @@ -1,235 +0,0 @@ -# Also see http://www.opensource.org/licenses/bsd-license.html -# See http://members.unine.ch/jacques.savoy/clef/index.html. -# This file was created by Jacques Savoy and is distributed under the BSD license. -# Note: by default this file also contains forms normalized by HindiNormalizer -# for spelling variation (see section below), such that it can be used whether or -# not you enable that feature. When adding additional entries to this list, -# please add the normalized form as well. -अंदर -अत -अपना -अपनी -अपने -अभी -आदि -आप -इत्यादि -इन -इनका -इन्हीं -इन्हें -इन्हों -इस -इसका -इसकी -इसके -इसमें -इसी -इसे -उन -उनका -उनकी -उनके -उनको -उन्हीं -उन्हें -उन्हों -उस -उसके -उसी -उसे -एक -एवं -एस -ऐसे -और -कई -कर -करता -करते -करना -करने -करें -कहते -कहा -का -काफ़ी -कि -कितना -किन्हें -किन्हों -किया -किर -किस -किसी -किसे -की -कुछ -कुल -के -को -कोई -कौन -कौनसा -गया -घर -जब -जहाँ -जा -जितना -जिन -जिन्हें -जिन्हों -जिस -जिसे -जीधर -जैसा -जैसे -जो -तक -तब -तरह -तिन -तिन्हें -तिन्हों -तिस -तिसे -तो -था -थी -थे -दबारा -दिया -दुसरा -दूसरे -दो -द्वारा -न -नहीं -ना -निहायत -नीचे -ने -पर -पर -पहले -पूरा -पे -फिर -बनी -बही -बहुत -बाद -बाला -बिलकुल -भी -भीतर -मगर -मानो -मे -में -यदि -यह -यहाँ -यही -या -यिह -ये -रखें -रहा -रहे -ऱ्वासा -लिए -लिये -लेकिन -व -वर्ग -वह -वह -वहाँ -वहीं -वाले -वुह -वे -वग़ैरह -संग -सकता -सकते -सबसे -सभी -साथ -साबुत -साभ -सारा -से -सो -ही -हुआ -हुई -हुए -है -हैं -हो -होता -होती -होते -होना -होने -# additional normalized forms of the above -अपनि -जेसे -होति -सभि -तिंहों -इंहों -दवारा -इसि -किंहें -थि -उंहों -ओर -जिंहें -वहिं -अभि -बनि -हि -उंहिं -उंहें -हें -वगेरह -एसे -रवासा -कोन -निचे -काफि -उसि -पुरा -भितर -हे -बहि -वहां -कोइ -यहां -जिंहों -तिंहें -किसि -कइ -यहि -इंहिं -जिधर -इंहें -अदि -इतयादि -हुइ -कोनसा -इसकि -दुसरे -जहां -अप -किंहों -उनकि -भि -वरग -हुअ -जेसा -नहिं diff --git a/archiver/solr-config-dir/lang/stopwords_hu.txt b/archiver/solr-config-dir/lang/stopwords_hu.txt deleted file mode 100644 index 37526da..0000000 --- a/archiver/solr-config-dir/lang/stopwords_hu.txt +++ /dev/null @@ -1,211 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - -| Hungarian stop word list -| prepared by Anna Tordai - -a -ahogy -ahol -aki -akik -akkor -alatt -által -általában -amely -amelyek -amelyekben -amelyeket -amelyet -amelynek -ami -amit -amolyan -amíg -amikor -át -abban -ahhoz -annak -arra -arról -az -azok -azon -azt -azzal -azért -aztán -azután -azonban -bár -be -belül -benne -cikk -cikkek -cikkeket -csak -de -e -eddig -egész -egy -egyes -egyetlen -egyéb -egyik -egyre -ekkor -el -elég -ellen -elő -először -előtt -első -én -éppen -ebben -ehhez -emilyen -ennek -erre -ez -ezt -ezek -ezen -ezzel -ezért -és -fel -felé -hanem -hiszen -hogy -hogyan -igen -így -illetve -ill. -ill -ilyen -ilyenkor -ison -ismét -itt -jó -jól -jobban -kell -kellett -keresztül -keressünk -ki -kívül -között -közül -legalább -lehet -lehetett -legyen -lenne -lenni -lesz -lett -maga -magát -majd -majd -már -más -másik -meg -még -mellett -mert -mely -melyek -mi -mit -míg -miért -milyen -mikor -minden -mindent -mindenki -mindig -mint -mintha -mivel -most -nagy -nagyobb -nagyon -ne -néha -nekem -neki -nem -néhány -nélkül -nincs -olyan -ott -össze -ő -ők -őket -pedig -persze -rá -s -saját -sem -semmi -sok -sokat -sokkal -számára -szemben -szerint -szinte -talán -tehát -teljes -tovább -továbbá -több -úgy -ugyanis -új -újabb -újra -után -utána -utolsó -vagy -vagyis -valaki -valami -valamint -való -vagyok -van -vannak -volt -voltam -voltak -voltunk -vissza -vele -viszont -volna diff --git a/archiver/solr-config-dir/lang/stopwords_hy.txt b/archiver/solr-config-dir/lang/stopwords_hy.txt deleted file mode 100644 index 60c1c50..0000000 --- a/archiver/solr-config-dir/lang/stopwords_hy.txt +++ /dev/null @@ -1,46 +0,0 @@ -# example set of Armenian stopwords. -այդ -այլ -այն -այս -դու -դուք -եմ -են -ենք -ես -եք -է -էի -էին -էինք -էիր -էիք -էր -ըստ -թ -ի -ին -իսկ -իր -կամ -համար -հետ -հետո -մենք -մեջ -մի -ն -նա -նաև -նրա -նրանք -որ -որը -որոնք -որպես -ու -ում -պիտի -վրա -և diff --git a/archiver/solr-config-dir/lang/stopwords_id.txt b/archiver/solr-config-dir/lang/stopwords_id.txt deleted file mode 100644 index 4617f83..0000000 --- a/archiver/solr-config-dir/lang/stopwords_id.txt +++ /dev/null @@ -1,359 +0,0 @@ -# from appendix D of: A Study of Stemming Effects on Information -# Retrieval in Bahasa Indonesia -ada -adanya -adalah -adapun -agak -agaknya -agar -akan -akankah -akhirnya -aku -akulah -amat -amatlah -anda -andalah -antar -diantaranya -antara -antaranya -diantara -apa -apaan -mengapa -apabila -apakah -apalagi -apatah -atau -ataukah -ataupun -bagai -bagaikan -sebagai -sebagainya -bagaimana -bagaimanapun -sebagaimana -bagaimanakah -bagi -bahkan -bahwa -bahwasanya -sebaliknya -banyak -sebanyak -beberapa -seberapa -begini -beginian -beginikah -beginilah -sebegini -begitu -begitukah -begitulah -begitupun -sebegitu -belum -belumlah -sebelum -sebelumnya -sebenarnya -berapa -berapakah -berapalah -berapapun -betulkah -sebetulnya -biasa -biasanya -bila -bilakah -bisa -bisakah -sebisanya -boleh -bolehkah -bolehlah -buat -bukan -bukankah -bukanlah -bukannya -cuma -percuma -dahulu -dalam -dan -dapat -dari -daripada -dekat -demi -demikian -demikianlah -sedemikian -dengan -depan -di -dia -dialah -dini -diri -dirinya -terdiri -dong -dulu -enggak -enggaknya -entah -entahlah -terhadap -terhadapnya -hal -hampir -hanya -hanyalah -harus -haruslah -harusnya -seharusnya -hendak -hendaklah -hendaknya -hingga -sehingga -ia -ialah -ibarat -ingin -inginkah -inginkan -ini -inikah -inilah -itu -itukah -itulah -jangan -jangankan -janganlah -jika -jikalau -juga -justru -kala -kalau -kalaulah -kalaupun -kalian -kami -kamilah -kamu -kamulah -kan -kapan -kapankah -kapanpun -dikarenakan -karena -karenanya -ke -kecil -kemudian -kenapa -kepada -kepadanya -ketika -seketika -khususnya -kini -kinilah -kiranya -sekiranya -kita -kitalah -kok -lagi -lagian -selagi -lah -lain -lainnya -melainkan -selaku -lalu -melalui -terlalu -lama -lamanya -selama -selama -selamanya -lebih -terlebih -bermacam -macam -semacam -maka -makanya -makin -malah -malahan -mampu -mampukah -mana -manakala -manalagi -masih -masihkah -semasih -masing -mau -maupun -semaunya -memang -mereka -merekalah -meski -meskipun -semula -mungkin -mungkinkah -nah -namun -nanti -nantinya -nyaris -oleh -olehnya -seorang -seseorang -pada -padanya -padahal -paling -sepanjang -pantas -sepantasnya -sepantasnyalah -para -pasti -pastilah -per -pernah -pula -pun -merupakan -rupanya -serupa -saat -saatnya -sesaat -saja -sajalah -saling -bersama -sama -sesama -sambil -sampai -sana -sangat -sangatlah -saya -sayalah -se -sebab -sebabnya -sebuah -tersebut -tersebutlah -sedang -sedangkan -sedikit -sedikitnya -segala -segalanya -segera -sesegera -sejak -sejenak -sekali -sekalian -sekalipun -sesekali -sekaligus -sekarang -sekarang -sekitar -sekitarnya -sela -selain -selalu -seluruh -seluruhnya -semakin -sementara -sempat -semua -semuanya -sendiri -sendirinya -seolah -seperti -sepertinya -sering -seringnya -serta -siapa -siapakah -siapapun -disini -disinilah -sini -sinilah -sesuatu -sesuatunya -suatu -sesudah -sesudahnya -sudah -sudahkah -sudahlah -supaya -tadi -tadinya -tak -tanpa -setelah -telah -tentang -tentu -tentulah -tentunya -tertentu -seterusnya -tapi -tetapi -setiap -tiap -setidaknya -tidak -tidakkah -tidaklah -toh -waduh -wah -wahai -sewaktu -walau -walaupun -wong -yaitu -yakni -yang diff --git a/archiver/solr-config-dir/lang/stopwords_it.txt b/archiver/solr-config-dir/lang/stopwords_it.txt deleted file mode 100644 index 1219cc7..0000000 --- a/archiver/solr-config-dir/lang/stopwords_it.txt +++ /dev/null @@ -1,303 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | An Italian stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - -ad | a (to) before vowel -al | a + il -allo | a + lo -ai | a + i -agli | a + gli -all | a + l' -agl | a + gl' -alla | a + la -alle | a + le -con | with -col | con + il -coi | con + i (forms collo, cogli etc are now very rare) -da | from -dal | da + il -dallo | da + lo -dai | da + i -dagli | da + gli -dall | da + l' -dagl | da + gll' -dalla | da + la -dalle | da + le -di | of -del | di + il -dello | di + lo -dei | di + i -degli | di + gli -dell | di + l' -degl | di + gl' -della | di + la -delle | di + le -in | in -nel | in + el -nello | in + lo -nei | in + i -negli | in + gli -nell | in + l' -negl | in + gl' -nella | in + la -nelle | in + le -su | on -sul | su + il -sullo | su + lo -sui | su + i -sugli | su + gli -sull | su + l' -sugl | su + gl' -sulla | su + la -sulle | su + le -per | through, by -tra | among -contro | against -io | I -tu | thou -lui | he -lei | she -noi | we -voi | you -loro | they -mio | my -mia | -miei | -mie | -tuo | -tua | -tuoi | thy -tue | -suo | -sua | -suoi | his, her -sue | -nostro | our -nostra | -nostri | -nostre | -vostro | your -vostra | -vostri | -vostre | -mi | me -ti | thee -ci | us, there -vi | you, there -lo | him, the -la | her, the -li | them -le | them, the -gli | to him, the -ne | from there etc -il | the -un | a -uno | a -una | a -ma | but -ed | and -se | if -perché | why, because -anche | also -come | how -dov | where (as dov') -dove | where -che | who, that -chi | who -cui | whom -non | not -più | more -quale | who, that -quanto | how much -quanti | -quanta | -quante | -quello | that -quelli | -quella | -quelle | -questo | this -questi | -questa | -queste | -si | yes -tutto | all -tutti | all - - | single letter forms: - -a | at -c | as c' for ce or ci -e | and -i | the -l | as l' -o | or - - | forms of avere, to have (not including the infinitive): - -ho -hai -ha -abbiamo -avete -hanno -abbia -abbiate -abbiano -avrò -avrai -avrà -avremo -avrete -avranno -avrei -avresti -avrebbe -avremmo -avreste -avrebbero -avevo -avevi -aveva -avevamo -avevate -avevano -ebbi -avesti -ebbe -avemmo -aveste -ebbero -avessi -avesse -avessimo -avessero -avendo -avuto -avuta -avuti -avute - - | forms of essere, to be (not including the infinitive): -sono -sei -è -siamo -siete -sia -siate -siano -sarò -sarai -sarà -saremo -sarete -saranno -sarei -saresti -sarebbe -saremmo -sareste -sarebbero -ero -eri -era -eravamo -eravate -erano -fui -fosti -fu -fummo -foste -furono -fossi -fosse -fossimo -fossero -essendo - - | forms of fare, to do (not including the infinitive, fa, fat-): -faccio -fai -facciamo -fanno -faccia -facciate -facciano -farò -farai -farà -faremo -farete -faranno -farei -faresti -farebbe -faremmo -fareste -farebbero -facevo -facevi -faceva -facevamo -facevate -facevano -feci -facesti -fece -facemmo -faceste -fecero -facessi -facesse -facessimo -facessero -facendo - - | forms of stare, to be (not including the infinitive): -sto -stai -sta -stiamo -stanno -stia -stiate -stiano -starò -starai -starà -staremo -starete -staranno -starei -staresti -starebbe -staremmo -stareste -starebbero -stavo -stavi -stava -stavamo -stavate -stavano -stetti -stesti -stette -stemmo -steste -stettero -stessi -stesse -stessimo -stessero -stando diff --git a/archiver/solr-config-dir/lang/stopwords_ja.txt b/archiver/solr-config-dir/lang/stopwords_ja.txt deleted file mode 100644 index d4321be..0000000 --- a/archiver/solr-config-dir/lang/stopwords_ja.txt +++ /dev/null @@ -1,127 +0,0 @@ -# -# This file defines a stopword set for Japanese. -# -# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. -# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 -# for frequency lists, etc. that can be useful for making your own set (if desired) -# -# Note that there is an overlap between these stopwords and the terms stopped when used -# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note -# that comments are not allowed on the same line as stopwords. -# -# Also note that stopping is done in a case-insensitive manner. Change your StopFilter -# configuration if you need case-sensitive stopping. Lastly, note that stopping is done -# using the same character width as the entries in this file. Since this StopFilter is -# normally done after a CJKWidthFilter in your chain, you would usually want your romaji -# entries to be in half-width and your kana entries to be in full-width. -# -の -に -は -を -た -が -で -て -と -し -れ -さ -ある -いる -も -する -から -な -こと -として -い -や -れる -など -なっ -ない -この -ため -その -あっ -よう -また -もの -という -あり -まで -られ -なる -へ -か -だ -これ -によって -により -おり -より -による -ず -なり -られる -において -ば -なかっ -なく -しかし -について -せ -だっ -その後 -できる -それ -う -ので -なお -のみ -でき -き -つ -における -および -いう -さらに -でも -ら -たり -その他 -に関する -たち -ます -ん -なら -に対して -特に -せる -及び -これら -とき -では -にて -ほか -ながら -うち -そして -とともに -ただし -かつて -それぞれ -または -お -ほど -ものの -に対する -ほとんど -と共に -といった -です -とも -ところ -ここ -##### End of file diff --git a/archiver/solr-config-dir/lang/stopwords_lv.txt b/archiver/solr-config-dir/lang/stopwords_lv.txt deleted file mode 100644 index e21a23c..0000000 --- a/archiver/solr-config-dir/lang/stopwords_lv.txt +++ /dev/null @@ -1,172 +0,0 @@ -# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins -# the original list of over 800 forms was refined: -# pronouns, adverbs, interjections were removed -# -# prepositions -aiz -ap -ar -apakš -ārpus -augšpus -bez -caur -dēļ -gar -iekš -iz -kopš -labad -lejpus -līdz -no -otrpus -pa -par -pār -pēc -pie -pirms -pret -priekš -starp -šaipus -uz -viņpus -virs -virspus -zem -apakšpus -# Conjunctions -un -bet -jo -ja -ka -lai -tomēr -tikko -turpretī -arī -kaut -gan -tādēļ -tā -ne -tikvien -vien -kā -ir -te -vai -kamēr -# Particles -ar -diezin -droši -diemžēl -nebūt -ik -it -taču -nu -pat -tiklab -iekšpus -nedz -tik -nevis -turpretim -jeb -iekam -iekām -iekāms -kolīdz -līdzko -tiklīdz -jebšu -tālab -tāpēc -nekā -itin -jā -jau -jel -nē -nezin -tad -tikai -vis -tak -iekams -vien -# modal verbs -būt -biju -biji -bija -bijām -bijāt -esmu -esi -esam -esat -būšu -būsi -būs -būsim -būsiet -tikt -tiku -tiki -tika -tikām -tikāt -tieku -tiec -tiek -tiekam -tiekat -tikšu -tiks -tiksim -tiksiet -tapt -tapi -tapāt -topat -tapšu -tapsi -taps -tapsim -tapsiet -kļūt -kļuvu -kļuvi -kļuva -kļuvām -kļuvāt -kļūstu -kļūsti -kļūst -kļūstam -kļūstat -kļūšu -kļūsi -kļūs -kļūsim -kļūsiet -# verbs -varēt -varēju -varējām -varēšu -varēsim -var -varēji -varējāt -varēsi -varēsiet -varat -varēja -varēs diff --git a/archiver/solr-config-dir/lang/stopwords_nl.txt b/archiver/solr-config-dir/lang/stopwords_nl.txt deleted file mode 100644 index 47a2aea..0000000 --- a/archiver/solr-config-dir/lang/stopwords_nl.txt +++ /dev/null @@ -1,119 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A Dutch stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - | This is a ranked list (commonest to rarest) of stopwords derived from - | a large sample of Dutch text. - - | Dutch stop words frequently exhibit homonym clashes. These are indicated - | clearly below. - -de | the -en | and -van | of, from -ik | I, the ego -te | (1) chez, at etc, (2) to, (3) too -dat | that, which -die | that, those, who, which -in | in, inside -een | a, an, one -hij | he -het | the, it -niet | not, nothing, naught -zijn | (1) to be, being, (2) his, one's, its -is | is -was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river -op | on, upon, at, in, up, used up -aan | on, upon, to (as dative) -met | with, by -als | like, such as, when -voor | (1) before, in front of, (2) furrow -had | had, past tense all persons sing. of 'hebben' (have) -er | there -maar | but, only -om | round, about, for etc -hem | him -dan | then -zou | should/would, past tense all persons sing. of 'zullen' -of | or, whether, if -wat | what, something, anything -mijn | possessive and noun 'mine' -men | people, 'one' -dit | this -zo | so, thus, in this way -door | through by -over | over, across -ze | she, her, they, them -zich | oneself -bij | (1) a bee, (2) by, near, at -ook | also, too -tot | till, until -je | you -mij | me -uit | out of, from -der | Old Dutch form of 'van der' still found in surnames -daar | (1) there, (2) because -haar | (1) her, their, them, (2) hair -naar | (1) unpleasant, unwell etc, (2) towards, (3) as -heb | present first person sing. of 'to have' -hoe | how, why -heeft | present third person sing. of 'to have' -hebben | 'to have' and various parts thereof -deze | this -u | you -want | (1) for, (2) mitten, (3) rigging -nog | yet, still -zal | 'shall', first and third person sing. of verb 'zullen' (will) -me | me -zij | she, they -nu | now -ge | 'thou', still used in Belgium and south Netherlands -geen | none -omdat | because -iets | something, somewhat -worden | to become, grow, get -toch | yet, still -al | all, every, each -waren | (1) 'were' (2) to wander, (3) wares, (3) -veel | much, many -meer | (1) more, (2) lake -doen | to do, to make -toen | then, when -moet | noun 'spot/mote' and present form of 'to must' -ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' -zonder | without -kan | noun 'can' and present form of 'to be able' -hun | their, them -dus | so, consequently -alles | all, everything, anything -onder | under, beneath -ja | yes, of course -eens | once, one day -hier | here -wie | who -werd | imperfect third person sing. of 'become' -altijd | always -doch | yet, but etc -wordt | present third person sing. of 'become' -wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans -kunnen | to be able -ons | us/our -zelf | self -tegen | against, towards, at -na | after, near -reeds | already -wil | (1) present tense of 'want', (2) 'will', noun, (3) fender -kon | could; past tense of 'to be able' -niets | nothing -uw | your -iemand | somebody -geweest | been; past participle of 'be' -andere | other diff --git a/archiver/solr-config-dir/lang/stopwords_no.txt b/archiver/solr-config-dir/lang/stopwords_no.txt deleted file mode 100644 index a7a2c28..0000000 --- a/archiver/solr-config-dir/lang/stopwords_no.txt +++ /dev/null @@ -1,194 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A Norwegian stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - | This stop word list is for the dominant bokmål dialect. Words unique - | to nynorsk are marked *. - - | Revised by Jan Bruusgaard , Jan 2005 - -og | and -i | in -jeg | I -det | it/this/that -at | to (w. inf.) -en | a/an -et | a/an -den | it/this/that -til | to -er | is/am/are -som | who/that -på | on -de | they / you(formal) -med | with -han | he -av | of -ikke | not -ikkje | not * -der | there -så | so -var | was/were -meg | me -seg | you -men | but -ett | one -har | have -om | about -vi | we -min | my -mitt | my -ha | have -hadde | had -hun | she -nå | now -over | over -da | when/as -ved | by/know -fra | from -du | you -ut | out -sin | your -dem | them -oss | us -opp | up -man | you/one -kan | can -hans | his -hvor | where -eller | or -hva | what -skal | shall/must -selv | self (reflective) -sjøl | self (reflective) -her | here -alle | all -vil | will -bli | become -ble | became -blei | became * -blitt | have become -kunne | could -inn | in -når | when -være | be -kom | come -noen | some -noe | some -ville | would -dere | you -som | who/which/that -deres | their/theirs -kun | only/just -ja | yes -etter | after -ned | down -skulle | should -denne | this -for | for/because -deg | you -si | hers/his -sine | hers/his -sitt | hers/his -mot | against -å | to -meget | much -hvorfor | why -dette | this -disse | these/those -uten | without -hvordan | how -ingen | none -din | your -ditt | your -blir | become -samme | same -hvilken | which -hvilke | which (plural) -sånn | such a -inni | inside/within -mellom | between -vår | our -hver | each -hvem | who -vors | us/ours -hvis | whose -både | both -bare | only/just -enn | than -fordi | as/because -før | before -mange | many -også | also -slik | just -vært | been -være | to be -båe | both * -begge | both -siden | since -dykk | your * -dykkar | yours * -dei | they * -deira | them * -deires | theirs * -deim | them * -di | your (fem.) * -då | as/when * -eg | I * -ein | a/an * -eit | a/an * -eitt | a/an * -elles | or * -honom | he * -hjå | at * -ho | she * -hoe | she * -henne | her -hennar | her/hers -hennes | hers -hoss | how * -hossen | how * -ikkje | not * -ingi | noone * -inkje | noone * -korleis | how * -korso | how * -kva | what/which * -kvar | where * -kvarhelst | where * -kven | who/whom * -kvi | why * -kvifor | why * -me | we * -medan | while * -mi | my * -mine | my * -mykje | much * -no | now * -nokon | some (masc./neut.) * -noka | some (fem.) * -nokor | some * -noko | some * -nokre | some * -si | his/hers * -sia | since * -sidan | since * -so | so * -somt | some * -somme | some * -um | about* -upp | up * -vere | be * -vore | was * -verte | become * -vort | become * -varte | became * -vart | became * - diff --git a/archiver/solr-config-dir/lang/stopwords_pt.txt b/archiver/solr-config-dir/lang/stopwords_pt.txt deleted file mode 100644 index acfeb01..0000000 --- a/archiver/solr-config-dir/lang/stopwords_pt.txt +++ /dev/null @@ -1,253 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A Portuguese stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - - | The following is a ranked list (commonest to rarest) of stopwords - | deriving from a large sample of text. - - | Extra words have been added at the end. - -de | of, from -a | the; to, at; her -o | the; him -que | who, that -e | and -do | de + o -da | de + a -em | in -um | a -para | for - | é from SER -com | with -não | not, no -uma | a -os | the; them -no | em + o -se | himself etc -na | em + a -por | for -mais | more -as | the; them -dos | de + os -como | as, like -mas | but - | foi from SER -ao | a + o -ele | he -das | de + as - | tem from TER -à | a + a -seu | his -sua | her -ou | or - | ser from SER -quando | when -muito | much - | há from HAV -nos | em + os; us -já | already, now - | está from EST -eu | I -também | also -só | only, just -pelo | per + o -pela | per + a -até | up to -isso | that -ela | he -entre | between - | era from SER -depois | after -sem | without -mesmo | same -aos | a + os - | ter from TER -seus | his -quem | whom -nas | em + as -me | me -esse | that -eles | they - | estão from EST -você | you - | tinha from TER - | foram from SER -essa | that -num | em + um -nem | nor -suas | her -meu | my -às | a + as -minha | my - | têm from TER -numa | em + uma -pelos | per + os -elas | they - | havia from HAV - | seja from SER -qual | which - | será from SER -nós | we - | tenho from TER -lhe | to him, her -deles | of them -essas | those -esses | those -pelas | per + as -este | this - | fosse from SER -dele | of him - - | other words. There are many contractions such as naquele = em+aquele, - | mo = me+o, but they are rare. - | Indefinite article plural forms are also rare. - -tu | thou -te | thee -vocês | you (plural) -vos | you -lhes | to them -meus | my -minhas -teu | thy -tua -teus -tuas -nosso | our -nossa -nossos -nossas - -dela | of her -delas | of them - -esta | this -estes | these -estas | these -aquele | that -aquela | that -aqueles | those -aquelas | those -isto | this -aquilo | that - - | forms of estar, to be (not including the infinitive): -estou -está -estamos -estão -estive -esteve -estivemos -estiveram -estava -estávamos -estavam -estivera -estivéramos -esteja -estejamos -estejam -estivesse -estivéssemos -estivessem -estiver -estivermos -estiverem - - | forms of haver, to have (not including the infinitive): -hei -há -havemos -hão -houve -houvemos -houveram -houvera -houvéramos -haja -hajamos -hajam -houvesse -houvéssemos -houvessem -houver -houvermos -houverem -houverei -houverá -houveremos -houverão -houveria -houveríamos -houveriam - - | forms of ser, to be (not including the infinitive): -sou -somos -são -era -éramos -eram -fui -foi -fomos -foram -fora -fôramos -seja -sejamos -sejam -fosse -fôssemos -fossem -for -formos -forem -serei -será -seremos -serão -seria -seríamos -seriam - - | forms of ter, to have (not including the infinitive): -tenho -tem -temos -tém -tinha -tínhamos -tinham -tive -teve -tivemos -tiveram -tivera -tivéramos -tenha -tenhamos -tenham -tivesse -tivéssemos -tivessem -tiver -tivermos -tiverem -terei -terá -teremos -terão -teria -teríamos -teriam diff --git a/archiver/solr-config-dir/lang/stopwords_ro.txt b/archiver/solr-config-dir/lang/stopwords_ro.txt deleted file mode 100644 index 4fdee90..0000000 --- a/archiver/solr-config-dir/lang/stopwords_ro.txt +++ /dev/null @@ -1,233 +0,0 @@ -# This file was created by Jacques Savoy and is distributed under the BSD license. -# See http://members.unine.ch/jacques.savoy/clef/index.html. -# Also see http://www.opensource.org/licenses/bsd-license.html -acea -aceasta -această -aceea -acei -aceia -acel -acela -acele -acelea -acest -acesta -aceste -acestea -aceşti -aceştia -acolo -acum -ai -aia -aibă -aici -al -ăla -ale -alea -ălea -altceva -altcineva -am -ar -are -aş -aşadar -asemenea -asta -ăsta -astăzi -astea -ăstea -ăştia -asupra -aţi -au -avea -avem -aveţi -azi -bine -bucur -bună -ca -că -căci -când -care -cărei -căror -cărui -cât -câte -câţi -către -câtva -ce -cel -ceva -chiar -cînd -cine -cineva -cît -cîte -cîţi -cîtva -contra -cu -cum -cumva -curând -curînd -da -dă -dacă -dar -datorită -de -deci -deja -deoarece -departe -deşi -din -dinaintea -dintr -dintre -drept -după -ea -ei -el -ele -eram -este -eşti -eu -face -fără -fi -fie -fiecare -fii -fim -fiţi -iar -ieri -îi -îl -îmi -împotriva -în -înainte -înaintea -încât -încît -încotro -între -întrucât -întrucît -îţi -la -lângă -le -li -lîngă -lor -lui -mă -mâine -mea -mei -mele -mereu -meu -mi -mine -mult -multă -mulţi -ne -nicăieri -nici -nimeni -nişte -noastră -noastre -noi -noştri -nostru -nu -ori -oricând -oricare -oricât -orice -oricînd -oricine -oricît -oricum -oriunde -până -pe -pentru -peste -pînă -poate -pot -prea -prima -primul -prin -printr -sa -să -săi -sale -sau -său -se -şi -sînt -sîntem -sînteţi -spre -sub -sunt -suntem -sunteţi -ta -tăi -tale -tău -te -ţi -ţie -tine -toată -toate -tot -toţi -totuşi -tu -un -una -unde -undeva -unei -unele -uneori -unor -vă -vi -voastră -voastre -voi -voştri -vostru -vouă -vreo -vreun diff --git a/archiver/solr-config-dir/lang/stopwords_ru.txt b/archiver/solr-config-dir/lang/stopwords_ru.txt deleted file mode 100644 index 5527140..0000000 --- a/archiver/solr-config-dir/lang/stopwords_ru.txt +++ /dev/null @@ -1,243 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | a russian stop word list. comments begin with vertical bar. each stop - | word is at the start of a line. - - | this is a ranked list (commonest to rarest) of stopwords derived from - | a large text sample. - - | letter `ё' is translated to `е'. - -и | and -в | in/into -во | alternative form -не | not -что | what/that -он | he -на | on/onto -я | i -с | from -со | alternative form -как | how -а | milder form of `no' (but) -то | conjunction and form of `that' -все | all -она | she -так | so, thus -его | him -но | but -да | yes/and -ты | thou -к | towards, by -у | around, chez -же | intensifier particle -вы | you -за | beyond, behind -бы | conditional/subj. particle -по | up to, along -только | only -ее | her -мне | to me -было | it was -вот | here is/are, particle -от | away from -меня | me -еще | still, yet, more -нет | no, there isnt/arent -о | about -из | out of -ему | to him -теперь | now -когда | when -даже | even -ну | so, well -вдруг | suddenly -ли | interrogative particle -если | if -уже | already, but homonym of `narrower' -или | or -ни | neither -быть | to be -был | he was -него | prepositional form of его -до | up to -вас | you accusative -нибудь | indef. suffix preceded by hyphen -опять | again -уж | already, but homonym of `adder' -вам | to you -сказал | he said -ведь | particle `after all' -там | there -потом | then -себя | oneself -ничего | nothing -ей | to her -может | usually with `быть' as `maybe' -они | they -тут | here -где | where -есть | there is/are -надо | got to, must -ней | prepositional form of ей -для | for -мы | we -тебя | thee -их | them, their -чем | than -была | she was -сам | self -чтоб | in order to -без | without -будто | as if -человек | man, person, one -чего | genitive form of `what' -раз | once -тоже | also -себе | to oneself -под | beneath -жизнь | life -будет | will be -ж | short form of intensifer particle `же' -тогда | then -кто | who -этот | this -говорил | was saying -того | genitive form of `that' -потому | for that reason -этого | genitive form of `this' -какой | which -совсем | altogether -ним | prepositional form of `его', `они' -здесь | here -этом | prepositional form of `этот' -один | one -почти | almost -мой | my -тем | instrumental/dative plural of `тот', `то' -чтобы | full form of `in order that' -нее | her (acc.) -кажется | it seems -сейчас | now -были | they were -куда | where to -зачем | why -сказать | to say -всех | all (acc., gen. preposn. plural) -никогда | never -сегодня | today -можно | possible, one can -при | by -наконец | finally -два | two -об | alternative form of `о', about -другой | another -хоть | even -после | after -над | above -больше | more -тот | that one (masc.) -через | across, in -эти | these -нас | us -про | about -всего | in all, only, of all -них | prepositional form of `они' (they) -какая | which, feminine -много | lots -разве | interrogative particle -сказала | she said -три | three -эту | this, acc. fem. sing. -моя | my, feminine -впрочем | moreover, besides -хорошо | good -свою | ones own, acc. fem. sing. -этой | oblique form of `эта', fem. `this' -перед | in front of -иногда | sometimes -лучше | better -чуть | a little -том | preposn. form of `that one' -нельзя | one must not -такой | such a one -им | to them -более | more -всегда | always -конечно | of course -всю | acc. fem. sing of `all' -между | between - - - | b: some paradigms - | - | personal pronouns - | - | я меня мне мной [мною] - | ты тебя тебе тобой [тобою] - | он его ему им [него, нему, ним] - | она ее эи ею [нее, нэи, нею] - | оно его ему им [него, нему, ним] - | - | мы нас нам нами - | вы вас вам вами - | они их им ими [них, ним, ними] - | - | себя себе собой [собою] - | - | demonstrative pronouns: этот (this), тот (that) - | - | этот эта это эти - | этого эты это эти - | этого этой этого этих - | этому этой этому этим - | этим этой этим [этою] этими - | этом этой этом этих - | - | тот та то те - | того ту то те - | того той того тех - | тому той тому тем - | тем той тем [тою] теми - | том той том тех - | - | determinative pronouns - | - | (a) весь (all) - | - | весь вся все все - | всего всю все все - | всего всей всего всех - | всему всей всему всем - | всем всей всем [всею] всеми - | всем всей всем всех - | - | (b) сам (himself etc) - | - | сам сама само сами - | самого саму само самих - | самого самой самого самих - | самому самой самому самим - | самим самой самим [самою] самими - | самом самой самом самих - | - | stems of verbs `to be', `to have', `to do' and modal - | - | быть бы буд быв есть суть - | име - | дел - | мог мож мочь - | уме - | хоч хот - | долж - | можн - | нужн - | нельзя - diff --git a/archiver/solr-config-dir/lang/stopwords_sv.txt b/archiver/solr-config-dir/lang/stopwords_sv.txt deleted file mode 100644 index 096f87f..0000000 --- a/archiver/solr-config-dir/lang/stopwords_sv.txt +++ /dev/null @@ -1,133 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A Swedish stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - | This is a ranked list (commonest to rarest) of stopwords derived from - | a large text sample. - - | Swedish stop words occasionally exhibit homonym clashes. For example - | så = so, but also seed. These are indicated clearly below. - -och | and -det | it, this/that -att | to (with infinitive) -i | in, at -en | a -jag | I -hon | she -som | who, that -han | he -på | on -den | it, this/that -med | with -var | where, each -sig | him(self) etc -för | for -så | so (also: seed) -till | to -är | is -men | but -ett | a -om | if; around, about -hade | had -de | they, these/those -av | of -icke | not, no -mig | me -du | you -henne | her -då | then, when -sin | his -nu | now -har | have -inte | inte någon = no one -hans | his -honom | him -skulle | 'sake' -hennes | her -där | there -min | my -man | one (pronoun) -ej | nor -vid | at, by, on (also: vast) -kunde | could -något | some etc -från | from, off -ut | out -när | when -efter | after, behind -upp | up -vi | we -dem | them -vara | be -vad | what -över | over -än | than -dig | you -kan | can -sina | his -här | here -ha | have -mot | towards -alla | all -under | under (also: wonder) -någon | some etc -eller | or (else) -allt | all -mycket | much -sedan | since -ju | why -denna | this/that -själv | myself, yourself etc -detta | this/that -åt | to -utan | without -varit | was -hur | how -ingen | no -mitt | my -ni | you -bli | to be, become -blev | from bli -oss | us -din | thy -dessa | these/those -några | some etc -deras | their -blir | from bli -mina | my -samma | (the) same -vilken | who, that -er | you, your -sådan | such a -vår | our -blivit | from bli -dess | its -inom | within -mellan | between -sådant | such a -varför | why -varje | each -vilka | who, that -ditt | thy -vem | who -vilket | who, that -sitta | his -sådana | such a -vart | each -dina | thy -vars | whose -vårt | our -våra | our -ert | your -era | your -vilkas | whose - diff --git a/archiver/solr-config-dir/lang/stopwords_th.txt b/archiver/solr-config-dir/lang/stopwords_th.txt deleted file mode 100644 index 07f0fab..0000000 --- a/archiver/solr-config-dir/lang/stopwords_th.txt +++ /dev/null @@ -1,119 +0,0 @@ -# Thai stopwords from: -# "Opinion Detection in Thai Political News Columns -# Based on Subjectivity Analysis" -# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak -ไว้ -ไม่ -ไป -ได้ -ให้ -ใน -โดย -แห่ง -แล้ว -และ -แรก -แบบ -แต่ -เอง -เห็น -เลย -เริ่ม -เรา -เมื่อ -เพื่อ -เพราะ -เป็นการ -เป็น -เปิดเผย -เปิด -เนื่องจาก -เดียวกัน -เดียว -เช่น -เฉพาะ -เคย -เข้า -เขา -อีก -อาจ -อะไร -ออก -อย่าง -อยู่ -อยาก -หาก -หลาย -หลังจาก -หลัง -หรือ -หนึ่ง -ส่วน -ส่ง -สุด -สําหรับ -ว่า -วัน -ลง -ร่วม -ราย -รับ -ระหว่าง -รวม -ยัง -มี -มาก -มา -พร้อม -พบ -ผ่าน -ผล -บาง -น่า -นี้ -นํา -นั้น -นัก -นอกจาก -ทุก -ที่สุด -ที่ -ทําให้ -ทํา -ทาง -ทั้งนี้ -ทั้ง -ถ้า -ถูก -ถึง -ต้อง -ต่างๆ -ต่าง -ต่อ -ตาม -ตั้งแต่ -ตั้ง -ด้าน -ด้วย -ดัง -ซึ่ง -ช่วง -จึง -จาก -จัด -จะ -คือ -ความ -ครั้ง -คง -ขึ้น -ของ -ขอ -ขณะ -ก่อน -ก็ -การ -กับ -กัน -กว่า -กล่าว diff --git a/archiver/solr-config-dir/lang/stopwords_tr.txt b/archiver/solr-config-dir/lang/stopwords_tr.txt deleted file mode 100644 index 84d9408..0000000 --- a/archiver/solr-config-dir/lang/stopwords_tr.txt +++ /dev/null @@ -1,212 +0,0 @@ -# Turkish stopwords from LUCENE-559 -# merged with the list from "Information Retrieval on Turkish Texts" -# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) -acaba -altmış -altı -ama -ancak -arada -aslında -ayrıca -bana -bazı -belki -ben -benden -beni -benim -beri -beş -bile -bin -bir -birçok -biri -birkaç -birkez -birşey -birşeyi -biz -bize -bizden -bizi -bizim -böyle -böylece -bu -buna -bunda -bundan -bunlar -bunları -bunların -bunu -bunun -burada -çok -çünkü -da -daha -dahi -de -defa -değil -diğer -diye -doksan -dokuz -dolayı -dolayısıyla -dört -edecek -eden -ederek -edilecek -ediliyor -edilmesi -ediyor -eğer -elli -en -etmesi -etti -ettiği -ettiğini -gibi -göre -halen -hangi -hatta -hem -henüz -hep -hepsi -her -herhangi -herkesin -hiç -hiçbir -için -iki -ile -ilgili -ise -işte -itibaren -itibariyle -kadar -karşın -katrilyon -kendi -kendilerine -kendini -kendisi -kendisine -kendisini -kez -ki -kim -kimden -kime -kimi -kimse -kırk -milyar -milyon -mu -mü -mı -nasıl -ne -neden -nedenle -nerde -nerede -nereye -niye -niçin -o -olan -olarak -oldu -olduğu -olduğunu -olduklarını -olmadı -olmadığı -olmak -olması -olmayan -olmaz -olsa -olsun -olup -olur -olursa -oluyor -on -ona -ondan -onlar -onlardan -onları -onların -onu -onun -otuz -oysa -öyle -pek -rağmen -sadece -sanki -sekiz -seksen -sen -senden -seni -senin -siz -sizden -sizi -sizin -şey -şeyden -şeyi -şeyler -şöyle -şu -şuna -şunda -şundan -şunları -şunu -tarafından -trilyon -tüm -üç -üzere -var -vardı -ve -veya -ya -yani -yapacak -yapılan -yapılması -yapıyor -yapmak -yaptı -yaptığı -yaptığını -yaptıkları -yedi -yerine -yetmiş -yine -yirmi -yoksa -yüz -zaten diff --git a/archiver/solr-config-dir/lang/userdict_ja.txt b/archiver/solr-config-dir/lang/userdict_ja.txt deleted file mode 100644 index 6f0368e..0000000 --- a/archiver/solr-config-dir/lang/userdict_ja.txt +++ /dev/null @@ -1,29 +0,0 @@ -# -# This is a sample user dictionary for Kuromoji (JapaneseTokenizer) -# -# Add entries to this file in order to override the statistical model in terms -# of segmentation, readings and part-of-speech tags. Notice that entries do -# not have weights since they are always used when found. This is by-design -# in order to maximize ease-of-use. -# -# Entries are defined using the following CSV format: -# , ... , ... , -# -# Notice that a single half-width space separates tokens and readings, and -# that the number tokens and readings must match exactly. -# -# Also notice that multiple entries with the same is undefined. -# -# Whitespace only lines are ignored. Comments are not allowed on entry lines. -# - -# Custom segmentation for kanji compounds -日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 -関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 - -# Custom segmentation for compound katakana -トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 -ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 - -# Custom reading for former sumo wrestler -朝青龍,朝青龍,アサショウリュウ,カスタム人名 diff --git a/archiver/solr-config-dir/schema.xml b/archiver/solr-config-dir/schema.xml deleted file mode 100644 index c1eae9a..0000000 --- a/archiver/solr-config-dir/schema.xml +++ /dev/null @@ -1,161 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - id - diff --git a/archiver/solr-config-dir/solrconfig.xml b/archiver/solr-config-dir/solrconfig.xml deleted file mode 100644 index 3331803..0000000 --- a/archiver/solr-config-dir/solrconfig.xml +++ /dev/null @@ -1,1076 +0,0 @@ - - - - - - - - - 9.8 - - - - - - - - - - - ${solr.data.dir:} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ${solr.lock.type:native} - - - - - - - - - - - - - - - - - - - - - ${solr.ulog.dir:} - ${solr.ulog.numVersionBuckets:65536} - - - - - ${solr.autoCommit.maxTime:15000} - false - - - - - - ${solr.autoSoftCommit.maxTime:3000} - - - - - - - - - - - - - - ${solr.max.booleanClauses:1024} - - - - - - - - - - - - - - - - - - - - - - - - true - - - - - - 20 - - - 200 - - - - - - - - - - - - - - - - - - - false - - - - - - - - - - - - - - - - - - - - - explicit - 10 - - - - - - - explicit - json - true - - - - - - - - - - text_general - - - - - - default - body - solr.DirectSolrSpellChecker - - internal - - 0.5 - - 2 - - 1 - - 5 - - 4 - - 0.01 - - - - - - - - - - - - default - on - true - 10 - 5 - 5 - true - true - 10 - 5 - - - spellcheck - - - - - - - - - - - - 100 - - - - - - - - 70 - - 0.5 - - [-\w ,/\n\"']{20,200} - - - - - - - ]]> - ]]> - - - - - - - - - - - - - - - - - - - - - - - - ,, - ,, - ,, - ,, - ,]]> - ]]> - - - - - - 10 - .,!? - - - - - - - WORD - - - en - US - - - - - - - - - - - - - [^\w-\.] - _ - - - - - - - yyyy-MM-dd['T'[HH:mm[:ss[.SSS]][z - yyyy-MM-dd['T'[HH:mm[:ss[,SSS]][z - yyyy-MM-dd HH:mm[:ss[.SSS]][z - yyyy-MM-dd HH:mm[:ss[,SSS]][z - [EEE, ]dd MMM yyyy HH:mm[:ss] z - EEEE, dd-MMM-yy HH:mm:ss z - EEE MMM ppd HH:mm:ss [z ]yyyy - - - - - java.lang.String - text_general - - *_str - 256 - - - true - - - java.lang.Boolean - booleans - - - java.util.Date - pdates - - - java.lang.Long - java.lang.Integer - plongs - - - java.lang.Number - pdoubles - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/breezewiki.rkt b/breezewiki.rkt index 5fd34b2..a8b8c28 100644 --- a/breezewiki.rkt +++ b/breezewiki.rkt @@ -17,12 +17,9 @@ (require-reloadable "src/page-proxy.rkt" page-proxy) (require-reloadable "src/page-redirect-wiki-home.rkt" redirect-wiki-home) (require-reloadable "src/page-search.rkt" page-search) -(require-reloadable "src/page-set-user-settings.rkt" page-set-user-settings) (require-reloadable "src/page-static.rkt" static-dispatcher) -(require-reloadable "src/page-static-archive.rkt" page-static-archive) (require-reloadable "src/page-subdomain.rkt" subdomain-dispatcher) (require-reloadable "src/page-wiki.rkt" page-wiki) -(require-reloadable "src/page-wiki-offline.rkt" page-wiki-offline) (require-reloadable "src/page-file.rkt" page-file) (reload!) @@ -30,9 +27,7 @@ (define ch (make-channel)) (define (start) (serve/launch/wait - #:listen-ip (if (equal? (config-get 'bind_host) "auto") - (if (config-true? 'debug) "127.0.0.1" #f) - (config-get 'bind_host)) + #:listen-ip (if (config-true? 'debug) "127.0.0.1" #f) #:port (string->number (config-get 'port)) (λ (quit) (channel-put ch (lambda () (semaphore-post quit))) @@ -45,10 +40,7 @@ page-not-found page-proxy page-search - page-set-user-settings - page-static-archive page-wiki - page-wiki-offline page-file redirect-wiki-home static-dispatcher diff --git a/dist.rkt b/dist.rkt index 2e46f8c..777e81a 100644 --- a/dist.rkt +++ b/dist.rkt @@ -11,18 +11,13 @@ (require (only-in "src/page-proxy.rkt" page-proxy)) (require (only-in "src/page-redirect-wiki-home.rkt" redirect-wiki-home)) (require (only-in "src/page-search.rkt" page-search)) -(require (only-in "src/page-set-user-settings.rkt" page-set-user-settings)) (require (only-in "src/page-static.rkt" static-dispatcher)) -(require (only-in "src/page-static-archive.rkt" page-static-archive)) (require (only-in "src/page-subdomain.rkt" subdomain-dispatcher)) (require (only-in "src/page-wiki.rkt" page-wiki)) -(require (only-in "src/page-wiki-offline.rkt" page-wiki-offline)) (require (only-in "src/page-file.rkt" page-file)) (serve/launch/wait - #:listen-ip (if (equal? (config-get 'bind_host) "auto") - (if (config-true? 'debug) "127.0.0.1" #f) - (config-get 'bind_host)) + #:listen-ip (if (config-true? 'debug) "127.0.0.1" #f) #:port (string->number (config-get 'port)) (λ (quit) (dispatcher-tree @@ -34,10 +29,7 @@ page-not-found page-proxy page-search - page-set-user-settings - page-static-archive page-wiki - page-wiki-offline page-file redirect-wiki-home static-dispatcher diff --git a/info.rkt b/info.rkt index c290d5b..46512df 100644 --- a/info.rkt +++ b/info.rkt @@ -1,3 +1,3 @@ #lang info -(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "typed-ini-lib" "memo" "net-cookies-lib" "db")) +(define build-deps '("rackunit-lib" "web-server-lib" "http-easy-lib" "html-parsing" "html-writing" "json-pointer" "ini-lib" "memo")) diff --git a/lib/archive-file-mappings.rkt b/lib/archive-file-mappings.rkt deleted file mode 100644 index 03f97f5..0000000 --- a/lib/archive-file-mappings.rkt +++ /dev/null @@ -1,35 +0,0 @@ -#lang racket/base -(require racket/string - net/url - (only-in net/uri-codec uri-decode) - "url-utils.rkt") -(provide - local-encoded-url->segments - url-segments->basename - local-encoded-url->basename - title->basename - basename->name-for-query - url-segments->guess-title) - -(define (local-encoded-url->segments str) ; '("wiki" "Page_title") - (map path/param-path (fix-semicolons-url-path (url-path (string->url str))))) - -(define (url-segments->basename segments) ; "Page_title" filename encoded, no extension or dir prefix - (define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) (cdr segments))) - (define basic-filename (string-join extra-encoded "#")) - basic-filename) - -(define (local-encoded-url->basename str) ; '("wiki" "Page_title"), no extension or dir prefix - (url-segments->basename (local-encoded-url->segments str))) - -(define (title->basename title) ; "Page title/Strategies" -> "Page_title#Strategies" filename encoded, no extension or dir prefi - (define elements (string-split (string-replace title " " "_") "/")) - (define extra-encoded (map (λ (s) (bytes->string/latin-1 (percent-encode s filename-set #f))) elements)) - (define basic-filename (string-join extra-encoded "#")) - basic-filename) - -(define (basename->name-for-query str) - (uri-decode (regexp-replace* #rx"#" str "/"))) - -(define (url-segments->guess-title segments) - (regexp-replace* #rx"_" (cadr segments) " ")) diff --git a/lib/html-parsing/main.rkt b/lib/html-parsing/main.rkt deleted file mode 100644 index bdc09b1..0000000 --- a/lib/html-parsing/main.rkt +++ /dev/null @@ -1,1887 +0,0 @@ -#lang racket/base -;; Copyright Neil Van Dyke. For legal info, see file "info.rkt". - -(require mcfly) - -(module+ test - (require overeasy)) - -(doc (section "Introduction") - - (para "The " - (code "html-parsing") - " library provides a permissive HTML parser. The parser is useful -for software agent extraction of information from Web pages, for -programmatically transforming HTML files, and for implementing interactive Web -browsers. " - (code "html-parsing") - " emits " - ;; TODO: 2016-02-21 Once create sxml-doc package, reference that. - (seclink "top" - #:doc '(lib "sxml-intro/sxml-intro.scrbl") - #:indirect? #true - "SXML/xexp") - ", so that conventional HTML may be processed with XML tools such as -SXPath. Like Oleg Kiselyov's " - (hyperlink "http://pobox.com/~oleg/ftp/Scheme/xml.html#HTML-parser" - "SSAX-based HTML parser") - ", " - (code "html-parsing") - " provides a permissive tokenizer, but " - (code "html-parsing") - " extends this by attempting to recover syntactic structure.") - - (para "The " - (code "html-parsing") - " parsing behavior is permissive in that it accepts erroneous HTML, -handling several classes of HTML syntax errors gracefully, without yielding a -parse error. This is crucial for parsing arbitrary real-world Web pages, since -many pages actually contain syntax errors that would defeat a strict or -validating parser. " - (code "html-parsing") - "'s handling of errors is intended to generally emulate popular Web -browsers' interpretation of the structure of erroneous HTML.") - (para (code "html-parsing") - " also has some support for XHTML, although XML namespace qualifiers -are accepted but stripped from the resulting SXML/xexp. Note that " - (italic "valid") - " XHTML input might be better handled by a validating XML parser -like Kiselyov's SSAX.")) - -;; BEGIN COPIED FROM XEXP PACKAGE - -(define (%html-parsing:make-xexp-char-ref val) - (if (or (symbol? val) (integer? val)) - `(& ,val) - (error 'make-xexp-char-ref - "invalid xexp reference value: ~S" - val))) - -(define %html-parsing:always-empty-html-elements - '(area base br frame hr img input isindex keygen link meta param - spacer wbr)) - -;; END COPIED FROM XEXP PACKAGE - -(define %html-parsing:empty-token-symbol '*empty*) -(define %html-parsing:end-token-symbol '*end*) -(define %html-parsing:start-token-symbol '*start*) -(define %html-parsing:entity-token-symbol '*entity*) -(define %html-parsing:text-string-token-symbol '*text-string*) -(define %html-parsing:text-char-token-symbol '*text-char*) - -(define %html-parsing:make-html-tokenizer - ;; TODO: Have the tokenizer replace contiguous whitespace within individual - ;; text tokens with single space characters (except for when in `pre' and - ;; verbatim elements). The parser will introduce new contiguous whitespace - ;; (e.g., when text tokens are concatenated, invalid end tags are removed, - ;; whitespace is irrelevant between certain elements), but then the parser - ;; only has to worry about the first and last character of each string. - ;; Perhaps the text tokens should have both leading and trailing whitespace - ;; stripped, and contain flags for whether or not leading and trailing - ;; whitespace occurred. - (letrec ((no-token '()) - - ;; TODO: Maybe make these three variables options. - - (verbatim-to-eof-elems '(plaintext)) - - (verbatim-pair-elems '(script server style xmp)) - - (ws-chars (list #\space - (integer->char 9) - (integer->char 10) - (integer->char 11) - (integer->char 12) - (integer->char 13))) - - (gosc/string-or-false - (lambda (os) - (let ((s (get-output-string os))) - (if (string=? s "") #f s)))) - - (gosc/symbol-or-false - (lambda (os) - (let ((s (gosc/string-or-false os))) - (if s (string->symbol s) #f)))) - ) - (lambda (in normalized?) - ;; TODO: Make a tokenizer option that causes XML namespace qualifiers to - ;; be ignored. - (letrec - ( - ;; Port buffer with inexpensive unread of one character and slightly - ;; more expensive pushback of second character to unread. The - ;; procedures themselves do no consing. The tokenizer currently - ;; needs two-symbol lookahead, due to ambiguous "/" while parsing - ;; element and attribute names, which could be either empty-tag - ;; syntax or XML qualified names. - (c #f) - (next-c #f) - (c-consumed? #t) - (read-c (lambda () - (if c-consumed? - (if next-c - (begin (set! c next-c) - (set! next-c #f)) - (set! c (read-char in))) - (set! c-consumed? #t)))) - (unread-c (lambda () - (if c-consumed? - (set! c-consumed? #f) - ;; TODO: Procedure name in error message really - ;; isn't "%html-parsing:make-html-tokenizer"... - (error '%html-parsing:make-html-tokenizer - "already unread: ~S" - c)))) - (push-c (lambda (new-c) - (if c-consumed? - (begin (set! c new-c) - (set! c-consumed? #f)) - (if next-c - (error '%html-parsing:make-html-tokenizer - "pushback full: ~S" - c) - (begin (set! next-c c) - (set! c new-c) - (set! c-consumed? #f)))))) - - ;; TODO: These procedures are a temporary convenience for - ;; enumerating the pertinent character classes, with an eye towards - ;; removing redundant tests of character class. These procedures - ;; should be eliminated in a future version. - (c-eof? (lambda () (eof-object? c))) - (c-amp? (lambda () (eqv? c #\&))) - (c-apos? (lambda () (eqv? c #\'))) - (c-bang? (lambda () (eqv? c #\!))) - (c-colon? (lambda () (eqv? c #\:))) - (c-quot? (lambda () (eqv? c #\"))) - (c-equals? (lambda () (eqv? c #\=))) - (c-gt? (lambda () (eqv? c #\>))) - (c-lsquare? (lambda () (eqv? c #\[))) - (c-lt? (lambda () (eqv? c #\<))) - (c-minus? (lambda () (eqv? c #\-))) - (c-pound? (lambda () (eqv? c #\#))) - (c-ques? (lambda () (eqv? c #\?))) - (c-semi? (lambda () (eqv? c #\;))) - (c-slash? (lambda () (eqv? c #\/))) - (c-splat? (lambda () (eqv? c #\*))) - (c-lf? (lambda () (eqv? c #\newline))) - (c-angle? (lambda () (memv c '(#\< #\>)))) - (c-ws? (lambda () (memv c ws-chars))) - (c-alpha? (lambda () (char-alphabetic? c))) - (c-digit? (lambda () (char-numeric? c))) - (c-alphanum? (lambda () (or (c-alpha?) (c-digit?)))) - (c-hexlet? (lambda () (memv c '(#\a #\b #\c #\d #\e #\f - #\A #\B #\C #\D #\E #\F)))) - - (skip-ws (lambda () (read-c) (if (c-ws?) (skip-ws) (unread-c)))) - - (if-read-chars - (lambda (match-chars yes-thunk no-proc) - (let loop ((chars match-chars) - (match-count 0)) - (if (null? chars) - (yes-thunk) - (begin (read-c) - (if (eqv? c (car chars)) - (begin (loop (cdr chars) (+ 1 match-count))) - (begin (unread-c) - (no-proc match-chars match-count)))))))) - - (write-chars-count - (lambda (chars count port) - (let loop ((chars chars) - (count count)) - (or (zero? count) - (begin (write-char (car chars) port) - (loop (cdr chars) - (- count 1))))))) - - (make-start-token - (if normalized? - (lambda (name ns attrs) - (list name (cons '@ attrs))) - (lambda (name ns attrs) - (if (null? attrs) - (list name) - (list name (cons '@ attrs)))))) - - (make-empty-token - (lambda (name ns attrs) - (cons %html-parsing:empty-token-symbol - (make-start-token name ns attrs)))) - - (make-end-token - (if normalized? - (lambda (name ns attrs) - (list %html-parsing:end-token-symbol - name - (cons '@ attrs))) - (lambda (name ns attrs) - (if (null? attrs) - (list %html-parsing:end-token-symbol name) - (list %html-parsing:end-token-symbol - name - (cons '@ attrs)))))) - - (make-comment-token - (lambda (str) (list '*COMMENT* str))) - - (make-decl-token - (lambda (parts) (cons '*DECL* parts))) - - (scan-qname - ;; TODO: Make sure we don't accept local names that have "*", since - ;; this can break SXML tools. Have to validate this afterwards if - ;; "verbatim-safe?". Also check for "@" and maybe "@@". Check - ;; qname parsing code, especially for verbatim mode. This is - ;; important! - (lambda (verbatim-safe?) - ;; Note: If we accept some invalid local names, we only need two - ;; symbols of lookahead to determine the end of a qname. - (letrec ((os #f) - (ns '()) - (vcolons 0) - (good-os (lambda () - (or os - (begin (set! os (open-output-string)) - os))))) - (let loop () - (read-c) - (cond ((c-eof?) #f) - ((or (c-ws?) (c-splat?)) - (if verbatim-safe? - (unread-c) - #f)) - ((or (c-angle?) (c-equals?) (c-quot?) (c-apos?)) - (unread-c)) - ((c-colon?) - (or (null? ns) - (set! ns (cons ":" ns))) - (if os - (begin - (set! ns (cons (get-output-string os) - ns)) - (set! os #f)) - #f) - (loop)) - ((c-slash?) - (read-c) - (cond ((or (c-eof?) - (c-ws?) - (c-equals?) - (c-apos?) - (c-quot?) - (c-angle?) - (c-splat?)) - (unread-c) - (push-c #\/)) - (else (write-char #\/ (good-os)) - (write-char c os) - (loop)))) - (else (write-char c (good-os)) - (loop)))) - (let ((ns (if (null? ns) - #f - (apply string-append - (reverse ns)))) - (localname (if os (get-output-string os) #f))) - (if verbatim-safe? - ;; TODO: Make sure we don't have ambiguous ":" or drop - ;; any characters! - (cons ns localname) - ;; Note: We represent "xml:" and "xmlns:" syntax as - ;; normal qnames, for lack of something better to do with - ;; them when we don't support XML namespaces. - ;; - ;; TODO: Local names are currently forced to lowercase, - ;; since HTML is usually case-insensitive. If XML - ;; namespaces are used, we might wish to keep local names - ;; case-sensitive. - (if localname - (if ns - (if (or (string=? ns "xml") - (string=? ns "xmlns")) - (string->symbol (string-append ns - ":" - localname)) - (cons ns - (string->symbol (string-downcase - localname)))) - (string->symbol (string-downcase localname))) - (if ns - (string->symbol (string-downcase ns)) - ;; TODO: Ensure in rest of code that returning #f - ;; as a name here is OK. - #f))))))) - - (scan-tag - (lambda (start?) - (skip-ws) - (let ((tag-name (scan-qname #f)) - (tag-ns #f) - (tag-attrs #f) - (tag-empty? #f)) - ;; Scan element name. - (if (pair? tag-name) - (begin (set! tag-ns (car tag-name)) - (set! tag-name (cdr tag-name))) - #f) - ;; TODO: Ensure there's no case in which a #f tag-name isn't - ;; compensated for later. - ;; - ;; Scan element attributes. - (set! tag-attrs - (let scan-attr-list () - (read-c) - (cond ((c-eof?) '()) - ((c-angle?) (unread-c) '()) - ((c-slash?) - (set! tag-empty? #t) - (scan-attr-list)) - ((c-alpha?) - (unread-c) - (let ((attr (scan-attr))) - (cons attr (scan-attr-list)))) - (else (scan-attr-list))))) - ;; Find ">" or unnatural end. - (let loop () - (read-c) - (cond ((c-eof?) no-token) - ((c-slash?) (set! tag-empty? #t) (loop)) - ((c-gt?) #f) - ((c-ws?) (loop)) - (else (unread-c)))) - ;; Change the tokenizer mode if necessary. - (cond ((not start?) #f) - (tag-empty? #f) - ;; TODO: Maybe make one alist lookup here, instead of - ;; two. - ((memq tag-name verbatim-to-eof-elems) - (set! nexttok verbeof-nexttok)) - ((memq tag-name verbatim-pair-elems) - (set! nexttok (make-verbpair-nexttok tag-name)))) - ;; Return a token object. - (if start? - (if tag-empty? - (make-empty-token tag-name tag-ns tag-attrs) - (make-start-token tag-name tag-ns tag-attrs)) - (make-end-token tag-name tag-ns tag-attrs))))) - - (scan-attr - (lambda () - (let ((name (scan-qname #f)) - (val #f)) - (if (pair? name) - (set! name (cdr name)) - #f) - (let loop-equals-or-end () - (read-c) - (cond ((c-eof?) no-token) - ((c-ws?) (loop-equals-or-end)) - ((c-equals?) - (let loop-quote-or-unquoted () - (read-c) - (cond ((c-eof?) no-token) - ((c-ws?) (loop-quote-or-unquoted)) - ((or (c-apos?) (c-quot?)) - (let ((term c)) - (set! val (open-output-string)) - (let loop-quoted-val () - (read-c) - (cond ((c-eof?) #f) - ((eqv? c term) #f) - ((c-amp?) (let ((entity (scan-entity))) - (display entity val) - (loop-quoted-val))) - (else (write-char c val) - (loop-quoted-val)))))) - ((c-angle?) (unread-c)) - (else - (set! val (open-output-string)) - (write-char c val) - (let loop-unquoted-val () - (read-c) - (cond ((c-eof?) no-token) - ((c-apos?) #f) - ((c-quot?) #f) - ((or (c-ws?) (c-angle?) - ;;(c-slash?) - ) - (unread-c)) - ;; Note: We can treat a slash in an - ;; unquoted attribute value as a - ;; value constituent because the - ;; slash is specially-handled only - ;; for XHTML, and XHTML attribute - ;; values must always be quoted. We - ;; could do lookahead for "/>", but - ;; that wouldn't let us parse HTML - ;; "" correctly, so this is - ;; an easier and more correct way to - ;; do things. - (else (write-char c val) - (loop-unquoted-val)))))))) - (else (unread-c)))) - (if normalized? - (list name (if val - (get-output-string val) - (symbol->string name))) - (if val - (list name (get-output-string val)) - (list name)))))) - - (scan-comment - ;; TODO: Rewrite this to use tail recursion rather than a state - ;; variable. - (lambda () - (let ((os (open-output-string)) - (state 'start-minus)) - (let loop () - (read-c) - (cond ((c-eof?) #f) - ((c-minus?) - (set! state - (case state - ((start-minus) 'start-minus-minus) - ((start-minus-minus body) 'end-minus) - ((end-minus) 'end-minus-minus) - ((end-minus-minus) (write-char #\- os) state) - (else (error '<%html-parsing:make-html-tokenizer> - "invalid state: ~S" - state)))) - (loop)) - ((and (c-gt?) (eq? state 'end-minus-minus)) #f) - (else (case state - ((end-minus) (write-char #\- os)) - ((end-minus-minus) (display "--" os))) - (set! state 'body) - (write-char c os) - (loop)))) - (make-comment-token (get-output-string os))))) - - (scan-possible-cdata - (lambda () - ;; Read ") - (lambda () (get-output-string os)) - (lambda (chars count) - (if (zero? count) - (if (eof-object? c) - (get-output-string os) - (begin (write-char c os) - (read-c) - (loop))) - (begin (write-char #\] os) - (if (= count 2) - (push-c #\]) - #f) - (loop))))))))) - - (scan-pi - (lambda () - (skip-ws) - (let ((name (open-output-string)) - (val (open-output-string))) - (let scan-name () - (read-c) - (cond ((c-eof?) #f) - ((c-ws?) #f) - ((c-alpha?) (write-char c name) (scan-name)) - (else (unread-c)))) - ;; TODO: Do we really want to emit #f for PI name? - (set! name (gosc/symbol-or-false name)) - (let scan-val () - (read-c) - (cond ((c-eof?) #f) - ;; ((c-amp?) (display (scan-entity) val) - ;; (scan-val)) - ((c-ques?) - (read-c) - (cond ((c-eof?) (write-char #\? val)) - ((c-gt?) #f) - (else (write-char #\? val) - (unread-c) - (scan-val)))) - (else (write-char c val) (scan-val)))) - (list '*PI* - name - (get-output-string val))))) - - (scan-decl - ;; TODO: Find if SXML includes declaration forms, and if so, use - ;; whatever format SXML wants. - ;; - ;; TODO: Rewrite to eliminate state variables. - (letrec - ((scan-parts - (lambda () - (let ((part (open-output-string)) - (nonsymbol? #f) - (state 'before) - (last? #f)) - (let loop () - (read-c) - (cond ((c-eof?) #f) - ((c-ws?) - (case state - ((before) (loop)) - ((quoted) (write-char c part) (loop)))) - ((and (c-gt?) (not (eq? state 'quoted))) - (set! last? #t)) - ((and (c-lt?) (not (eq? state 'quoted))) - (unread-c)) - ((c-quot?) - (case state - ((before) (set! state 'quoted) (loop)) - ((unquoted) (unread-c)) - ((quoted) #f))) - (else - (if (eq? state 'before) - (set! state 'unquoted) - #f) - (set! nonsymbol? (or nonsymbol? - (not (c-alphanum?)))) - (write-char c part) - (loop)))) - (set! part (get-output-string part)) - (if (string=? part "") - '() - (cons (if (or (eq? state 'quoted) nonsymbol?) - part - ;; TODO: Normalize case of things we make - ;; into symbols here. - (string->symbol part)) - (if last? - '() - (scan-parts)))))))) - (lambda () (make-decl-token (scan-parts))))) - - (special-entity-reverse-chars-to-string-alist - '(((#\p #\m #\a) . "&") - ((#\s #\o #\p #\a) . "'") - ((#\t #\g) . ">") - ((#\t #\l) . "<") - ((#\t #\o #\u #\q) . "\""))) - - (finish-terminated-named-entity - (lambda (reverse-name-chars) - (cond ((equal? '() reverse-name-chars) - "&") - ((assoc reverse-name-chars - special-entity-reverse-chars-to-string-alist) - => (lambda (p) - (cdr p))) - (else (%html-parsing:make-xexp-char-ref - (string->symbol (apply string (reverse reverse-name-chars)))))))) - - (finish-unterminated-named-entity - (lambda (reverse-name-chars) - (apply string (cons #\& (reverse reverse-name-chars))))) - - (scan-entity - (lambda () - (read-c) - (cond ((c-eof?) "&") - ((c-alpha?) - ;; TODO: Do entity names have a maximum length? - (let loop ((reverse-name-chars (cons c '()))) - (read-c) - (cond ((c-eof?) (finish-unterminated-named-entity - reverse-name-chars)) - ((c-alpha?) (let ((reverse-name-chars (cons c reverse-name-chars))) - (cond ((assoc reverse-name-chars - special-entity-reverse-chars-to-string-alist) - => (lambda (p) - (read-c) - (or (c-semi?) - (unread-c)) - (cdr p))) - (else (loop reverse-name-chars))))) - ((c-semi?) (finish-terminated-named-entity - reverse-name-chars)) - (else (unread-c) - (finish-unterminated-named-entity - reverse-name-chars))))) - ((c-pound?) - (let ((num (open-output-string)) - (hex? #f)) - (read-c) - (cond ((c-eof?) #f) - ((memv c '(#\x #\X)) (set! hex? #t) (read-c))) - (let loop () - (cond ((c-eof?) #f) - ((c-semi?) #f) - ((or (c-digit?) (and hex? (c-hexlet?))) - (write-char c num) - (read-c) - (loop)) - (else (unread-c)))) - (set! num (get-output-string num)) - (if (string=? num "") - "&#;" - (let ((n (string->number num (if hex? 16 10)))) - (if (<= 32 n 126) - (string (integer->char n)) - (string (integer->char n))))))) - (else (unread-c) "&")))) - - (normal-nexttok - (lambda () - (read-c) - (cond ((c-eof?) no-token) - ((c-lt?) - (let loop () - (read-c) - (cond ((c-eof?) "<") - ;; ((c-ws?) (loop)) - ((c-slash?) (scan-tag #f)) - ((c-ques?) (scan-pi)) - ((c-alpha?) (unread-c) (scan-tag #t)) - ((c-bang?) - (read-c) - (if (c-lsquare?) - (scan-possible-cdata) - (let loop () - (cond ((c-eof?) no-token) - ((c-ws?) (read-c) (loop)) - ((c-minus?) (scan-comment)) - (else (unread-c) - (scan-decl)))))) - (else (unread-c) "<")))) - ((c-gt?) ">") - (else (let ((os (open-output-string))) - (let loop () - (cond ((c-eof?) #f) - ((c-angle?) (unread-c)) - ((c-amp?) - (let ((entity (scan-entity))) - (if (string? entity) - (begin (display entity os) - (read-c) - (loop)) - (let ((saved-nexttok nexttok)) - (set! nexttok - (lambda () - (set! nexttok - saved-nexttok) - entity)))))) - (else (write-char c os) - (or (c-lf?) - (begin (read-c) (loop)))))) - (let ((text (get-output-string os))) - (if (equal? text "") - (nexttok) - text))))))) - - (verbeof-nexttok - (lambda () - (read-c) - (if (c-eof?) - no-token - (let ((os (open-output-string))) - (let loop () - (or (c-eof?) - (begin (write-char c os) - (or (c-lf?) - (begin (read-c) (loop)))))) - (get-output-string os))))) - - (make-verbpair-nexttok - (lambda (elem-name) - (lambda () - (let ((os (open-output-string))) - ;; Accumulate up to a newline-terminated line. - (let loop () - (read-c) - (cond ((c-eof?) - ;; Got EOF in verbatim context, so set the normal - ;; nextok procedure, then fall out of loop. - (set! nexttok normal-nexttok)) - ((c-lt?) - ;; Got "<" in verbatim context, so get next - ;; character. - (read-c) - (cond ((c-eof?) - ;; Got "<" then EOF, so set to the normal - ;; nexttok procedure, add the "<" to the - ;; verbatim string, and fall out of loop. - (set! nexttok normal-nexttok) - (write-char #\< os)) - ((c-slash?) - ;; Got "symbol - (string-downcase local)) - elem-name)) - ;; This is the terminator tag, so - ;; scan to the end of it, set the - ;; nexttok, and fall out of the loop. - (begin - (let scan-to-end () - (read-c) - (cond ((c-eof?) #f) - ((c-gt?) #f) - ((c-lt?) (unread-c)) - ((c-alpha?) - (unread-c) - ;; Note: This is an - ;; expensive way to skip - ;; over an attribute, but - ;; in practice more - ;; verbatim end tags will - ;; not have attributes. - (scan-attr) - (scan-to-end)) - (else (scan-to-end)))) - (set! nexttok - (lambda () - (set! nexttok - normal-nexttok) - (make-end-token - elem-name #f '())))) - ;; This isn't the terminator tag, so - ;; add to the verbatim string the - ;; "xexp} and related procedures, except -;; using @var{tokenizer} as a source of tokens, rather than tokenizing from an -;; input port. This procedure is used internally, and generally should not be -;; called directly. - -(define %html-parsing:parse-html/tokenizer - ;; Note: This algorithm was originally written in 2001 (as part of the first - ;; Scheme library the author ever wrote), and then on 2009-08-16 was revamped - ;; to not use mutable pairs, for PLT 4 compatibility. It could still use - ;; some work to be more FP, but it works for now. - (letrec ((empty-elements - ;; TODO: Maybe make this an option. - %html-parsing:empty-elements) - (h-elem-parents - ;; Am doing this kludge mainly for mid-1990s HTML that uses the `p` - ;; element wrong. Trying to get all appropriate parents other than - ;; `p` that I can, to reduce breaking other code. - '(a article aside blink blockquote body center footer form header html li main nav section slot template)) - (parent-constraints - ;; TODO: Maybe make this an option. - `((area . (map span)) - (body . (html)) - (caption . (table)) - (colgroup . (table)) - (dd . (dl)) - (dt . (dl)) - (frame . (frameset)) - (head . (html)) - (h1 . ,h-elem-parents) - (h2 . ,h-elem-parents) - (h3 . ,h-elem-parents) - (h4 . ,h-elem-parents) - (h5 . ,h-elem-parents) - (h6 . ,h-elem-parents) - (isindex . (head)) - (li . (dir menu ol ul)) - (meta . (head)) - (noframes . (frameset)) - (option . (select)) - (p . (blockquote body details figcaption html li td th)) - (param . (applet)) - (tbody . (table)) - (td . (tr)) - (th . (tr)) - (thead . (table)) - (title . (head)) - (tr . (table tbody thead)))) - (token-kinds-that-always-get-added - `(*COMMENT* - *DECL* - *PI* - ,%html-parsing:entity-token-symbol - ,%html-parsing:text-string-token-symbol - ,%html-parsing:text-char-token-symbol)) - (start-tag-name (lambda (tag-token) (car tag-token))) - (end-tag-name (lambda (tag-token) (list-ref tag-token 1)))) - (lambda (tokenizer normalized?) - ;;(log-html-parsing-debug "(%html-parsing:parse-html/tokenizer ~S ~S)" tokenizer normalized?) - (let ((begs (list (vector #f '())))) - (letrec ((add-thing-as-child-of-current-beg - (lambda (tok) - (let ((beg (car begs))) - (vector-set! beg 1 (cons tok (vector-ref beg 1)))))) - - (beg->elem - (lambda (beg) - (let ((elem-name (vector-ref beg 0)) - (attrs-and-contents (reverse (vector-ref beg 1)))) - (cons elem-name attrs-and-contents)))) - - (finish-current-beg-and-return-elem - (lambda () - (let ((elem (beg->elem (car begs)))) - (set! begs (cdr begs)) - (or (null? begs) - (add-thing-as-child-of-current-beg elem)) - elem))) - - (finish-current-beg - (lambda () - (finish-current-beg-and-return-elem))) - - (finish-all-begs-and-return-top - (lambda () - (let loop () - (let ((elem (finish-current-beg-and-return-elem))) - (if (car elem) - (loop) - (cdr elem)))))) - - (finish-begs-up-to-and-including-name - (lambda (name) - ;; (log-html-parsing-debug "(finish-begs-up-to-and-including-name ~S)" name) - (let loop-find-name ((find-begs begs) - (depth 1)) - (let ((beg-name (vector-ref (car find-begs) 0))) - (cond ((not beg-name) - ;; We reached the root without finding a - ;; matching beg, so don't finish anything. - ;; - ;; TODO: 2022-04-02: Consider having a `*TOP*` - ;; kludge in `parent-constraints` that's checked - ;; here, especially for handling mid-1990s HTML - ;; `p` element (so that we can keep `p` from - ;; being a child of `p` even when there's no - ;; parent `body` or `html` element). - (void)) - ((eqv? name beg-name) - ;; We found a match, so finish the begs up to - ;; depth. - (let loop-finish ((depth depth)) - (or (zero? depth) - (begin - (finish-current-beg) - (loop-finish (- depth 1)))))) - (else - ;; Didn't find a match yet, and there's still at - ;; least one more beg to look at, so recur. - (loop-find-name (cdr find-begs) - (+ depth 1)))))))) - - (finish-begs-upto-but-not-including-names - (lambda (names) - ;; (log-html-parsing-debug "(finish-begs-upto-but-not-including-names ~S)" names) - ;; (log-html-parsing-debug "begs = ~S" begs) - (let loop-find-name ((find-begs begs) - (depth 0)) - (let ((beg-name (vector-ref (car find-begs) 0))) - (cond ((not beg-name) - ;; We reached the root without finding a - ;; matching beg, so simply discard it. - (void)) - ((memq beg-name names) - ;; We found a match, so finish the begs up to - ;; depth. - (let loop-finish ((depth depth)) - (or (zero? depth) - (begin - (finish-current-beg) - (loop-finish (- depth 1)))))) - (else - ;; Didn't find a match yet, and there's still at - ;; least one more beg to look at, so recur. - (loop-find-name (cdr find-begs) - (+ depth 1))))))))) - - (let loop () - (let ((tok (tokenizer))) - (if (null? tok) - (finish-all-begs-and-return-top) - (let ((kind (%html-parsing:xexp-token-kind tok))) - ;; (log-html-parsing-debug "kind = ~S" kind) - (cond ((memv kind token-kinds-that-always-get-added) - (add-thing-as-child-of-current-beg tok)) - ((eqv? kind %html-parsing:start-token-symbol) - ;; (log-html-parsing-debug "is-start-token-symbol") - (let* ((name (start-tag-name tok)) - (cell (assq name parent-constraints))) - ;; (log-html-parsing-debug "name = ~S cell = ~S" name cell) - (and cell - (finish-begs-upto-but-not-including-names - (cons 'div (cdr cell)))) - (if (memq name empty-elements) - (add-thing-as-child-of-current-beg tok) - (set! begs (cons (vector (car tok) - (cdr tok)) - begs))))) - ((eqv? kind %html-parsing:empty-token-symbol) - ;; Empty tag token, so just add it to current - ;; beginning while stripping off leading `*EMPTY*' - ;; symbol so that the token becomes normal SXML - ;; element syntax. - (add-thing-as-child-of-current-beg (cdr tok))) - ((eqv? kind %html-parsing:end-token-symbol) - (let ((name (end-tag-name tok))) - (if name - ;; Try to finish to a start tag matching this - ;; end tag. If none, just drop the token, - ;; though we used to add it to the current - ;; beginning. - (finish-begs-up-to-and-including-name - name) - ;; We have an anonymous end tag, so match it - ;; with the most recent beginning. If no - ;; beginning to match, then just drop the - ;; token, though we used to add it to the - ;; current beginning. - (and (vector-ref (car begs) 0) - (finish-current-beg))))) - (else (error 'parse-html/tokenizer - "unknown tag kind: ~S" - kind))) - (loop)))))))))) - -;; TODO: Quote of message to a user: -;; -;; >I think this behavior is due to HtmlPrag's use in "parse-html/tokenizer" -;; >of its local "parent-constraints" variable. -;; > -;; >The following line of code from the variable binding expresses the -;; >constraint that any "p" element can have as immediate parent element -;; >only "body", "td", or "th": -;; > -;; > (p . (body td th)) -;; > -;; >I think I know a good heuristic for dealing with unfamiliar but -;; >seemingly well-formed elements, like "page" in this case, but I'm afraid -;; >I don't have time to implement it right now. (I am job-hunting right -;; >now, and there are many other coding things I need to do first.) -;; > -;; >Would adding "page" to the above line of the HtmlPrag source code work -;; >around the current problem, or do you need a better solution right now? - -;; @defproc %parse-html input normalized? top? -;; -;; This procedure is now used internally by @code{html->xexp} and its -;; variants, and should not be used directly by programs. The interface is -;; likely to change in future versions of HtmlPrag. - -(define (%html-parsing:parse-html input normalized? top?) - (let ((parse - (lambda () - (%html-parsing:parse-html/tokenizer - (%html-parsing:make-html-tokenizer - (cond ((input-port? input) input) - ((string? input) (open-input-string input)) - (else (error - '%html-parsing:parse-html - "invalid input type: ~E" - input))) - normalized?) - normalized?)))) - (if top? - (cons '*TOP* (parse)) - (parse)))) - -;; @defproc html->sxml-0nf input -;; @defprocx html->sxml-1nf input -;; @defprocx html->sxml-2nf input -;; @defprocx html->sxml input -(doc (defproc (html->xexp (input (or/c input-port? string?))) - xexp - - (para "Parse HTML permissively from " - (racket input) - ", which is either an input port or a string, and emit an -SXML/xexp equivalent or approximation. To borrow and slightly modify an -example from Kiselyov's discussion of his HTML parser:") - - (racketinput - (html->xexp - (string-append - "whatever" - " link

" - "