From 83e78be0dcd0bf7d5fdd0ac545f5310587673998 Mon Sep 17 00:00:00 2001 From: Cadence Ember Date: Fri, 10 Nov 2023 17:09:18 +1300 Subject: [PATCH] Add experimental FTS indexer --- archiver/fts.rkt | 89 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 archiver/fts.rkt diff --git a/archiver/fts.rkt b/archiver/fts.rkt new file mode 100644 index 0000000..20b21cc --- /dev/null +++ b/archiver/fts.rkt @@ -0,0 +1,89 @@ +#lang racket/base +(require racket/function + racket/future + racket/match + racket/path + racket/promise + racket/port + racket/string + file/gunzip + db + db/unsafe/sqlite3 + json + json-pointer + "../lib/html-parsing/main.rkt" + "../lib/xexpr-utils.rkt" + "../lib/tree-updater.rkt") + +(define (class-has? attributes substrs) + (define cl (or (get-attribute 'class attributes) "")) + (ormap (λ (substr) (string-contains? cl substr)) substrs)) + +(define (updater element element-type attributes children) + (cond + [(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews")) + (list 'div '() '())] + [#t + (list element-type attributes children)])) + +(define slc (sqlite3-connect #:database "../storage/fts-separate.db")) +(sqlite3-load-extension slc "fts5") + +(define (writer page) + (for ([bit page]) + (cond + [(memq bit '(div p li td)) (displayln "")] + [(symbol? bit) (void)] + [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)] + [(and (pair? bit) (eq? (car bit) '@)) (void)] + [(pair? bit) (writer bit)] + [(string? bit) (display bit)]))) + +(define wikiname "sto") +(define tablename (format "page_~a" wikiname)) + +(define ((extract f)) ; f - filename + (with-handlers + ([exn:fail? (λ (err) (println f) (raise err))]) + (define j + (case (path-get-extension f) + [(#".json") + (with-input-from-file f (λ () (read-json)))] + [(#".gz") + (define-values (in out) (make-pipe)) + (with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out))) + (read-json in)] + [else #f])) + (define title (json-pointer-value "/parse/title" j)) + (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j))) + (define page (update-tree updater (html->xexp page-html))) + (define text (with-output-to-string (λ () (writer page)))) + (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n")) + (values title shrink-text))) + +(println "extracting text...") +(define results + (time + (for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)] + #:when (member (path-get-extension f) '(#".json" #".gz"))) + (extract f)))) + +(println "inserting...") +(query-exec slc "begin transaction") +#;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname)) +(time + (for ([fut results] + [i (in-naturals 1)]) + (display "-") + (when (and (> i 0) (= (modulo i 100) 0)) + (println i)) + (define-values (title shrink-text) (fut)) + (query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text))) + +(println "running optimize...") +(query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename)) + +(println "committing...") +(query-exec slc "commit") + +(disconnect slc)