forked from cadence/breezewiki
Add experimental FTS indexer
This commit is contained in:
parent
b8ccd6cc3e
commit
83e78be0dc
1 changed files with 89 additions and 0 deletions
89
archiver/fts.rkt
Normal file
89
archiver/fts.rkt
Normal file
|
@ -0,0 +1,89 @@
|
|||
#lang racket/base
|
||||
(require racket/function
|
||||
racket/future
|
||||
racket/match
|
||||
racket/path
|
||||
racket/promise
|
||||
racket/port
|
||||
racket/string
|
||||
file/gunzip
|
||||
db
|
||||
db/unsafe/sqlite3
|
||||
json
|
||||
json-pointer
|
||||
"../lib/html-parsing/main.rkt"
|
||||
"../lib/xexpr-utils.rkt"
|
||||
"../lib/tree-updater.rkt")
|
||||
|
||||
(define (class-has? attributes substrs)
|
||||
(define cl (or (get-attribute 'class attributes) ""))
|
||||
(ormap (λ (substr) (string-contains? cl substr)) substrs))
|
||||
|
||||
(define (updater element element-type attributes children)
|
||||
(cond
|
||||
[(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews"))
|
||||
(list 'div '() '())]
|
||||
[#t
|
||||
(list element-type attributes children)]))
|
||||
|
||||
(define slc (sqlite3-connect #:database "../storage/fts-separate.db"))
|
||||
(sqlite3-load-extension slc "fts5")
|
||||
|
||||
(define (writer page)
|
||||
(for ([bit page])
|
||||
(cond
|
||||
[(memq bit '(div p li td)) (displayln "")]
|
||||
[(symbol? bit) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)]
|
||||
[(and (pair? bit) (eq? (car bit) '@)) (void)]
|
||||
[(pair? bit) (writer bit)]
|
||||
[(string? bit) (display bit)])))
|
||||
|
||||
(define wikiname "sto")
|
||||
(define tablename (format "page_~a" wikiname))
|
||||
|
||||
(define ((extract f)) ; f - filename
|
||||
(with-handlers
|
||||
([exn:fail? (λ (err) (println f) (raise err))])
|
||||
(define j
|
||||
(case (path-get-extension f)
|
||||
[(#".json")
|
||||
(with-input-from-file f (λ () (read-json)))]
|
||||
[(#".gz")
|
||||
(define-values (in out) (make-pipe))
|
||||
(with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out)))
|
||||
(read-json in)]
|
||||
[else #f]))
|
||||
(define title (json-pointer-value "/parse/title" j))
|
||||
(define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j)))
|
||||
(define page (update-tree updater (html->xexp page-html)))
|
||||
(define text (with-output-to-string (λ () (writer page))))
|
||||
(define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n"))
|
||||
(values title shrink-text)))
|
||||
|
||||
(println "extracting text...")
|
||||
(define results
|
||||
(time
|
||||
(for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)]
|
||||
#:when (member (path-get-extension f) '(#".json" #".gz")))
|
||||
(extract f))))
|
||||
|
||||
(println "inserting...")
|
||||
(query-exec slc "begin transaction")
|
||||
#;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname))
|
||||
(time
|
||||
(for ([fut results]
|
||||
[i (in-naturals 1)])
|
||||
(display "-")
|
||||
(when (and (> i 0) (= (modulo i 100) 0))
|
||||
(println i))
|
||||
(define-values (title shrink-text) (fut))
|
||||
(query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text)))
|
||||
|
||||
(println "running optimize...")
|
||||
(query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename))
|
||||
|
||||
(println "committing...")
|
||||
(query-exec slc "commit")
|
||||
|
||||
(disconnect slc)
|
Loading…
Reference in a new issue