Add experimental FTS indexer
This commit is contained in:
		
							parent
							
								
									b8ccd6cc3e
								
							
						
					
					
						commit
						43c3f70736
					
				
					 1 changed files with 89 additions and 0 deletions
				
			
		
							
								
								
									
										89
									
								
								archiver/fts.rkt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								archiver/fts.rkt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,89 @@ | ||||||
|  | #lang racket/base | ||||||
|  | (require racket/function | ||||||
|  |          racket/future | ||||||
|  |          racket/match | ||||||
|  |          racket/path | ||||||
|  |          racket/promise | ||||||
|  |          racket/port | ||||||
|  |          racket/string | ||||||
|  |          file/gunzip | ||||||
|  |          db | ||||||
|  |          db/unsafe/sqlite3 | ||||||
|  |          json | ||||||
|  |          json-pointer | ||||||
|  |          "../lib/html-parsing/main.rkt" | ||||||
|  |          "../lib/xexpr-utils.rkt" | ||||||
|  |          "../lib/tree-updater.rkt") | ||||||
|  | 
 | ||||||
|  | (define (class-has? attributes substrs) | ||||||
|  |   (define cl (or (get-attribute 'class attributes) "")) | ||||||
|  |   (ormap (λ (substr) (string-contains? cl substr)) substrs)) | ||||||
|  | 
 | ||||||
|  | (define (updater element element-type attributes children) | ||||||
|  |   (cond | ||||||
|  |     [(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews")) | ||||||
|  |      (list 'div '() '())] | ||||||
|  |     [#t | ||||||
|  |      (list element-type attributes children)])) | ||||||
|  | 
 | ||||||
|  | (define slc (sqlite3-connect #:database "../storage/fts-separate.db")) | ||||||
|  | (sqlite3-load-extension slc "fts5") | ||||||
|  | 
 | ||||||
|  | (define (writer page) | ||||||
|  |   (for ([bit page]) | ||||||
|  |     (cond | ||||||
|  |       [(memq bit '(div p li td)) (displayln "")] | ||||||
|  |       [(symbol? bit) (void)] | ||||||
|  |       [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)] | ||||||
|  |       [(and (pair? bit) (eq? (car bit) '@)) (void)] | ||||||
|  |       [(pair? bit) (writer bit)] | ||||||
|  |       [(string? bit) (display bit)]))) | ||||||
|  | 
 | ||||||
|  | (define wikiname "sto") | ||||||
|  | (define tablename (format "page_~a" wikiname)) | ||||||
|  | 
 | ||||||
|  | (define ((extract f)) ; f - filename | ||||||
|  |   (with-handlers | ||||||
|  |     ([exn:fail? (λ (err) (println f) (raise err))]) | ||||||
|  |     (define j | ||||||
|  |       (case (path-get-extension f) | ||||||
|  |         [(#".json") | ||||||
|  |          (with-input-from-file f (λ () (read-json)))] | ||||||
|  |         [(#".gz") | ||||||
|  |          (define-values (in out) (make-pipe)) | ||||||
|  |          (with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out))) | ||||||
|  |          (read-json in)] | ||||||
|  |         [else #f])) | ||||||
|  |     (define title (json-pointer-value "/parse/title" j)) | ||||||
|  |     (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j))) | ||||||
|  |     (define page (update-tree updater (html->xexp page-html))) | ||||||
|  |     (define text (with-output-to-string (λ () (writer page)))) | ||||||
|  |     (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n")) | ||||||
|  |     (values title shrink-text))) | ||||||
|  | 
 | ||||||
|  | (println "extracting text...") | ||||||
|  | (define results | ||||||
|  |   (time | ||||||
|  |    (for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)] | ||||||
|  |               #:when (member (path-get-extension f) '(#".json" #".gz"))) | ||||||
|  |      (extract f)))) | ||||||
|  | 
 | ||||||
|  | (println "inserting...") | ||||||
|  | (query-exec slc "begin transaction") | ||||||
|  | #;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname)) | ||||||
|  | (time | ||||||
|  |  (for ([fut results] | ||||||
|  |        [i (in-naturals 1)]) | ||||||
|  |    (display "-") | ||||||
|  |    (when (and (> i 0) (= (modulo i 100) 0)) | ||||||
|  |      (println i)) | ||||||
|  |    (define-values (title shrink-text) (fut)) | ||||||
|  |    (query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text))) | ||||||
|  | 
 | ||||||
|  | (println "running optimize...") | ||||||
|  | (query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename)) | ||||||
|  | 
 | ||||||
|  | (println "committing...") | ||||||
|  | (query-exec slc "commit") | ||||||
|  | 
 | ||||||
|  | (disconnect slc) | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue