Add experimental FTS indexer
This commit is contained in:
		
							parent
							
								
									b8ccd6cc3e
								
							
						
					
					
						commit
						43c3f70736
					
				
					 1 changed files with 89 additions and 0 deletions
				
			
		
							
								
								
									
										89
									
								
								archiver/fts.rkt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								archiver/fts.rkt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,89 @@ | |||
| #lang racket/base | ||||
| (require racket/function | ||||
|          racket/future | ||||
|          racket/match | ||||
|          racket/path | ||||
|          racket/promise | ||||
|          racket/port | ||||
|          racket/string | ||||
|          file/gunzip | ||||
|          db | ||||
|          db/unsafe/sqlite3 | ||||
|          json | ||||
|          json-pointer | ||||
|          "../lib/html-parsing/main.rkt" | ||||
|          "../lib/xexpr-utils.rkt" | ||||
|          "../lib/tree-updater.rkt") | ||||
| 
 | ||||
| (define (class-has? attributes substrs) | ||||
|   (define cl (or (get-attribute 'class attributes) "")) | ||||
|   (ormap (λ (substr) (string-contains? cl substr)) substrs)) | ||||
| 
 | ||||
| (define (updater element element-type attributes children) | ||||
|   (cond | ||||
|     [(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews")) | ||||
|      (list 'div '() '())] | ||||
|     [#t | ||||
|      (list element-type attributes children)])) | ||||
| 
 | ||||
| (define slc (sqlite3-connect #:database "../storage/fts-separate.db")) | ||||
| (sqlite3-load-extension slc "fts5") | ||||
| 
 | ||||
| (define (writer page) | ||||
|   (for ([bit page]) | ||||
|     (cond | ||||
|       [(memq bit '(div p li td)) (displayln "")] | ||||
|       [(symbol? bit) (void)] | ||||
|       [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)] | ||||
|       [(and (pair? bit) (eq? (car bit) '@)) (void)] | ||||
|       [(pair? bit) (writer bit)] | ||||
|       [(string? bit) (display bit)]))) | ||||
| 
 | ||||
| (define wikiname "sto") | ||||
| (define tablename (format "page_~a" wikiname)) | ||||
| 
 | ||||
| (define ((extract f)) ; f - filename | ||||
|   (with-handlers | ||||
|     ([exn:fail? (λ (err) (println f) (raise err))]) | ||||
|     (define j | ||||
|       (case (path-get-extension f) | ||||
|         [(#".json") | ||||
|          (with-input-from-file f (λ () (read-json)))] | ||||
|         [(#".gz") | ||||
|          (define-values (in out) (make-pipe)) | ||||
|          (with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out))) | ||||
|          (read-json in)] | ||||
|         [else #f])) | ||||
|     (define title (json-pointer-value "/parse/title" j)) | ||||
|     (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j))) | ||||
|     (define page (update-tree updater (html->xexp page-html))) | ||||
|     (define text (with-output-to-string (λ () (writer page)))) | ||||
|     (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n")) | ||||
|     (values title shrink-text))) | ||||
| 
 | ||||
| (println "extracting text...") | ||||
| (define results | ||||
|   (time | ||||
|    (for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)] | ||||
|               #:when (member (path-get-extension f) '(#".json" #".gz"))) | ||||
|      (extract f)))) | ||||
| 
 | ||||
| (println "inserting...") | ||||
| (query-exec slc "begin transaction") | ||||
| #;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname)) | ||||
| (time | ||||
|  (for ([fut results] | ||||
|        [i (in-naturals 1)]) | ||||
|    (display "-") | ||||
|    (when (and (> i 0) (= (modulo i 100) 0)) | ||||
|      (println i)) | ||||
|    (define-values (title shrink-text) (fut)) | ||||
|    (query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text))) | ||||
| 
 | ||||
| (println "running optimize...") | ||||
| (query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename)) | ||||
| 
 | ||||
| (println "committing...") | ||||
| (query-exec slc "commit") | ||||
| 
 | ||||
| (disconnect slc) | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue