forked from cadence/breezewiki
		
	Add experimental FTS indexer
This commit is contained in:
		
							parent
							
								
									b8ccd6cc3e
								
							
						
					
					
						commit
						83e78be0dc
					
				
					 1 changed files with 89 additions and 0 deletions
				
			
		
							
								
								
									
										89
									
								
								archiver/fts.rkt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								archiver/fts.rkt
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,89 @@
 | 
				
			||||||
 | 
					#lang racket/base
 | 
				
			||||||
 | 
					(require racket/function
 | 
				
			||||||
 | 
					         racket/future
 | 
				
			||||||
 | 
					         racket/match
 | 
				
			||||||
 | 
					         racket/path
 | 
				
			||||||
 | 
					         racket/promise
 | 
				
			||||||
 | 
					         racket/port
 | 
				
			||||||
 | 
					         racket/string
 | 
				
			||||||
 | 
					         file/gunzip
 | 
				
			||||||
 | 
					         db
 | 
				
			||||||
 | 
					         db/unsafe/sqlite3
 | 
				
			||||||
 | 
					         json
 | 
				
			||||||
 | 
					         json-pointer
 | 
				
			||||||
 | 
					         "../lib/html-parsing/main.rkt"
 | 
				
			||||||
 | 
					         "../lib/xexpr-utils.rkt"
 | 
				
			||||||
 | 
					         "../lib/tree-updater.rkt")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(define (class-has? attributes substrs)
 | 
				
			||||||
 | 
					  (define cl (or (get-attribute 'class attributes) ""))
 | 
				
			||||||
 | 
					  (ormap (λ (substr) (string-contains? cl substr)) substrs))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(define (updater element element-type attributes children)
 | 
				
			||||||
 | 
					  (cond
 | 
				
			||||||
 | 
					    [(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews"))
 | 
				
			||||||
 | 
					     (list 'div '() '())]
 | 
				
			||||||
 | 
					    [#t
 | 
				
			||||||
 | 
					     (list element-type attributes children)]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(define slc (sqlite3-connect #:database "../storage/fts-separate.db"))
 | 
				
			||||||
 | 
					(sqlite3-load-extension slc "fts5")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(define (writer page)
 | 
				
			||||||
 | 
					  (for ([bit page])
 | 
				
			||||||
 | 
					    (cond
 | 
				
			||||||
 | 
					      [(memq bit '(div p li td)) (displayln "")]
 | 
				
			||||||
 | 
					      [(symbol? bit) (void)]
 | 
				
			||||||
 | 
					      [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)]
 | 
				
			||||||
 | 
					      [(and (pair? bit) (eq? (car bit) '@)) (void)]
 | 
				
			||||||
 | 
					      [(pair? bit) (writer bit)]
 | 
				
			||||||
 | 
					      [(string? bit) (display bit)])))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(define wikiname "sto")
 | 
				
			||||||
 | 
					(define tablename (format "page_~a" wikiname))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(define ((extract f)) ; f - filename
 | 
				
			||||||
 | 
					  (with-handlers
 | 
				
			||||||
 | 
					    ([exn:fail? (λ (err) (println f) (raise err))])
 | 
				
			||||||
 | 
					    (define j
 | 
				
			||||||
 | 
					      (case (path-get-extension f)
 | 
				
			||||||
 | 
					        [(#".json")
 | 
				
			||||||
 | 
					         (with-input-from-file f (λ () (read-json)))]
 | 
				
			||||||
 | 
					        [(#".gz")
 | 
				
			||||||
 | 
					         (define-values (in out) (make-pipe))
 | 
				
			||||||
 | 
					         (with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out)))
 | 
				
			||||||
 | 
					         (read-json in)]
 | 
				
			||||||
 | 
					        [else #f]))
 | 
				
			||||||
 | 
					    (define title (json-pointer-value "/parse/title" j))
 | 
				
			||||||
 | 
					    (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j)))
 | 
				
			||||||
 | 
					    (define page (update-tree updater (html->xexp page-html)))
 | 
				
			||||||
 | 
					    (define text (with-output-to-string (λ () (writer page))))
 | 
				
			||||||
 | 
					    (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n"))
 | 
				
			||||||
 | 
					    (values title shrink-text)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(println "extracting text...")
 | 
				
			||||||
 | 
					(define results
 | 
				
			||||||
 | 
					  (time
 | 
				
			||||||
 | 
					   (for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)]
 | 
				
			||||||
 | 
					              #:when (member (path-get-extension f) '(#".json" #".gz")))
 | 
				
			||||||
 | 
					     (extract f))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(println "inserting...")
 | 
				
			||||||
 | 
					(query-exec slc "begin transaction")
 | 
				
			||||||
 | 
					#;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname))
 | 
				
			||||||
 | 
					(time
 | 
				
			||||||
 | 
					 (for ([fut results]
 | 
				
			||||||
 | 
					       [i (in-naturals 1)])
 | 
				
			||||||
 | 
					   (display "-")
 | 
				
			||||||
 | 
					   (when (and (> i 0) (= (modulo i 100) 0))
 | 
				
			||||||
 | 
					     (println i))
 | 
				
			||||||
 | 
					   (define-values (title shrink-text) (fut))
 | 
				
			||||||
 | 
					   (query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(println "running optimize...")
 | 
				
			||||||
 | 
					(query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(println "committing...")
 | 
				
			||||||
 | 
					(query-exec slc "commit")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(disconnect slc)
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue