forked from cadence/breezewiki
		
	Add experimental FTS indexer
This commit is contained in:
		
							parent
							
								
									b8ccd6cc3e
								
							
						
					
					
						commit
						83e78be0dc
					
				
					 1 changed files with 89 additions and 0 deletions
				
			
		
							
								
								
									
										89
									
								
								archiver/fts.rkt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								archiver/fts.rkt
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,89 @@
 | 
			
		|||
#lang racket/base
 | 
			
		||||
(require racket/function
 | 
			
		||||
         racket/future
 | 
			
		||||
         racket/match
 | 
			
		||||
         racket/path
 | 
			
		||||
         racket/promise
 | 
			
		||||
         racket/port
 | 
			
		||||
         racket/string
 | 
			
		||||
         file/gunzip
 | 
			
		||||
         db
 | 
			
		||||
         db/unsafe/sqlite3
 | 
			
		||||
         json
 | 
			
		||||
         json-pointer
 | 
			
		||||
         "../lib/html-parsing/main.rkt"
 | 
			
		||||
         "../lib/xexpr-utils.rkt"
 | 
			
		||||
         "../lib/tree-updater.rkt")
 | 
			
		||||
 | 
			
		||||
(define (class-has? attributes substrs)
 | 
			
		||||
  (define cl (or (get-attribute 'class attributes) ""))
 | 
			
		||||
  (ormap (λ (substr) (string-contains? cl substr)) substrs))
 | 
			
		||||
 | 
			
		||||
(define (updater element element-type attributes children)
 | 
			
		||||
  (cond
 | 
			
		||||
    [(class-has? attributes '("collapsed" "selflink" "label" "toc" "editsection" "reviews"))
 | 
			
		||||
     (list 'div '() '())]
 | 
			
		||||
    [#t
 | 
			
		||||
     (list element-type attributes children)]))
 | 
			
		||||
 | 
			
		||||
(define slc (sqlite3-connect #:database "../storage/fts-separate.db"))
 | 
			
		||||
(sqlite3-load-extension slc "fts5")
 | 
			
		||||
 | 
			
		||||
(define (writer page)
 | 
			
		||||
  (for ([bit page])
 | 
			
		||||
    (cond
 | 
			
		||||
      [(memq bit '(div p li td)) (displayln "")]
 | 
			
		||||
      [(symbol? bit) (void)]
 | 
			
		||||
      [(and (pair? bit) (eq? (car bit) '*COMMENT*)) (void)]
 | 
			
		||||
      [(and (pair? bit) (eq? (car bit) '@)) (void)]
 | 
			
		||||
      [(pair? bit) (writer bit)]
 | 
			
		||||
      [(string? bit) (display bit)])))
 | 
			
		||||
 | 
			
		||||
(define wikiname "sto")
 | 
			
		||||
(define tablename (format "page_~a" wikiname))
 | 
			
		||||
 | 
			
		||||
(define ((extract f)) ; f - filename
 | 
			
		||||
  (with-handlers
 | 
			
		||||
    ([exn:fail? (λ (err) (println f) (raise err))])
 | 
			
		||||
    (define j
 | 
			
		||||
      (case (path-get-extension f)
 | 
			
		||||
        [(#".json")
 | 
			
		||||
         (with-input-from-file f (λ () (read-json)))]
 | 
			
		||||
        [(#".gz")
 | 
			
		||||
         (define-values (in out) (make-pipe))
 | 
			
		||||
         (with-input-from-file f (λ () (gunzip-through-ports (current-input-port) out)))
 | 
			
		||||
         (read-json in)]
 | 
			
		||||
        [else #f]))
 | 
			
		||||
    (define title (json-pointer-value "/parse/title" j))
 | 
			
		||||
    (define page-html (preprocess-html-wiki (json-pointer-value "/parse/text" j)))
 | 
			
		||||
    (define page (update-tree updater (html->xexp page-html)))
 | 
			
		||||
    (define text (with-output-to-string (λ () (writer page))))
 | 
			
		||||
    (define shrink-text (regexp-replace* #px"([ \t]*\r?\n+)+" text "\n"))
 | 
			
		||||
    (values title shrink-text)))
 | 
			
		||||
 | 
			
		||||
(println "extracting text...")
 | 
			
		||||
(define results
 | 
			
		||||
  (time
 | 
			
		||||
   (for/list ([f (directory-list (format "../storage/archive/~a" wikiname) #:build? #t)]
 | 
			
		||||
              #:when (member (path-get-extension f) '(#".json" #".gz")))
 | 
			
		||||
     (extract f))))
 | 
			
		||||
 | 
			
		||||
(println "inserting...")
 | 
			
		||||
(query-exec slc "begin transaction")
 | 
			
		||||
#;(query-exec slc (format "create virtual table \"~a\" using fts5 (title, body, tokenize='porter unicode61')" wikiname))
 | 
			
		||||
(time
 | 
			
		||||
 (for ([fut results]
 | 
			
		||||
       [i (in-naturals 1)])
 | 
			
		||||
   (display "-")
 | 
			
		||||
   (when (and (> i 0) (= (modulo i 100) 0))
 | 
			
		||||
     (println i))
 | 
			
		||||
   (define-values (title shrink-text) (fut))
 | 
			
		||||
   (query-exec slc (format "insert into \"~a\" (title, body) values (?, ?)" tablename) title shrink-text)))
 | 
			
		||||
 | 
			
		||||
(println "running optimize...")
 | 
			
		||||
(query-exec slc (format "insert into \"~a\" (\"~a\") values ('optimize')" tablename tablename))
 | 
			
		||||
 | 
			
		||||
(println "committing...")
 | 
			
		||||
(query-exec slc "commit")
 | 
			
		||||
 | 
			
		||||
(disconnect slc)
 | 
			
		||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue