wip
This commit is contained in:
		
							parent
							
								
									f33571f2f4
								
							
						
					
					
						commit
						cf7b1c0c5d
					
				
					 5 changed files with 334 additions and 23 deletions
				
			
		
							
								
								
									
										1
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							|  | @ -3,6 +3,7 @@ | ||||||
| /node_modules | /node_modules | ||||||
| /built | /built | ||||||
| /uploads | /uploads | ||||||
|  | /data | ||||||
| npm-debug.log | npm-debug.log | ||||||
| *.pem | *.pem | ||||||
| run.bat | run.bat | ||||||
|  |  | ||||||
|  | @ -64,6 +64,7 @@ | ||||||
|     "@types/webpack": "3.0.10", |     "@types/webpack": "3.0.10", | ||||||
|     "@types/webpack-stream": "3.2.7", |     "@types/webpack-stream": "3.2.7", | ||||||
|     "@types/websocket": "0.0.34", |     "@types/websocket": "0.0.34", | ||||||
|  |     "@types/msgpack-lite": "^0.1.5", | ||||||
|     "chai": "4.1.2", |     "chai": "4.1.2", | ||||||
|     "chai-http": "3.0.0", |     "chai-http": "3.0.0", | ||||||
|     "css-loader": "0.28.7", |     "css-loader": "0.28.7", | ||||||
|  | @ -97,7 +98,6 @@ | ||||||
|     "accesses": "2.5.0", |     "accesses": "2.5.0", | ||||||
|     "animejs": "2.0.2", |     "animejs": "2.0.2", | ||||||
|     "autwh": "0.0.1", |     "autwh": "0.0.1", | ||||||
|     "bayes": "0.0.7", |  | ||||||
|     "bcryptjs": "2.4.3", |     "bcryptjs": "2.4.3", | ||||||
|     "body-parser": "1.17.2", |     "body-parser": "1.17.2", | ||||||
|     "cafy": "2.4.0", |     "cafy": "2.4.0", | ||||||
|  | @ -126,6 +126,7 @@ | ||||||
|     "monk": "6.0.3", |     "monk": "6.0.3", | ||||||
|     "morgan": "1.8.2", |     "morgan": "1.8.2", | ||||||
|     "ms": "2.0.0", |     "ms": "2.0.0", | ||||||
|  |     "msgpack-lite": "^0.1.26", | ||||||
|     "multer": "1.3.0", |     "multer": "1.3.0", | ||||||
|     "nprogress": "0.2.0", |     "nprogress": "0.2.0", | ||||||
|     "os-utils": "0.0.14", |     "os-utils": "0.0.14", | ||||||
|  |  | ||||||
|  | @ -68,6 +68,9 @@ type Source = { | ||||||
| 		hook_secret: string; | 		hook_secret: string; | ||||||
| 		username: string; | 		username: string; | ||||||
| 	}; | 	}; | ||||||
|  | 	categorizer?: { | ||||||
|  | 		mecab_command?: string; | ||||||
|  | 	}; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| /** | /** | ||||||
|  |  | ||||||
|  | @ -1,36 +1,42 @@ | ||||||
| import * as fs from 'fs'; | import * as fs from 'fs'; | ||||||
| const bayes = require('bayes'); | 
 | ||||||
|  | const bayes = require('./naive-bayes.js'); | ||||||
| const MeCab = require('mecab-async'); | const MeCab = require('mecab-async'); | ||||||
|  | import * as msgpack from 'msgpack-lite'; | ||||||
|  | 
 | ||||||
| import Post from '../../api/models/post'; | import Post from '../../api/models/post'; | ||||||
|  | import config from '../../conf'; | ||||||
| 
 | 
 | ||||||
|  | /** | ||||||
|  |  * 投稿を学習したり与えられた投稿のカテゴリを予測します | ||||||
|  |  */ | ||||||
| export default class Categorizer { | export default class Categorizer { | ||||||
| 	classifier: any; | 	private classifier: any; | ||||||
| 	categorizerDbFilePath: string; | 	private categorizerDbFilePath: string; | ||||||
| 	mecab: any; | 	private mecab: any; | ||||||
| 
 | 
 | ||||||
| 	constructor(categorizerDbFilePath: string, mecabCommand: string = 'mecab -d /usr/share/mecab/dic/mecab-ipadic-neologd') { | 	constructor() { | ||||||
| 		this.categorizerDbFilePath = categorizerDbFilePath; | 		this.categorizerDbFilePath = `${__dirname}/../../../data/category`; | ||||||
| 
 | 
 | ||||||
| 		this.mecab = new MeCab(); | 		this.mecab = new MeCab(); | ||||||
| 		this.mecab.command = mecabCommand; | 		if (config.categorizer.mecab_command) this.mecab.command = config.categorizer.mecab_command; | ||||||
| 
 | 
 | ||||||
| 		// BIND -----------------------------------
 | 		// BIND -----------------------------------
 | ||||||
| 		this.tokenizer = this.tokenizer.bind(this); | 		this.tokenizer = this.tokenizer.bind(this); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	tokenizer(text: string) { | 	private tokenizer(text: string) { | ||||||
| 		return this.mecab.wakachiSync(text); | 		return this.mecab.wakachiSync(text); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	async init() { | 	public async init() { | ||||||
| 		try { | 		try { | ||||||
| 			const db = fs.readFileSync(this.categorizerDbFilePath, { | 			const buffer = fs.readFileSync(this.categorizerDbFilePath); | ||||||
| 				encoding: 'utf8' | 			const db = msgpack.decode(buffer); | ||||||
| 			}); |  | ||||||
| 
 | 
 | ||||||
| 			this.classifier = bayes.fromJson(db); | 			this.classifier = bayes.import(db); | ||||||
| 			this.classifier.tokenizer = this.tokenizer; | 			this.classifier.tokenizer = this.tokenizer; | ||||||
| 		} catch(e) { | 		} catch (e) { | ||||||
| 			this.classifier = bayes({ | 			this.classifier = bayes({ | ||||||
| 				tokenizer: this.tokenizer | 				tokenizer: this.tokenizer | ||||||
| 			}); | 			}); | ||||||
|  | @ -49,7 +55,7 @@ export default class Categorizer { | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	async learn(id, category) { | 	public async learn(id, category) { | ||||||
| 		const post = await Post.findOne({ _id: id }); | 		const post = await Post.findOne({ _id: id }); | ||||||
| 
 | 
 | ||||||
| 		Post.update({ _id: id }, { | 		Post.update({ _id: id }, { | ||||||
|  | @ -64,7 +70,7 @@ export default class Categorizer { | ||||||
| 		this.save(); | 		this.save(); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	async categorize(id) { | 	public async categorize(id) { | ||||||
| 		const post = await Post.findOne({ _id: id }); | 		const post = await Post.findOne({ _id: id }); | ||||||
| 
 | 
 | ||||||
| 		const category = this.classifier.categorize(post.text); | 		const category = this.classifier.categorize(post.text); | ||||||
|  | @ -76,14 +82,12 @@ export default class Categorizer { | ||||||
| 		}); | 		}); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	async test(text) { | 	public async test(text) { | ||||||
| 		return this.classifier.categorize(text); | 		return this.classifier.categorize(text); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	save() { | 	private save() { | ||||||
| 		fs.writeFileSync(this.categorizerDbFilePath, this.classifier.toJson(), { | 		const buffer = msgpack.encode(this.classifier.export()); | ||||||
| 			encoding: 'utf8' | 		fs.writeFileSync(this.categorizerDbFilePath, buffer); | ||||||
| 		}); |  | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 |  | ||||||
|  |  | ||||||
							
								
								
									
										302
									
								
								src/tools/ai/naive-bayes.js
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										302
									
								
								src/tools/ai/naive-bayes.js
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,302 @@ | ||||||
|  | // Original source code: https://github.com/ttezel/bayes/blob/master/lib/naive_bayes.js (commit: 2c20d3066e4fc786400aaedcf3e42987e52abe3c)
 | ||||||
|  | // CUSTOMIZED BY SYUILO
 | ||||||
|  | 
 | ||||||
|  | /* | ||||||
|  | 		Expose our naive-bayes generator function | ||||||
|  | */ | ||||||
|  | module.exports = function (options) { | ||||||
|  | 	return new Naivebayes(options) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // keys we use to serialize a classifier's state
 | ||||||
|  | var STATE_KEYS = module.exports.STATE_KEYS = [ | ||||||
|  | 	'categories', 'docCount', 'totalDocuments', 'vocabulary', 'vocabularySize', | ||||||
|  | 	'wordCount', 'wordFrequencyCount', 'options' | ||||||
|  | ]; | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Initializes a NaiveBayes instance from a JSON state representation. | ||||||
|  |  * Use this with classifier.toJson(). | ||||||
|  |  * | ||||||
|  |  * @param  {String} jsonStr   state representation obtained by classifier.toJson() | ||||||
|  |  * @return {NaiveBayes}       Classifier | ||||||
|  |  */ | ||||||
|  | module.exports.fromJson = function (jsonStr) { | ||||||
|  | 	var parsed; | ||||||
|  | 	try { | ||||||
|  | 		parsed = JSON.parse(jsonStr) | ||||||
|  | 	} catch (e) { | ||||||
|  | 		throw new Error('Naivebayes.fromJson expects a valid JSON string.') | ||||||
|  | 	} | ||||||
|  | 	// init a new classifier
 | ||||||
|  | 	var classifier = new Naivebayes(parsed.options) | ||||||
|  | 
 | ||||||
|  | 	// override the classifier's state
 | ||||||
|  | 	STATE_KEYS.forEach(function (k) { | ||||||
|  | 		if (!parsed[k]) { | ||||||
|  | 			throw new Error('Naivebayes.fromJson: JSON string is missing an expected property: `'+k+'`.') | ||||||
|  | 		} | ||||||
|  | 		classifier[k] = parsed[k] | ||||||
|  | 	}) | ||||||
|  | 
 | ||||||
|  | 	return classifier | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Given an input string, tokenize it into an array of word tokens. | ||||||
|  |  * This is the default tokenization function used if user does not provide one in `options`. | ||||||
|  |  * | ||||||
|  |  * @param  {String} text | ||||||
|  |  * @return {Array} | ||||||
|  |  */ | ||||||
|  | var defaultTokenizer = function (text) { | ||||||
|  | 	//remove punctuation from text - remove anything that isn't a word char or a space
 | ||||||
|  | 	var rgxPunctuation = /[^(a-zA-ZA-Яa-я0-9_)+\s]/g | ||||||
|  | 
 | ||||||
|  | 	var sanitized = text.replace(rgxPunctuation, ' ') | ||||||
|  | 
 | ||||||
|  | 	return sanitized.split(/\s+/) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Naive-Bayes Classifier | ||||||
|  |  * | ||||||
|  |  * This is a naive-bayes classifier that uses Laplace Smoothing. | ||||||
|  |  * | ||||||
|  |  * Takes an (optional) options object containing: | ||||||
|  |  *   - `tokenizer`  => custom tokenization function | ||||||
|  |  * | ||||||
|  |  */ | ||||||
|  | function Naivebayes (options) { | ||||||
|  | 	// set options object
 | ||||||
|  | 	this.options = {} | ||||||
|  | 	if (typeof options !== 'undefined') { | ||||||
|  | 		if (!options || typeof options !== 'object' || Array.isArray(options)) { | ||||||
|  | 			throw TypeError('NaiveBayes got invalid `options`: `' + options + '`. Pass in an object.') | ||||||
|  | 		} | ||||||
|  | 		this.options = options | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	this.tokenizer = this.options.tokenizer || defaultTokenizer | ||||||
|  | 
 | ||||||
|  | 	//initialize our vocabulary and its size
 | ||||||
|  | 	this.vocabulary = {} | ||||||
|  | 	this.vocabularySize = 0 | ||||||
|  | 
 | ||||||
|  | 	//number of documents we have learned from
 | ||||||
|  | 	this.totalDocuments = 0 | ||||||
|  | 
 | ||||||
|  | 	//document frequency table for each of our categories
 | ||||||
|  | 	//=> for each category, how often were documents mapped to it
 | ||||||
|  | 	this.docCount = {} | ||||||
|  | 
 | ||||||
|  | 	//for each category, how many words total were mapped to it
 | ||||||
|  | 	this.wordCount = {} | ||||||
|  | 
 | ||||||
|  | 	//word frequency table for each category
 | ||||||
|  | 	//=> for each category, how frequent was a given word mapped to it
 | ||||||
|  | 	this.wordFrequencyCount = {} | ||||||
|  | 
 | ||||||
|  | 	//hashmap of our category names
 | ||||||
|  | 	this.categories = {} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Initialize each of our data structure entries for this new category | ||||||
|  |  * | ||||||
|  |  * @param  {String} categoryName | ||||||
|  |  */ | ||||||
|  | Naivebayes.prototype.initializeCategory = function (categoryName) { | ||||||
|  | 	if (!this.categories[categoryName]) { | ||||||
|  | 		this.docCount[categoryName] = 0 | ||||||
|  | 		this.wordCount[categoryName] = 0 | ||||||
|  | 		this.wordFrequencyCount[categoryName] = {} | ||||||
|  | 		this.categories[categoryName] = true | ||||||
|  | 	} | ||||||
|  | 	return this | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * train our naive-bayes classifier by telling it what `category` | ||||||
|  |  * the `text` corresponds to. | ||||||
|  |  * | ||||||
|  |  * @param  {String} text | ||||||
|  |  * @param  {String} class | ||||||
|  |  */ | ||||||
|  | Naivebayes.prototype.learn = function (text, category) { | ||||||
|  | 	var self = this | ||||||
|  | 
 | ||||||
|  | 	//initialize category data structures if we've never seen this category
 | ||||||
|  | 	self.initializeCategory(category) | ||||||
|  | 
 | ||||||
|  | 	//update our count of how many documents mapped to this category
 | ||||||
|  | 	self.docCount[category]++ | ||||||
|  | 
 | ||||||
|  | 	//update the total number of documents we have learned from
 | ||||||
|  | 	self.totalDocuments++ | ||||||
|  | 
 | ||||||
|  | 	//normalize the text into a word array
 | ||||||
|  | 	var tokens = self.tokenizer(text) | ||||||
|  | 
 | ||||||
|  | 	//get a frequency count for each token in the text
 | ||||||
|  | 	var frequencyTable = self.frequencyTable(tokens) | ||||||
|  | 
 | ||||||
|  | 	/* | ||||||
|  | 			Update our vocabulary and our word frequency count for this category | ||||||
|  | 	*/ | ||||||
|  | 
 | ||||||
|  | 	Object | ||||||
|  | 	.keys(frequencyTable) | ||||||
|  | 	.forEach(function (token) { | ||||||
|  | 		//add this word to our vocabulary if not already existing
 | ||||||
|  | 		if (!self.vocabulary[token]) { | ||||||
|  | 			self.vocabulary[token] = true | ||||||
|  | 			self.vocabularySize++ | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		var frequencyInText = frequencyTable[token] | ||||||
|  | 
 | ||||||
|  | 		//update the frequency information for this word in this category
 | ||||||
|  | 		if (!self.wordFrequencyCount[category][token]) | ||||||
|  | 			self.wordFrequencyCount[category][token] = frequencyInText | ||||||
|  | 		else | ||||||
|  | 			self.wordFrequencyCount[category][token] += frequencyInText | ||||||
|  | 
 | ||||||
|  | 		//update the count of all words we have seen mapped to this category
 | ||||||
|  | 		self.wordCount[category] += frequencyInText | ||||||
|  | 	}) | ||||||
|  | 
 | ||||||
|  | 	return self | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Determine what category `text` belongs to. | ||||||
|  |  * | ||||||
|  |  * @param  {String} text | ||||||
|  |  * @return {String} category | ||||||
|  |  */ | ||||||
|  | Naivebayes.prototype.categorize = function (text) { | ||||||
|  | 	var self = this | ||||||
|  | 		, maxProbability = -Infinity | ||||||
|  | 		, chosenCategory = null | ||||||
|  | 
 | ||||||
|  | 	var tokens = self.tokenizer(text) | ||||||
|  | 	var frequencyTable = self.frequencyTable(tokens) | ||||||
|  | 
 | ||||||
|  | 	//iterate thru our categories to find the one with max probability for this text
 | ||||||
|  | 	Object | ||||||
|  | 	.keys(self.categories) | ||||||
|  | 	.forEach(function (category) { | ||||||
|  | 
 | ||||||
|  | 		//start by calculating the overall probability of this category
 | ||||||
|  | 		//=>  out of all documents we've ever looked at, how many were
 | ||||||
|  | 		//    mapped to this category
 | ||||||
|  | 		var categoryProbability = self.docCount[category] / self.totalDocuments | ||||||
|  | 
 | ||||||
|  | 		//take the log to avoid underflow
 | ||||||
|  | 		var logProbability = Math.log(categoryProbability) | ||||||
|  | 
 | ||||||
|  | 		//now determine P( w | c ) for each word `w` in the text
 | ||||||
|  | 		Object | ||||||
|  | 		.keys(frequencyTable) | ||||||
|  | 		.forEach(function (token) { | ||||||
|  | 			var frequencyInText = frequencyTable[token] | ||||||
|  | 			var tokenProbability = self.tokenProbability(token, category) | ||||||
|  | 
 | ||||||
|  | 			// console.log('token: %s category: `%s` tokenProbability: %d', token, category, tokenProbability)
 | ||||||
|  | 
 | ||||||
|  | 			//determine the log of the P( w | c ) for this word
 | ||||||
|  | 			logProbability += frequencyInText * Math.log(tokenProbability) | ||||||
|  | 		}) | ||||||
|  | 
 | ||||||
|  | 		if (logProbability > maxProbability) { | ||||||
|  | 			maxProbability = logProbability | ||||||
|  | 			chosenCategory = category | ||||||
|  | 		} | ||||||
|  | 	}) | ||||||
|  | 
 | ||||||
|  | 	return chosenCategory | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Calculate probability that a `token` belongs to a `category` | ||||||
|  |  * | ||||||
|  |  * @param  {String} token | ||||||
|  |  * @param  {String} category | ||||||
|  |  * @return {Number} probability | ||||||
|  |  */ | ||||||
|  | Naivebayes.prototype.tokenProbability = function (token, category) { | ||||||
|  | 	//how many times this word has occurred in documents mapped to this category
 | ||||||
|  | 	var wordFrequencyCount = this.wordFrequencyCount[category][token] || 0 | ||||||
|  | 
 | ||||||
|  | 	//what is the count of all words that have ever been mapped to this category
 | ||||||
|  | 	var wordCount = this.wordCount[category] | ||||||
|  | 
 | ||||||
|  | 	//use laplace Add-1 Smoothing equation
 | ||||||
|  | 	return ( wordFrequencyCount + 1 ) / ( wordCount + this.vocabularySize ) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Build a frequency hashmap where | ||||||
|  |  * - the keys are the entries in `tokens` | ||||||
|  |  * - the values are the frequency of each entry in `tokens` | ||||||
|  |  * | ||||||
|  |  * @param  {Array} tokens  Normalized word array | ||||||
|  |  * @return {Object} | ||||||
|  |  */ | ||||||
|  | Naivebayes.prototype.frequencyTable = function (tokens) { | ||||||
|  | 	var frequencyTable = Object.create(null) | ||||||
|  | 
 | ||||||
|  | 	tokens.forEach(function (token) { | ||||||
|  | 		if (!frequencyTable[token]) | ||||||
|  | 			frequencyTable[token] = 1 | ||||||
|  | 		else | ||||||
|  | 			frequencyTable[token]++ | ||||||
|  | 	}) | ||||||
|  | 
 | ||||||
|  | 	return frequencyTable | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /** | ||||||
|  |  * Dump the classifier's state as a JSON string. | ||||||
|  |  * @return {String} Representation of the classifier. | ||||||
|  |  */ | ||||||
|  | Naivebayes.prototype.toJson = function () { | ||||||
|  | 	var state = {} | ||||||
|  | 	var self = this | ||||||
|  | 	STATE_KEYS.forEach(function (k) { | ||||||
|  | 		state[k] = self[k] | ||||||
|  | 	}) | ||||||
|  | 
 | ||||||
|  | 	var jsonStr = JSON.stringify(state) | ||||||
|  | 
 | ||||||
|  | 	return jsonStr | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // (original method)
 | ||||||
|  | Naivebayes.prototype.export = function () { | ||||||
|  | 	var state = {} | ||||||
|  | 	var self = this | ||||||
|  | 	STATE_KEYS.forEach(function (k) { | ||||||
|  | 		state[k] = self[k] | ||||||
|  | 	}) | ||||||
|  | 
 | ||||||
|  | 	return state | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | module.exports.import = function (data) { | ||||||
|  | 	var parsed = data | ||||||
|  | 
 | ||||||
|  | 	// init a new classifier
 | ||||||
|  | 	var classifier = new Naivebayes() | ||||||
|  | 
 | ||||||
|  | 	// override the classifier's state
 | ||||||
|  | 	STATE_KEYS.forEach(function (k) { | ||||||
|  | 		if (!parsed[k]) { | ||||||
|  | 			throw new Error('Naivebayes.import: data is missing an expected property: `'+k+'`.') | ||||||
|  | 		} | ||||||
|  | 		classifier[k] = parsed[k] | ||||||
|  | 	}) | ||||||
|  | 
 | ||||||
|  | 	return classifier | ||||||
|  | } | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue