This commit is contained in:
syuilo 2017-09-06 23:19:58 +09:00
parent cf7b1c0c5d
commit c6b0bf42a1
8 changed files with 205 additions and 93 deletions

View file

@ -1,93 +0,0 @@
import * as fs from 'fs';
const bayes = require('./naive-bayes.js');
const MeCab = require('mecab-async');
import * as msgpack from 'msgpack-lite';
import Post from '../../api/models/post';
import config from '../../conf';
/**
* 稿稿
*/
export default class Categorizer {
private classifier: any;
private categorizerDbFilePath: string;
private mecab: any;
constructor() {
this.categorizerDbFilePath = `${__dirname}/../../../data/category`;
this.mecab = new MeCab();
if (config.categorizer.mecab_command) this.mecab.command = config.categorizer.mecab_command;
// BIND -----------------------------------
this.tokenizer = this.tokenizer.bind(this);
}
private tokenizer(text: string) {
return this.mecab.wakachiSync(text);
}
public async init() {
try {
const buffer = fs.readFileSync(this.categorizerDbFilePath);
const db = msgpack.decode(buffer);
this.classifier = bayes.import(db);
this.classifier.tokenizer = this.tokenizer;
} catch (e) {
this.classifier = bayes({
tokenizer: this.tokenizer
});
// 訓練データ
const verifiedPosts = await Post.find({
is_category_verified: true
});
// 学習
verifiedPosts.forEach(post => {
this.classifier.learn(post.text, post.category);
});
this.save();
}
}
public async learn(id, category) {
const post = await Post.findOne({ _id: id });
Post.update({ _id: id }, {
$set: {
category: category,
is_category_verified: true
}
});
this.classifier.learn(post.text, category);
this.save();
}
public async categorize(id) {
const post = await Post.findOne({ _id: id });
const category = this.classifier.categorize(post.text);
Post.update({ _id: id }, {
$set: {
category: category
}
});
}
public async test(text) {
return this.classifier.categorize(text);
}
private save() {
const buffer = msgpack.encode(this.classifier.export());
fs.writeFileSync(this.categorizerDbFilePath, buffer);
}
}

View file

@ -0,0 +1,57 @@
const bayes = require('./naive-bayes.js');
const MeCab = require('mecab-async');
import Post from '../../api/models/post';
import config from '../../conf';
const classifier = bayes({
tokenizer: this.tokenizer
});
const mecab = new MeCab();
if (config.categorizer.mecab_command) mecab.command = config.categorizer.mecab_command;
// 訓練データ取得
Post.find({
is_category_verified: true
}, {
fields: {
_id: false,
text: true,
category: true
}
}).then(verifiedPosts => {
// 学習
verifiedPosts.forEach(post => {
classifier.learn(post.text, post.category);
});
// 全ての(人間によって証明されていない)投稿を取得
Post.find({
text: {
$exists: true
},
is_category_verified: {
$ne: true
}
}, {
sort: {
_id: -1
},
fields: {
_id: true,
text: true
}
}).then(posts => {
posts.forEach(post => {
console.log(`predicting... ${post._id}`);
const category = classifier.categorize(post.text);
Post.update({ _id: post._id }, {
$set: {
category: category
}
});
});
});
});

View file

@ -0,0 +1,45 @@
import Post from '../../api/models/post';
import User from '../../api/models/user';
export async function predictOne(id) {
console.log(`predict interest of ${id} ...`);
// TODO: repostなども含める
const recentPosts = await Post.find({
user_id: id,
category: {
$exists: true
}
}, {
sort: {
_id: -1
},
limit: 1000,
fields: {
_id: false,
category: true
}
});
const categories = {};
recentPosts.forEach(post => {
if (categories[post.category]) {
categories[post.category]++;
} else {
categories[post.category] = 1;
}
});
}
export async function predictAll() {
const allUsers = await User.find({}, {
fields: {
_id: true
}
});
allUsers.forEach(user => {
predictOne(user._id);
});
}