安装依赖工具包:
pip install -U gensim
安装过程中出现已经安装过的工具包冲突可以uninstall或删除Lib目录下相关包或使用:
pip install -U gensim --ignore-installed scipy
Word2Vec、Doc2Vec原理可以参考链接:
https://blog.csdn.net/mpk_no1/article/details/72458003
简单操作代码如下:
# -*- coding:utf-8 -*-
import os
import sys
import jieba
import logging
import pymongo
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.models.word2vec import Text8Corpus, LineSentence, Word2Vec
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
# 网上随便爬取一些新闻存入数据库
client = pymongo.MongoClient(host='192.168.0.1', port=27017)
db = client['news']
# 停用词
chinese_stop_words_file = os.path.abspath(os.getcwd() + os.sep + '..' + os.sep + 'static' + os.sep + 'dic' + os.sep + 'chinese_stop_words.txt')
chinese_stop_words = [line.strip() for line in open(chinese_stop_words_file, 'r').readlines()]
total_cut_word_count = 0
# 句子分割
def sentence_segment(sentence):
global total_cut_word_count
result = []
cut_words = jieba.cut(sentence)
for cut_word in cut_words:
if cut_word not in chinese_stop_words:
result.append(cut_word)
total_cut_word_count += 1
return result
# 准备语料库
def prepare_word_corpus():
datas = db['netease_ent_news_detail'].find({"create_time": {"$ne": None}}).sort('create_time', pymongo.ASCENDING)
print datas.count()
corpus = ''
for data in datas:
if data['title'] is not None and data['content'] is not None:
sentence = str(data['title']).strip()
corpus += ' '.join(sentence_segment(sentence)) + '\n'
with open('word_corpus.txt', 'wb') as f:
f.write(corpus)
# 准备语料库
def prepare_doc_corpus():
datas = db['netease_ent_news_detail'].find({"create_time": {"$ne": None}}).sort('create_time', pymongo.ASCENDING)
print datas.count()
for i, data in enumerate(datas):
if data['title'] is not None and data['content'] is not None:
title = str(data['title']).strip()
content = str(data['content']).strip()
yield TaggedDocument(sentence_segment(title), [data['_id']])
# 训练模型
def train_word_model():
prepare_word_corpus()
# sentences: 要分析的语料,可以是一个列表或者从文件中遍历读出。对于大语料集,建议使用BrownCorpus, Text8Corpus或lineSentence构建。
# size: 词向量的维度,默认值是100。这个维度的取值一般与我们的语料的大小相关,视语料库的大小而定。
# alpha: 初始的学习速率,在训练过程中会线性地递减到min_alpha。
# window: 词向量上下文最大距离,skip - gram和cbow算法是基于滑动窗口来做预测。默认值为5。
# 在实际使用中,可以根据实际的需求来动态调整这个window的大小。对于一般的语料这个值推荐在[5, 10]之间。
# min_count: 对字典做截断.词频少于min_count次数的单词会被丢弃掉, 默认值为5。
# max_vocab_size: 设置词向量构建期间的RAM限制,设置成None则没有限制。
# sample: 高频词汇的随机降采样的配置阈值,默认为1e - 3,范围是(0, 1e-5)。
# seed: 用于随机数发生器。与初始化词向量有关。
# workers: 用于控制训练的并行数。
# min_alpha: 由于算法支持在迭代的过程中逐渐减小步长,min_alpha给出了最小的迭代步长值。
# 随机梯度下降中每轮的迭代步长可以由iter、alpha、min_alpha一起得出。对于大语料,需要对alpha、min_alpha、iter一起调参来选 择合适的三个值。
# sg: 即我们的word2vec两个模型的选择了。如果是0则是CBOW模型,是1则是Skip-Gram模型,默认是0即CBOW模型。
# hs: 即我们的word2vec两个解法的选择了,如果是0则是Negative Sampling,
# 1的话并且负采样个数negative大于0则是Hierarchical Softmax。默认是0即Negative Sampling。
# negative: 如果大于零,则会采用negative sampling,用于设置多少个noise words(一般是5-20)。
# cbow_mean: 仅用于CBOW在做投影的时候,为0则采用上下文的词向量之和,为1则为上下文的词向量的平均值。默认值也是1不推荐修改默认值。
# hashfxn: hash函数来初始化权重,默认使用python的hash函数。
# iter: 随机梯度下降法中迭代的最大次数,默认是5。对于大语料,可以增大这个值。
# trim_rule: 用于设置词汇表的整理规则,指定那些单词要留下,哪些要被删除。可以设置为None(min_count会被使用)。
# sorted_vocab: 如果为1(默认)则在分配word index的时候会先对单词基于频率降序排序。
# batch_words:每一批的传递给线程的单词的数量,默认为10000。
word2vec = Word2Vec(Text8Corpus('word_corpus.txt'), size=200, window=5, min_count=2, workers=4, iter=10)
word2vec.save('word2vec.model')
# word2vec.save_word2vec_format('word2vec.model', binary=False) 不能追加训练
# 训练模型
def train_doc_model():
corpus = prepare_doc_corpus()
# vector_size: int, optional Dimensionality of the feature vectors.
# window: int, optional The maximum distance between the current and predicted word within a sentence.
# min_count: int, optional Ignores all words with total frequency lower than this.
# max_vocab_size: int, optional Limits the RAM during vocabulary building; if there are more unique
# words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
# Set to `None` for no limit.
# sample: float, optional The threshold for configuring which higher-frequency words are randomly downsampled,
# useful range is (0, 1e-5).
# negative: int, optional
# If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
# should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
# workers: int, optional Use these many worker threads to train the model(=faster training with multicore machines).
# epochs: int, optional Number of iterations (epochs) over the corpus.
doc2vec = Doc2Vec(vector_size=300, min_count=2, window=10, workers=4, epochs=20)
doc2vec.build_vocab(corpus)
doc2vec.train(corpus, total_examples=doc2vec.corpus_count, epochs=doc2vec.epochs)
doc2vec.save('doc2vec.model')
# 测试模型
def test_word_model_1(sentence_1, sentence_2):
word2vec = Word2Vec.load('word2vec.model')
print word2vec.wv.similarity(sentence_1, sentence_2)
# 测试模型
def test_word_model_2(sentence, topn=10):
word2vec = Word2Vec.load('word2vec.model')
similar_words = word2vec.wv.most_similar(sentence, topn=topn)
for similar_word in similar_words:
print '{} {}'.format(similar_word[0], similar_word[1])
# 测试模型
def test_word_model_3(sentence, topn=10):
word2vec = Word2Vec.load('word2vec.model')
similar_words = word2vec.wv.similar_by_word(sentence, topn=topn)
for similar_word in similar_words:
print '{} {}'.format(similar_word[0], similar_word[1])
# 测试模型
def test_word_model_4(words):
word2vec = Word2Vec.load('word2vec.model')
doesnt_match_words = word2vec.wv.doesnt_match(words)
print doesnt_match_words
# 测试模型
def test_doc_model(sentence):
doc2vec = Doc2Vec.load('doc2vec.model')
result = doc2vec.infer_vector(sentence_segment(sentence))
similar_docs = doc2vec.docvecs.most_similar([result], topn=10)
for similar_doc in similar_docs:
data = db['netease_ent_news_detail'].find_one({'_id': similar_doc[0]})
print '{} {} {}'.format(similar_doc[1], str(data['title']).strip(), data['create_time'])