Python学习笔记-gensim初识Word2Vec、Doc2Vec

安装依赖工具包：

pip install -U gensim

安装过程中出现已经安装过的工具包冲突可以uninstall或删除Lib目录下相关包或使用：

pip install -U gensim --ignore-installed scipy

Word2Vec、Doc2Vec原理可以参考链接：

https://blog.csdn.net/mpk_no1/article/details/72458003

简单操作代码如下：

# -*- coding:utf-8 -*-

import os
import sys
import jieba
import logging
import pymongo
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.models.word2vec import Text8Corpus, LineSentence, Word2Vec


default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)


logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)


# 网上随便爬取一些新闻存入数据库
client = pymongo.MongoClient(host='192.168.0.1', port=27017)
db = client['news']

# 停用词
chinese_stop_words_file = os.path.abspath(os.getcwd() + os.sep + '..' + os.sep + 'static' + os.sep + 'dic' + os.sep + 'chinese_stop_words.txt')
chinese_stop_words = [line.strip() for line in open(chinese_stop_words_file, 'r').readlines()]

total_cut_word_count = 0


# 句子分割
def sentence_segment(sentence):
    global total_cut_word_count
    result = []
    cut_words = jieba.cut(sentence)
    for cut_word in cut_words:
        if cut_word not in chinese_stop_words:
            result.append(cut_word)
            total_cut_word_count += 1
    return result


# 准备语料库
def prepare_word_corpus():
    datas = db['netease_ent_news_detail'].find({"create_time": {"$ne": None}}).sort('create_time', pymongo.ASCENDING)
    print datas.count()
    corpus = ''
    for data in datas:
        if data['title'] is not None and data['content'] is not None:
            sentence = str(data['title']).strip()
            corpus += ' '.join(sentence_segment(sentence)) + '\n'
    with open('word_corpus.txt', 'wb') as f:
        f.write(corpus)


# 准备语料库
def prepare_doc_corpus():
    datas = db['netease_ent_news_detail'].find({"create_time": {"$ne": None}}).sort('create_time', pymongo.ASCENDING)
    print datas.count()
    for i, data in enumerate(datas):
        if data['title'] is not None and data['content'] is not None:
            title = str(data['title']).strip()
            content = str(data['content']).strip()
            yield TaggedDocument(sentence_segment(title), [data['_id']])


# 训练模型
def train_word_model():
    prepare_word_corpus()
    # sentences: 要分析的语料,可以是一个列表或者从文件中遍历读出。对于大语料集，建议使用BrownCorpus, Text8Corpus或lineSentence构建。
    # size: 词向量的维度，默认值是100。这个维度的取值一般与我们的语料的大小相关，视语料库的大小而定。
    # alpha: 初始的学习速率，在训练过程中会线性地递减到min_alpha。
    # window: 词向量上下文最大距离，skip - gram和cbow算法是基于滑动窗口来做预测。默认值为5。
    # 在实际使用中，可以根据实际的需求来动态调整这个window的大小。对于一般的语料这个值推荐在[5, 10]之间。
    # min_count: 对字典做截断.词频少于min_count次数的单词会被丢弃掉, 默认值为5。
    # max_vocab_size: 设置词向量构建期间的RAM限制，设置成None则没有限制。
    # sample: 高频词汇的随机降采样的配置阈值，默认为1e - 3，范围是(0, 1e-5)。
    # seed: 用于随机数发生器。与初始化词向量有关。
    # workers: 用于控制训练的并行数。
    # min_alpha: 由于算法支持在迭代的过程中逐渐减小步长，min_alpha给出了最小的迭代步长值。
    # 随机梯度下降中每轮的迭代步长可以由iter、alpha、min_alpha一起得出。对于大语料，需要对alpha、min_alpha、iter一起调参来选                        择合适的三个值。
    # sg: 即我们的word2vec两个模型的选择了。如果是0则是CBOW模型，是1则是Skip-Gram模型，默认是0即CBOW模型。
    # hs: 即我们的word2vec两个解法的选择了，如果是0则是Negative Sampling，
    # 1的话并且负采样个数negative大于0则是Hierarchical Softmax。默认是0即Negative Sampling。
    # negative: 如果大于零，则会采用negative sampling，用于设置多少个noise words（一般是5-20）。
    # cbow_mean: 仅用于CBOW在做投影的时候，为0则采用上下文的词向量之和，为1则为上下文的词向量的平均值。默认值也是1不推荐修改默认值。
    # hashfxn: hash函数来初始化权重，默认使用python的hash函数。
    # iter: 随机梯度下降法中迭代的最大次数，默认是5。对于大语料，可以增大这个值。
    # trim_rule: 用于设置词汇表的整理规则，指定那些单词要留下，哪些要被删除。可以设置为None(min_count会被使用)。
    # sorted_vocab: 如果为1（默认）则在分配word index的时候会先对单词基于频率降序排序。
    # batch_words：每一批的传递给线程的单词的数量，默认为10000。
    word2vec = Word2Vec(Text8Corpus('word_corpus.txt'), size=200, window=5, min_count=2, workers=4, iter=10)
    word2vec.save('word2vec.model')
    # word2vec.save_word2vec_format('word2vec.model', binary=False)  不能追加训练


# 训练模型
def train_doc_model():
    corpus = prepare_doc_corpus()
    # vector_size: int, optional Dimensionality of the feature vectors.
    # window: int, optional The maximum distance between the current and predicted word within a sentence.
    # min_count: int, optional Ignores all words with total frequency lower than this.
    # max_vocab_size: int, optional Limits the RAM during vocabulary building; if there are more unique
    # words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
    # Set to `None` for no limit.
    # sample: float, optional The threshold for configuring which higher-frequency words are randomly downsampled,
    # useful range is (0, 1e-5).
    # negative: int, optional
    # If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
    # should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
    # workers: int, optional Use these many worker threads to train the model(=faster training with multicore machines).
    # epochs: int, optional Number of iterations (epochs) over the corpus.
    doc2vec = Doc2Vec(vector_size=300, min_count=2, window=10, workers=4, epochs=20)
    doc2vec.build_vocab(corpus)
    doc2vec.train(corpus, total_examples=doc2vec.corpus_count, epochs=doc2vec.epochs)
    doc2vec.save('doc2vec.model')


# 测试模型
def test_word_model_1(sentence_1, sentence_2):
    word2vec = Word2Vec.load('word2vec.model')
    print word2vec.wv.similarity(sentence_1, sentence_2)


# 测试模型
def test_word_model_2(sentence, topn=10):
    word2vec = Word2Vec.load('word2vec.model')
    similar_words = word2vec.wv.most_similar(sentence, topn=topn)
    for similar_word in similar_words:
        print '{} {}'.format(similar_word[0], similar_word[1])


# 测试模型
def test_word_model_3(sentence, topn=10):
    word2vec = Word2Vec.load('word2vec.model')
    similar_words = word2vec.wv.similar_by_word(sentence, topn=topn)
    for similar_word in similar_words:
        print '{} {}'.format(similar_word[0], similar_word[1])


# 测试模型
def test_word_model_4(words):
    word2vec = Word2Vec.load('word2vec.model')
    doesnt_match_words = word2vec.wv.doesnt_match(words)
    print doesnt_match_words


# 测试模型
def test_doc_model(sentence):
    doc2vec = Doc2Vec.load('doc2vec.model')
    result = doc2vec.infer_vector(sentence_segment(sentence))
    similar_docs = doc2vec.docvecs.most_similar([result], topn=10)
    for similar_doc in similar_docs:
        data = db['netease_ent_news_detail'].find_one({'_id': similar_doc[0]})
        print '{} {} {}'.format(similar_doc[1], str(data['title']).strip(), data['create_time'])

Python学习笔记-gensim初识Word2Vec、Doc2Vec

开源高性能结构化日志模块NanoLog

杭州的 IT 崩盘了么？

【简写Mybatis-02】注册机的实现以及SqlSession处理

手绘二维码

.NET借助虚拟网卡实现一个简单异地组网工具

Python學習筆記-WXPY語音信息識別

推薦系統筆記-01-文章畫像

Python學習筆記-WXPY初識

Activiti學習筆記-整合SpringBoot與簡單使用

ElasticSearch學習筆記-ngram、中文拼音、簡繁體搜索記錄

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結