fasttext和gensim訓練詞向量

"""
https://github.com/facebookresearch/fastText
python版本
https://github.com/salestock/fastText.py
這個是非官方的版本 現在已經不在使用了
官方提供了Python版本 
https://github.com/facebookresearch/fastText/tree/master/python
現在用的都是官方的版本
"""
import jieba
import pandas as pd
import random
 
cate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}
 
df_technology = pd.read_csv("./data/technology_news.csv", encoding='utf-8')
df_technology = df_technology.dropna()
 
df_car = pd.read_csv("./data/car_news.csv", encoding='utf-8')
df_car = df_car.dropna()
 
df_entertainment = pd.read_csv("./data/entertainment_news.csv", encoding='utf-8')
df_entertainment = df_entertainment.dropna()
 
df_military = pd.read_csv("./data/military_news.csv", encoding='utf-8')
df_military = df_military.dropna()
 
df_sports = pd.read_csv("./data/sports_news.csv", encoding='utf-8')
df_sports = df_sports.dropna()
 
technology = df_technology.content.values.tolist()[1000:21000]
car = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[:20000]
military = df_military.content.values.tolist()[:20000]
sports = df_sports.content.values.tolist()[:20000]
 
stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values
"""
fasttext的無監督的詞向量訓練
https://github.com/facebookresearch/fastText/tree/master/python
"""
import fasttext

cate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}

def preprocess_text_unsupervised(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = filter(lambda x:len(x)>1, segs)
            segs = filter(lambda x:x not in stopwords, segs)
            sentences.append(" ".join(segs))
        except Exception:
            print (line)
            continue
#生成無監督訓練數據
sentences = []

preprocess_text_unsupervised(technology, sentences, cate_dic['technology'])
preprocess_text_unsupervised(car, sentences, cate_dic['car'])
preprocess_text_unsupervised(entertainment, sentences, cate_dic['entertainment'])
preprocess_text_unsupervised(military, sentences, cate_dic['military'])
preprocess_text_unsupervised(sports, sentences, cate_dic['sports'])

print ("writing data to fasttext unsupervised learning format...")
out = open('unsupervised_train_data.txt', 'wb')
for sentence in sentences:
    out.write(sentence.encode('utf8')+b"\n")
print("done!")            


import fasttext

# Skipgram model :
skmodel = fasttext.train_unsupervised('unsupervised_train_data.txt', model='skipgram')

# or, cbow model :
cbowmodel = fasttext.train_unsupervised('unsupervised_train_data.txt', model='cbow')

 

 

gensim訓練詞向量 

import gensim
"""
對比gensim的word2vec
"""


model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
model.save("gensim_word2vec.model")

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章