"""
https://github.com/facebookresearch/fastText
python版本
https://github.com/salestock/fastText.py
這個是非官方的版本 現在已經不在使用了
官方提供了Python版本
https://github.com/facebookresearch/fastText/tree/master/python
現在用的都是官方的版本
"""
import jieba
import pandas as pd
import random
cate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}
df_technology = pd.read_csv("./data/technology_news.csv", encoding='utf-8')
df_technology = df_technology.dropna()
df_car = pd.read_csv("./data/car_news.csv", encoding='utf-8')
df_car = df_car.dropna()
df_entertainment = pd.read_csv("./data/entertainment_news.csv", encoding='utf-8')
df_entertainment = df_entertainment.dropna()
df_military = pd.read_csv("./data/military_news.csv", encoding='utf-8')
df_military = df_military.dropna()
df_sports = pd.read_csv("./data/sports_news.csv", encoding='utf-8')
df_sports = df_sports.dropna()
technology = df_technology.content.values.tolist()[1000:21000]
car = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[:20000]
military = df_military.content.values.tolist()[:20000]
sports = df_sports.content.values.tolist()[:20000]
stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values
"""
fasttext的無監督的詞向量訓練
https://github.com/facebookresearch/fastText/tree/master/python
"""
import fasttext
cate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}
def preprocess_text_unsupervised(content_lines, sentences, category):
for line in content_lines:
try:
segs=jieba.lcut(line)
segs = filter(lambda x:len(x)>1, segs)
segs = filter(lambda x:x not in stopwords, segs)
sentences.append(" ".join(segs))
except Exception:
print (line)
continue
#生成無監督訓練數據
sentences = []
preprocess_text_unsupervised(technology, sentences, cate_dic['technology'])
preprocess_text_unsupervised(car, sentences, cate_dic['car'])
preprocess_text_unsupervised(entertainment, sentences, cate_dic['entertainment'])
preprocess_text_unsupervised(military, sentences, cate_dic['military'])
preprocess_text_unsupervised(sports, sentences, cate_dic['sports'])
print ("writing data to fasttext unsupervised learning format...")
out = open('unsupervised_train_data.txt', 'wb')
for sentence in sentences:
out.write(sentence.encode('utf8')+b"\n")
print("done!")
import fasttext
# Skipgram model :
skmodel = fasttext.train_unsupervised('unsupervised_train_data.txt', model='skipgram')
# or, cbow model :
cbowmodel = fasttext.train_unsupervised('unsupervised_train_data.txt', model='cbow')
gensim訓練詞向量
import gensim
"""
對比gensim的word2vec
"""
model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
model.save("gensim_word2vec.model")