分詞–list of list格式
(df2的來源請看上一篇博客https://blog.csdn.net/dongzixian/article/details/103474094)
import jieba
df2['cut'] = df2[0].apply(jieba.lcut)
df2.head()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df2['cut'],df2['class'],test_size=0.3)
x_train
from gensim.models.word2vec import Word2Vec
model = Word2Vec(size=400,min_count=5)
model.build_vocab(x_train)
model.build_vocab(x_train)
model.wv.most_similar("好",topn=100)
model.wv.most_similar("不好",topn=100)
model.wv["不好"]
生成句向量
print(df2.cut[0])#(第一條)
print("第一條評論分詞後詞個數爲:",len(df2.cut[0]))
將第一條評論轉成word2vec矩陣
pd.DataFrame([model.wv[i] for i in df2.cut[0] if i in model.wv])
將第一條評論轉成向量
vec1 = pd.DataFrame([model.wv[i] for i in df2.cut[0] if i in model.wv]).mean()
print(vec1)
生成word2vec句向量
def sen2vec(words):
return pd.DataFrame([model.wv[i] for i in df2.cut[0] if i in model.wv]).mean()
train_vec = pd.DataFrame([sen2vec(s) for s in x_train])
train_vec
分類模型–支持向量機
from sklearn.svm import SVC
clf2 = SVC()
clf2.fit(train_vec,y_train)
clf2.score(train_vec,y_train)
SVM
from sklearn.metrics import classification_report
print(classification_report(y_train,clf2.predict(train_vec)))
生成測試集word2vec句向量
def sen2vec(words):
return pd.DataFrame([model.wv[i] for i in words if i in model.wv]).mean()
test_vec = pd.DataFrame([sen2vec(s) for s in x_test])
test_vec
clf2.score(test_vec,y_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,clf2.predict(test_vec)))
預測新評論
s = """...新評論..."""
s_seg1 = jieba.lcut(s)
s_word2vec = pd.DataFrame([model.wv[i] for i in s_seg1 if i in model.wv])
s_word2vec
s_vec = s_word2vec.mean()
s_vec
import numpy as np
s_vec = np.array(s_vec)
s_vec = s_vec.reshape(1,-1)
result_svm = clf2.predict(s_vec)
result_sum