目錄
1、基於TF的關鍵詞提取
使用TF詞頻對訓練集clean_data_train進行關鍵詞提取,選取topK個關鍵詞作爲特徵詞,即topK=10000。
# 訓練集中詞頻統計,並計算TF值
def words_tf():
train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
sentence_list = []
lenth = len(train_data)
for i in range(lenth):
sentence_list.append(str(train_data['contents'][i]).split())
# 總詞頻統計
doc_frequency = defaultdict(int)
for word_list in sentence_list:
for i in word_list:
doc_frequency[i] += 1
# 計算每個詞的TF值
word_tf = {} # 存儲每個詞的tf值
for i in doc_frequency:
word_tf[i] = doc_frequency[i] / sum(doc_frequency.values())
words_tf = sorted(word_tf.items(), key=lambda x: x[1], reverse=True)
return words_tf[:10000]
2、根據詞頻將文本轉化爲向量
根據所提取的關鍵詞及其詞頻權重值,將訓練集clean_data_train和測試集clean_data_test轉換爲向量,作爲模型的輸入。
# 根據詞頻,將文本轉換爲向量
def word2vec(keywords_tf, doc_sentence):
keywords = list(dict(keywords_tf).keys()) # 獲取關鍵詞
tf_weight = list(dict(keywords_tf).values()) # 獲取關鍵詞tf值
docvec_list = []
for sentence in doc_sentence:
docvec = [0] * len(keywords_tf)
for word in sentence:
if word in keywords:
docvec[keywords.index(word)] = tf_weight[keywords.index(word)]
docvec_list.append(docvec)
return docvec_list
# 將訓練集和測試集換爲文本向量
def doc_vec(x_train, x_test):
keywords_tf = words_tf() # 獲取詞頻關鍵詞
# 訓練集轉換爲向量
train_lenth = len(x_train)
train_data_list = []
for i in range(train_lenth):
train_data_list.append(str(x_train[i]).split())
train_docvec_list = word2vec(keywords_tf, train_data_list)
# 測試集轉換爲向量
test_lenth = len(x_test)
test_data_list = []
for i in range(test_lenth):
test_data_list.append(str(x_test[i]).split())
test_docvec_list = word2vec(keywords_tf, test_data_list)
return train_docvec_list, test_docvec_list
3、基於樹模型的重要特徵選擇
使用樹模型對樣本進行訓練,從10000個特徵中選擇重要特徵(選擇特徵重要性爲1.5倍均值的特徵),即features=986。
# 導入SelectFromModel結合ExtraTreesClassifier計算特徵重要性,並按重要性閾值選擇特徵。
clf_model = ExtraTreesClassifier(n_estimators=250, random_state=0)
# clf_model=RandomForestClassifier(n_estimators=250,random_state=0)
clf_model.fit(x_train, y_train)
# 獲取每個詞的特徵權重,數值越高特徵越重要l
importances = clf_model.feature_importances_
'''
# 將詞和詞的權重存入字典並寫入文件
feature_words_dic = {}
for i in range(len(words_list)):
feature_words_dic[words_list[i][0]] = importances[i]
# 對字典按權重由大到小進行排序
words_info_dic_sort = sorted(feature_words_dic.items(), key=lambda x: x[1], reverse=True)
#將前2000個詞的權重字典寫入文件
key_words_importance=dict(words_info_dic_sort[:2000])
with open('data/key_words_importance','w') as f:
f.write(str(key_words_importance))
'''
# 選擇特徵重要性爲1.5倍均值的特徵
model = SelectFromModel(clf_model, threshold='1.5*mean', prefit=True)
x_train_new = model.transform(x_train) # 返回訓練集所選特徵
x_test_new = model.transform(x_test) # 返回測試集所選特徵
4、lightGBM模型構建
lightGBM參數設置:num_round=2000,max_depth=6,learning_rate=0.1,其餘採用模型默認參數。
# 創建成lgb特徵的數據集格式
lgb_train = lgb.Dataset(x_train_new, y_train)
lgb_val = lgb.Dataset(x_test_new, y_test, reference=lgb_train)
# 構建lightGBM模型
params = {'max_depth': 6, 'min_data_in_leaf': 20, 'num_leaves': 35, 'learning_rate': 0.1, 'lambda_l1': 0.1,
'lambda_l2': 0.2, 'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
# 設置迭代次數,默認爲100,通常設置爲100+
num_boost_round = 2000
# 訓練lightGBM模型
gbm = lgb.train(params, lgb_train, num_boost_round, verbose_eval=100, valid_sets=lgb_val)
5、完整代碼實現
# coding=utf-8
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
# 訓練集中詞頻統計,並計算TF值
def words_tf():
train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
sentence_list = []
lenth = len(train_data)
for i in range(lenth):
sentence_list.append(str(train_data['contents'][i]).split())
# 總詞頻統計
doc_frequency = defaultdict(int)
for word_list in sentence_list:
for i in word_list:
doc_frequency[i] += 1
# 計算每個詞的TF值
word_tf = {} # 存儲每個詞的tf值
for i in doc_frequency:
word_tf[i] = doc_frequency[i] / sum(doc_frequency.values())
words_tf = sorted(word_tf.items(), key=lambda x: x[1], reverse=True)
return words_tf[:10000]
# 根據詞頻,將文本轉換爲向量
def word2vec(keywords_tf, doc_sentence):
keywords = list(dict(keywords_tf).keys()) # 獲取關鍵詞
tf_weight = list(dict(keywords_tf).values()) # 獲取關鍵詞tf值
docvec_list = []
for sentence in doc_sentence:
docvec = [0] * len(keywords_tf)
for word in sentence:
if word in keywords:
docvec[keywords.index(word)] = tf_weight[keywords.index(word)]
docvec_list.append(docvec)
return docvec_list
# 將訓練集和測試集換爲文本向量
def doc_vec(x_train, x_test):
keywords_tf = words_tf() # 獲取詞頻關鍵詞
# 訓練集轉換爲向量
train_lenth = len(x_train)
train_data_list = []
for i in range(train_lenth):
train_data_list.append(str(x_train[i]).split())
train_docvec_list = word2vec(keywords_tf, train_data_list)
# 測試集轉換爲向量
test_lenth = len(x_test)
test_data_list = []
for i in range(test_lenth):
test_data_list.append(str(x_test[i]).split())
test_docvec_list = word2vec(keywords_tf, test_data_list)
return train_docvec_list, test_docvec_list
if __name__ == '__main__':
train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
x_train, x_test, y_train, y_test = train_test_split(train_data['contents'], train_data['labels'], test_size=0.05)
x_train = np.array(x_train)
x_test = np.array(x_test)
cw = lambda x: int(x)
y_train = np.array(y_train.apply(cw))
y_test = np.array(y_test.apply(cw))
x_train, x_test = doc_vec(x_train, x_test) # 訓練集和測試集向量化
x_train, y_train = shuffle(x_train, y_train, random_state=0) # 打亂順序
# 導入SelectFromModel結合ExtraTreesClassifier計算特徵重要性,並按重要性閾值選擇特徵。
clf_model = ExtraTreesClassifier(n_estimators=250, random_state=0)
# clf_model=RandomForestClassifier(n_estimators=250,random_state=0)
clf_model.fit(x_train, y_train)
# 獲取每個詞的特徵權重,數值越高特徵越重要l
importances = clf_model.feature_importances_
'''
# 將詞和詞的權重存入字典並寫入文件
feature_words_dic = {}
for i in range(len(words_list)):
feature_words_dic[words_list[i][0]] = importances[i]
# 對字典按權重由大到小進行排序
words_info_dic_sort = sorted(feature_words_dic.items(), key=lambda x: x[1], reverse=True)
#將前2000個詞的權重字典寫入文件
key_words_importance=dict(words_info_dic_sort[:2000])
with open('data/key_words_importance','w') as f:
f.write(str(key_words_importance))
'''
# 選擇特徵重要性爲1.5倍均值的特徵
model = SelectFromModel(clf_model, threshold='1.5*mean', prefit=True)
x_train_new = model.transform(x_train) # 返回訓練集所選特徵
x_test_new = model.transform(x_test) # 返回測試集所選特徵
print(x_train_new.shape)
print(x_test_new.shape)
# 創建成lgb特徵的數據集格式
lgb_train = lgb.Dataset(x_train_new, y_train)
lgb_val = lgb.Dataset(x_test_new, y_test, reference=lgb_train)
# 構建lightGBM模型
params = {'max_depth': 6, 'min_data_in_leaf': 20, 'num_leaves': 35, 'learning_rate': 0.1, 'lambda_l1': 0.1,
'lambda_l2': 0.2, 'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
# 設置迭代次數,默認爲100,通常設置爲100+
num_boost_round = 2000
# 訓練lightGBM模型
gbm = lgb.train(params, lgb_train, num_boost_round, verbose_eval=100, valid_sets=lgb_val)
# 保存模型到文件
# gbm.save_model('data/lightGBM_model')
# 預測數據集
result = gbm.predict(x_test_new, num_iteration=gbm.best_iteration)
y_predict = np.argmax(result, axis=1) # 獲得最大概率對應的標籤
label_all = ['負面', '中性', '正面']
confusion_mat = metrics.confusion_matrix(y_test, y_predict)
df = pd.DataFrame(confusion_mat, columns=label_all)
df.index = label_all
print('準確率:', metrics.accuracy_score(y_test, y_predict))
print('confusion_matrix:', df)
print('分類報告:', metrics.classification_report(y_test, y_predict))
6、分類結果