基於樹模型的lightGBM文本分類

目錄

1、基於TF的關鍵詞提取

2、根據詞頻將文本轉化爲向量

3、基於樹模型的重要特徵選擇

5、完整代碼實現

6、分類結果


1、基於TF的關鍵詞提取

使用TF詞頻對訓練集clean_data_train進行關鍵詞提取,選取topK個關鍵詞作爲特徵詞,即topK=10000。

# 訓練集中詞頻統計,並計算TF值
def words_tf():
    train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
    sentence_list = []
    lenth = len(train_data)
    for i in range(lenth):
        sentence_list.append(str(train_data['contents'][i]).split())
    # 總詞頻統計
    doc_frequency = defaultdict(int)
    for word_list in sentence_list:
        for i in word_list:
            doc_frequency[i] += 1

    # 計算每個詞的TF值
    word_tf = {}  # 存儲每個詞的tf值
    for i in doc_frequency:
        word_tf[i] = doc_frequency[i] / sum(doc_frequency.values())

    words_tf = sorted(word_tf.items(), key=lambda x: x[1], reverse=True)
    return words_tf[:10000]

2、根據詞頻將文本轉化爲向量

根據所提取的關鍵詞及其詞頻權重值,將訓練集clean_data_train和測試集clean_data_test轉換爲向量,作爲模型的輸入。

# 根據詞頻,將文本轉換爲向量
def word2vec(keywords_tf, doc_sentence):
    keywords = list(dict(keywords_tf).keys())  # 獲取關鍵詞
    tf_weight = list(dict(keywords_tf).values())  # 獲取關鍵詞tf值

    docvec_list = []
    for sentence in doc_sentence:
        docvec = [0] * len(keywords_tf)
        for word in sentence:
            if word in keywords:
                docvec[keywords.index(word)] = tf_weight[keywords.index(word)]
        docvec_list.append(docvec)
    return docvec_list

# 將訓練集和測試集換爲文本向量
def doc_vec(x_train, x_test):
    keywords_tf = words_tf()  # 獲取詞頻關鍵詞

    # 訓練集轉換爲向量
    train_lenth = len(x_train)
    train_data_list = []
    for i in range(train_lenth):
        train_data_list.append(str(x_train[i]).split())
    train_docvec_list = word2vec(keywords_tf, train_data_list)

    # 測試集轉換爲向量
    test_lenth = len(x_test)
    test_data_list = []
    for i in range(test_lenth):
        test_data_list.append(str(x_test[i]).split())
    test_docvec_list = word2vec(keywords_tf, test_data_list)

    return train_docvec_list, test_docvec_list

3、基於樹模型的重要特徵選擇

使用樹模型對樣本進行訓練,從10000個特徵中選擇重要特徵(選擇特徵重要性爲1.5倍均值的特徵),即features=986。

    # 導入SelectFromModel結合ExtraTreesClassifier計算特徵重要性,並按重要性閾值選擇特徵。
    clf_model = ExtraTreesClassifier(n_estimators=250, random_state=0)
    # clf_model=RandomForestClassifier(n_estimators=250,random_state=0)
    clf_model.fit(x_train, y_train)
    # 獲取每個詞的特徵權重,數值越高特徵越重要l
    importances = clf_model.feature_importances_

    '''
    # 將詞和詞的權重存入字典並寫入文件
    feature_words_dic = {}
    for i in range(len(words_list)):
        feature_words_dic[words_list[i][0]] = importances[i]
    # 對字典按權重由大到小進行排序
    words_info_dic_sort = sorted(feature_words_dic.items(), key=lambda x: x[1], reverse=True)
    #將前2000個詞的權重字典寫入文件
    key_words_importance=dict(words_info_dic_sort[:2000])
    with open('data/key_words_importance','w') as f:
        f.write(str(key_words_importance))
    '''

    # 選擇特徵重要性爲1.5倍均值的特徵
    model = SelectFromModel(clf_model, threshold='1.5*mean', prefit=True)
    x_train_new = model.transform(x_train)  # 返回訓練集所選特徵
    x_test_new = model.transform(x_test)  # 返回測試集所選特徵

4、lightGBM模型構建

lightGBM參數設置:num_round=2000,max_depth=6,learning_rate=0.1,其餘採用模型默認參數。

    # 創建成lgb特徵的數據集格式
    lgb_train = lgb.Dataset(x_train_new, y_train)
    lgb_val = lgb.Dataset(x_test_new, y_test, reference=lgb_train)

    # 構建lightGBM模型
    params = {'max_depth': 6, 'min_data_in_leaf': 20, 'num_leaves': 35, 'learning_rate': 0.1, 'lambda_l1': 0.1,
              'lambda_l2': 0.2, 'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
    # 設置迭代次數,默認爲100,通常設置爲100+
    num_boost_round = 2000
    # 訓練lightGBM模型
    gbm = lgb.train(params, lgb_train, num_boost_round, verbose_eval=100, valid_sets=lgb_val)

5、完整代碼實現

# coding=utf-8
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


# 訓練集中詞頻統計,並計算TF值
def words_tf():
    train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
    sentence_list = []
    lenth = len(train_data)
    for i in range(lenth):
        sentence_list.append(str(train_data['contents'][i]).split())
    # 總詞頻統計
    doc_frequency = defaultdict(int)
    for word_list in sentence_list:
        for i in word_list:
            doc_frequency[i] += 1

    # 計算每個詞的TF值
    word_tf = {}  # 存儲每個詞的tf值
    for i in doc_frequency:
        word_tf[i] = doc_frequency[i] / sum(doc_frequency.values())

    words_tf = sorted(word_tf.items(), key=lambda x: x[1], reverse=True)
    return words_tf[:10000]


# 根據詞頻,將文本轉換爲向量
def word2vec(keywords_tf, doc_sentence):
    keywords = list(dict(keywords_tf).keys())  # 獲取關鍵詞
    tf_weight = list(dict(keywords_tf).values())  # 獲取關鍵詞tf值

    docvec_list = []
    for sentence in doc_sentence:
        docvec = [0] * len(keywords_tf)
        for word in sentence:
            if word in keywords:
                docvec[keywords.index(word)] = tf_weight[keywords.index(word)]
        docvec_list.append(docvec)
    return docvec_list

# 將訓練集和測試集換爲文本向量
def doc_vec(x_train, x_test):
    keywords_tf = words_tf()  # 獲取詞頻關鍵詞

    # 訓練集轉換爲向量
    train_lenth = len(x_train)
    train_data_list = []
    for i in range(train_lenth):
        train_data_list.append(str(x_train[i]).split())
    train_docvec_list = word2vec(keywords_tf, train_data_list)

    # 測試集轉換爲向量
    test_lenth = len(x_test)
    test_data_list = []
    for i in range(test_lenth):
        test_data_list.append(str(x_test[i]).split())
    test_docvec_list = word2vec(keywords_tf, test_data_list)

    return train_docvec_list, test_docvec_list


if __name__ == '__main__':
    train_data = pd.read_csv('data/clean_data_train.csv', sep=',', names=['contents', 'labels']).astype(str)
    x_train, x_test, y_train, y_test = train_test_split(train_data['contents'], train_data['labels'], test_size=0.05)

    x_train = np.array(x_train)
    x_test = np.array(x_test)
    cw = lambda x: int(x)
    y_train = np.array(y_train.apply(cw))
    y_test = np.array(y_test.apply(cw))

    x_train, x_test = doc_vec(x_train, x_test)  # 訓練集和測試集向量化
    x_train, y_train = shuffle(x_train, y_train, random_state=0)  # 打亂順序

    # 導入SelectFromModel結合ExtraTreesClassifier計算特徵重要性,並按重要性閾值選擇特徵。
    clf_model = ExtraTreesClassifier(n_estimators=250, random_state=0)
    # clf_model=RandomForestClassifier(n_estimators=250,random_state=0)
    clf_model.fit(x_train, y_train)
    # 獲取每個詞的特徵權重,數值越高特徵越重要l
    importances = clf_model.feature_importances_

    '''
    # 將詞和詞的權重存入字典並寫入文件
    feature_words_dic = {}
    for i in range(len(words_list)):
        feature_words_dic[words_list[i][0]] = importances[i]
    # 對字典按權重由大到小進行排序
    words_info_dic_sort = sorted(feature_words_dic.items(), key=lambda x: x[1], reverse=True)
    #將前2000個詞的權重字典寫入文件
    key_words_importance=dict(words_info_dic_sort[:2000])
    with open('data/key_words_importance','w') as f:
        f.write(str(key_words_importance))
    '''

    # 選擇特徵重要性爲1.5倍均值的特徵
    model = SelectFromModel(clf_model, threshold='1.5*mean', prefit=True)
    x_train_new = model.transform(x_train)  # 返回訓練集所選特徵
    x_test_new = model.transform(x_test)  # 返回測試集所選特徵

    print(x_train_new.shape)
    print(x_test_new.shape)

    # 創建成lgb特徵的數據集格式
    lgb_train = lgb.Dataset(x_train_new, y_train)
    lgb_val = lgb.Dataset(x_test_new, y_test, reference=lgb_train)

    # 構建lightGBM模型
    params = {'max_depth': 6, 'min_data_in_leaf': 20, 'num_leaves': 35, 'learning_rate': 0.1, 'lambda_l1': 0.1,
              'lambda_l2': 0.2, 'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
    # 設置迭代次數,默認爲100,通常設置爲100+
    num_boost_round = 2000
    # 訓練lightGBM模型
    gbm = lgb.train(params, lgb_train, num_boost_round, verbose_eval=100, valid_sets=lgb_val)

    # 保存模型到文件
    # gbm.save_model('data/lightGBM_model')

    # 預測數據集
    result = gbm.predict(x_test_new, num_iteration=gbm.best_iteration)
    y_predict = np.argmax(result, axis=1)  # 獲得最大概率對應的標籤

    label_all = ['負面', '中性', '正面']
    confusion_mat = metrics.confusion_matrix(y_test, y_predict)
    df = pd.DataFrame(confusion_mat, columns=label_all)
    df.index = label_all

    print('準確率:', metrics.accuracy_score(y_test, y_predict))
    print('confusion_matrix:', df)
    print('分類報告:', metrics.classification_report(y_test, y_predict))

6、分類結果

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章