《機器學習實戰》個人學習記錄筆記(九)———樸素貝葉斯之新浪新聞分類(Sklearn)

第四章 樸素貝葉斯

PS:個人筆記 根據《機器學習實戰》這本書,Jack-Cui的博客,以及深度眸的視頻進行學習

1 中文語句切分

import os
import jieba

def TextProcessing(folder_path):
    folder_list = os.listdir(folder_path)                         #查看folder_path下的文件列表
    data_list = []                                                #訓練集
    class_list = []                                               #分類列表

    for folder in folder_list:                                    #遍歷文件列表中的每個文件
        new_folder_path = os.path.join(folder_path, folder)       #根據子文件夾名稱,生成子文件夾的路徑
        files = os.listdir(new_folder_path)                       #存放子文件夾下的txt文件的列表名稱
        j = 1
        for file in files:                                        #遍歷每個TXT文件
            if j > 100:                                           #每類txt樣本數最多100個
                break
            with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:    #打開txt文件
                raw = f.read()                                    #讀取文件
            word_cut = jieba.cut(raw, cut_all = False)            #精簡模式,返回一個可迭代的generator
            word_list = list(word_cut)                            #將generator轉換爲list
            data_list.append(word_list)                           #加入數據列表中
            class_list.append(folder)                             #分類加入分類列表中
            j += 1
        print(data_list)
        print(class_list)
if __name__ == '__main__':
    folder_path = './SogouC/Sample'                               #訓練集存放地址
    TextProcessing(folder_path)

2 文本特徵選擇

我們將所有文本分成訓練集和測試集,並對訓練集中的所有單詞進行詞頻統計,並按降序排序。也就是將出現次數多的詞語在前,出現次數少的詞語在後進行排序。編寫代碼如下:

import os
import random
import jieba

"""
函數說明:中文文本處理
Parameters:
    folder_path - 文本存放的路徑
    test_size - 測試集佔比,默認佔所有數據集的百分之20
Returns:
    all_words_list - 按詞頻降序排序的訓練集列表
    train_data_list - 訓練集列表
    test_data_list - 測試集列表
    train_class_list - 訓練集標籤列表
    test_class_list - 測試集標籤列表
"""
def TextProcessing(folder_path, test_size = 0.2):
    folder_list = os.listdir(folder_path)                   
    data_list = []                                           
    class_list = []                                          

    #遍歷每個子文件夾
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)        
        files = os.listdir(new_folder_path)                        

        j = 1
        for file in files:
            if j > 100:                                          
                break
            with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:    
                raw = f.read()

            word_cut = jieba.cut(raw, cut_all = False)         
            word_list = list(word_cut)                          

            data_list.append(word_list)                        
            class_list.append(folder)                        
            j += 1

    data_class_list = list(zip(data_list, class_list))             #zip壓縮合並,將數據與標籤對應壓縮
    random.shuffle(data_class_list)                                #將data_class_list亂序,原本是按順序排的
    index = int(len(data_class_list) * test_size) + 1              #取訓練集和測試集的分隔處索引
    train_list = data_class_list[index:]                           #訓練集
    test_list = data_class_list[:index]                            #測試集
    train_data_list, train_class_list = zip(*train_list)           #訓練集解壓縮 zip(*)表示解壓縮
    test_data_list, test_class_list = zip(*test_list)              #測試集解壓縮

    all_words_dict = {}                                            #統計訓練集詞頻
    for word_list in train_data_list:
        for word in word_list:
            if word in all_words_dict.keys():
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1

    #根據鍵的值倒序排序
    all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True) #用次數進行排序,並且是降序排列
    all_words_list, all_words_nums = zip(*all_words_tuple_list)    #解壓縮
    all_words_list = list(all_words_list)                        #轉換成列表
    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list

if __name__ == '__main__':
    folder_path = './SogouC/Sample'             
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
    print(all_words_list)

首先去掉高頻詞,至於去掉多少個高頻詞,我們可以通過觀察去掉高頻詞個數和最終檢測準確率的關係來確定。除此之外,去除數字,不把數字作爲分類特徵。同時,去除一些特定的詞語,比如:”的”,”一”,”在”,”不”,”當然”,”怎麼”這類的對新聞分類無影響的介詞、代詞、連詞。

import os
import random
import jieba

def TextProcessing(folder_path, test_size = 0.2):
    folder_list = os.listdir(folder_path)                     
    data_list = []                                           
    class_list = []                                     

    #遍歷每個子文件夾
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)  
        files = os.listdir(new_folder_path)                     

        j = 1
        #遍歷每個txt文件
        for file in files:
            if j > 100:                                        
                break
            with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:    
                raw = f.read()

            word_cut = jieba.cut(raw, cut_all = False)         
            word_list = list(word_cut)                       

            data_list.append(word_list) 
            class_list.append(folder)                       
            j += 1

    data_class_list = list(zip(data_list, class_list))       
    random.shuffle(data_class_list)                              
    index = int(len(data_class_list) * test_size) + 1        
    train_list = data_class_list[index:]                    
    test_list = data_class_list[:index]                    
    train_data_list, train_class_list = zip(*train_list)      
    test_data_list, test_class_list = zip(*test_list)        

    all_words_dict = {}                                      
    for word_list in train_data_list:
        for word in word_list:
            if word in all_words_dict.keys():
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1
    all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True)
    all_words_list, all_words_nums = zip(*all_words_tuple_list)   
    all_words_list = list(all_words_list)                   
    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list

"""
函數說明:讀取文件裏的內容,並去重,去停詞表

Parameters:
    words_file - 文件路徑
Returns:
    words_set - 讀取的內容的set集合
"""
def MakeWordsSet(words_file):
    words_set = set()                                            #創建set集合
    with open(words_file, 'r', encoding = 'utf-8') as f:         #打開文件,只讀
        for line in f.readlines():                               #一行一行讀取
            word = line.strip()                                  #除去空白符
            if len(word) > 0:                                    #有文本,則添加到words_set中
                words_set.add(word)                               
    return words_set                                             #返回處理結果

"""
函數說明:文本特徵選取
Parameters:
    all_words_list - 訓練集所有文本列表
    deleteN - 刪除詞頻最高的deleteN個詞
    stopwords_set - 指定的結束語
Returns:
    feature_words - 特徵集
"""
def words_dict(all_words_list, deleteN, stopwords_set = set()):
    feature_words = []                            #特徵列表
    n = 1
    for t in range(deleteN, len(all_words_list), 1):      #deleteN之前的相當於刪除了,之後的保存,按步長1一個個遍歷
        if n > 1000:                                      #feature_words的維度爲1000,最多也只去1000個
            break                               
        #如果這個詞不是數字,並且不是指定的結束語,並且單詞長度大於1小於5,那麼這個詞就可以作爲特徵詞
        if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
            feature_words.append(all_words_list[t])
        n += 1
    return feature_words

if __name__ == '__main__':
    folder_path = './SogouC/Sample'           
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
    stopwords_file = './stopwords_cn.txt'
    stopwords_set = MakeWordsSet(stopwords_file)
    feature_words = words_dict(all_words_list, 100, stopwords_set)
    print(feature_words)

這個feature_words就是我們最終選出的用於新聞分類的特徵。隨後,我們就可以根據feature_words,將文本向量化,然後用於訓練樸素貝葉斯分類器

3 使用Sklearn構建樸素貝葉斯分類器

sklearn.naive_bayes中的MultinomialNB

對於新聞分類,屬於多分類問題。我們可以使用MultinamialNB()完成我們的新聞分類問題。

MultinomialNB假設特徵的先驗概率爲多項式分佈,即如下式:


其中,P(Xj = Xjl | Y = Ck)是第k個類別的第j維特徵的第l個取值條件概率。mk是訓練集中輸出爲第k類的樣本個數。λ爲一個大於0的常數,嚐嚐取值爲1,即拉普拉斯平滑,也可以取其他值。


參數說明如下:

  • alpha:浮點型可選參數,默認爲1.0,其實就是添加拉普拉斯平滑,即爲上述公式中的λ ,如果這個參數設置爲0,就是不添加平滑;
  • fit_prior:布爾型可選參數,默認爲True。布爾參數fit_prior表示是否要考慮先驗概率,如果是false,則所有的樣本類別輸出都有相同的類別先驗概率。否則可以自己用第三個參數class_prior輸入先驗概率,或者不輸入第三個參數class_prior讓MultinomialNB自己從訓練集樣本來計算先驗概率,此時的先驗概率爲P(Y=Ck)=mk/m。其中m爲訓練集樣本總數量,mk爲輸出爲第k類別的訓練集樣本數。
  • class_prior:可選參數,默認爲None。
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
import os
import random
import jieba

def TextProcessing(folder_path, test_size = 0.2):
    folder_list = os.listdir(folder_path)                    
    data_list = []                                         
    class_list = []                                        
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)    
        files = os.listdir(new_folder_path)                      
        j = 1
        for file in files:
            if j > 100:                                     
                break
            with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:  
                raw = f.read()

            word_cut = jieba.cut(raw, cut_all = False)          
            word_list = list(word_cut)                   
            data_list.append(word_list)                       
            class_list.append(folder)                     
            j += 1
    data_class_list = list(zip(data_list, class_list))          
    random.shuffle(data_class_list)                           
    index = int(len(data_class_list) * test_size) + 1        
    train_list = data_class_list[index:]                
    test_list = data_class_list[:index]                 
    train_data_list, train_class_list = zip(*train_list)      
    test_data_list, test_class_list = zip(*test_list)           
    all_words_dict = {}                                   
    for word_list in train_data_list:
        for word in word_list:
            if word in all_words_dict.keys():
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1
    all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True)
    all_words_list, all_words_nums = zip(*all_words_tuple_list) 
    all_words_list = list(all_words_list)              
    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list

def MakeWordsSet(words_file):
    words_set = set()                                        
    with open(words_file, 'r', encoding = 'utf-8') as f:  
        for line in f.readlines():                       
            word = line.strip()                            
            if len(word) > 0:                 
                words_set.add(word)                               
    return words_set                                      

"""
函數說明:根據feature_words將文本向量化

Parameters:
    train_data_list - 訓練集
    test_data_list - 測試集
    feature_words - 特徵集
Returns:
    train_feature_list - 訓練集向量化列表
    test_feature_list - 測試集向量化列表
"""
def TextFeatures(train_data_list, test_data_list, feature_words):
    def text_features(text, feature_words):                        #出現在特徵集中,則置1                                               
        text_words = set(text)
        features = [1 if word in text_words else 0 for word in feature_words]
        return features
    train_feature_list = [text_features(text, feature_words) for text in train_data_list]
    test_feature_list = [text_features(text, feature_words) for text in test_data_list]
    return train_feature_list, test_feature_list         

def words_dict(all_words_list, deleteN, stopwords_set = set()):
    feature_words = []                      
    n = 1
    for t in range(deleteN, len(all_words_list), 1):
        if n > 1000:                   
            break                               
        if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
            feature_words.append(all_words_list[t])
        n += 1
    return feature_words

"""
函數說明:新聞分類器

Parameters:
    train_feature_list - 訓練集向量化的特徵文本
    test_feature_list - 測試集向量化的特徵文本
    train_class_list - 訓練集分類標籤
    test_class_list - 測試集分類標籤
Returns:
    test_accuracy - 分類器精度
"""
def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list):
    classifier = MultinomialNB().fit(train_feature_list, train_class_list)
    test_accuracy = classifier.score(test_feature_list, test_class_list)
    return test_accuracy

if __name__ == '__main__':
    folder_path = './SogouC/Sample'             
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
    stopwords_file = './stopwords_cn.txt'
    stopwords_set = MakeWordsSet(stopwords_file)


    test_accuracy_list = []
    deleteNs = range(0, 1000, 20)                #因爲要去掉一些高頻詞,但是具體多少堯通過測試,可以通過這個函數測試,然後可視化表示出來
    for deleteN in deleteNs:
        feature_words = words_dict(all_words_list, deleteN, stopwords_set)
        train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
        test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
        test_accuracy_list.append(test_accuracy)

    plt.figure()
    plt.plot(deleteNs, test_accuracy_list)
    plt.title('Relationship of deleteNs and test_accuracy')
    plt.xlabel('deleteNs')
    plt.ylabel('test_accuracy')
    plt.show()

繪製出了deleteNs和test_accuracy的關係,這樣我們就可以大致確定去掉前多少的高頻詞彙了。每次運行程序,繪製的圖形可能不盡相同,我們可以通過多次測試,來決定這個deleteN的取值,然後確定這個參數,這樣就可以順利構建出用於新聞分類的樸素貝葉斯分類器了

if __name__ == '__main__':
    folder_path = './SogouC/Sample'             
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)

    stopwords_file = './stopwords_cn.txt'
    stopwords_set = MakeWordsSet(stopwords_file)


    test_accuracy_list = []
    feature_words = words_dict(all_words_list, 450, stopwords_set)
    train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
    test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
    test_accuracy_list.append(test_accuracy)
    ave = lambda c: sum(c) / len(c)
    print(ave(test_accuracy_list))



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章