第四章 樸素貝葉斯
PS:個人筆記 根據《機器學習實戰》這本書,Jack-Cui的博客,以及深度眸的視頻進行學習
1 中文語句切分
import os
import jieba
def TextProcessing(folder_path):
folder_list = os.listdir(folder_path) #查看folder_path下的文件列表
data_list = [] #訓練集
class_list = [] #分類列表
for folder in folder_list: #遍歷文件列表中的每個文件
new_folder_path = os.path.join(folder_path, folder) #根據子文件夾名稱,生成子文件夾的路徑
files = os.listdir(new_folder_path) #存放子文件夾下的txt文件的列表名稱
j = 1
for file in files: #遍歷每個TXT文件
if j > 100: #每類txt樣本數最多100個
break
with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f: #打開txt文件
raw = f.read() #讀取文件
word_cut = jieba.cut(raw, cut_all = False) #精簡模式,返回一個可迭代的generator
word_list = list(word_cut) #將generator轉換爲list
data_list.append(word_list) #加入數據列表中
class_list.append(folder) #分類加入分類列表中
j += 1
print(data_list)
print(class_list)
if __name__ == '__main__':
folder_path = './SogouC/Sample' #訓練集存放地址
TextProcessing(folder_path)
2 文本特徵選擇
我們將所有文本分成訓練集和測試集,並對訓練集中的所有單詞進行詞頻統計,並按降序排序。也就是將出現次數多的詞語在前,出現次數少的詞語在後進行排序。編寫代碼如下:
import os
import random
import jieba
"""
函數說明:中文文本處理
Parameters:
folder_path - 文本存放的路徑
test_size - 測試集佔比,默認佔所有數據集的百分之20
Returns:
all_words_list - 按詞頻降序排序的訓練集列表
train_data_list - 訓練集列表
test_data_list - 測試集列表
train_class_list - 訓練集標籤列表
test_class_list - 測試集標籤列表
"""
def TextProcessing(folder_path, test_size = 0.2):
folder_list = os.listdir(folder_path)
data_list = []
class_list = []
#遍歷每個子文件夾
for folder in folder_list:
new_folder_path = os.path.join(folder_path, folder)
files = os.listdir(new_folder_path)
j = 1
for file in files:
if j > 100:
break
with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:
raw = f.read()
word_cut = jieba.cut(raw, cut_all = False)
word_list = list(word_cut)
data_list.append(word_list)
class_list.append(folder)
j += 1
data_class_list = list(zip(data_list, class_list)) #zip壓縮合並,將數據與標籤對應壓縮
random.shuffle(data_class_list) #將data_class_list亂序,原本是按順序排的
index = int(len(data_class_list) * test_size) + 1 #取訓練集和測試集的分隔處索引
train_list = data_class_list[index:] #訓練集
test_list = data_class_list[:index] #測試集
train_data_list, train_class_list = zip(*train_list) #訓練集解壓縮 zip(*)表示解壓縮
test_data_list, test_class_list = zip(*test_list) #測試集解壓縮
all_words_dict = {} #統計訓練集詞頻
for word_list in train_data_list:
for word in word_list:
if word in all_words_dict.keys():
all_words_dict[word] += 1
else:
all_words_dict[word] = 1
#根據鍵的值倒序排序
all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True) #用次數進行排序,並且是降序排列
all_words_list, all_words_nums = zip(*all_words_tuple_list) #解壓縮
all_words_list = list(all_words_list) #轉換成列表
return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list
if __name__ == '__main__':
folder_path = './SogouC/Sample'
all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
print(all_words_list)
首先去掉高頻詞,至於去掉多少個高頻詞,我們可以通過觀察去掉高頻詞個數和最終檢測準確率的關係來確定。除此之外,去除數字,不把數字作爲分類特徵。同時,去除一些特定的詞語,比如:”的”,”一”,”在”,”不”,”當然”,”怎麼”這類的對新聞分類無影響的介詞、代詞、連詞。
import os
import random
import jieba
def TextProcessing(folder_path, test_size = 0.2):
folder_list = os.listdir(folder_path)
data_list = []
class_list = []
#遍歷每個子文件夾
for folder in folder_list:
new_folder_path = os.path.join(folder_path, folder)
files = os.listdir(new_folder_path)
j = 1
#遍歷每個txt文件
for file in files:
if j > 100:
break
with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:
raw = f.read()
word_cut = jieba.cut(raw, cut_all = False)
word_list = list(word_cut)
data_list.append(word_list)
class_list.append(folder)
j += 1
data_class_list = list(zip(data_list, class_list))
random.shuffle(data_class_list)
index = int(len(data_class_list) * test_size) + 1
train_list = data_class_list[index:]
test_list = data_class_list[:index]
train_data_list, train_class_list = zip(*train_list)
test_data_list, test_class_list = zip(*test_list)
all_words_dict = {}
for word_list in train_data_list:
for word in word_list:
if word in all_words_dict.keys():
all_words_dict[word] += 1
else:
all_words_dict[word] = 1
all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True)
all_words_list, all_words_nums = zip(*all_words_tuple_list)
all_words_list = list(all_words_list)
return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list
"""
函數說明:讀取文件裏的內容,並去重,去停詞表
Parameters:
words_file - 文件路徑
Returns:
words_set - 讀取的內容的set集合
"""
def MakeWordsSet(words_file):
words_set = set() #創建set集合
with open(words_file, 'r', encoding = 'utf-8') as f: #打開文件,只讀
for line in f.readlines(): #一行一行讀取
word = line.strip() #除去空白符
if len(word) > 0: #有文本,則添加到words_set中
words_set.add(word)
return words_set #返回處理結果
"""
函數說明:文本特徵選取
Parameters:
all_words_list - 訓練集所有文本列表
deleteN - 刪除詞頻最高的deleteN個詞
stopwords_set - 指定的結束語
Returns:
feature_words - 特徵集
"""
def words_dict(all_words_list, deleteN, stopwords_set = set()):
feature_words = [] #特徵列表
n = 1
for t in range(deleteN, len(all_words_list), 1): #deleteN之前的相當於刪除了,之後的保存,按步長1一個個遍歷
if n > 1000: #feature_words的維度爲1000,最多也只去1000個
break
#如果這個詞不是數字,並且不是指定的結束語,並且單詞長度大於1小於5,那麼這個詞就可以作爲特徵詞
if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
feature_words.append(all_words_list[t])
n += 1
return feature_words
if __name__ == '__main__':
folder_path = './SogouC/Sample'
all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
stopwords_file = './stopwords_cn.txt'
stopwords_set = MakeWordsSet(stopwords_file)
feature_words = words_dict(all_words_list, 100, stopwords_set)
print(feature_words)
這個feature_words就是我們最終選出的用於新聞分類的特徵。隨後,我們就可以根據feature_words,將文本向量化,然後用於訓練樸素貝葉斯分類器
3 使用Sklearn構建樸素貝葉斯分類器
sklearn.naive_bayes中的MultinomialNB
對於新聞分類,屬於多分類問題。我們可以使用MultinamialNB()完成我們的新聞分類問題。
MultinomialNB假設特徵的先驗概率爲多項式分佈,即如下式:
其中,P(Xj = Xjl | Y = Ck)是第k個類別的第j維特徵的第l個取值條件概率。mk是訓練集中輸出爲第k類的樣本個數。λ爲一個大於0的常數,嚐嚐取值爲1,即拉普拉斯平滑,也可以取其他值。
參數說明如下:
- alpha:浮點型可選參數,默認爲1.0,其實就是添加拉普拉斯平滑,即爲上述公式中的λ ,如果這個參數設置爲0,就是不添加平滑;
- fit_prior:布爾型可選參數,默認爲True。布爾參數fit_prior表示是否要考慮先驗概率,如果是false,則所有的樣本類別輸出都有相同的類別先驗概率。否則可以自己用第三個參數class_prior輸入先驗概率,或者不輸入第三個參數class_prior讓MultinomialNB自己從訓練集樣本來計算先驗概率,此時的先驗概率爲P(Y=Ck)=mk/m。其中m爲訓練集樣本總數量,mk爲輸出爲第k類別的訓練集樣本數。
- class_prior:可選參數,默認爲None。
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
import os
import random
import jieba
def TextProcessing(folder_path, test_size = 0.2):
folder_list = os.listdir(folder_path)
data_list = []
class_list = []
for folder in folder_list:
new_folder_path = os.path.join(folder_path, folder)
files = os.listdir(new_folder_path)
j = 1
for file in files:
if j > 100:
break
with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:
raw = f.read()
word_cut = jieba.cut(raw, cut_all = False)
word_list = list(word_cut)
data_list.append(word_list)
class_list.append(folder)
j += 1
data_class_list = list(zip(data_list, class_list))
random.shuffle(data_class_list)
index = int(len(data_class_list) * test_size) + 1
train_list = data_class_list[index:]
test_list = data_class_list[:index]
train_data_list, train_class_list = zip(*train_list)
test_data_list, test_class_list = zip(*test_list)
all_words_dict = {}
for word_list in train_data_list:
for word in word_list:
if word in all_words_dict.keys():
all_words_dict[word] += 1
else:
all_words_dict[word] = 1
all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True)
all_words_list, all_words_nums = zip(*all_words_tuple_list)
all_words_list = list(all_words_list)
return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list
def MakeWordsSet(words_file):
words_set = set()
with open(words_file, 'r', encoding = 'utf-8') as f:
for line in f.readlines():
word = line.strip()
if len(word) > 0:
words_set.add(word)
return words_set
"""
函數說明:根據feature_words將文本向量化
Parameters:
train_data_list - 訓練集
test_data_list - 測試集
feature_words - 特徵集
Returns:
train_feature_list - 訓練集向量化列表
test_feature_list - 測試集向量化列表
"""
def TextFeatures(train_data_list, test_data_list, feature_words):
def text_features(text, feature_words): #出現在特徵集中,則置1
text_words = set(text)
features = [1 if word in text_words else 0 for word in feature_words]
return features
train_feature_list = [text_features(text, feature_words) for text in train_data_list]
test_feature_list = [text_features(text, feature_words) for text in test_data_list]
return train_feature_list, test_feature_list
def words_dict(all_words_list, deleteN, stopwords_set = set()):
feature_words = []
n = 1
for t in range(deleteN, len(all_words_list), 1):
if n > 1000:
break
if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
feature_words.append(all_words_list[t])
n += 1
return feature_words
"""
函數說明:新聞分類器
Parameters:
train_feature_list - 訓練集向量化的特徵文本
test_feature_list - 測試集向量化的特徵文本
train_class_list - 訓練集分類標籤
test_class_list - 測試集分類標籤
Returns:
test_accuracy - 分類器精度
"""
def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list):
classifier = MultinomialNB().fit(train_feature_list, train_class_list)
test_accuracy = classifier.score(test_feature_list, test_class_list)
return test_accuracy
if __name__ == '__main__':
folder_path = './SogouC/Sample'
all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
stopwords_file = './stopwords_cn.txt'
stopwords_set = MakeWordsSet(stopwords_file)
test_accuracy_list = []
deleteNs = range(0, 1000, 20) #因爲要去掉一些高頻詞,但是具體多少堯通過測試,可以通過這個函數測試,然後可視化表示出來
for deleteN in deleteNs:
feature_words = words_dict(all_words_list, deleteN, stopwords_set)
train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
test_accuracy_list.append(test_accuracy)
plt.figure()
plt.plot(deleteNs, test_accuracy_list)
plt.title('Relationship of deleteNs and test_accuracy')
plt.xlabel('deleteNs')
plt.ylabel('test_accuracy')
plt.show()
繪製出了deleteNs和test_accuracy的關係,這樣我們就可以大致確定去掉前多少的高頻詞彙了。每次運行程序,繪製的圖形可能不盡相同,我們可以通過多次測試,來決定這個deleteN的取值,然後確定這個參數,這樣就可以順利構建出用於新聞分類的樸素貝葉斯分類器了
if __name__ == '__main__':
folder_path = './SogouC/Sample'
all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
stopwords_file = './stopwords_cn.txt'
stopwords_set = MakeWordsSet(stopwords_file)
test_accuracy_list = []
feature_words = words_dict(all_words_list, 450, stopwords_set)
train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
test_accuracy_list.append(test_accuracy)
ave = lambda c: sum(c) / len(c)
print(ave(test_accuracy_list))