實例:新聞分類器:
參考:這篇
- 首先將文件當中的文字取出,分別存到列表當中,並且返回存放字出現頻率從高到底排列的列表:
import os
import jieba
from sklearn.naive_bayes import MultinomialNB
from matplotlib import pyplot as plt
import random
def TextProcess(folder_path,test_size):
"""遍歷切分新聞內容
Params:
文件夾所在目錄,測試集大小
return:
字出現頻率從高到底排列的列表,訓練集列表和分類,測試集列表和分類
"""
file_list = os.listdir(folder_path) #獲取文件夾下的所有文件名, 返回一個列表
data_list = [] #用來存放數據
class_list = [] #用來存放分類
for each in file_list: #遍歷每個文件夾
new_folder = os.path.join(folder_path,each) #形成一個打開該文件夾的路徑
files = os.listdir(new_folder) #返回該文件夾下所有文件名
j=1 #文件數量初始化
for file in files: #遍歷每一個文件名
if j>100: #遍歷數量最多不能超過100個
break
with open(os.path.join(new_folder,file),'r',encoding = 'utf-8') as f: #打開文件
raw = f.read() #獲取內容
word_cut = jieba.cut(raw,cut_all=False) #用jieba庫進行切割
word_list = list(word_cut) #形成列表
data_list.append(word_list)
class_list.append(each)
j +=1
all_data = list(zip(data_list,class_list)) #講標籤和文本對應壓縮起來
random.shuffle(all_data) #打亂順序,遵守隨機原則
index = int(len(all_data)*test_size) + 1 #取處分割的索引值
train_set = all_data[index:]
test_set = all_data[:index]
train_data,train_class = zip(*train_set) #解壓
test_data,test_class = zip(*test_set)
all_word_dict = {}
for word_list in train_data:
for word in word_list:
if word in all_word_dict.keys():
all_word_dict[word] +=1
else:
all_word_dict[word] =1
all_word_tuple_list = sorted(all_word_dict.items(),key= lambda f : f[1],reverse=True)#降序排序
all_word_list,all_word_nums = zip(*all_word_tuple_list) #解壓
all_word_list = list(all_word_list)
return all_word_list,train_data,train_class,test_data,test_class
if __name__ == '__main__':
all_word_list,train_data,train_class,test_data,test_class=TextProcess(folder_path,test_size)
print(all_word_list)
結果如下:
由上圖可以清楚的看到裏面由很多無關緊要的詞出現的頻率非常的高,所以這要寫個函數把這寫字去掉
- 找出特徵詞
def WordsMaker(file_name):
"""將可去除的詞變成一個集合"""
Words_set = set()
with open(file_name,'r',encoding = 'utf-8') as f:
for each_line in f.readlines():
each = each_line.strip()
if len(each) >0: #如果長度大於0就要
Words_set.add(each)
return Words_set
def words_dict(all_word_list,deleteN,stop_set=set()):
"""取出特徵詞
Params:
按出現頻率進行排序的列表,要去除高頻詞的數量,無關詞彙表"""
features = []
n=1
for each_word in range(deleteN,len(all_word_list),1):
if n>1000: #存1000個特徵詞就夠了
break
if not all_word_list[each_word].isdigit() and all_word_list[each_word] not in stop_set and 1<len(all_word_list[each_word]) <5:
#如果這個詞不是數字,並且不再要去除字的列表當中,並且長度大於1,小於5,就放入表中
features.append(all_word_list[each_word])
n+=1
return features
if __name__ == '__mian__':
file_name='E:/Data/stopwords_cn.txt'
Word_set = WordsMaker(file_name)
feature_list = words_dict(all_word_list,100,Word_set)
print(feature_list)
結果:
- 根據feature_list來向量化:
def TextFeatures(train_data_list,test_data_list,feature_words):
def text_feature(text,feature_words):
text_words =set(text)
features = [1 if word in text_words else 0 for word in feature_words]
return features
train_feature_list = [text_feature(text,feature_words) for text in train_data_list]
test_feature_list = [text_feature(text,feature_words) for text in test_data_list]
return train_feature_list,test_feature_list
- 建立貝葉斯分類器:
def Classify(train_feature_list,test_feature_list,train_class_list,test_class_list):
classifier = MultinomialNB().fit(train_feature_list, train_class_list)
test_accuracy = classifier.score(test_feature_list, test_class_list)
return test_accuracy
- 找出刪除高頻詞的最佳數量:
if __name__ == '__main__':
folder_path = 'E:/Data/Sample'
all_word_list,train_data,train_class,test_data,test_class=TextProcess(folder_path,0.2)
file_path = 'E:/Data/stopwords_cn.txt'
stopset = WordsMaker(file_path)
test_accuracy = []
deleteNs = range(0,1000,20)
for deleteN in deleteNs:
feature_list = words_dict(all_word_list,deleteN,stopset)
train_feature_list,test_feature_list = TextFeatures(train_data,test_data,feature_list)
tests_accuracy = Classify(train_feature_list,test_feature_list,train_class,test_class)
test_accuracy.append(tests_accuracy)
plt.plot(deleteNs,test_accuracy)
plt.show()
選450比較好
if __name__ == '__main__':
folder_path = 'E:/Data/Sample'
all_word_list,train_data,train_class,test_data,test_class=TextProcess(folder_path,0.2)
file_path = 'E:/Data/stopwords_cn.txt'
stopset = WordsMaker(file_path)
test_accuracy = []
feature_list = words_dict(all_word_list,400,stopset)
train_feature_list,test_feature_list = TextFeatures(train_data,test_data,feature_list)
tests_accuracy = Classify(train_feature_list,test_feature_list,train_class,test_class)
test_accuracy.append(tests_accuracy)
ave = lambda c:sum(c)/len(c)
print(ave(test_accuracy))
結果: