下面使用樸素貝葉斯模型,對郵件進行分類,識別郵件是不是垃圾郵件。
數據下載地址:
鏈接:https://pan.baidu.com/s/1er-AjWm-inaWPf-r0qxnLA
提取碼:ohsc
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# 預處理數據
def text_parse(big_string):
token_list = big_string.split()
return [tok.lower() for tok in token_list if len(tok)>2]
# 去除列表中重複元素,並以列表形式返回
def create_vocab_list(data_set):
vocab_set = set({})
for d in data_set:
vocab_set = vocab_set | set(d)
return list(vocab_set)
# 統計每一文檔(或郵件)在單詞表中出現的次數,並以列表形式返回
def words_to_vec(vocab_list, input_set):
return_vec = [0] * len(vocab_list)
for word in input_set:
if word in vocab_list:
return_vec[vocab_list.index(word)] += 1
return return_vec
# 樸素貝葉斯主程序
doc_list, class_list, x = [], [], []
for i in range(1, 26):
# 讀取第i篇垃圾文件,並以列表形式返回
word_list = text_parse(open('email/spam/{0}.txt'.format(i), encoding='ISO-8859-1').read())
doc_list.append(word_list)
class_list.append(1)
# 讀取第i篇非垃圾文件,並以列表形式返回
word_list = text_parse(open('email/ham/{0}.txt'.format(i), encoding='ISO-8859-1').read())
doc_list.append(word_list)
class_list.append(0)
# 將數據向量化
vocab_list = create_vocab_list(doc_list)
for word_list in doc_list:
x.append(words_to_vec(vocab_list, word_list))
# 分割數據爲訓練集和測試集
x_train, x_test, y_train, y_test = train_test_split(x, class_list, test_size=0.25)
x_train, x_test, y_train, y_test = np.array(x_train), np.array(x_test),\
np.array(y_train), np.array(y_test)
print("x_train: ")
print(x_train[:5])
print("\n")
print("y_train: ")
print(y_train[:5])
print("\n")
# 訓練模型
nb_model = MultinomialNB()
nb_model.fit(x_train, y_train)
# 測試模型效果
y_pred = nb_model.predict(x_test)
# 輸出預測情況
print("正確值:{0}".format(y_test))
print("預測值:{0}".format(y_pred))
print("準確率:%f%%" % (accuracy_score(y_test, y_pred)*100))
結果如下所示,識別正確率爲92.3%,效果還算可以哦。
x_train:
[[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]]
y_train:
[1 0 0 0 1]
正確值:[1 1 1 1 1 1 1 1 1 1 1 0 0]
預測值:[1 1 1 0 1 1 1 1 1 1 1 0 0]
準確率:92.307692%