第四章 樸素貝葉斯
PS:個人筆記 根據《機器學習實戰》這本書,Jack-Cui的博客,以及深度眸的視頻進行學習
1 兩個改進
①拉普拉斯平滑(Laplace Smoothing)又被稱爲加1平滑,是比較常用的平滑方法,它就是爲了解決0概率問題。
②下溢出:這是由於太多很小的數相乘造成的。爲了解決這個問題,對乘積結果取自然對數。通過求對數可以避免下溢出或者浮點數舍入導致的錯誤。同時,採用自然對數進行處理不會有任何損失。
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = np.ones(numWords); p1Num = np.ones(numWords)
p0Denom = 2.0; p1Denom = 2.0 ⭐#分母初始化爲2,拉普拉斯平滑
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = np.log(p1Num/p1Denom) ⭐#取對數,防止下溢出
p0Vect = np.log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
2 過濾垃圾郵件
import re
"""
Parameters:
無
Returns:
無
"""
def textParse(bigString): #將文本解析爲字符串列表
listOfTokens = re.split(r'\W.*?', bigString) #將'\w'作爲分隔符,獲得單個單詞;⭐改爲.*?
return [tok.lower() for tok in listOfTokens if len(tok) > 2] #對於單詞長度做要求並且規定小寫
"""
Parameters:
dataSet - 整理的樣本數據集
Returns:
vocabSet - 返回不重複的詞條列表,也就是詞彙表
"""
def createVocabList(dataSet):
vocabSet = set([]) #創建一個空的不重複列表,利用set()的不重複功能
for document in dataSet:
vocabSet = vocabSet | set(document) #刪除重複的單詞,組成詞彙表
return list(vocabSet)
if __name__ == '__main__':
docList = []; classList = []
for i in range(1, 26): #遍歷文件,一共有26
wordList = textParse(open('spam/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read()) #⭐打開文件,用之前定義的函數,這裏有個編碼問題
docList.append(wordList)
classList.append(1) #標記垃圾郵件,1表示垃圾文件
wordList = textParse(open('ham/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read()) #這裏打開非垃圾郵件,並字符串轉換成字符串列表
docList.append(wordList)
classList.append(0) #標記非垃圾郵件,1表示垃圾文件
vocabList = createVocabList(docList) #創建詞彙表,不重複
print(vocabList)
文本向量化,我們將數據集分爲訓練集和測試集,使用交叉驗證的方式測試樸素貝葉斯分類器的準確性。
import numpy as np
import random
import re
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setOfWords2Vec(vocabList, inputSet):
returnVec = [0] * len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else: print("the word: %s is not in my Vocabulary!" % word)
return returnVec
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = np.ones(numWords); p1Num = np.ones(numWords)
p0Denom = 2.0; p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = np.log(p1Num/p1Denom)
p0Vect = np.log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) #對應元素相乘。logA * B = logA + logB,所以這裏加上log(pClass1)
p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def textParse(bigString):
listOfTokens = re.split(r'\W.*?', bigString) #這裏改爲,*?
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest(): #樣本測試
docList = []; classList = []; fullText = []
for i in range(1, 26):
wordList = textParse(open('spam/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read()) #讀取每個垃圾郵件,並字符串轉換成字符串列表⭐編碼有問題做了改變
docList.append(wordList)
fullText.append(wordList)
classList.append(1)
wordList = textParse(open('ham/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read())
docList.append(wordList)
fullText.append(wordList)
classList.append(0) #標記非垃圾郵件,1表示垃圾文件
vocabList = createVocabList(docList) #創建詞彙表,不重複
trainingSet = list(range(50)); testSet = [] #創建存儲訓練集的索引值的列表和測試集的索引值的列表
for i in range(10): #從50個郵件中,隨機挑選出40個作爲訓練集,10個做測試集
randIndex = int(random.uniform(0, len(trainingSet))) #隨機選取索索引值,是均勻隨機函數
testSet.append(trainingSet[randIndex]) #添加測試集的索引值
del(trainingSet[randIndex]) #在訓練集列表中刪除添加到測試集的索引值
trainMat = []; trainClasses = [] #創建訓練集矩陣和訓練集類別標籤系向量
for docIndex in trainingSet: #遍歷訓練集
trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) #將生成的詞集模型添加到訓練矩陣中
trainClasses.append(classList[docIndex]) #將類別添加到訓練集類別標籤系向量中
p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses)) #訓練樸素貝葉斯模型,要矩陣運算,用np轉換
errorCount = 0 #錯誤分類計數
for docIndex in testSet: #遍歷測試集
wordVector = setOfWords2Vec(vocabList, docList[docIndex]) #測試集的詞集模型
if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: #如果分類錯誤
errorCount += 1 #錯誤計數加1
print("分類錯誤的測試集:",docList[docIndex])
print('錯誤率:%.2f%%' % (float(errorCount) / len(testSet) * 100))
if __name__ == '__main__':
spamTest()