《機器學習實戰》個人學習記錄筆記(八)———樸素貝葉斯實戰篇之垃圾郵件分類

第四章 樸素貝葉斯

PS:個人筆記 根據《機器學習實戰》這本書,Jack-Cui的博客,以及深度眸的視頻進行學習

1 兩個改進

拉普拉斯平滑(Laplace Smoothing)又被稱爲加1平滑,是比較常用的平滑方法,它就是爲了解決0概率問題

下溢出:這是由於太多很小的數相乘造成的。爲了解決這個問題,對乘積結果取自然對數。通過求對數可以避免下溢出或者浮點數舍入導致的錯誤。同時,採用自然對數進行處理不會有任何損失。

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)                            
    numWords = len(trainMatrix[0])                           
    pAbusive = sum(trainCategory)/float(numTrainDocs)       
    p0Num = np.ones(numWords); p1Num = np.ones(numWords)    
    p0Denom = 2.0; p1Denom = 2.0                            ⭐#分母初始化爲2,拉普拉斯平滑
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:                          
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:                                           
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)                          ⭐#取對數,防止下溢出         
    p0Vect = np.log(p0Num/p0Denom)         
    return p0Vect,p1Vect,pAbusive                          

2 過濾垃圾郵件

import re

"""
Parameters:
    無
Returns:
    無
"""
def textParse(bigString):                                                   #將文本解析爲字符串列表
    listOfTokens = re.split(r'\W.*?', bigString)                            #將'\w'作爲分隔符,獲得單個單詞;⭐改爲.*? 
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]            #對於單詞長度做要求並且規定小寫

"""
Parameters:
    dataSet - 整理的樣本數據集
Returns:
    vocabSet - 返回不重複的詞條列表,也就是詞彙表
"""
def createVocabList(dataSet):
    vocabSet = set([])                      #創建一個空的不重複列表,利用set()的不重複功能
    for document in dataSet:               
        vocabSet = vocabSet | set(document) #刪除重複的單詞,組成詞彙表
    return list(vocabSet)

if __name__ == '__main__':
    docList = []; classList = []
    for i in range(1, 26):                                                  #遍歷文件,一共有26  
        wordList = textParse(open('spam/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read())  #⭐打開文件,用之前定義的函數,這裏有個編碼問題
        docList.append(wordList)
        classList.append(1)                                                 #標記垃圾郵件,1表示垃圾文件
        wordList = textParse(open('ham/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read())     #這裏打開非垃圾郵件,並字符串轉換成字符串列表
        docList.append(wordList)
        classList.append(0)                                                 #標記非垃圾郵件,1表示垃圾文件   
    vocabList = createVocabList(docList)                                    #創建詞彙表,不重複
    print(vocabList)

文本向量化,我們將數據集分爲訓練集和測試集,使用交叉驗證的方式測試樸素貝葉斯分類器的準確性。

import numpy as np
import random
import re

def createVocabList(dataSet):
    vocabSet = set([])                   
    for document in dataSet:               
        vocabSet = vocabSet | set(document) 
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)                                
    for word in inputSet:                                             
        if word in vocabList:                                         
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec                                                  

def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0]*len(vocabList)                                     
    for word in inputSet:                                             
        if word in vocabList:                                      
            returnVec[vocabList.index(word)] += 1
    return returnVec                                                 

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)                          
    numWords = len(trainMatrix[0])                         
    pAbusive = sum(trainCategory)/float(numTrainDocs)      
    p0Num = np.ones(numWords); p1Num = np.ones(numWords)    
    p0Denom = 2.0; p1Denom = 2.0                            
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:                           
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:                                              
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num/p1Denom)                        
    p0Vect = np.log(p0Num/p0Denom)         
    return p0Vect,p1Vect,pAbusive                           

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)        #對應元素相乘。logA * B = logA + logB,所以這裏加上log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def textParse(bigString):                                              
    listOfTokens = re.split(r'\W.*?', bigString)                       #這裏改爲,*?     
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]       


def spamTest():                                                             #樣本測試
    docList = []; classList = []; fullText = []
    for i in range(1, 26):                                             
        wordList = textParse(open('spam/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read())     #讀取每個垃圾郵件,並字符串轉換成字符串列表⭐編碼有問題做了改變
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(1)                                             
        wordList = textParse(open('ham/%d.txt' % i, 'r',encoding='gb18030',errors='ignore').read())      
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)                                                 #標記非垃圾郵件,1表示垃圾文件   
    vocabList = createVocabList(docList)                                    #創建詞彙表,不重複
    trainingSet = list(range(50)); testSet = []                             #創建存儲訓練集的索引值的列表和測試集的索引值的列表                       
    for i in range(10):                                                     #從50個郵件中,隨機挑選出40個作爲訓練集,10個做測試集
        randIndex = int(random.uniform(0, len(trainingSet)))                #隨機選取索索引值,是均勻隨機函數
        testSet.append(trainingSet[randIndex])                              #添加測試集的索引值
        del(trainingSet[randIndex])                                         #在訓練集列表中刪除添加到測試集的索引值
    trainMat = []; trainClasses = []                                        #創建訓練集矩陣和訓練集類別標籤系向量             
    for docIndex in trainingSet:                                            #遍歷訓練集
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))       #將生成的詞集模型添加到訓練矩陣中
        trainClasses.append(classList[docIndex])                            #將類別添加到訓練集類別標籤系向量中
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))  #訓練樸素貝葉斯模型,要矩陣運算,用np轉換
    errorCount = 0                                                          #錯誤分類計數
    for docIndex in testSet:                                                #遍歷測試集
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])           #測試集的詞集模型
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:    #如果分類錯誤
            errorCount += 1                                                 #錯誤計數加1
            print("分類錯誤的測試集:",docList[docIndex])
    print('錯誤率:%.2f%%' % (float(errorCount) / len(testSet) * 100))


if __name__ == '__main__':
    spamTest()






發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章