(3)機器學習實戰筆記:樸素貝葉斯

優點:數據比較少的時候仍然有效,可以處理多類別問題
缺點:對於輸入數據的準備方式比較敏感
適用的數據類型:標稱型數據
 
 
將一組單詞轉換爲一組數字
使用數字計算概率
 
著名應用:使用樸素貝葉斯過濾垃圾郵件
分類思路:
(1)收集數據:提供文本文件
(2)準備數據:將文本文件解析成詞條向量
(3)分析數據:檢查詞條確保解析的正確性
(4)訓練算法:使用我們之前建立的trainNB0()函數
(5)測試算法:使用classifyNB(),構建一個新的測試函數來計算文檔的錯誤率
(6)使用算法:構建一個完整的程度對一組文檔進行分類,將錯分的文檔輸出到屏幕上
 
切分文本:使用String.split()方法切分
爲了更加精確地估計分類器錯誤率,需要進行多次迭代後求出平均錯誤率

 

——————————————————————————————-

簡單實例:通過樸素貝葉斯分類實現垃圾郵件分類

通過對一郵件文本數據集進行處理(轉化爲向量)

經過樸素貝葉斯分類器進行分類可以判定是否爲垃圾郵件

代碼實現了簡單的樸素貝葉斯分類器、文本向量轉換器

詳細備註見解釋,下載數據集點這裏

import numpy as np
from functools import reduce

#準備數據:從文本中構建詞向量
def loadDataSet():
    # 切分的詞條
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    # 類別標籤向量,1代表侮辱性詞彙,0代表不是
    classVec = [0, 1, 0, 1, 0, 1]
    # 返回實驗樣本切分的詞條、類別標籤向量
    return postingList, classVec

def createVocabList(dataSet):
    #無重複提取單祠
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet |set(document)#創建兩個集合的並集

    return list(vocabSet)

#檢查單詞在第幾篇文檔(文檔爲inputSet向量)中出現
def setOfWords2Vec(vocabList,inputSet):
    #創建一個元素都爲0的向量
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    # else: print("the word: %s is not in my Vocabulary!" % word)

    return returnVec

listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
# print(myVocabList)
# print(setOfWords2Vec(myVocabList,listOPosts[0]))

#訓練算法:詞向量計算概率
#輸入:文檔矩陣trainMatrix、每類文檔類別標籤所構成向量trainCategory
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])#無重複單詞表有幾個單詞

    #概率初始化
    pAbusive=sum(trainCategory)/float(numTrainDocs) #文檔裏髒話文檔的概率
    p0Num = np.ones(numWords)#正向詞彙列
    p1Num = np.ones(numWords)#髒話詞彙列,都初始化爲0
    #降低由概率值爲0導致最後乘積爲0的影響
    p0Denom=2.0
    p1Denom=2.0

    for i in range(numTrainDocs):
        #計算文檔屬於侮辱性文檔(class=1)的概率P(1)
        #對於二分類問題可以通過1-P(1)得到P(0)
        #一旦某個詞語在某文檔中出現,該詞對應的個數就加1
        #在所有的文檔中,文檔的總詞數也相應+1,

        if trainCategory[i]==1: #對於髒話類詞彙統計
            p1Num += trainMatrix[i] #統計髒話詞彙數量,對應位置數量+1(單位就是1)
            p1Denom += sum(trainMatrix[i]) #總髒話詞彙+出現次數
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])

    p1Vect=p1Num/p1Denom
    p0Vect=p0Num/p0Denom

    return p0Vect,p1Vect,pAbusive

trainMat=[]
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    #統計經過處理的無重複詞彙表中在對應第(postinDoc)文檔中是否出現,是標記1,
    #返回len爲文檔長度的向量組
# print(trainMat)

p0V,p1V,pAb=trainNB0(trainMat,listClasses)
# print("0")
# print(p0V)
# print("1")
# print(p1V)
# print("A")
# print(pAb)
# print(myVocabList)

#樸素貝葉斯分類函數/輸入向量(要分類的向量,使用函數trainNB0計算得到三個概率
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify*p1Vec)+np.log(pClass1)
    p0 = sum(vec2Classify*p0Vec)+np.log(1.0-pClass1)
    if p1>p0:
        return 1
    else:
        return 0
#簡單的分類測試
def testingNB():
    listOposts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOposts)
    trainMat=[]

    for postinDoc in listOposts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
    testEntry=['love','my','dalmation']

    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry=['stupid','garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))



testingNB()

#詞袋模型:遇到每一個單詞時,會增加詞向量中對應值,而不是將對應數值設爲0
def bagOfWords2VecMN(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]+=1
    return returnVec

#應用:進行垃圾郵件的過濾

#切分文本

#test!
# mySent = 'This book is the best book on python or M.L. I have laid eyes upon.'
#
# import re
# regEx = re.compile('\\W*')
# listOfTokens = regEx.split(mySent)

#測試:使用樸素貝葉斯進行交叉驗證
def textParse(bigString):
    import re
    listOfTokens =re.split(r'\W*',bigString)
    return[tok.lower() for tok in listOfTokens if len(tok)>2]
    #返回長度大於2的詞,而且全部小寫化



#該函數對貝葉斯垃圾郵件分類進行自動化處理,導入spam與ham下的文本文件,併爲他們解析詞列表。(*1)
#分離器所需要的概率計算只利用訓練集中的文檔來完成
#python變量trainingSet是一個整數列表,數值範圍是0到49;(*2)



def spamTest():
    docList=[]
    classList=[]
    fullText=[]
    main_email=[]
    for i in range(1,26):
        #(*1)
        wordList = textParse(open('email/spam/%d.txt'%i).read())
        main_e=open('email/spam/%d.txt'%i).read()
        main_email.append(main_e)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        wordList = textParse(open('email/ham/%d.txt'%i).read())
        docList.append(wordList)
        main_e= open('email/spam/%d.txt' % i).read()
        main_email.append(main_e)
        fullText.extend(wordList)
        classList.append(0)



    #構建詞彙表
    vocabList = createVocabList(docList)
    # print("構建的vocabList:")
    # print(vocabList)
    # print("=========================================================")



    #進行測試集的劃分 (*2)
    trainingSet = list(range(50))
    testSet=[]
    for i in range(10): #隨機選擇10個文件
        randIndex = int(np.random.uniform(0,len(trainingSet))) #隨機構建測試集,獲取隨機數作爲index
        testSet.append(trainingSet[randIndex])#把index對應的郵件index添加到測試集中
        del(trainingSet[randIndex])#並且把該index從待挑選名單中刪除


    trainMat=[]
    trainClasses =[]
    for docIndex in trainingSet:
        #對於每一個訓練集裏的訓練單位進行詞向量的構建
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex]) #安上對應標籤!
    #針對訓練集進行訓練
    #
    # print(trainMat)
    # print(trainClasses)

    p0V,p1V,pSpam=trainNB0(np.array(trainMat),np.array(trainClasses))
    # print(p0V)

    errorCount = 0

    for docIndex in testSet:
        #提取訓練名單上對應的郵件信息,查看滴docIndex條元素裏對應的單詞是否在vocablist詞彙表中出現,出現則+1,返回信息向量wordVector
        # print("train")
        # print(docList[docIndex])
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        # print(wordVector)

        if classifyNB(np.array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
            errorCount +=1
            print(main_email[docIndex])
            print(classifyNB(np.array(wordVector), p0V, p1V, pSpam))
            print(classList[docIndex])




    print('the error rate is :',float(errorCount)/len(testSet))

# spamTest()

#尋找最優參數


def findthebest_Data_test():
    docList = []
    classList = []
    fullText = []
    main_email = []
    for i in range(1, 26):
        # (*1)
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        main_e = open('email/spam/%d.txt' % i).read()
        main_email.append(main_e)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        main_e = open('email/spam/%d.txt' % i).read()
        main_email.append(main_e)
        fullText.extend(wordList)
        classList.append(0)

    # 構建詞彙表
    vocabList = createVocabList(docList)
    # print("構建的vocabList:")
    # print(vocabList)
    # print("=========================================================")

    # 進行測試集的劃分 (*2)
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):  # 隨機選擇10個文件
        randIndex = int(np.random.uniform(0, len(trainingSet)))  # 隨機構建測試集,獲取隨機數作爲index
        testSet.append(trainingSet[randIndex])  # 把index對應的郵件index添加到測試集中
        del (trainingSet[randIndex])  # 並且把該index從待挑選名單中刪除

    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        # 對於每一個訓練集裏的訓練單位進行詞向量的構建
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])  # 安上對應標籤!
    # 針對訓練集進行訓練
    #
    # print(trainMat)
    # print(trainClasses)

    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    # print(p0V)

    errorCount = 0

    for docIndex in testSet:
        # 提取訓練名單上對應的郵件信息,查看滴docIndex條元素裏對應的單詞是否在vocablist詞彙表中出現,出現則+1,返回信息向量wordVector
        # print("train")
        # print(docList[docIndex])
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        # print(wordVector)

        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
            # print(main_email[docIndex])
            # print(classifyNB(np.array(wordVector), p0V, p1V, pSpam))
            # print(classList[docIndex])

    # print('the error rate is :', float(errorCount) / len(testSet))
    error_rate=float(errorCount) / len(testSet)
    return p0V, p1V, pSpam,error_rate

def find_the_data():
    p0Num = np.ones(10)
    p1Num = np.ones(10)
    PA = 0.0
    err=1
    for i in range(50):
        a,b,c,d=findthebest_Data_test()
        if d<err:
            err = d
            p0Num=a
            p1Num=b
            PA=c


    return p0Num,p1Num,PA



def final_test():
    p0,p1,pA =find_the_data()



    docList = []
    classList = []
    fullText = []
    main_email = []
    for i in range(1, 26):
        # (*1)
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        main_e = open('email/spam/%d.txt' % i).read()
        main_email.append(main_e)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        main_e = open('email/spam/%d.txt' % i).read()
        main_email.append(main_e)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)

    errorCount = 0

    for i in range(len(docList)):
        # 提取訓練名單上對應的郵件信息,查看滴docIndex條元素裏對應的單詞是否在vocablist詞彙表中出現,出現則+1,返回信息向量wordVector
        # print("train")
        # print(docList[docIndex])
        wordVector = setOfWords2Vec(vocabList, docList[i])
        # print(wordVector)


        if classifyNB(np.array(wordVector), p0, p1, pA) != classList[i]:
            errorCount += 1
            # print(main_email[i])
            # print(classifyNB(np.array(wordVector), p0, p1, pA))
            # print(classList[i])

    print('the error rate is :', float(errorCount) / len(docList))



final_test()

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章