——————————————————————————————-
簡單實例:通過樸素貝葉斯分類實現垃圾郵件分類
通過對一郵件文本數據集進行處理(轉化爲向量)
經過樸素貝葉斯分類器進行分類可以判定是否爲垃圾郵件
代碼實現了簡單的樸素貝葉斯分類器、文本向量轉換器
詳細備註見解釋,下載數據集點這裏
import numpy as np from functools import reduce #準備數據:從文本中構建詞向量 def loadDataSet(): # 切分的詞條 postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] # 類別標籤向量,1代表侮辱性詞彙,0代表不是 classVec = [0, 1, 0, 1, 0, 1] # 返回實驗樣本切分的詞條、類別標籤向量 return postingList, classVec def createVocabList(dataSet): #無重複提取單祠 vocabSet = set([]) for document in dataSet: vocabSet = vocabSet |set(document)#創建兩個集合的並集 return list(vocabSet) #檢查單詞在第幾篇文檔(文檔爲inputSet向量)中出現 def setOfWords2Vec(vocabList,inputSet): #創建一個元素都爲0的向量 returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 # else: print("the word: %s is not in my Vocabulary!" % word) return returnVec listOPosts,listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) # print(myVocabList) # print(setOfWords2Vec(myVocabList,listOPosts[0])) #訓練算法:詞向量計算概率 #輸入:文檔矩陣trainMatrix、每類文檔類別標籤所構成向量trainCategory def trainNB0(trainMatrix,trainCategory): numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0])#無重複單詞表有幾個單詞 #概率初始化 pAbusive=sum(trainCategory)/float(numTrainDocs) #文檔裏髒話文檔的概率 p0Num = np.ones(numWords)#正向詞彙列 p1Num = np.ones(numWords)#髒話詞彙列,都初始化爲0 #降低由概率值爲0導致最後乘積爲0的影響 p0Denom=2.0 p1Denom=2.0 for i in range(numTrainDocs): #計算文檔屬於侮辱性文檔(class=1)的概率P(1) #對於二分類問題可以通過1-P(1)得到P(0) #一旦某個詞語在某文檔中出現,該詞對應的個數就加1 #在所有的文檔中,文檔的總詞數也相應+1, if trainCategory[i]==1: #對於髒話類詞彙統計 p1Num += trainMatrix[i] #統計髒話詞彙數量,對應位置數量+1(單位就是1) p1Denom += sum(trainMatrix[i]) #總髒話詞彙+出現次數 else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect=p1Num/p1Denom p0Vect=p0Num/p0Denom return p0Vect,p1Vect,pAbusive trainMat=[] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #統計經過處理的無重複詞彙表中在對應第(postinDoc)文檔中是否出現,是標記1, #返回len爲文檔長度的向量組 # print(trainMat) p0V,p1V,pAb=trainNB0(trainMat,listClasses) # print("0") # print(p0V) # print("1") # print(p1V) # print("A") # print(pAb) # print(myVocabList) #樸素貝葉斯分類函數/輸入向量(要分類的向量,使用函數trainNB0計算得到三個概率 def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): p1 = sum(vec2Classify*p1Vec)+np.log(pClass1) p0 = sum(vec2Classify*p0Vec)+np.log(1.0-pClass1) if p1>p0: return 1 else: return 0 #簡單的分類測試 def testingNB(): listOposts,listClasses = loadDataSet() myVocabList = createVocabList(listOposts) trainMat=[] for postinDoc in listOposts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses)) testEntry=['love','my','dalmation'] thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry)) print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)) testEntry=['stupid','garbage'] thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry)) print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)) testingNB() #詞袋模型:遇到每一個單詞時,會增加詞向量中對應值,而不是將對應數值設爲0 def bagOfWords2VecMN(vocabList,inputSet): returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)]+=1 return returnVec #應用:進行垃圾郵件的過濾 #切分文本 #test! # mySent = 'This book is the best book on python or M.L. I have laid eyes upon.' # # import re # regEx = re.compile('\\W*') # listOfTokens = regEx.split(mySent) #測試:使用樸素貝葉斯進行交叉驗證 def textParse(bigString): import re listOfTokens =re.split(r'\W*',bigString) return[tok.lower() for tok in listOfTokens if len(tok)>2] #返回長度大於2的詞,而且全部小寫化 #該函數對貝葉斯垃圾郵件分類進行自動化處理,導入spam與ham下的文本文件,併爲他們解析詞列表。(*1) #分離器所需要的概率計算只利用訓練集中的文檔來完成 #python變量trainingSet是一個整數列表,數值範圍是0到49;(*2) def spamTest(): docList=[] classList=[] fullText=[] main_email=[] for i in range(1,26): #(*1) wordList = textParse(open('email/spam/%d.txt'%i).read()) main_e=open('email/spam/%d.txt'%i).read() main_email.append(main_e) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt'%i).read()) docList.append(wordList) main_e= open('email/spam/%d.txt' % i).read() main_email.append(main_e) fullText.extend(wordList) classList.append(0) #構建詞彙表 vocabList = createVocabList(docList) # print("構建的vocabList:") # print(vocabList) # print("=========================================================") #進行測試集的劃分 (*2) trainingSet = list(range(50)) testSet=[] for i in range(10): #隨機選擇10個文件 randIndex = int(np.random.uniform(0,len(trainingSet))) #隨機構建測試集,獲取隨機數作爲index testSet.append(trainingSet[randIndex])#把index對應的郵件index添加到測試集中 del(trainingSet[randIndex])#並且把該index從待挑選名單中刪除 trainMat=[] trainClasses =[] for docIndex in trainingSet: #對於每一個訓練集裏的訓練單位進行詞向量的構建 trainMat.append(setOfWords2Vec(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) #安上對應標籤! #針對訓練集進行訓練 # # print(trainMat) # print(trainClasses) p0V,p1V,pSpam=trainNB0(np.array(trainMat),np.array(trainClasses)) # print(p0V) errorCount = 0 for docIndex in testSet: #提取訓練名單上對應的郵件信息,查看滴docIndex條元素裏對應的單詞是否在vocablist詞彙表中出現,出現則+1,返回信息向量wordVector # print("train") # print(docList[docIndex]) wordVector = setOfWords2Vec(vocabList,docList[docIndex]) # print(wordVector) if classifyNB(np.array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]: errorCount +=1 print(main_email[docIndex]) print(classifyNB(np.array(wordVector), p0V, p1V, pSpam)) print(classList[docIndex]) print('the error rate is :',float(errorCount)/len(testSet)) # spamTest() #尋找最優參數 def findthebest_Data_test(): docList = [] classList = [] fullText = [] main_email = [] for i in range(1, 26): # (*1) wordList = textParse(open('email/spam/%d.txt' % i).read()) main_e = open('email/spam/%d.txt' % i).read() main_email.append(main_e) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) main_e = open('email/spam/%d.txt' % i).read() main_email.append(main_e) fullText.extend(wordList) classList.append(0) # 構建詞彙表 vocabList = createVocabList(docList) # print("構建的vocabList:") # print(vocabList) # print("=========================================================") # 進行測試集的劃分 (*2) trainingSet = list(range(50)) testSet = [] for i in range(10): # 隨機選擇10個文件 randIndex = int(np.random.uniform(0, len(trainingSet))) # 隨機構建測試集,獲取隨機數作爲index testSet.append(trainingSet[randIndex]) # 把index對應的郵件index添加到測試集中 del (trainingSet[randIndex]) # 並且把該index從待挑選名單中刪除 trainMat = [] trainClasses = [] for docIndex in trainingSet: # 對於每一個訓練集裏的訓練單位進行詞向量的構建 trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) # 安上對應標籤! # 針對訓練集進行訓練 # # print(trainMat) # print(trainClasses) p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses)) # print(p0V) errorCount = 0 for docIndex in testSet: # 提取訓練名單上對應的郵件信息,查看滴docIndex條元素裏對應的單詞是否在vocablist詞彙表中出現,出現則+1,返回信息向量wordVector # print("train") # print(docList[docIndex]) wordVector = setOfWords2Vec(vocabList, docList[docIndex]) # print(wordVector) if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 # print(main_email[docIndex]) # print(classifyNB(np.array(wordVector), p0V, p1V, pSpam)) # print(classList[docIndex]) # print('the error rate is :', float(errorCount) / len(testSet)) error_rate=float(errorCount) / len(testSet) return p0V, p1V, pSpam,error_rate def find_the_data(): p0Num = np.ones(10) p1Num = np.ones(10) PA = 0.0 err=1 for i in range(50): a,b,c,d=findthebest_Data_test() if d<err: err = d p0Num=a p1Num=b PA=c return p0Num,p1Num,PA def final_test(): p0,p1,pA =find_the_data() docList = [] classList = [] fullText = [] main_email = [] for i in range(1, 26): # (*1) wordList = textParse(open('email/spam/%d.txt' % i).read()) main_e = open('email/spam/%d.txt' % i).read() main_email.append(main_e) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) main_e = open('email/spam/%d.txt' % i).read() main_email.append(main_e) fullText.extend(wordList) classList.append(0) vocabList = createVocabList(docList) errorCount = 0 for i in range(len(docList)): # 提取訓練名單上對應的郵件信息,查看滴docIndex條元素裏對應的單詞是否在vocablist詞彙表中出現,出現則+1,返回信息向量wordVector # print("train") # print(docList[docIndex]) wordVector = setOfWords2Vec(vocabList, docList[i]) # print(wordVector) if classifyNB(np.array(wordVector), p0, p1, pA) != classList[i]: errorCount += 1 # print(main_email[i]) # print(classifyNB(np.array(wordVector), p0, p1, pA)) # print(classList[i]) print('the error rate is :', float(errorCount) / len(docList)) final_test()