機器學習實戰樸素貝葉斯篇

import numpy as np
from math import *
import random

def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec=[0,1,0,1,0,1] #1表示侮辱性言論，0表示正常言論
return postingList,classVec

#構建詞彙表生成函數creatVocabList
def createVocabList(dataSet):
vocabSet=set([])
for document in dataSet:
vocabSet=vocabSet|set(document) #取兩個集合的並集
return list(vocabSet)

#對輸入的詞彙表構建詞向量
#詞集模型
def setOfWords2Vec(vocabList,inputSet):
returnVec=np.zeros(len(vocabList)) #生成零向量的array
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]=1 #有單詞，則該位置填充0
else: print('the word:%s is not in my Vocabulary!'% word)
return returnVec #返回全爲0和1的向量

#這種構建詞向量的方法，只記錄了每個詞是否出現，而沒有記錄詞出現的次數，這樣的模型
#叫做詞集模型，如果在詞向量中記錄詞出現的次數，每出現一次，則多記錄一次，這樣的詞向
#量構建方法，被稱爲詞袋模型，下面構建以一個詞袋模型的詞向量生成函數bagOfWord2VecMN:
#詞袋模型
def bagOfWords2VecMN(vocabList,inputSet):
returnVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]+=1
return returnVec #返回非負整數的詞向量

def trainNB0(trainMatrix,trainCategory):
numTrainDocs=len(trainMatrix) #文檔數目,此例爲6
numWord=len(trainMatrix[0]) #詞彙表詞數目，即不重複詞彙的總和
pAbusive=sum(trainCategory)/len(trainCategory) #p1,出現侮辱性評論的概率
p0Num=np.zeros(numWord)
p1Num=np.zeros(numWord)
p0Demon=0;p1Demon=0
for i in range(numTrainDocs):
if trainCategory[i]==0:
p0Num+=trainMatrix[i] #向量相加，該向量長度爲詞彙表詞數目
p0Demon+=sum(trainMatrix[i]) #該向量中數值1的累加求和
else:
p1Num+=trainMatrix[i]
p1Demon+=sum(trainMatrix[i])
p0Vec=p0Num/p0Demon #p0/1Vec 向量長度爲詞彙表詞數目，值爲詞彙表中每個詞在
#正常/侮辱言論中所有詞個數所佔的比例
p1Vec=p1Num/p1Demon
return p0Vec,p1Vec,pAbusive

##算法漏洞：
##1.乘積爲0
##我們看到，當某分類下某詞項出現頻次爲0時，其概率也是0，因此在計算p(w0|ci)p(w1|ci)p(w2|ci)......p(wN|ci)
##會因爲其中某個的概率爲0而全部是0。爲了避免這樣的情況發生，我們將所有詞項出現的頻次都初始化爲1，
##某類所有詞項數量初始化爲2。
##2.因子太小導致結果溢出問題
##由於p(w0|ci)p(w1|ci)p(w2|ci)......p(wN|ci)中每個因子都很小，所有因子相乘，特別是因子數量多的時候，
##會導致結果溢出，從而得到錯誤的數據避免溢出問題的發生，可以使用求自然對數的方法，自然對數和原本
##的數值同增同減，不會有任何損失，因此不會影響求得的概率結果。

def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1=sum(vec2Classify*p1Vec)+log(pClass1) #p1=sum(vec2Classify*p1Vec)+log(pClass1) 的數學原理是ln(a*b)=ln(a) +ln(b)
p0=sum(vec2Classify*p0Vec)+log(1-pClass1)
if p1>p0:
return 1
else:
return 0

def trainNB1(trainMatrix,trainCategory):
numTrainDocs=len(trainMatrix)
numWord=len(trainMatrix[0])
pAbusive=sum(trainCategory)/len(trainCategory)
p0Num=ones(numWord);p1Num=ones(numWord)# 初始化爲1
p0Demon=2;p1Demon=2 #初始化爲2
for i in range(numTrainDocs):
if trainCategory[i]==0:
p0Num+=trainMatrix[i]
p0Demon+=sum(trainMatrix[i])
else:
p1Num+=trainMatrix[i]
p1Demon+=sum(trainMatrix[i])
p0Vec=log(p0Num/p0Demon) #對結果求對數
p1Vec=log(p1Num/p1Demon) #對結果求自然對數
return p0Vec,p1Vec,pAbusive

#獲取頻率最高的詞項
def calcMostFreq(vocabList,fullText):
import operator
freqDict={}
for word in vocabList:
freqDict[word]=fullText.count(word)
sortedFreq=sorted(freqDict.items(),key=operator.itemgetter(1),reverse=True) #reverse=True/False 降序/升序
return sortedFreq[:30] #operator模塊提供的itemgetter函數用於獲取對象的哪些維的數據，參數爲一些序號（即需要獲取的數據在對象中的序號）

def localWords(feed1,feed0):
import feedparser
docList=[];classList=[];fullText=[]
minLen=min(len(feed1['entries']),len(feed0['entries']))
print(minLen)
for i in range(minLen):
wordList=textParser(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList=textParser(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList=createVocabList(docList)
top30words=calcMostFreq(vocabList,fullText) #選取頻率最高的30個詞
#去掉出現頻率最高的30個詞
for pairW in top30words:
if pairW[0] in vocabList:
vocabList.remove(pairW[0])
trainSet=list[range(2*minLen)]
testSet=[]
print(trainSet)
for i in range(20): #從原始數據中選出20個作爲測試數據
randIndex=int(random.uniform(0,len(trainSet)))
testSet.append(trainSet[randIndex])
del(trainSet[randIndex])
trainMat=[];trainClasses=[]
for docIndex in trainSet:
trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam=trainNB1(trainMat,trainClasses)
errorCount=0
for docIndex in testSet:
wordVector=bagOfWords2VecMN(vocabList,docList[docIndex])
if classifyNB(wordVector,p0V,p1V,pSpam) !=classList[docIndex]:
errorCount+=1
print('the error rate is:',float(errorCount)/len(testSet))
return vocabList,p0V,p1V

#最具表徵性的詞彙表顯示函數
def getTopWords(ny,sf,t=-6.0):
import operator
vocabList,p0V,p1V=localWords(ny,sf)
topNY=[];topSF=[]
for i in range(len(p0V)):
#選取一個閾值t
if p0V[i]>t:topSF.append((vocabList[i],p0V[i]))
if p1V[i]>t:topNY.append((vocabList[i],p1V[i]))
sortedSF=sorted(topSF,key=lambda x: x[1],reverse=True)
print('SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF')
for item in sortedSF:
print(item[0])
sortedNY=sorted(topNY,key=lambda pair: pair[1],reverse=True)
print('NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY')
for item in sortedNY:
print(item[0])

機器學習實戰樸素貝葉斯篇

Anaconda中torch模塊的安裝問題

PAT甲級C語言.1004. 成績排名

Pytorch入門與實踐——Tensor和autograd

Pytorch入門與實踐——神經網絡工具箱

Pytorch入門與實踐——常用的工具

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

機器學習實戰 樸素貝葉斯篇

機器學習實戰樸素貝葉斯篇