機器學習實戰——樸素貝葉斯分類

準備數據:從文本中構建詞向量

前期測試函數用的數據

def loadDataSet():
    '''創建一些實驗樣本'''
    postingList = [['my','dog','has','flea','problems','help','please'],
                  ['maybe','not','take','him','to','dog','park','stupid'],
                  ['my','dalmation','is','so','cute','I','love','him'],
                  ['stop','posting','stupid','worthless','garbage'],
                  ['mr','licks','ate','my','steak','how','to','stop','him'],
                  ['quit','buying','worthless','dog','food','stupid']]
    classVec = [0,1,0,1,0,1]  #0代表正常言論   1表示侮辱性
    return postingList,classVec
def createVocabList(dataSet):
    '''返回一個包含所有文檔中出現的不重複的詞條集合'''
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)   #創建兩個集合的並集
    return list(vocabSet)

詞表向向量的轉換函數

def setOfWords2Vec(vocabList,inputSet):
    '''接受詞彙表和某個文檔,返回該文檔向量'''
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word:{} is not in my Vocabulary".format(word))
    return returnVec

測試上述

listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
myVocabList
['so',
 'buying',
 'please',
 'has',
 'dalmation',
 'my',
 'cute',
 'quit',
 'love',
 'stupid',
 'park',
 'not',
 'how',
 'flea',
 'problems',
 'licks',
 'food',
 'stop',
 'help',
 'him',
 'ate',
 'maybe',
 'take',
 'I',
 'worthless',
 'to',
 'steak',
 'mr',
 'is',
 'garbage',
 'posting',
 'dog']
setOfWords2Vec(myVocabList,listOPosts[0])
[0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

訓練算法:從詞向量計算概率

樸素貝葉斯分類器訓練數據

from numpy import *
def trainNB0(trainMatrix,trainCategory):
    '''輸入文檔矩陣,每篇文檔類別構成的向量
    返回兩個向量[元素是各個詞條的條件概率P(Wi | C1) ,其中i=1,2,...,詞條數]和一個先驗概率'''
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)   #類別中侮辱性對應1,它的先驗概率(若非二分問題此處修改)
    #p0Num = zeros(numWords);p1Num = zeros(numWords)
    #p0Denom = 0.0;p1denom = 0.0           #初始化概率
    p0Num = ones(numWords);p1Num = ones(numWords)
    p0Denom = 2.0;p1denom = 2.0            #初始化概率,拉普拉斯平滑,避免出現0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    #p1Vect = p1Num/p1denom
    #p0Vect = p0Num/p0Denom
    p1Vect = log(p1Num/p1denom)        #對乘積取自然對數,解決乘積很小時出現下溢出
    p0Vect = log(p0Num/p0Denom)
    return p0Vect,p1Vect,pAbusive

測試上述

trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0v,p1v,pAb = trainNB0(trainMat,listClasses)
pAb
0.5
p0v
array([-2.56494936, -3.25809654, -2.56494936, -2.56494936, -2.56494936,
       -1.87180218, -2.56494936, -3.25809654, -2.56494936, -3.25809654,
       -3.25809654, -3.25809654, -2.56494936, -2.56494936, -2.56494936,
       -2.56494936, -3.25809654, -2.56494936, -2.56494936, -2.15948425,
       -2.56494936, -3.25809654, -3.25809654, -2.56494936, -3.25809654,
       -2.56494936, -2.56494936, -2.56494936, -2.56494936, -3.25809654,
       -3.25809654, -2.56494936])
p1v
array([-3.04452244, -2.35137526, -3.04452244, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244, -2.35137526, -3.04452244, -1.65822808,
       -2.35137526, -2.35137526, -3.04452244, -3.04452244, -3.04452244,
       -3.04452244, -2.35137526, -2.35137526, -3.04452244, -2.35137526,
       -3.04452244, -2.35137526, -2.35137526, -3.04452244, -1.94591015,
       -2.35137526, -3.04452244, -3.04452244, -3.04452244, -2.35137526,
       -2.35137526, -1.94591015])

測試算法:

樸素貝葉斯分類函數

def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    '''輸入要分類的向量,及訓練得到的參數
       返回分類'''
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)   #元素相乘,此處相加是由於取對數了,而且vec2Classify的元素是0,1,從而對p1Vec和p0Vec
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)  #進行了篩選(0乘上去爲0),即只用樣例出現的各個獨立條件概率(訓練得到的參數)
    if p1 > p0:
        return 1
    else:
        return 0

測試

testEntry = ['love','my','dalmation']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print("{0}classified as:{1}".format(testEntry,classifyNB(thisDoc,p0v,p1v,pAb)))
testEntry = ['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print("{0}classified as:{1}".format(testEntry,classifyNB(thisDoc,p0v,p1v,pAb)))
['love', 'my', 'dalmation']classified as:0
['stupid', 'garbage']classified as:1

使用詞袋模型

詞集模型是上面函數setOfWords2Vec()實現的,而詞袋模型是每個單詞可以出現多次

def bagOfWords2VecMN(vocabList,inputSet):
    '''接受詞彙表和某個文檔,返回該文檔向量'''
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

示例一:使用樸素貝葉斯過濾垃圾郵件

準備數據,文件解析,從文本文檔中構建自己的詞列表

def textParse(bigString):
    '''文本文件解析,返回字符串列表'''
    import re
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

對垃圾郵件進行自動化處理

def spamTest():
    docList = [];classList = [];fullText = []
    for i in range(1,26):
        #導入並解析文件
        wordList = textParse(open('E:\DataMining\Project\MLBook\機器學習實戰源代碼\machinelearninginaction\Ch04\email\spam\{}.txt'.
                                  format(i)).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('E:\DataMining\Project\MLBook\機器學習實戰源代碼\machinelearninginaction\Ch04\email\ham\{}.txt'.
                                  format(i),encoding='gb18030',errors='ignore').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)  #返回所有文檔中不重複的詞集
    trainingSet = list(range(50));testSet = []
    for i in range(10):
        #隨機構建訓練集
        randomIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randomIndex])
        del(trainingSet[randomIndex])
    trainMat = [];trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0v,p1v,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        #對測試集分類
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0v,p1v,pSpam) != classList[docIndex]:
            errorCount += 1
    print("the error rate is {}".format(float(errorCount)/len(testSet)))
    return float(errorCount)/len(testSet)

重複10次取錯誤率均值

errorPercent = 0.0
for i in range(10):
    errorPercent += spamTest()
print("the average error persent is : {}%".format(errorPercent/10 * 100))
E:\Anaconda3\lib\re.py:212: FutureWarning: split() requires a non-empty pattern match.
  return _compile(pattern, flags).split(string, maxsplit)


the error rate is 0.1
the error rate is 0.0
the error rate is 0.0
the error rate is 0.2
the error rate is 0.0
the error rate is 0.0
the error rate is 0.1
the error rate is 0.1
the error rate is 0.0
the error rate is 0.0
the average error persent is : 5.0%

示例二:使用樸素貝葉斯分類器從當地新聞中獲取所屬區域

這裏只是運用以下該分類器,找到高頻詞,如果真的要分析,其實要用停詞,詞性分析等

基於:使用RSS源閱讀程序庫:feedparser

def calcMostFreq(vocabList,fullText):
    '''遍歷詞彙表中的每個詞並統計在文本出現次數
       返回排序最高的30個單詞'''
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(),key=operator.itemgetter(1),reverse=True)
    return sortedFreq[:30]

RSS源分類器函數

def localWords(feed1,feed0):
    '''輸入兩個RSS源'''
    import feedparser   #使用RSS源閱讀程序庫
    docList = [];classList = [];fullText = []
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList,fullText)
    for pairW in top30Words:
        #去掉出現次數最高的那些詞
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet = list(range(2*minLen));testSet = []
    #print("minLen is : {}".format(minLen))
    for i in range (20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        #print("randIndex is : {}".format(randIndex))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = [];trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])   #使用詞袋模型
    p0v,p1v,pSpam = trainNB0(array(trainMat),array(trainClasses))    #開始訓練
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0v,p1v,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is : {}'.format(float(errorCount)/len(testSet)))
    return vocabList,p0v,p1v      

導入RSS源測試

import feedparser
ny = feedparser.parse('https://newyork.craigslist.org/d/activity-partners/search/act?format=rss')
sf = feedparser.parse('https://sfbay.craigslist.org/d/activity-partners/search/act?format=rss')    #它的內容是不斷變化的
vocabList,pSF,pNY = localWords(ny,sf)
the error rate is : 0.35


E:\Anaconda3\lib\re.py:212: FutureWarning: split() requires a non-empty pattern match.
  return _compile(pattern, flags).split(string, maxsplit)

分析數據:顯示最近(我測試時)兩地新聞相關用詞

def getTopWords(ny,sf):
    import operator
    vocabList,p0v,p1v = localWords(ny,sf)
    topNY = [];topSF = []
    for i in range(len(p0v)):
        if p0v[i] > -5.0 : 
            topSF.append((vocabList[i],p0v[i]))
        if p1v[i] > -5.0 : 
            topNY.append((vocabList[i],p1v[i]))
    sortedSF = sorted(topSF,key = lambda pair: pair[1],reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY,key = lambda pair:pair[1],reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print(item[0])
getTopWords(ny,sf)
the error rate is : 0.3
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
really
abou
join
years
maybe
whom
one
wood
games
working
hang
fitness
early
two
also
know
june
past
level
could
but
NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**
channel
need
lady
great
our
shorter
make
little
attire
call
attend
youtube
things
participate
area
has
hair
help
got
funds
butterfly
social
vide
extra
submit
shiny
outgoing
brooklyn
there
how
long
etc
new
afternoon
noon
conversation
watching
hurry
walks
29th
youtu
back
does
dinner
moments
seeking
paddy
around
people
number
restaurant
put
couple
singers
weekends
maybe
share
when
must
love
full
name
live
then
5twfhtidasa
videos
humor
crowded
friend
articulate
info
pastime
working
starter
black
sports
show
those
considered
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章