轉自:https://blog.csdn.net/GrinAndBearIt/article/details/79045143 侵刪
本人在自學李航老師的統計學習方法,在學習樸素貝葉斯章節時,其中概念非常好理解,但是準備想把課本中的例題實戰一下時卻犯了難,有點無從下手的感覺,主要是因爲怎麼去合理的去寫,提高代碼的適應性以及重複利用率。
在網上找了蠻多博客,大部分都是是判斷情感詞等,其中有篇博客就對該例題進行了剖析,代碼也寫得非常好,我個人仔細學習了下,並自己動手寫了下。這個腳本總體來說和那個情感的差不多,具有非常好的適應性,都是把特徵合併一個大的tag標籤,然後每個樣本就變了一個1/0數組,1代表有這個屬性,0代表沒有。總之非常具有學習意義。下面我附上代碼。
#-*- coding:utf-8 -*-
import numpy as np
def loadDateSet():
dateSet = [[1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3], [
'S', 'M', 'M', 'S', 'S', 'S', 'M', 'M', 'L', 'L', 'L', 'M', 'M', 'L', 'L']]
labels = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]
return np.array(dateSet).transpose().tolist(), labels
def calc_label(labels, alpha): # 計算類的種類及比例
m = len(labels)
uniqueLabel = set(labels)
diffLabelNum = len(uniqueLabel)
labelRate = {}
for label in uniqueLabel:
labelRate[label] = (labels.count(label)+alpha)/(m+diffLabelNum*alpha)
return labelRate, list(uniqueLabel)
#labelRate = {1: 0.5882352941176471, -1: 0.4117647058823529}
#list(uniqueLabel) = [1, -1]
def calcVocaulary(dataset): # 計算屬性的種類,是把所有屬性放在一起
voca = set()
for content in dataset:
voca = voca | set(content) # |求兩個集合的並集
return list(voca)
#voca ={'L', '3', 'M', 'S', '2', '1'}
def calcVector(voca, vector): # 計算某一個樣本(vector)的屬性的數組
n = len(voca)
originVector = np.zeros(n)
for word in vector:
if word in voca:
originVector[voca.index(word)] += 1
return np.array(originVector)
# originVector = array([1., 0., 0., 1., 0., 0.])
def calcUniqueValueNum(dataset, labels, label, voca): # 計算分母的sj
labelDataSet = []
for i in range(len(labels)):
if labels[i] == label:
labelDataSet.append(dataset[i])
m, n = np.shape(labelDataSet)
uniqueValueDict = {}
for i in range(n):
uniqueValue = set()
[uniqueValue.add(content[i]) for content in labelDataSet]
for value in uniqueValue:
uniqueValueDict[value] = len(uniqueValue) # 每個xi的可能取值範圍個數
a = len(voca)
returnArray = np.zeros(a)
for key in uniqueValueDict:
returnArray[voca.index(key)] = float(uniqueValueDict[key])
return returnArray
#array([3., 3., 3., 3., 3., 3.])
def Bayes(dataset, labels, uniqueLabel, voca, alpha): # 計算不同的label的概率
n = len(uniqueLabel)
m = len(dataset)
trainVecDict = {}
for i in range(n):
labelVector = np.array(np.ones(len(voca)))*alpha # 分子的alpha
for j in range(m):
if labels[j] == uniqueLabel[i]:
labelVector += calcVector(voca, dataset[j]) # 計算分子
labelVector /= (labels.count(uniqueLabel[i])+calcUniqueValueNum(
dataset, labels, uniqueLabel[i], voca)*alpha)
trainVecDict[uniqueLabel[i]] = labelVector
return trainVecDict
#{-1: array([0.16666667, 0...33333333]), 1: array([0.44444444, 0...33333333])}
def testFunction(testArray, voca, trainVecDict, labelRate): # 對測試數據計算各label的概率,返回概率最大的那個值
result = -1
maxRate = -np.inf # 負無窮大
for key in trainVecDict:
singleLabelRate = 1.0
for word in testArray:
singleLabelRate *= trainVecDict[key][voca.index(word)]
if singleLabelRate*labelRate[key] > maxRate:
result = key
maxRate = singleLabelRate*labelRate[key]
return result
def main():
dataSet, labels = loadDateSet()
labelRate, uniqueLabel = calc_label(labels, 0)
voca = calcVocaulary(dataSet)
trainVecDict = Bayes(dataSet, labels, uniqueLabel, voca, 0)
testArray = np.array([2, 'S'])
print(testFunction(testArray, voca, trainVecDict, labelRate))
if __name__ == '__main__':
main()
結果是-1