我自己對樸素貝葉斯算法的理解就是:根據訓練集,
1.對連續型隨機變量算出每個特徵的正太分佈函數,從而當需要進行預測的樣本過來的時候,根據數值直接計算可能性。
2.若爲離散型,則直接計算相應的概率(西瓜書上寫的很詳細)
理論知識其實很好懂,主要是編程的實現。
數據集用的是pima-indians-diabetes.data.csv
步驟:
1.加載文件
2.分出訓練集和測試集
3.分出類別
4.計算平均數
5.算方差
6.計算每列(即每個特徵)的方差與平均數(記住刪除最後一行)
7.計算可能性的函數
8.計算總的可能性
import csv
import math
import random
def load_csv(filename):#導入數據
lines = csv.reader(open(filename,'r'))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def split_data(dataset,splitRatio):#將數據分爲測試集和訓練集
dataSize = int(len(dataset) * splitRatio)
copy = dataset
train_set = []
while len(train_set) < dataSize:
index = random.randrange(len(copy))
train_set.append(copy.pop(index))#每次加入訓練集的時候copy集合中刪除改數據,那麼留下的數據自然就是測試集
return [train_set,copy]
def separateByClass(dataset):#將數據分類
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if vector[-1] not in separated:
separated[vector[-1]] = []
separated[vector[-1]].append(dataset[i])
return separated
def mean(numbers):#計算均值
return sum(numbers)/float(len(numbers))
def stdev(numbers):#計算方差
avg = mean(numbers)
variance = sum([math.pow(x - avg,2) for x in numbers])/float(len(numbers) - 1)# hvae a problem here []
return math.sqrt(variance)
def summarize(dataset):#將其存入[()]中 方便調用
#結果是[(5.040609137055838, 3.8317118829042394),..., (36.84263959390863, 10.48325239379926)]
#這種形式的好處是不用每次計算,而且代碼更清晰
summaries = [(mean(attribute) , stdev(attribute) )for attribute in zip(*dataset)]# hvae a prpblem ()
del summaries[-1]#因爲最後一列是類別列,在計算時候沒有意義,刪除
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instance in separated.items():
summaries[classValue] = summarize(instance)
return summaries
def calculateProbability(x,mean,stdev):#正太分佈的計算
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVectors):#計算總的可能性
probabilities = {}
for classValue , classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean , stdev = classSummaries[i]
x = inputVectors[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
#xie fen lei
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel , bestProb = None, -1
for classValue , probability in probabilities.items():
if bestLabel is None or bestProb < probability:
bestLabel = classValue
bestProb = probability
return bestLabel
def getPrediction(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predicitions):
print(predicitions)
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predicitions[i]:
correct += 1
return (correct/float(len(testSet))) * 100
def main():
filename = 'pima-indians-diabetes.data.csv'
splitRatio = 0.67
dataset = load_csv(filename)
train_set,test_set = split_data(dataset, splitRatio)
summaries = summarizeByClass(train_set)
predicitions = getPrediction(summaries, test_set)
accuracy = getAccuracy(test_set, predicitions)
print('accuracy: {0} %'.format(accuracy))
main()