樸素貝葉斯算法
貝葉斯定理想必大家很早就已經瞭解,樸素貝葉斯算法就是基於貝葉斯定理提出的一種監督機器學習算法。爲什麼叫“樸素”了?那是因爲樸素貝葉斯分類器基於一個簡單的假定:給定目標值時屬性之間相互條件獨立。給定類變量 y (這裏一個樣本僅屬於一類) 和一個相互獨立的特徵向量
使用樸素(naive)的假設:每個特徵之間相互獨立:
對於所有的
由於
並且我們可以使用最大後驗概率(MAP)估計來估計
不同的樸素貝葉斯分類器的不同之處在於:它們對
樸素貝葉斯算法 高斯模型
高斯模型假設這些一個特徵的所有屬於某個類別的觀測值符合高斯分佈:
python代碼實現
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import random
import math
# 根據文件路徑加載數據
def load_data(file_name):
file = open(file_name)
data = []
for line in file.readlines():
tmpline = line.split(' ')
tmp = []
for x in tmpline:
tmp.append(float(x))
data.append(tmp)
return data
# 數據分爲訓練數據 train_set 和測試數據 test_set
def split_data(data, split_ratio):
train_size = int(len(data) * split_ratio)
train_set = []
test_set = list(data)
while len(train_set) < train_size:
index = random.randrange(len(test_set))
train_set.append(test_set.pop(index))
return [train_set, test_set]
# 根據結果分類
def separate_by_class(data):
y = {}
for i in range(len(data)):
vector = data[i]
if vector[-1] not in y:
y[vector[-1]] = []
y[vector[-1]].append(vector)
return y
# 計算平均值
def average(numbers):
return sum(numbers) / float(len(numbers))
# 計算樣本方差
def stdev(numbers):
avg = average(numbers)
variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
return math.sqrt(variance)
# 計算每個屬性的平均值和樣本方差
def summarize(data):
summaries = [(average(attribute), stdev(attribute)) for attribute in zip(*data)]
del summaries[-1]
return summaries
def summaries_by_class(data):
y = separate_by_class(data)
summaries = {}
for classValue, instances in y.items():
summaries[classValue] = summarize(instances)
return summaries
# 計算高斯分佈
def calculate_Probability(x, avg, stdev):
exponent = math.exp((-1) * (math.pow(x - avg, 2) / (2 * math.pow(stdev, 2))))
return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
# 計算輸入向量的貝葉斯概率
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
avg, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculate_Probability(x, avg, stdev)
return probabilities
# 找到最大的值
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
# 計算預測正確率
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(testSet))) * 100.0
if __name__ == '__main__':
data = load_data("/tmp/pima-indians-diabetes.txt")
trainSet, testSet = split_data(data, 0.8)
summaries = summaries_by_class(trainSet)
print(
'split {0} rows data into {1} rows trainData and {2} rows testData'.format(len(data), len(trainSet), len(testSet)))
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy:{0}%'.format(accuracy))
本實驗進行了10次測試,得到的平均正確率爲74.42%。
- 參考資料:
- 數據集