我自己对朴素贝叶斯算法的理解就是:根据训练集,
1.对连续型随机变量算出每个特征的正太分布函数,从而当需要进行预测的样本过来的时候,根据数值直接计算可能性。
2.若为离散型,则直接计算相应的概率(西瓜书上写的很详细)
理论知识其实很好懂,主要是编程的实现。
数据集用的是pima-indians-diabetes.data.csv
步骤:
1.加载文件
2.分出训练集和测试集
3.分出类别
4.计算平均数
5.算方差
6.计算每列(即每个特征)的方差与平均数(记住删除最后一行)
7.计算可能性的函数
8.计算总的可能性
import csv
import math
import random
def load_csv(filename):#导入数据
lines = csv.reader(open(filename,'r'))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def split_data(dataset,splitRatio):#将数据分为测试集和训练集
dataSize = int(len(dataset) * splitRatio)
copy = dataset
train_set = []
while len(train_set) < dataSize:
index = random.randrange(len(copy))
train_set.append(copy.pop(index))#每次加入训练集的时候copy集合中删除改数据,那么留下的数据自然就是测试集
return [train_set,copy]
def separateByClass(dataset):#将数据分类
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if vector[-1] not in separated:
separated[vector[-1]] = []
separated[vector[-1]].append(dataset[i])
return separated
def mean(numbers):#计算均值
return sum(numbers)/float(len(numbers))
def stdev(numbers):#计算方差
avg = mean(numbers)
variance = sum([math.pow(x - avg,2) for x in numbers])/float(len(numbers) - 1)# hvae a problem here []
return math.sqrt(variance)
def summarize(dataset):#将其存入[()]中 方便调用
#结果是[(5.040609137055838, 3.8317118829042394),..., (36.84263959390863, 10.48325239379926)]
#这种形式的好处是不用每次计算,而且代码更清晰
summaries = [(mean(attribute) , stdev(attribute) )for attribute in zip(*dataset)]# hvae a prpblem ()
del summaries[-1]#因为最后一列是类别列,在计算时候没有意义,删除
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instance in separated.items():
summaries[classValue] = summarize(instance)
return summaries
def calculateProbability(x,mean,stdev):#正太分布的计算
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVectors):#计算总的可能性
probabilities = {}
for classValue , classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean , stdev = classSummaries[i]
x = inputVectors[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
#xie fen lei
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel , bestProb = None, -1
for classValue , probability in probabilities.items():
if bestLabel is None or bestProb < probability:
bestLabel = classValue
bestProb = probability
return bestLabel
def getPrediction(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
def getAccuracy(testSet, predicitions):
print(predicitions)
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predicitions[i]:
correct += 1
return (correct/float(len(testSet))) * 100
def main():
filename = 'pima-indians-diabetes.data.csv'
splitRatio = 0.67
dataset = load_csv(filename)
train_set,test_set = split_data(dataset, splitRatio)
summaries = summarizeByClass(train_set)
predicitions = getPrediction(summaries, test_set)
accuracy = getAccuracy(test_set, predicitions)
print('accuracy: {0} %'.format(accuracy))
main()