朴素贝叶斯算法--python实现

我自己对朴素贝叶斯算法的理解就是:根据训练集,
1.对连续型随机变量算出每个特征的正太分布函数,从而当需要进行预测的样本过来的时候,根据数值直接计算可能性。
2.若为离散型,则直接计算相应的概率(西瓜书上写的很详细)

理论知识其实很好懂,主要是编程的实现。
数据集用的是pima-indians-diabetes.data.csv
步骤:
1.加载文件
2.分出训练集和测试集
3.分出类别
4.计算平均数
5.算方差
6.计算每列(即每个特征)的方差与平均数(记住删除最后一行)
7.计算可能性的函数
8.计算总的可能性

import csv
import math
import random

def load_csv(filename):#导入数据
    lines = csv.reader(open(filename,'r'))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

def split_data(dataset,splitRatio):#将数据分为测试集和训练集
    dataSize = int(len(dataset) * splitRatio)
    copy = dataset
    train_set  = []
    while len(train_set) < dataSize:
        index = random.randrange(len(copy))
        train_set.append(copy.pop(index))#每次加入训练集的时候copy集合中删除改数据,那么留下的数据自然就是测试集
    return [train_set,copy]

def separateByClass(dataset):#将数据分类
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if vector[-1] not in separated:
            separated[vector[-1]] = []
        separated[vector[-1]].append(dataset[i])
    return separated

def mean(numbers):#计算均值
    return sum(numbers)/float(len(numbers))

def stdev(numbers):#计算方差
    avg = mean(numbers)
    variance = sum([math.pow(x - avg,2) for x in numbers])/float(len(numbers) - 1)# hvae a problem here []
    return math.sqrt(variance)

def summarize(dataset):#将其存入[()]中 方便调用
    #结果是[(5.040609137055838, 3.8317118829042394),..., (36.84263959390863, 10.48325239379926)]
    #这种形式的好处是不用每次计算,而且代码更清晰
    summaries = [(mean(attribute) , stdev(attribute) )for attribute in zip(*dataset)]# hvae a prpblem ()
    del summaries[-1]#因为最后一列是类别列,在计算时候没有意义,删除
    return summaries

def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instance in separated.items():
        summaries[classValue] = summarize(instance) 
    return summaries

def calculateProbability(x,mean,stdev):#正太分布的计算
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVectors):#计算总的可能性
    probabilities = {}
    for classValue , classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean , stdev = classSummaries[i]
            x = inputVectors[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

def predict(summaries, inputVector):
    #xie fen lei 
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel , bestProb = None, -1
    for classValue , probability in probabilities.items():
        if bestLabel is None or bestProb < probability:
            bestLabel = classValue
            bestProb = probability
    return bestLabel

def getPrediction(summaries, testSet):

    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

def getAccuracy(testSet, predicitions):
    print(predicitions)
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predicitions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100

def main():
    filename = 'pima-indians-diabetes.data.csv'
    splitRatio = 0.67
    dataset = load_csv(filename)
    train_set,test_set = split_data(dataset, splitRatio)
    summaries = summarizeByClass(train_set)
    predicitions =  getPrediction(summaries, test_set)
    accuracy = getAccuracy(test_set, predicitions)
    print('accuracy: {0} %'.format(accuracy))


main()


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章