樸素貝葉斯算法--python實現

我自己對樸素貝葉斯算法的理解就是:根據訓練集,
1.對連續型隨機變量算出每個特徵的正太分佈函數,從而當需要進行預測的樣本過來的時候,根據數值直接計算可能性。
2.若爲離散型,則直接計算相應的概率(西瓜書上寫的很詳細)

理論知識其實很好懂,主要是編程的實現。
數據集用的是pima-indians-diabetes.data.csv
步驟:
1.加載文件
2.分出訓練集和測試集
3.分出類別
4.計算平均數
5.算方差
6.計算每列(即每個特徵)的方差與平均數(記住刪除最後一行)
7.計算可能性的函數
8.計算總的可能性

import csv
import math
import random

def load_csv(filename):#導入數據
    lines = csv.reader(open(filename,'r'))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

def split_data(dataset,splitRatio):#將數據分爲測試集和訓練集
    dataSize = int(len(dataset) * splitRatio)
    copy = dataset
    train_set  = []
    while len(train_set) < dataSize:
        index = random.randrange(len(copy))
        train_set.append(copy.pop(index))#每次加入訓練集的時候copy集合中刪除改數據,那麼留下的數據自然就是測試集
    return [train_set,copy]

def separateByClass(dataset):#將數據分類
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if vector[-1] not in separated:
            separated[vector[-1]] = []
        separated[vector[-1]].append(dataset[i])
    return separated

def mean(numbers):#計算均值
    return sum(numbers)/float(len(numbers))

def stdev(numbers):#計算方差
    avg = mean(numbers)
    variance = sum([math.pow(x - avg,2) for x in numbers])/float(len(numbers) - 1)# hvae a problem here []
    return math.sqrt(variance)

def summarize(dataset):#將其存入[()]中 方便調用
    #結果是[(5.040609137055838, 3.8317118829042394),..., (36.84263959390863, 10.48325239379926)]
    #這種形式的好處是不用每次計算,而且代碼更清晰
    summaries = [(mean(attribute) , stdev(attribute) )for attribute in zip(*dataset)]# hvae a prpblem ()
    del summaries[-1]#因爲最後一列是類別列,在計算時候沒有意義,刪除
    return summaries

def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instance in separated.items():
        summaries[classValue] = summarize(instance) 
    return summaries

def calculateProbability(x,mean,stdev):#正太分佈的計算
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVectors):#計算總的可能性
    probabilities = {}
    for classValue , classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean , stdev = classSummaries[i]
            x = inputVectors[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

def predict(summaries, inputVector):
    #xie fen lei 
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel , bestProb = None, -1
    for classValue , probability in probabilities.items():
        if bestLabel is None or bestProb < probability:
            bestLabel = classValue
            bestProb = probability
    return bestLabel

def getPrediction(summaries, testSet):

    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

def getAccuracy(testSet, predicitions):
    print(predicitions)
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predicitions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100

def main():
    filename = 'pima-indians-diabetes.data.csv'
    splitRatio = 0.67
    dataset = load_csv(filename)
    train_set,test_set = split_data(dataset, splitRatio)
    summaries = summarizeByClass(train_set)
    predicitions =  getPrediction(summaries, test_set)
    accuracy = getAccuracy(test_set, predicitions)
    print('accuracy: {0} %'.format(accuracy))


main()


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章