手寫樸素貝葉斯

from scipy.io import arff
import numpy as np
import pandas as pd

radius = 4  # search radius


def distance(point1, point2):
    return np.sqrt(np.sum([(point1[i] - point2[i]) ** 2 for i in range(4)]))


weather = arff.loadarff('../dataset/weather.nominal.arff')
df = pd.DataFrame(weather[0])
df = df.sample(frac=1)
attribute_columns = df.columns
train_data = df.iloc[:10]
test_data = df.iloc[10:]
train_length = train_data.shape[0]
test_length = test_data.shape[0]


def predict(x, data):
    whole_length = data.shape[0]
    yes_no_count = data['play'].value_counts()
    yes_p = yes_no_count[b'yes'] / whole_length
    no_p = yes_no_count[b'no'] / whole_length
    attribute_list = list(x.index)
    for attribute in attribute_list:
        yes_count = data[data['play'] == b'yes'][attribute].value_counts()
        no_count = data[data['play'] == b'no'][attribute].value_counts()
        yes_sum = np.sum(yes_count)
        no_sum = np.sum(no_count)
        yes_attribute_count = 0
        no_attribute_count = 0
        if x[attribute] not in yes_count.index:
            yes_attribute_count = 0
        elif x[attribute] not in no_count.index:
            no_attribute_count = 0
        else:
            yes_attribute_count = yes_count[x[attribute]]
            no_attribute_count = no_count[x[attribute]]
        yes_p *= ((yes_attribute_count + 1) / (yes_sum + 1))
        no_p *= ((no_attribute_count + 1) / (no_sum + 1))
    if yes_p >= no_p:
        return b'yes'
    else:
        return b'no'


y = test_data.iloc[:, 4]
y_predict_list = []
for i in range(test_length):
    x = test_data.iloc[i, 0:4]
    y_predict = predict(x, train_data)
    y_predict_list.append(y_predict)
print('test acc: %.2f' % (np.mean(y_predict_list == y)))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章