貝葉斯分類是數學性較強的分類方法,在處理多屬性問題的分類時,主要用到下面兩個公式:
第二個公式之所以成立,是因爲在貝葉斯分類中進行了各屬性均與分類標籤獨立的假設。即X與Ci獨立,則有:
,,
import random
import numpy as np
from math import e
from math import pow
from sklearn.datasets import load_iris
iris=load_iris()
n_tot,n_attr=iris.data.shape
n_train=120;n_test=30;n_target=3;mm=5
book=np.zeros(150,dtype=int)
for i in range(n_test):#每五個樣本中,選一個作爲測試樣本
val=5*i+random.randint(0,4)
book[val]=1
data_train=np.zeros((n_tot,5))
data_test=np.zeros((n_tot,5))
cnt1=0;cnt2=0
for i in range(n_tot):#data_train爲訓練樣本,data_test爲測試樣本
if book[i]==0:
for j in range(n_attr):
data_train[cnt1][j]=iris.data[i][j]
data_train[cnt1][n_attr]=iris.target[i]
cnt1+=1
else:
for j in range(n_attr):
data_test[cnt2][j]=iris.data[i][j]
data_test[cnt2][n_attr]=iris.target[i]
cnt2+=1
cnt=np.zeros((5,5))
average=np.zeros((5,5))
deviation=np.zeros((5,5))
pro_attr=np.zeros(5)
for i in range(n_target):#average[i][j]代表所有標籤爲i的樣本中,第j個屬性的均值
for j in range(n_attr):
for k in range(n_train):
if data_train[k][n_attr]==i:
average[i][j]+=data_train[k][j]
cnt[i][j]+=1.0
for i in range(n_target):
for j in range(n_attr):
average[i][j]/=cnt[i][j]
for i in range(n_target):#deviation[i][j]代表所有標籤爲i的樣本中,第j個屬性的方差
for j in range(n_attr):
for k in range(n_train):
if data_train[k][n_attr]==i:
deviation[i][j]+=(data_train[k][j]-average[i][j])*(data_train[k][j]-average[i][j])
for i in range(n_target):
for j in range(n_attr):
deviation[i][j]/=cnt[i][j]
for i in range(n_train):#pro_attr[i]代表標籤爲i的樣本佔所有樣本的比例
val=int(data_train[i][n_attr])
pro_attr[val]+=1.0
for i in range(n_target):
pro_attr[i]/=n_train
cnt_correct=0
for i in range(n_test):
maxx=0.0;ans=0.0
for j in range(n_target):#求P(Cj|X)
tmp=pro_attr[j]
for k in range(n_attr):#求P(Xk|Cj)
val=pow(e,-((data_test[i][k]-average[j][k])*(data_test[i][k]-average[j][k]))/(2.0*deviation[j][k]*deviation[j][k]))
tmp*=val;
if maxx<tmp:
maxx=tmp
ans=j
if ans==data_test[i][n_attr]:
cnt_correct+=1
print(cnt_correct,n_test)