#借用下python機器學習中的數據集
import numpy as np
from collections import Counter
from math import log
def CreateDataSet():
dataset = np.array([[1, 1, 'yes' ],
[1, 1, 'yes' ],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']])
return dataset
'''熵 h(x) sum(-p(x)*log p(x) )'''
def cal_entropy(dataset,m): #m表示計算dataset中第k列的熵
feature_cnt=Counter(dataset[:,m]) #引入Counter進行頻度統計,或者採用字典的方式進行累加計算也可以;
length=len(dataset)
res= sum( [ -(v/length)*log(v/length) for v in feature_cnt.values() ])
return res
datasets=CreateDataSet() #這裏label其實沒什麼用
print(cal_entropy(datasets,2))
'''聯合熵,將兩列特徵拼接好後進行計算 h(x,y) -p(x,y)*log p(x,y)'''
def cal_union_entropy(dataset,m,n): #計算兩個特徵的聯合熵
col1=dataset[:,m]
col2=dataset[:,n]
feature_cnt=Counter([str(col1[i])+col2[i] for i in range(len(col1))])
length=len(dataset)
res= sum( [ -(v/length)*log(v/length) for v in feature_cnt.values() ])
return res
print(cal_union_entropy(datasets,1,2))
'''條件熵 h(x,y)-h(x) x確定時,y的不確定性 '''
def cal_condition_entropy(dataset,m,n):
return cal_union_entropy(dataset,m,n)-cal_entropy(dataset,n)
print(cal_condition_entropy(datasets,1,2))
'''交叉熵 h(p,q)=sum(-p*log(q))'''
def cal_cross_entropy(dataset,m):
feature_cnt=Counter(dataset[:,m]) #引入Counter進行頻度統計,或者採用字典的方式進行累加計算也可以;
length=len(dataset)
prob=[0.5,0.5] ###模擬一個q分佈
value=list(feature_cnt.values())
res=sum([ - (value[i]/length)*log(prob[i]) for i in range(len(value)) ])
return res
print(cal_cross_entropy(datasets,2))
'''相對熵 kl(p||q)=sum(-p*log(q/p))'''
def cal_cross_entropy(dataset,m):
feature_cnt=Counter(dataset[:,m]) #引入Counter進行頻度統計,或者採用字典的方式進行累加計算也可以;
length=len(dataset)
prob=[0.5,0.5] ###模擬一個q分佈
value=list(feature_cnt.values())
res=sum([ - (value[i]/length)*log(prob[i]*length/value[i]) for i in range(len(value)) ])
return res
print(cal_cross_entropy(datasets,2))
分別輸出:
0.6730116670092565
1.0549201679861442
0.38190850097688767
0.6931471805599453
0.020135513550688836