二、案例
日期 |
天氣 |
氣溫 |
溼度 |
是否颳風 |
是否外出散步 |
1 |
下雨 |
68 |
80 |
否 |
是 |
2 |
下雨 |
65 |
70 |
是 |
否 |
3 |
多雲 |
64 |
65 |
是 |
是 |
4 |
晴天 |
72 |
95 |
否 |
否 |
5 |
晴天 |
69 |
70 |
否 |
是 |
6 |
下雨 |
75 |
80 |
否 |
是 |
7 |
晴天 |
75 |
70 |
是 |
是 |
8 |
多雲 |
72 |
90 |
是 |
是 |
9 |
多雲 |
81 |
75 |
否 |
是 |
10 |
下雨 |
71 |
80 |
是 |
否 |
對於屬性“天氣”和“是否颳風”,將看做離散型的數據。對於屬性“氣溫”和“溼度”將看成連續型的數據。
採用前述的公式即可計算出各自屬性的信息增益。Python程序見下面代碼。
三、基於Python的實現
import numpy as np
import math
###########################################
'''在數組array中定位第一個大於data的元素位置'''
###########################################
def findLocLT( array, data ):
for i in range( len( array ) ):
if( array[i] > data ):
ind = i
break
return ind
###########################################
'''對數組進行排序,輸出排序結果及在原數組的索引'''
#array: the array to be sorted
#data: the sorted array
#ind : the index of element in the intial array
###########################################
def sortWithInd( array ):
length = len( array )
data = np.copy( array )
ind = np.arange( length )
for i in range( length ):
flag = 1
for j in range( length - 1 - i ):
if( data[j] > data[j+1] ):
flag = 0
temp = data[j+1]
data[j+1] = data[j]
data[j] = temp
temp = ind[j+1]
ind[j+1] = ind[j]
ind[j] = temp
if( flag == 1 ):
break
return data, ind
###########################################
'''統計數組中的不同數據及其出現次數'''
'''返回值:diffData,存儲data中出現的不同數據'''
''' diffDataNum, 存儲不同數據的出現次數'''
###########################################
def StatDataInf( data ):
dataArrayLen = len( data )
diffData = [];
diffDataNum = [];
dataCpy = []
for n in data:
dataCpy.append( n )
for i in range( dataArrayLen ):
count = 0;
j = i
if( dataCpy[j] != '/' ):
temp = dataCpy[i]
diffData.append( temp )
while( j < dataArrayLen ):
if( dataCpy[j] == temp ):
count = count + 1
dataCpy[j] = '/'
j = j + 1
diffDataNum.append( count )
return diffData, diffDataNum
###########################################
'''計算已知數據的熵'''
###########################################
def DataEntropy( data ):
[diffData, diffDataNum] = StatDataInf( data )
dataArrayLen = len( data )
diffDataArrayLen = len( diffDataNum )
entropyVal = 0;
for i in range( diffDataArrayLen ):
proptyVal = diffDataNum[i] / dataArrayLen
if( proptyVal != 0 ):
entropyVal = entropyVal - proptyVal * math.log2( proptyVal )
return entropyVal
############################################
'''離散屬性的信息量'''
#attArray: discrete attributition in data set
#classArray: classify information of data set
#infoA: the attArray's entropy according to classArray
############################################
def AttributionInfoDise( attArray, classArray ):
infoA = 0;
[diffA, diffANum] = StatDataInf( attArray )
[diffC, diffCNum] = StatDataInf( classArray )
lenSample = len( attArray )
len_diffA = len( diffA )
len_diffC = len( diffC )
for i in range( len_diffA ):
p_Ai = diffANum[i] / lenSample
info_Ai = 0
count_Ai = 0
for j in range( len_diffC ):
count_Ai = 0
for k in range( lenSample ):
if( (attArray[k] == diffA[i]) and (classArray[k] == diffC[j]) ):
count_Ai = count_Ai + 1
if( count_Ai / diffANum[i] != 0 ):
info_Ai = info_Ai - (count_Ai / diffANum[i]) * math.log2( count_Ai / diffANum[i] )
infoA = infoA + p_Ai * info_Ai
return infoA
############################################
'''連續屬性的信息量'''
#the attArray is to be diveded into two parts by the average value of attArray
############################################
def AttributionInfoCont( attArray, classArray ):
infoA = 0;
[ sortedAtt, indArray ] = sortWithInd(attArray)
length = len( attArray )
ave = np.average( attArray )
ind = findLocLT( sortedAtt, ave )
att1 = []
att2 = []
for i in range( ind ):
att1.append( 1 )
for i in range( ind, length ):
att2.append( 2 )
newClassArray = []
for i in range( length ):
k = indArray[i]
newClassArray.append( classArray[ k ] )
newAtt = att1 + att2
infoA = AttributionInfoDise( newAtt, newClassArray )
return infoA
############################################
'''計算離散屬性的信息增益'''
############################################
def GainInfoDise( attArray, classArray ):
gainInfo = 0;
infoA = AttributionInfoDise( attArray, classArray )
infoC = DataEntropy( classArray )
gainInfo = infoC - infoA
return gainInfo
############################################
'''計算連續屬性的信息增益'''
############################################
def GainInfoCont( attArray, classArray ):
gainInfo = 0;
infoA = AttributionInfoCont( attArray, classArray )
infoC = DataEntropy( classArray )
gainInfo = infoC - infoA
return gainInfo
############################################
def main():
outlook = [ '下雨', '下雨', '多雲', '晴天', '晴天', '下雨', '晴天', '多雲', '多雲', '下雨' ]
temprature = [68, 65, 64, 72, 69, 75, 75, 72, 81, 71 ]
humidity = [ 80, 70, 65, 95, 70, 80, 70, 90, 75, 80 ]
windy = [ '否', '是', '是', '否', '否', '否', '是', '是', '否', '是' ]
playGolf = [ '是', '否', '是', '否', '是', '是', '是', '是', '是', '否']
infoC = DataEntropy( playGolf )
print( "infoC = ", float( "{0:.3}".format(infoC) ) )
infoA = AttributionInfoDise( outlook, playGolf )
print( "infoA_outlook = ", float( "{0:.3}".format(infoA) ) )
gainInfo = GainInfoDise( outlook, playGolf )
print( "gainInfo_outlook = ", float( "{0:.3}".format(gainInfo) ) )
infoA = AttributionInfoDise( windy, playGolf )
print( "infoA_windy = ", float( "{0:.3}".format(infoA) ) )
gainInfo = GainInfoDise( windy, playGolf )
print( "gainInfo_windy = ", float( "{0:.3}".format(gainInfo) ) )
infoA = AttributionInfoCont( temprature, playGolf )
print( "infoA_temprature = ", float( "{0:.3}".format(infoA) ) )
gainInfo = GainInfoCont( temprature, playGolf )
print( "gainInfo_temprature = ", float( "{0:.3}".format(gainInfo) ) )
infoA = AttributionInfoCont( humidity, playGolf )
print( "infoA_humidity = ", float( "{0:.3}".format(infoA) ) )
gainInfo = GainInfoCont( humidity, playGolf )
print( "gainInfo_humidity = ", float( "{0:.3}".format(gainInfo) ) )
if __name__ == '__main__':
main()
作者:YangYF