多維數據的信息增益及Python實現

 

二、案例

日期

天氣

氣溫

溼度

是否颳風

是否外出散步

1

下雨

68

80

2

下雨

65

70

3

多雲

64

65

4

晴天

72

95

5

晴天

69

70

6

下雨

75

80

7

晴天

75

70

8

多雲

72

90

9

多雲

81

75

10

下雨

71

80

對於屬性“天氣”和“是否颳風”,將看做離散型的數據。對於屬性“氣溫”和“溼度”將看成連續型的數據。

採用前述的公式即可計算出各自屬性的信息增益。Python程序見下面代碼。

三、基於Python的實現

import numpy as np
import math
###########################################
'''在數組array中定位第一個大於data的元素位置'''
###########################################
def findLocLT( array, data ):
    for i in range( len( array ) ):
        if( array[i] > data ):
            ind  = i
            break
    return ind
###########################################
'''對數組進行排序,輸出排序結果及在原數組的索引'''
#array: the array to be sorted
#data:  the sorted array
#ind :  the index of element in the intial array
###########################################
def sortWithInd( array ):
    length = len( array )
    data = np.copy( array )
    ind = np.arange( length )
    
    for i in range( length ):
        flag = 1

        for j in range( length - 1 - i ):
            if( data[j] > data[j+1] ):
                flag = 0
                temp = data[j+1]
                data[j+1] = data[j]
                data[j] = temp
                
                temp = ind[j+1]
                ind[j+1] = ind[j]
                ind[j] = temp 
        if( flag == 1 ):
            break
 
    return data, ind

###########################################
'''統計數組中的不同數據及其出現次數'''
'''返回值:diffData,存儲data中出現的不同數據'''
'''       diffDataNum, 存儲不同數據的出現次數'''
###########################################
def StatDataInf( data ):

    dataArrayLen = len( data )
    diffData = [];
    diffDataNum = [];
    dataCpy = []
    for n in data:
        dataCpy.append( n )
    for i in range( dataArrayLen ):
        count = 0;
        j = i
        if( dataCpy[j] != '/' ):
            temp = dataCpy[i]
            diffData.append( temp )
            while( j < dataArrayLen ):
                if( dataCpy[j] == temp ):
                    count = count + 1
                    dataCpy[j] = '/'
                j = j + 1
            diffDataNum.append( count )
    return diffData, diffDataNum

###########################################
'''計算已知數據的熵'''
###########################################
def DataEntropy( data ):
    [diffData, diffDataNum] = StatDataInf( data )
    dataArrayLen     = len( data )
    diffDataArrayLen = len( diffDataNum )
    entropyVal = 0;
    for i in range( diffDataArrayLen ):
        proptyVal = diffDataNum[i] / dataArrayLen
        if( proptyVal != 0 ):
            entropyVal = entropyVal - proptyVal * math.log2( proptyVal )
    return entropyVal

############################################
'''離散屬性的信息量'''
#attArray:   discrete  attributition in data set
#classArray: classify information of data set
#infoA:      the attArray's entropy according to classArray
############################################
def AttributionInfoDise( attArray, classArray ):
    infoA = 0;
    [diffA, diffANum] = StatDataInf( attArray )
    [diffC, diffCNum] = StatDataInf( classArray )
    lenSample = len( attArray )
    len_diffA = len( diffA )
    len_diffC = len( diffC )
    for i in range( len_diffA ):
        p_Ai = diffANum[i] / lenSample
        info_Ai = 0
        count_Ai = 0
        for j in range( len_diffC ):
            count_Ai = 0
            for k in range( lenSample ):
                if( (attArray[k] == diffA[i])  and (classArray[k] == diffC[j]) ):
                    count_Ai = count_Ai + 1
            if( count_Ai / diffANum[i] != 0 ):
                info_Ai = info_Ai - (count_Ai / diffANum[i]) * math.log2( count_Ai / diffANum[i] )
        infoA = infoA + p_Ai * info_Ai
    return infoA

############################################
'''連續屬性的信息量'''
#the attArray is to be diveded into two parts by the average value of attArray
############################################
def AttributionInfoCont( attArray, classArray ):
    infoA = 0;
    [ sortedAtt, indArray ] = sortWithInd(attArray)
    
    length = len( attArray )
    ave = np.average( attArray )
    ind = findLocLT( sortedAtt, ave )
    
    att1 = []
    att2 = []
    for i in range( ind ):
        att1.append( 1 )  
    for i in range( ind, length ):
        att2.append( 2 ) 
    newClassArray = []
    for i in range( length ):
        k = indArray[i]
        newClassArray.append( classArray[ k ] )
    newAtt = att1 + att2
    
    infoA = AttributionInfoDise( newAtt, newClassArray )
    return infoA

############################################
    '''計算離散屬性的信息增益'''
############################################
def GainInfoDise( attArray, classArray ):
    gainInfo = 0;
    infoA = AttributionInfoDise( attArray, classArray )
    infoC = DataEntropy( classArray )
    gainInfo = infoC - infoA
    return gainInfo

############################################
    '''計算連續屬性的信息增益'''
############################################
def GainInfoCont( attArray, classArray ):
    gainInfo = 0;
    infoA = AttributionInfoCont( attArray, classArray )
    infoC = DataEntropy( classArray )
    gainInfo = infoC - infoA
    return gainInfo

############################################
def main():
    outlook = [ '下雨', '下雨', '多雲', '晴天', '晴天', '下雨', '晴天', '多雲', '多雲', '下雨' ]
    temprature = [68, 65, 64, 72, 69, 75, 75, 72, 81, 71 ]
    humidity = [ 80, 70, 65, 95, 70, 80, 70, 90, 75, 80 ]
    windy = [ '否', '是', '是', '否', '否', '否', '是', '是', '否', '是' ]
    playGolf = [ '是', '否', '是', '否', '是', '是', '是', '是', '是', '否']
    
    infoC = DataEntropy( playGolf )
    print( "infoC = ",  float( "{0:.3}".format(infoC) ) )
    
    infoA = AttributionInfoDise( outlook, playGolf )
    print( "infoA_outlook = ",  float( "{0:.3}".format(infoA) ) )
    gainInfo = GainInfoDise( outlook, playGolf )
    print( "gainInfo_outlook = ",  float( "{0:.3}".format(gainInfo) ) )
    
    infoA = AttributionInfoDise( windy, playGolf )
    print( "infoA_windy = ",  float( "{0:.3}".format(infoA) ) )
    gainInfo = GainInfoDise( windy, playGolf )
    print( "gainInfo_windy = ",  float( "{0:.3}".format(gainInfo) ) )

    infoA = AttributionInfoCont( temprature, playGolf )
    print( "infoA_temprature = ",  float( "{0:.3}".format(infoA) ) )
    gainInfo = GainInfoCont( temprature, playGolf )
    print( "gainInfo_temprature = ",  float( "{0:.3}".format(gainInfo) ) )

    infoA = AttributionInfoCont( humidity, playGolf )
    print( "infoA_humidity = ",  float( "{0:.3}".format(infoA) ) )
    gainInfo = GainInfoCont( humidity, playGolf )
    print( "gainInfo_humidity = ",  float( "{0:.3}".format(gainInfo) ) )
      
if __name__ == '__main__':
    main()

作者:YangYF

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章