多維數據的信息增益及Python實現

二、案例

日期	天氣	氣溫	溼度	是否颳風	是否外出散步
1	下雨	68	80	否	是
2	下雨	65	70	是	否
3	多雲	64	65	是	是
4	晴天	72	95	否	否
5	晴天	69	70	否	是
6	下雨	75	80	否	是
7	晴天	75	70	是	是
8	多雲	72	90	是	是
9	多雲	81	75	否	是
10	下雨	71	80	是	否

對於屬性“天氣”和“是否颳風”，將看做離散型的數據。對於屬性“氣溫”和“溼度”將看成連續型的數據。

採用前述的公式即可計算出各自屬性的信息增益。Python程序見下面代碼。

三、基於Python的實現

import numpy as np
import math
###########################################
'''在數組array中定位第一個大於data的元素位置'''
###########################################
def findLocLT( array, data ):
    for i in range( len( array ) ):
        if( array[i] > data ):
            ind  = i
            break
    return ind
###########################################
'''對數組進行排序，輸出排序結果及在原數組的索引'''
#array: the array to be sorted
#data:  the sorted array
#ind :  the index of element in the intial array
###########################################
def sortWithInd( array ):
    length = len( array )
    data = np.copy( array )
    ind = np.arange( length )
    
    for i in range( length ):
        flag = 1

        for j in range( length - 1 - i ):
            if( data[j] > data[j+1] ):
                flag = 0
                temp = data[j+1]
                data[j+1] = data[j]
                data[j] = temp
                
                temp = ind[j+1]
                ind[j+1] = ind[j]
                ind[j] = temp 
        if( flag == 1 ):
            break
 
    return data, ind

###########################################
'''統計數組中的不同數據及其出現次數'''
'''返回值：diffData，存儲data中出現的不同數據'''
'''       diffDataNum, 存儲不同數據的出現次數'''
###########################################
def StatDataInf( data ):

    dataArrayLen = len( data )
    diffData = [];
    diffDataNum = [];
    dataCpy = []
    for n in data:
        dataCpy.append( n )
    for i in range( dataArrayLen ):
        count = 0;
        j = i
        if( dataCpy[j] != '/' ):
            temp = dataCpy[i]
            diffData.append( temp )
            while( j < dataArrayLen ):
                if( dataCpy[j] == temp ):
                    count = count + 1
                    dataCpy[j] = '/'
                j = j + 1
            diffDataNum.append( count )
    return diffData, diffDataNum

###########################################
'''計算已知數據的熵'''
###########################################
def DataEntropy( data ):
    [diffData, diffDataNum] = StatDataInf( data )
    dataArrayLen     = len( data )
    diffDataArrayLen = len( diffDataNum )
    entropyVal = 0;
    for i in range( diffDataArrayLen ):
        proptyVal = diffDataNum[i] / dataArrayLen
        if( proptyVal != 0 ):
            entropyVal = entropyVal - proptyVal * math.log2( proptyVal )
    return entropyVal

############################################
'''離散屬性的信息量'''
#attArray:   discrete  attributition in data set
#classArray: classify information of data set
#infoA:      the attArray's entropy according to classArray
############################################
def AttributionInfoDise( attArray, classArray ):
    infoA = 0;
    [diffA, diffANum] = StatDataInf( attArray )
    [diffC, diffCNum] = StatDataInf( classArray )
    lenSample = len( attArray )
    len_diffA = len( diffA )
    len_diffC = len( diffC )
    for i in range( len_diffA ):
        p_Ai = diffANum[i] / lenSample
        info_Ai = 0
        count_Ai = 0
        for j in range( len_diffC ):
            count_Ai = 0
            for k in range( lenSample ):
                if( (attArray[k] == diffA[i])  and (classArray[k] == diffC[j]) ):
                    count_Ai = count_Ai + 1
            if( count_Ai / diffANum[i] != 0 ):
                info_Ai = info_Ai - (count_Ai / diffANum[i]) * math.log2( count_Ai / diffANum[i] )
        infoA = infoA + p_Ai * info_Ai
    return infoA

############################################
'''連續屬性的信息量'''
#the attArray is to be diveded into two parts by the average value of attArray
############################################
def AttributionInfoCont( attArray, classArray ):
    infoA = 0;
    [ sortedAtt, indArray ] = sortWithInd(attArray)
    
    length = len( attArray )
    ave = np.average( attArray )
    ind = findLocLT( sortedAtt, ave )
    
    att1 = []
    att2 = []
    for i in range( ind ):
        att1.append( 1 )  
    for i in range( ind, length ):
        att2.append( 2 ) 
    newClassArray = []
    for i in range( length ):
        k = indArray[i]
        newClassArray.append( classArray[ k ] )
    newAtt = att1 + att2
    
    infoA = AttributionInfoDise( newAtt, newClassArray )
    return infoA

############################################
    '''計算離散屬性的信息增益'''
############################################
def GainInfoDise( attArray, classArray ):
    gainInfo = 0;
    infoA = AttributionInfoDise( attArray, classArray )
    infoC = DataEntropy( classArray )
    gainInfo = infoC - infoA
    return gainInfo

############################################
    '''計算連續屬性的信息增益'''
############################################
def GainInfoCont( attArray, classArray ):
    gainInfo = 0;
    infoA = AttributionInfoCont( attArray, classArray )
    infoC = DataEntropy( classArray )
    gainInfo = infoC - infoA
    return gainInfo

############################################
def main():
    outlook = [ '下雨', '下雨', '多雲', '晴天', '晴天', '下雨', '晴天', '多雲', '多雲', '下雨' ]
    temprature = [68, 65, 64, 72, 69, 75, 75, 72, 81, 71 ]
    humidity = [ 80, 70, 65, 95, 70, 80, 70, 90, 75, 80 ]
    windy = [ '否', '是', '是', '否', '否', '否', '是', '是', '否', '是' ]
    playGolf = [ '是', '否', '是', '否', '是', '是', '是', '是', '是', '否']
    
    infoC = DataEntropy( playGolf )
    print( "infoC = ",  float( "{0:.3}".format(infoC) ) )
    
    infoA = AttributionInfoDise( outlook, playGolf )
    print( "infoA_outlook = ",  float( "{0:.3}".format(infoA) ) )
    gainInfo = GainInfoDise( outlook, playGolf )
    print( "gainInfo_outlook = ",  float( "{0:.3}".format(gainInfo) ) )
    
    infoA = AttributionInfoDise( windy, playGolf )
    print( "infoA_windy = ",  float( "{0:.3}".format(infoA) ) )
    gainInfo = GainInfoDise( windy, playGolf )
    print( "gainInfo_windy = ",  float( "{0:.3}".format(gainInfo) ) )

    infoA = AttributionInfoCont( temprature, playGolf )
    print( "infoA_temprature = ",  float( "{0:.3}".format(infoA) ) )
    gainInfo = GainInfoCont( temprature, playGolf )
    print( "gainInfo_temprature = ",  float( "{0:.3}".format(gainInfo) ) )

    infoA = AttributionInfoCont( humidity, playGolf )
    print( "infoA_humidity = ",  float( "{0:.3}".format(infoA) ) )
    gainInfo = GainInfoCont( humidity, playGolf )
    print( "gainInfo_humidity = ",  float( "{0:.3}".format(gainInfo) ) )
      
if __name__ == '__main__':
    main()

作者：YangYF

多維數據的信息增益及Python實現

pyhton_基尼指數計算

ANSYS MESHING網格種類及區別

Python繪圖佈局、圖例說明、邊界

fluent動網格profile文件的編寫

BP神經網絡及其Python實現

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結