ALEC--《Active learning through density clustering 》[Wang et al. 2017 ] python實現代碼

文獻:::Wang M , Min F , Zhang Z H , et al. Active learning through density clustering[J]. Expert Systems with Applications, 2017:S095741741730369X.

原文給出了java版本代碼。

以下是本人實現的,不太完善的Python版本代碼,請大家指正

import numpy as np
from scipy.spatial.distance import pdist,squareform
from collections import OrderedDict


def DPCA(ro,gamma,distMat,currentSet):
    k = 2
    n = len(currentSet)
    currentSet = np.array(currentSet,dtype=int)
    cluster_block = OrderedDict()
    cluster_index = np.ones(n,dtype=int) * (-1)
    current_Gamma = gamma[currentSet]
    current_Ro = ro[currentSet]
    Ord_current_Gamma_index = np.flipud(np.argsort(current_Gamma))
    Ord_current_Ro_index = np.flipud(np.argsort(current_Ro))
    current_deta = np.zeros(n)                       ##################deta需要重構################
    current_ms =np.ones(n,dtype=int) * (-1)
    ####################-----------計算每個樣本的deta------------#################################
    maxdist = 0
    for i in range(n):   ###############################計算密度最大點的deta
        if distMat[currentSet[Ord_current_Ro_index[0]],currentSet[i]] > maxdist:
            maxdist = distMat[currentSet[Ord_current_Ro_index[0]],currentSet[i]]   #因爲要用到整體數據的距離矩陣,所以要套上currentSet
    current_deta[Ord_current_Ro_index[0]] = maxdist                 #current_deta建立在0-12之間了
    for i in range(1,n):
        min_deta = np.inf
        ms_index = -1
        for j in range(0,i):
            if distMat[currentSet[Ord_current_Ro_index[i]],currentSet[Ord_current_Ro_index[j]]] < min_deta:
                min_deta = distMat[currentSet[Ord_current_Ro_index[i]],currentSet[Ord_current_Ro_index[j]]]
                ms_index = Ord_current_Ro_index[j]  #####################這裏應該怎麼賦值  加不加currentSet
        current_deta[Ord_current_Ro_index[i]] = min_deta   #############current_deta建立在0-12之間了
        current_ms[Ord_current_Ro_index[i]] = ms_index   ###############current_ms建立在0-12之間了
    '''current_Gamma要不要更新呢?暫時沒有更新'''
    '''下面兩行沒毛病'''
    for i in range(k): ############################獲取兩個聚類中心點######################
        cluster_index[Ord_current_Gamma_index[i]] = i
    '''下面需要進行標籤傳播 '''
    # print("cluster_index:",cluster_index)
    # print("Ord_current_Ro_index:",Ord_current_Ro_index)
    for i in range(n):                                              #然後是進行標籤傳播
        if cluster_index[Ord_current_Ro_index[i]] == -1:
            cluster_index[Ord_current_Ro_index[i]] = cluster_index[current_ms[Ord_current_Ro_index[i]]]
    # print("cluster_index:",cluster_index)
    for i in range(k):
        block = []
        for j in range(n):
            if cluster_index[j] == i:
                block.append(j)
        ########-----------------添加映射------------------####################
        block_Real = currentSet[block]
        cluster_block[i] = block_Real
    return cluster_block

def ALEC(X,y,disPercent=20,N=50):
    '''
    :param X: 樣本集(無標記)需要是nparray格式
    :param y: 樣本的真實標記(相當於專家或老師orecal)
    :param distPercent:
    :param N: 標記請求預算
    :return: y_obtain(可以拿來直接訓練數據)
    '''
    n,m = X.shape
    U_I = []
    U_II = []
    U_III = [i for i in range(n)]
    y_obtain = np.array([-1 for i in range(n)])
    y_is_labeled = np.array([-1 for i in range(n)])
    sqrtN = np.ceil(np.sqrt(N))

    ####################--------------基本信息(常用)-------------###############################
    percent = disPercent
    distList = pdist(X, metric='euclidean')
    distMatrix = squareform(distList)
    ro = np.zeros(n)
    deta = np.zeros(n)
    ms = np.ones(n, dtype=int) * (-1)

    distList = sorted(distList)
    position = round(len(distList) * percent / 100)
    distCut = distList[position]
    ######################計算樣本局部密度#####################################
    #########------------方法1:使用高斯相似性度量---------------##############
    # for i in range(n-1):
    #     for j in range(1+1,n):
    #         ro[i] = ro[i] + np.exp(-(distMatrix[i,j]/distCut)**2)
    #         ro[j] = ro[j] + np.exp(-(distMatrix[i, j] / distCut) ** 2)
    # ro = [item /max(ro) for item in ro]

    #########------------方法2:原始的計數度量法----------------###############
    for i in range(n):
        for j in range(n):
            if i != j and distMatrix[i, j] < distCut:
                ro[i] += 1
    ############################################################################
    ro = np.array(ro)
    ord_ro_index = np.flipud(np.argsort(ro))
    ######################------------計算deta------------------##########################
    maxdist = 0
    for i in range(n):
        if distMatrix[ord_ro_index[0], i] > maxdist:
            maxdist = distMatrix[ord_ro_index[0], i]
    deta[ord_ro_index[0]] = maxdist

    for i in range(1, n):
        min_dist = np.inf
        min_index = -1
        for j in range(0, i):
            if distMatrix[ord_ro_index[i], ord_ro_index[j]] < min_dist:
                min_dist = distMatrix[ord_ro_index[i], ord_ro_index[j]]
                min_index = ord_ro_index[j]
        deta[ord_ro_index[i]] = min_dist
        ms[ord_ro_index[i]] = min_index
    gamma = deta * ro
    current = np.array([x for x in range(n)])
    ClusterDict = DPCA(ro,gamma,distMatrix,current)
    SuperClusterDict = OrderedDict()
    cluster_Norm = 2
    blockisGameOver = False
    for i in range(cluster_Norm):
        PureLabel = 0
        SuperClusterDict[i] = [ClusterDict[i],blockisGameOver,PureLabel]
    DictList = OrderedDict()                  #用於存儲需要分裂的塊,這個很重要
    ############################------------進入主循環------------########################################
    while len(U_I) < N and len(U_III) != 0:
        # print("無標記樣本個數:",len(U_III))
        # print("已經標記的個數:",len(U_I)+len(U_II))
        # print("分類標記的個數:",len(U_II))
        for key,subblock in SuperClusterDict.items():
            if subblock[1] == True:        #說明這個塊已經標記完或者已經分類完了
                continue
                # if len(set(y_is_labeled[subblock[0]])) == 1:
                #     continue
                # else:
                #     print("subblock[1] == True  這裏出錯了")
            else:                         #說明這個塊還沒標記完
                #檢查已標記數據
                block_Labeled = []
                block_Unlabeled = list(subblock[0])
                for ele in subblock[0]:
                    if y_is_labeled[ele] == 1:
                        block_Labeled.append(ele)
                        block_Unlabeled.remove(ele)
                labels = set(y_obtain[block_Labeled])
                if len(labels) == 1 or len(labels) == 0:                     #說明這個塊現在還是純的
                    if len(labels) == 1:
                        subblock[2] = list(labels)[0]         #獲取純的標記
                    if len(labels) == 0:                      #空的
                        subblock[2] = -1      #################***********問題出在這裏****************#####################
                    #####說明這個塊是純的:
                    #####這裏有個分支:繼續標記or直接分類。下面是一個判斷
                    if len(block_Labeled) >= sqrtN:   # 現在已經是純的,還沒標完,標記量夠大
                        y_obtain[block_Unlabeled] = y[block_Unlabeled]
                        U_II.extend(block_Unlabeled)
                        # print(U_II)
                        for _ in block_Unlabeled:
                            U_III.remove(_)
                            y_is_labeled[_] = 1
                        subblock[1] = True
                    else:
                        #######繼續標記
                        norm_available = 0
                        norm_N_left = N - len(U_I)     #看對當前塊有多少個預算可用
                        if norm_N_left < sqrtN:
                            norm_available = norm_N_left
                        else:
                            norm_available = sqrtN     #norm_available就是對於當前塊可用的子彈
                        currentGamma = gamma[subblock[0]]
                        Ord_currentGamma_index = np.flipud(np.argsort(currentGamma))
                        while norm_available > 0:
                            for r in range(len(subblock[0])):
                                #檢查是否已經標記
                                if subblock[0][Ord_currentGamma_index[r]] in block_Labeled:
                                    continue
                                else:
                                    norm_available -= 1
                                    tag = y[subblock[0][Ord_currentGamma_index[r]]]
                                    y_obtain[subblock[0][Ord_currentGamma_index[r]]] = tag
                                    y_is_labeled[subblock[0][Ord_currentGamma_index[r]]] = 1
                                    U_I.append(subblock[0][Ord_currentGamma_index[r]])
                                    U_III.remove(subblock[0][Ord_currentGamma_index[r]])
                                    if subblock[2] == -1:                 #已標記爲空的時候的
                                        subblock[2] = tag
                                    if tag != subblock[2]:                 #標記不相同,說明出現了不純的標記
                                        ###############下面是執行分裂
                                        subClusterDict = DPCA(ro,gamma,distMatrix,subblock[0])
        ##########################字典需要更新############################
                                        # SuperClusterDict.pop(key)
                                        ##############分裂以後要更新超級聚類字典
                                        blockisGameOver = False
                                        PureLabel = -1
                                        if -1 in y_is_labeled[subClusterDict[0]]:
                                            blockisGameOver = True
                                        else:
                                            blockisGameOver = False
                                        A = [subClusterDict[0], blockisGameOver, PureLabel]
                                        if -1 in y_is_labeled[subClusterDict[1]]:
                                            blockisGameOver = True
                                        else:
                                            blockisGameOver = False
                                        B = [subClusterDict[1], blockisGameOver,PureLabel]
                                        DictList[key] =[A,B]

                else:                   #####這個塊不純需要分裂
                    #######################需要分裂,需要分裂,需要分裂
                    subClusterDict = DPCA(ro, gamma, distMatrix, subblock[0])
                    # SuperClusterDict.pop(key)
                    PureLabel = -1
                    if -1 in y_is_labeled[subClusterDict[0]]:
                        blockisGameOver = True
                    else:
                        blockisGameOver = False
                    C = [subClusterDict[0], blockisGameOver, PureLabel]
                    if -1 in y_is_labeled[subClusterDict[1]]:
                        blockisGameOver = True
                    else:
                        blockisGameOver = False
                    D = [subClusterDict[1], blockisGameOver, PureLabel]
                    DictList[key] = [C, D]
        ############字典遍歷完了可以更新了###################
        for k,v in DictList.items():
            SuperClusterDict[k] = v[0]
            SuperClusterDict[cluster_Norm] =v[1]
            cluster_Norm += 1
    for kk,vv in SuperClusterDict.items():
        if vv[1] != True:
            block_Labeled = []
            block_Unlabeled = list(vv[0])
            for ele in vv[0]:
                if y_is_labeled[ele] == 1:
                    block_Labeled.append(ele)
                    block_Unlabeled.remove(ele)
            yList = list(y_obtain[block_Labeled])
            if len(yList) >= 1:
                majority_label = max(yList, key=yList.count)
                for ele in block_Unlabeled:
                    y_obtain[ele] = majority_label
                    U_II.append(ele)
                    U_III.remove(ele)

    return SuperClusterDict,len(U_I),len(U_II),len(U_III)


if __name__ == "__main__":
    X = np.array(
        [[1, 2], [5, 6], [2, 8], [3, 5], [9, 4], [7, 8], [4, 8], [8, 25], [5, 14], [20, 8], [5, 2], [6, 7], [20, 1],
         [10, 9], [6, 9], [3, 9], [7, 10], [6, 14], [10, 4], [4, 10]])
    y = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    CD,L1,L2,L3 = ALEC(X,y,disPercent=20,N=5)
    print("CD:",CD)
    print("L1:",L1)
    print("L2:",L2)
    print("L3:",L3)

 

提示:此處的DPCA不是標準密度峯值,請勿照搬!

X,y是測試用,隨手編的,需要換成實際數據及其標籤。

也可加本人QQ:516524185討論,不限於該篇文獻~~

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章