文獻:::Wang M , Min F , Zhang Z H , et al. Active learning through density clustering[J]. Expert Systems with Applications, 2017:S095741741730369X.
原文給出了java版本代碼。
以下是本人實現的,不太完善的Python版本代碼,請大家指正。
import numpy as np
from scipy.spatial.distance import pdist,squareform
from collections import OrderedDict
def DPCA(ro,gamma,distMat,currentSet):
k = 2
n = len(currentSet)
currentSet = np.array(currentSet,dtype=int)
cluster_block = OrderedDict()
cluster_index = np.ones(n,dtype=int) * (-1)
current_Gamma = gamma[currentSet]
current_Ro = ro[currentSet]
Ord_current_Gamma_index = np.flipud(np.argsort(current_Gamma))
Ord_current_Ro_index = np.flipud(np.argsort(current_Ro))
current_deta = np.zeros(n) ##################deta需要重構################
current_ms =np.ones(n,dtype=int) * (-1)
####################-----------計算每個樣本的deta------------#################################
maxdist = 0
for i in range(n): ###############################計算密度最大點的deta
if distMat[currentSet[Ord_current_Ro_index[0]],currentSet[i]] > maxdist:
maxdist = distMat[currentSet[Ord_current_Ro_index[0]],currentSet[i]] #因爲要用到整體數據的距離矩陣,所以要套上currentSet
current_deta[Ord_current_Ro_index[0]] = maxdist #current_deta建立在0-12之間了
for i in range(1,n):
min_deta = np.inf
ms_index = -1
for j in range(0,i):
if distMat[currentSet[Ord_current_Ro_index[i]],currentSet[Ord_current_Ro_index[j]]] < min_deta:
min_deta = distMat[currentSet[Ord_current_Ro_index[i]],currentSet[Ord_current_Ro_index[j]]]
ms_index = Ord_current_Ro_index[j] #####################這裏應該怎麼賦值 加不加currentSet
current_deta[Ord_current_Ro_index[i]] = min_deta #############current_deta建立在0-12之間了
current_ms[Ord_current_Ro_index[i]] = ms_index ###############current_ms建立在0-12之間了
'''current_Gamma要不要更新呢?暫時沒有更新'''
'''下面兩行沒毛病'''
for i in range(k): ############################獲取兩個聚類中心點######################
cluster_index[Ord_current_Gamma_index[i]] = i
'''下面需要進行標籤傳播 '''
# print("cluster_index:",cluster_index)
# print("Ord_current_Ro_index:",Ord_current_Ro_index)
for i in range(n): #然後是進行標籤傳播
if cluster_index[Ord_current_Ro_index[i]] == -1:
cluster_index[Ord_current_Ro_index[i]] = cluster_index[current_ms[Ord_current_Ro_index[i]]]
# print("cluster_index:",cluster_index)
for i in range(k):
block = []
for j in range(n):
if cluster_index[j] == i:
block.append(j)
########-----------------添加映射------------------####################
block_Real = currentSet[block]
cluster_block[i] = block_Real
return cluster_block
def ALEC(X,y,disPercent=20,N=50):
'''
:param X: 樣本集(無標記)需要是nparray格式
:param y: 樣本的真實標記(相當於專家或老師orecal)
:param distPercent:
:param N: 標記請求預算
:return: y_obtain(可以拿來直接訓練數據)
'''
n,m = X.shape
U_I = []
U_II = []
U_III = [i for i in range(n)]
y_obtain = np.array([-1 for i in range(n)])
y_is_labeled = np.array([-1 for i in range(n)])
sqrtN = np.ceil(np.sqrt(N))
####################--------------基本信息(常用)-------------###############################
percent = disPercent
distList = pdist(X, metric='euclidean')
distMatrix = squareform(distList)
ro = np.zeros(n)
deta = np.zeros(n)
ms = np.ones(n, dtype=int) * (-1)
distList = sorted(distList)
position = round(len(distList) * percent / 100)
distCut = distList[position]
######################計算樣本局部密度#####################################
#########------------方法1:使用高斯相似性度量---------------##############
# for i in range(n-1):
# for j in range(1+1,n):
# ro[i] = ro[i] + np.exp(-(distMatrix[i,j]/distCut)**2)
# ro[j] = ro[j] + np.exp(-(distMatrix[i, j] / distCut) ** 2)
# ro = [item /max(ro) for item in ro]
#########------------方法2:原始的計數度量法----------------###############
for i in range(n):
for j in range(n):
if i != j and distMatrix[i, j] < distCut:
ro[i] += 1
############################################################################
ro = np.array(ro)
ord_ro_index = np.flipud(np.argsort(ro))
######################------------計算deta------------------##########################
maxdist = 0
for i in range(n):
if distMatrix[ord_ro_index[0], i] > maxdist:
maxdist = distMatrix[ord_ro_index[0], i]
deta[ord_ro_index[0]] = maxdist
for i in range(1, n):
min_dist = np.inf
min_index = -1
for j in range(0, i):
if distMatrix[ord_ro_index[i], ord_ro_index[j]] < min_dist:
min_dist = distMatrix[ord_ro_index[i], ord_ro_index[j]]
min_index = ord_ro_index[j]
deta[ord_ro_index[i]] = min_dist
ms[ord_ro_index[i]] = min_index
gamma = deta * ro
current = np.array([x for x in range(n)])
ClusterDict = DPCA(ro,gamma,distMatrix,current)
SuperClusterDict = OrderedDict()
cluster_Norm = 2
blockisGameOver = False
for i in range(cluster_Norm):
PureLabel = 0
SuperClusterDict[i] = [ClusterDict[i],blockisGameOver,PureLabel]
DictList = OrderedDict() #用於存儲需要分裂的塊,這個很重要
############################------------進入主循環------------########################################
while len(U_I) < N and len(U_III) != 0:
# print("無標記樣本個數:",len(U_III))
# print("已經標記的個數:",len(U_I)+len(U_II))
# print("分類標記的個數:",len(U_II))
for key,subblock in SuperClusterDict.items():
if subblock[1] == True: #說明這個塊已經標記完或者已經分類完了
continue
# if len(set(y_is_labeled[subblock[0]])) == 1:
# continue
# else:
# print("subblock[1] == True 這裏出錯了")
else: #說明這個塊還沒標記完
#檢查已標記數據
block_Labeled = []
block_Unlabeled = list(subblock[0])
for ele in subblock[0]:
if y_is_labeled[ele] == 1:
block_Labeled.append(ele)
block_Unlabeled.remove(ele)
labels = set(y_obtain[block_Labeled])
if len(labels) == 1 or len(labels) == 0: #說明這個塊現在還是純的
if len(labels) == 1:
subblock[2] = list(labels)[0] #獲取純的標記
if len(labels) == 0: #空的
subblock[2] = -1 #################***********問題出在這裏****************#####################
#####說明這個塊是純的:
#####這裏有個分支:繼續標記or直接分類。下面是一個判斷
if len(block_Labeled) >= sqrtN: # 現在已經是純的,還沒標完,標記量夠大
y_obtain[block_Unlabeled] = y[block_Unlabeled]
U_II.extend(block_Unlabeled)
# print(U_II)
for _ in block_Unlabeled:
U_III.remove(_)
y_is_labeled[_] = 1
subblock[1] = True
else:
#######繼續標記
norm_available = 0
norm_N_left = N - len(U_I) #看對當前塊有多少個預算可用
if norm_N_left < sqrtN:
norm_available = norm_N_left
else:
norm_available = sqrtN #norm_available就是對於當前塊可用的子彈
currentGamma = gamma[subblock[0]]
Ord_currentGamma_index = np.flipud(np.argsort(currentGamma))
while norm_available > 0:
for r in range(len(subblock[0])):
#檢查是否已經標記
if subblock[0][Ord_currentGamma_index[r]] in block_Labeled:
continue
else:
norm_available -= 1
tag = y[subblock[0][Ord_currentGamma_index[r]]]
y_obtain[subblock[0][Ord_currentGamma_index[r]]] = tag
y_is_labeled[subblock[0][Ord_currentGamma_index[r]]] = 1
U_I.append(subblock[0][Ord_currentGamma_index[r]])
U_III.remove(subblock[0][Ord_currentGamma_index[r]])
if subblock[2] == -1: #已標記爲空的時候的
subblock[2] = tag
if tag != subblock[2]: #標記不相同,說明出現了不純的標記
###############下面是執行分裂
subClusterDict = DPCA(ro,gamma,distMatrix,subblock[0])
##########################字典需要更新############################
# SuperClusterDict.pop(key)
##############分裂以後要更新超級聚類字典
blockisGameOver = False
PureLabel = -1
if -1 in y_is_labeled[subClusterDict[0]]:
blockisGameOver = True
else:
blockisGameOver = False
A = [subClusterDict[0], blockisGameOver, PureLabel]
if -1 in y_is_labeled[subClusterDict[1]]:
blockisGameOver = True
else:
blockisGameOver = False
B = [subClusterDict[1], blockisGameOver,PureLabel]
DictList[key] =[A,B]
else: #####這個塊不純需要分裂
#######################需要分裂,需要分裂,需要分裂
subClusterDict = DPCA(ro, gamma, distMatrix, subblock[0])
# SuperClusterDict.pop(key)
PureLabel = -1
if -1 in y_is_labeled[subClusterDict[0]]:
blockisGameOver = True
else:
blockisGameOver = False
C = [subClusterDict[0], blockisGameOver, PureLabel]
if -1 in y_is_labeled[subClusterDict[1]]:
blockisGameOver = True
else:
blockisGameOver = False
D = [subClusterDict[1], blockisGameOver, PureLabel]
DictList[key] = [C, D]
############字典遍歷完了可以更新了###################
for k,v in DictList.items():
SuperClusterDict[k] = v[0]
SuperClusterDict[cluster_Norm] =v[1]
cluster_Norm += 1
for kk,vv in SuperClusterDict.items():
if vv[1] != True:
block_Labeled = []
block_Unlabeled = list(vv[0])
for ele in vv[0]:
if y_is_labeled[ele] == 1:
block_Labeled.append(ele)
block_Unlabeled.remove(ele)
yList = list(y_obtain[block_Labeled])
if len(yList) >= 1:
majority_label = max(yList, key=yList.count)
for ele in block_Unlabeled:
y_obtain[ele] = majority_label
U_II.append(ele)
U_III.remove(ele)
return SuperClusterDict,len(U_I),len(U_II),len(U_III)
if __name__ == "__main__":
X = np.array(
[[1, 2], [5, 6], [2, 8], [3, 5], [9, 4], [7, 8], [4, 8], [8, 25], [5, 14], [20, 8], [5, 2], [6, 7], [20, 1],
[10, 9], [6, 9], [3, 9], [7, 10], [6, 14], [10, 4], [4, 10]])
y = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
CD,L1,L2,L3 = ALEC(X,y,disPercent=20,N=5)
print("CD:",CD)
print("L1:",L1)
print("L2:",L2)
print("L3:",L3)
提示:此處的DPCA不是標準密度峯值,請勿照搬!
X,y是測試用,隨手編的,需要換成實際數據及其標籤。
也可加本人QQ:516524185討論,不限於該篇文獻~~