Background
有監督的多模態檢索(supervised multi-modal retrieval)中,常用 label 構造相似矩陣 S。
樣本集 ,對應標籤集 。對任意兩個樣本 和 ,若 和 至少有一個共同標籤(即 ),則認爲它們相似。計算相似矩陣 S 定義爲:
Problem
在 supervised 設置下,一般基於 label 信息計算相似矩陣。想做的是在 UNsupervised 設置下,用一些奇技淫巧構造相似矩陣 S 之後,反求樣本 label(使求得的 label 能算出同樣的 S),後續用 label 搞事。
Idea 1: Learn-to-Label
像學 hash code 一樣,將 label 當成參數,用網絡學出來,約束就用 DLFH[1] 那條,或見 DCMH[2]。
code
from tensorflow import ConfigProto, Session
from keras.backend.tensorflow_backend import set_session
config = ConfigProto()
config.gpu_options.allow_growth = True
set_session(Session(config=config))
import os
import argparse
from time import time
import numpy as np
import keras
import keras.backend as K
from keras.callbacks import Callback
from keras.models import Model
from keras.layers import Dense, Dropout, Activation, Input, Lambda
np.random.seed(int(time()))
parser = argparse.ArgumentParser()
parser.add_argument('--EPOCH', type=int, default=100)
parser.add_argument('--BATCH', type=int, default=64)
parser.add_argument('--SRC', type=str, default='data')
opt = parser.parse_args()
print(opt)
def load_data(fname):
return np.load(os.path.join(opt.SRC, f'{fname}.npy'))
I_train = load_data('I_train')
I_val = load_data('I_val')
L_train = load_data('L_train')
L_val = load_data('L_val')
EPOCH = opt.EPOCH
BATCH = opt.BATCH
N_CLASS = L_train.shape[-1]
DIM_IMG = I_train.shape[-1]
@K.tf.custom_gradient
def Htanh(x): # hard tanh
def grad(dy):
cond = (x >= -1) & (x <= 1)
zeros = K.zeros_like(dy)
return K.tf.where(cond, dy, zeros)
return K.sign(x), grad
def gen_data(which='train', bat_sz=BATCH):
if which == "train":
Img, Lab = I_train, L_train
elif which == "test":
Img, Lab = I_val, L_val
num = Lab.shape[0]
S = (np.dot(Lab, Lab.T) > 0).astype(np.float32)
while True:
idx = np.random.choice(num, bat_sz)
im, sim, lb = Img[idx], S[idx], Lab[idx]
yield [im, sim], lb
# network
in_img = Input([DIM_IMG], name='image')
in_sim = Input([BATCH], name='similarity_matrix')
x = in_img
x = Dense(N_CLASS)(x)
x = Lambda(Htanh)(x)
l = x
m_train = Model([in_img, in_sim], l, name='train')
clf = Model(in_img, l, name='classifier')
def struct(y_true, y_pred):
s_hat = in_sim * 2. - 1.
theta = 0.5 * K.dot(y_pred, K.transpose(y_pred))
loss = - K.log(0.5 * (1. - s_hat) + s_hat * K.sigmoid(theta) + 1e-9)
return K.sum(loss)
S_val = (np.dot(L_train, L_train.T) > 0).astype(np.int)
tot = L_train.shape[0] ** 2
def test():
l = clf.predict(I_train)
sv = (np.dot(l, l.T) > 0).astype(np.int)
# print('right:', np.sum(sv * S_val))
# print('wrong:', np.sum(sv * (1 - S_val)))
# print('--- S_val ---\n', S_val)
# print(' --- sv ---\n', sv)
print('S_val:', np.sum(S_val)) # 真·相似對數
print('sv:', np.sum(sv)) # 預測的相似對數
print('total:', tot) # 總對數
l_sum = np.sum(l, axis=-1) # 各 label 的 1 個數
L_sum = np.sum(L_train, axis=-1)
print('mean discrepency:', np.mean(np.abs(l_sum - L_sum)))
class moniter(Callback):
def on_epoch_end(self, epoch, logs=None):
if epoch % 10 == 9 or epoch == 0:
print(epoch, "> "*10)
test()
m_train.compile('adam', loss=struct)
gen_train = gen_data('train')
gen_test = gen_data('test')
m_train.fit_generator(gen_train,
steps_per_epoch=I_train.shape[0]//BATCH,
epochs=EPOCH,
callbacks=[moniter()],
validation_data=gen_test,
verbose=0,
validation_steps=I_val.shape[0]//BATCH)
print('--- after ---')
# test()
clf.compile('adam', loss='binary_crossentropy', metrics=['binary_accuracy'])
print(clf.evaluate(I_train, L_train))
Idea 2
通過一個確定的過程構造 label。將相似矩陣 S 看成一幅無向圖,貪心地染色(打標籤)。
sample
code
from os.path import join
from scipy.spatial.distance import cdist
import numpy as np
def load_data(fname):
return np.load(join('data', f'{fname}.npy'))
# L_train = load_data('L_train')
# L_val = load_data('L_val')
# L_ret = load_data('L_ret')
def construct_label(S):
N = S.shape[0] # 樣本數
vis_edge = np.identity(N, dtype=np.int) # np.zeros_like(S) # 已考慮過的邊(相似關係)
class_set = {} # class_set[i]:i 類對應的樣本 id
class_id = 0 # class id 總數
# 考慮孤立點(只同自己相似的樣本)
# 它們自己要擁有一個 label
s_sum = np.sum(S, axis=-1)
for i in range(N):
if s_sum[i] == 1:
vis_edge[i][i] = 0
for now in range(N):
# 鄰接點集
neighbour = set()
for v in range(N):
if S[now][v] == 1 and vis_edge[now][v] == 0:
neighbour.add(v)
while len(neighbour) > 0:
v = neighbour.pop()
if vis_edge[now][v] == 1: # 已經考慮過
continue
vis_edge[now][v] = vis_edge[v][now] = 1
elem = {now, v} # 當前類的元素集
for u in neighbour:
# 看 now 的其它鄰點能否加入當前類的元素集
# 要求它同其它所有元素都是鄰點
can_in = True
for e in elem:
if S[u][e] != 1:
can_in = False
break
if can_in:
elem.add(u)
for e in elem:
vis_edge[u][e] = vis_edge[e][u] = 1
class_set[class_id] = elem
class_id += 1
neighbour = neighbour - elem
label = np.zeros((N, class_id))
for lb in class_set:
for x in class_set[lb]:
label[x][lb] = 1
# 檢驗是否已考慮所有相似關係
for i in range(S.shape[0]):
for j in range(S.shape[1]):
if S[i][j] != vis_edge[i][j]:
print(f's & vis_edge diff: ({i}, {j})')
return label
if __name__ == '__main__':
# 對應上圖 sample
# 樣例手打的 label
lab = np.array([
[1, 1, 0, 0, 0, 0, 0, 0, 0], # (1, 2)
[1 ,0, 1, 1, 0, 0, 0, 0, 0], # (1, 3, 4)
[0, 0, 1, 0, 1, 0, 0, 0, 0], # (3, 5)
[0, 0, 0, 0, 1, 0, 0, 0, 0], # (5)
[0, 0, 0, 0, 1, 1, 0, 0, 0], # (5, 6)
[0, 0, 0, 0, 0, 0, 1, 0, 0], # (7)
[0, 0, 0, 1, 0, 6, 1, 1, 0], # (4 ,6, 7, 8)
[0, 0, 0, 0, 0, 0, 0, 1, 0], # (8)
[0, 1, 0, 0, 0, 0, 0, 1, 0], # (2, 4)
[0, 0, 0, 0, 0, 0, 0, 0, 1] # (9),孤立點
])
# 手打 label 算出的 S
sim = (np.dot(lab, lab.T) > 0).astype(np.int)
# 手打的 S
sim_my = np.array([
[1, 1, 0, 0, 0, 0, 0, 0, 1, 0],
[1, 1, 1, 0, 0, 0, 1, 0, 0, 0],
[0, 1, 1, 1, 1, 0, 0, 0, 0, 0],
[0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
[0, 0, 1, 1, 1, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
[0, 1, 0, 0, 1, 1, 1, 1, 1, 0],
[0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
[1, 0, 0, 0, 0, 0, 1, 1, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
])
## 檢驗手打 S 和算出的 S 一致
# for r in range(sim_my.shape[0]):
# for c in range(sim_my.shape[1]):
# if sim_my[r][c] != sim[r][c]:
# print(f'my sim diff: ({r}, {c})')
# print('my sim finish')
# print(sim - np.identity(sim.shape[0]))
lab_hat = construct_label(sim)
sim_hat = (np.dot(lab_hat, lab_hat.T) > 0).astype(np.int)
print(lab.shape, lab_hat.shape, '\n', lab_hat)
print(sim.shape, sim_hat.shape, '\n', sim_hat)
# 原 S 和構造 S' 的差別
for r in range(sim_hat.shape[0]):
for c in range(sim_hat.shape[1]):
if sim_hat[r][c] != sim[r][c]:
print(f'sim diff: ({r}, {c})')
print('sim finish')
# 原 L 和構造 L' 的區別
for r in range(lab_hat.shape[0]):
for c in range(lab_hat.shape[1]):
if lab_hat[r][c] != lab[r][c]:
print(f'lab diff: ({r}, {c})')
print('lab finish')
Discussion
第一種效果好像不太好。
第二種好像可以,在我構造的 sample 上雖然構造出的 label 同原來的標號有點對不上,但交換下順序可以同原來的一樣(同構)。
但當用 flickr-25k test set(2k 個 sample)的 label 測試時,發現構造的 label 雖然可以算出原來的 S,但 label 長了很多,即構造了更多的類別:原本只有 24 個,而構造的有 3412 個。而且用 training set 同用 test set 構造的 label 維度不同…顯然這種構造法只是個弟弟。
而且第二種是在無向圖下測試的,即 S 對稱,在 supervised 下用 label 算出來的 S 可以滿足,但 unsupervised 下構造的 S 可能是非對稱的,如 [3],這時不知道還能不能用。
Future Work
- 更優的構造算法,使得構造的 class 不會多得那麼誇張(最好可以同真 label 一樣長,甚至可以惟一對應回原來的標籤,就像在我上文構造的那個簡單 sample 那樣)
- 用於非對稱 S 的構造法