論文名稱:Watch Your Step: Learning Graph Embeddings Through Attention
這篇文章的出發點是自動化選擇網絡表示學習的參數從而適應不同網絡的需求。同時文章也證明了DeepWalk的工作其實等同於矩陣分解。
論文簡介
文章分析了之前一些工作的一個不可避免的問題,那就是模型參數的選擇問題。對於不同的網絡,模型能達到最好效果的參數是不同的,如果對於每個新出現的問題都要反覆試驗找到合適的參數,這顯然是低效的。這些參數包括DeepWalk中窗口大小w的選取,node2vec中定義的兩個跟隨機遊走有關的參數p、q。參數的選取完全影響着模型的好壞。
這些參數其實構成的是一個關於採樣節點的一個概率分佈,以DeepWalk中的w爲例,對於隨機遊走的每一個節點v,採樣的節點爲v的w鄰域的2w個節點,而採樣每個節點的概率就是 ,這實際上就是一個採樣節點的均勻分佈。本文的思想就是將這些超參數作爲要學習的參數,這樣就能自動學習出最適合網絡的超參數。
具體要做的有以下幾點:
- 把隨機遊走中對節點的上下文采樣過程看成是對轉移概率矩陣的期望。
- 通過證明DeepWalk等同於矩陣分解發現上述的採樣過程對應的目標是k階轉移概率矩陣。在這個不同的k階轉移矩陣前的係數就定義了一種對節點不同的採樣過程。引入Attention Model自動學習這些係數,從而達到最好的採樣效果。
- 用Attention Model學習出來的參數和人工調出來的參數效果差不多,進一步證明了方法的可行性。
本文的主要思想就是通過Attention Model自動化學習網絡中的參數,將網絡中的參數看成隨機遊走中對鄰居採樣的一種概率分佈,通過學習最適應網絡的這種分佈達到好的網絡表示結果。
論文思想
原文:【論文筆記】Watch Your Step
https://zhuanlan.zhihu.com/p/46935910
論文代碼
完整代碼可以查看google research:https://github.com/google-research/google-research/tree/master/graph_embedding/watch_your_step
def GetOrMakeAdjacencyMatrix():
"""Creates Adjacency matrix and caches it on disk with name a.npy."""
a_file = os.path.join(FLAGS.dataset_dir, 'a.npy')
if os.path.exists(a_file):
return numpy.load(open(a_file, 'rb'))
num_nodes = GetNumNodes()
a = numpy.zeros(shape=(num_nodes, num_nodes), dtype='float32')
train_edges = numpy.load(
open(os.path.join(FLAGS.dataset_dir, 'train.txt.npy'), 'rb'))
a[train_edges[:, 0], train_edges[:, 1]] = 1.0
if not IsDirected():
a[train_edges[:, 1], train_edges[:, 0]] = 1.0
numpy.save(open(a_file, 'wb'), a)
return a
def GetPowerTransitionPairs(highest_power):
return list(IterPowerTransitionPairs(highest_power))
def IterPowerTransitionPairs(highest_power):
"""Yields powers of transition matrix (T, T*T, T*T*T, ...).
It caches them on disk as t_<i>.npy, where <i> is the power. The first power
(i = 1) is not cached as it is trivially computed from the adjacency matrix.
Args:
highest_power: integer representing the highest power of the transition
matrix. This will be the number of yields.
"""
num_nodes = GetNumNodes()
for i in range(highest_power):
if i == 0:
a = GetOrMakeAdjacencyMatrix()
transition = a.T
degree = transition.sum(axis=0)
transition /= degree + 0.0000001
power_array = transition
else:
power_filename = os.path.join(FLAGS.dataset_dir, 't_%i.npy' % (i + 1))
if os.path.exists(power_filename):
power_array = numpy.load(open(power_filename, 'rb'))
else:
power_array = power_array.dot(transition)
print('Computing T^%i ...' % (i + 1)) # pylint: disable=superfluous-parens
numpy.save(open(power_filename, 'wb'), power_array)
print(' ... Saved T^%i' % (i + 1)) # pylint: disable=superfluous-parens
placeholder = tf.placeholder(tf.float32, shape=(num_nodes, num_nodes))
yield (placeholder, power_array)
def GetParametrizedExpectation(references):
r"""Calculates E[D; q_1, q_2, ...]: a parametrized (tensor) matrix D.
Which is defined as:
E[D; q] = P_0 * (Q_1*T + Q_2*T^2 + Q_3*T^3 + ...)
where Q_1, Q_2, ... = softmax(q_1, q_2, ...)
and vector (q_1, q_2, ...) is created as a "trainable variable".
Args:
references: Dict that will be populated as key-value pairs:
'combination': \sum_j Q_j T^j (i.e. E[D] excluding P_0).
'normed': The vector Q_1, Q_2, ... (sums to 1).
'mults': The vector q_1, q_2, ... (Before softmax, does not sum to 1).
Returns:
Tuple (E[D; q], feed_dict) where the first entry contains placeholders and
the feed_dict contains is a dictionary from the placeholders to numpy arrays
of the transition powers.
"""
feed_dict = {}
n = FLAGS.transition_powers
regularizer = FLAGS.context_regularizer
a = GetOrMakeAdjacencyMatrix()
transition = a.T
degree = transition.sum(axis=0)
# transition /= degree + 0.0000001
# transition_pow_n = transition
convex_combination = []
# vector q
mults = tf.Variable(numpy.ones(shape=(n), dtype='float32'))
# vector Q (output of softmax)
normed = tf.squeeze(tf.nn.softmax(tf.expand_dims(mults, 0)), 0)
references['mults'] = mults
references['normed'] = normed
transition_powers = GetPowerTransitionPairs(n)
for k, (placeholder, transition_pow) in enumerate(transition_powers):
feed_dict[placeholder] = transition_pow
convex_combination.append(normed[k] * placeholder)
d_sum = tf.add_n(convex_combination)
d_sum *= degree
tf.losses.add_loss(tf.reduce_mean(mults**2) * regularizer)
references['combination'] = convex_combination
return tf.transpose(d_sum) * GetNumNodes() * 80, feed_dict
# Helper function 1/3 for PercentDelta.
def GetPD(target_num_steps):
global_step = tf.train.get_or_create_global_step()
global_step = tf.cast(global_step, tf.float32)
# gs = 0, target = 1
# gs = num_steps, target = 0.01
# Solve: y = mx + c
# gives: c = 1
# m = dy / dx = (1 - 0.01) / (0 - num_steps) = - 0.99 / num_steps
# Therefore, y = 1 - (0.99/num_steps) * x
return -global_step * 0.99 / target_num_steps + 1
# Helper function 2/3 for PercentDelta.
def PlusEpsilon(x, eps=1e-5):
"""Returns x+epsilon, without changing element-wise sign of x."""
return x + (tf.cast(x < 0, tf.float32) * -eps) + (
tf.cast(x >= 0, tf.float32) * eps)
# Helper function 3/3 for PercentDelta.
def CreateGradMultipliers(loss):
"""Returns a gradient multiplier so that SGD becomes PercentDelta."""
variables = tf.trainable_variables() # tf.global_variables()
gradients = tf.gradients(loss, variables)
multipliers = {}
target_pd = GetPD(FLAGS.max_number_of_steps)
for v, g in zip(variables, gradients):
if g is None:
continue
multipliers[v] = target_pd / PlusEpsilon(
tf.reduce_mean(tf.abs(g / PlusEpsilon(v))))
return multipliers
def CreateEmbeddingDictionary(side, size):
num_nodes = GetNumNodes()
embeddings = numpy.array(
numpy.random.uniform(low=-0.1, high=0.1, size=(num_nodes, size)),
dtype='float32')
embeddings = tf.Variable(embeddings, name=side + 'E')
tf.losses.add_loss(tf.reduce_mean(embeddings**2) * 1e-6)
return embeddings
def CreateObjective(g, target_matrix):
"""Returns the objective function (can be nlgl or rmse)."""
if FLAGS.objective == 'nlgl': # Negative log likelihood
# target_matrix is E[D; q], which is used in the "positive part" of the
# likelihood objective. We use true adjacency for the "negative part", as
# described in our paper.
true_adjacency = tf.Variable(
GetOrMakeAdjacencyMatrix(), name='adjacency', trainable=False)
logistic = tf.sigmoid(g)
return -tf.reduce_mean(
tf.multiply(target_matrix, tf.log(PlusEpsilon(logistic))) +
tf.multiply(1 - true_adjacency, tf.log(PlusEpsilon(1 - logistic))))
elif FLAGS.objective == 'rmse': # Root mean squared error
return tf.reduce_mean((g - target_matrix)**2)
else:
logging.fatal('unknown objective "%s".', FLAGS.objective)
def CreateGFn(net_l, net_r):
return tf.matmul(net_l, tf.transpose(net_r))