图注意力模型GAT代码分析(Keras版)

本文分享一个对Keras版GAT源码的分析。
GAT原文:https://arxiv.org/abs/1710.10903,建议参考着知乎superbrother大神的文章进行理解。
TensorFlow版可以看:https://github.com/PetarV-/GAT
源代码 github:https://github.com/danielegrattarola/keras-gat

1 utils.py

    utils.py定义数据的加载,预处理,与normalize adj矩阵

from __future__ import print_function

import os
import pickle as pkl
import sys

import networkx as nx
import numpy as np
import scipy.sparse as sp


def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)


def load_data(dataset_str):
    """Load data."""
    """
    Loads input data from gcn/data directory

    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

    All objects above must be saved using python pickle module.

    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    """
    从data文件中读取数据,文档中包含3组数据'cora', 'citeseer',与'pubmed',每组数据有8种类型。
    'x'为训练数据的特征向量,
    'tx'为测试数据的特征向量,
    'allx'包括训练数据和测试数据的特征向量,
    'y'为训练数据的标签,'ty'为测试数据的标签,
    'ally'包括训练数据和测试数据的标签,
    'index'为测试数据的ID,
    'graph'为图数据。
    """
    #返回的是.py文件的绝对路径
    FILE_PATH = os.path.abspath(__file__)
    #返回的是.py文件的目录
    DIR_PATH = os.path.dirname(FILE_PATH)
    #到这里返回了data文件夹的绝对路径
    DATA_PATH = os.path.join(DIR_PATH, 'data/')

    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("{}ind.{}.{}".format(DATA_PATH, dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))
    # x.shape:(140, 1433); y.shape:(140, 7);tx.shape:(1000, 1433);ty.shape:(1708, 1433);
    # allx.shape:(1708, 1433);ally.shape:(1708, 7)
    x, y, tx, ty, allx, ally, graph = tuple(objects)
    # 训练数据集
    # print(x[0][0],x.shape,type(x))  ##x是一个稀疏矩阵,记住1的位置,140个实例,每个实例的特征向量维度是1433  (140,1433)
    # print(y[0],y.shape)   ##y是标签向量,7分类,140个实例 (140,7)

    ##测试数据集
    # print(tx[0][0],tx.shape,type(tx))  ##tx是一个稀疏矩阵,1000个实例,每个实例的特征向量维度是1433  (1000,1433)
    # print(ty[0],ty.shape)   ##y是标签向量,7分类,1000个实例 (1000,7)

    ##allx,ally和上面的形式一致
    # print(allx[0][0],allx.shape,type(allx))  ##tx是一个稀疏矩阵,1708个实例,每个实例的特征向量维度是1433  (1708,1433)
    # print(ally[0],ally.shape)   ##y是标签向量,7分类,1708个实例 (1708,7)

    ##graph是一个字典,大图总共2708个节点
    # for i in graph:
    #     print(i,graph[i])

    # 测试数据集的索引乱序版
    test_idx_reorder = parse_index_file("{}ind.{}.test.index".format(DATA_PATH, dataset_str))
    # print(test_idx_reorder)
    # [2488, 2644, 3261, 2804, 3176, 2432, 3310, 2410, 2812,...]
    
    # 从小到大排序,如[1707,1708,1709,...]
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder),
                                    max(test_idx_reorder) + 1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range - min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range - min(test_idx_range), :] = ty
        ty = ty_extended

    # 将allx和tx叠起来并转化成LIL格式的feature,即输入一张整图
    features = sp.vstack((allx, tx)).tolil()
    # 把特征矩阵还原,和对应的邻接矩阵对应起来,因为之前是打乱的,不对齐的话,特征就和对应的节点搞错了。
    features[test_idx_reorder, :] = features[test_idx_range, :]
    # print("features.shape:",features.shape)
    # features.shape: (2708, 1433)
    
    # 邻接矩阵格式也是LIL的,并且shape为(2708, 2708)
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
    
    # labels.shape:(2708, 7)
    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]
    
    # len(list(idx_val)) + len(list(idx_train)) + len(idx_test) =  1640
    idx_test = test_idx_range.tolist()
    # print(idx_test)
    # [1708, 1709, 1710, 1711, 1712, 1713,...,2705, 2706, 2707]
    # print(len(idx_test))
    # 1000
    
    idx_train = range(len(y))
    # print(idx_train)
    # range(0, 140)
    
    idx_val = range(len(y), len(y) + 500)
    # print(idx_val,len(idx_val))
    # range(140, 640) 500
    
    # 训练mask:idx_train=[0,140)范围的是True,后面的是False
    train_mask = sample_mask(idx_train, labels.shape[0])
    # print(train_mask,train_mask.shape)
    # [True  True  True... False False False]  # labels.shape[0]:(2708,)
    
    # 验证mask:val_mask的idx_val=(140, 640]范围为True,其余的是False
    val_mask = sample_mask(idx_val, labels.shape[0])
    # test_mask,idx_test=[1708,2707]范围是True,其余的是False
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    # print(y_train.shape," ",y_test.shape," ",y_val.shape)
    # (2708, 7)(2708, 7)(2708, 7)
    
    # 替换了true位置
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask

# 处理特征:特征矩阵进行归一化并返回一个格式为(coords, values, shape)的元组
# 特征矩阵的每一行的每个元素除以行和,处理后的每一行元素之和为1
# 处理特征矩阵,跟谱图卷积的理论有关,目的是要把周围节点的特征和自身节点的特征都捕捉到,同时避免不同节点间度的不均衡带来的问题
def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    print("preprocess_features")
    # >> > b = [[1.0, 3], [2, 4], [3, 5]]
    # >> > b = np.array(b)
    # >> > b
    # array([[1., 3.],
    #        [2., 4.],
    #        [3., 5.]])
    # >> > np.array(b.sum(1))
    # array([4., 6., 8.])
    # >> > c = np.array(b.sum(1))
    # >> > np.power(c, -1)
    # array([0.25, 0.16666667, 0.125])
    # >> > np.power(c, -1).flatten()
    # array([0.25, 0.16666667, 0.125])
    # >> > r_inv = np.power(c, -1).flatten()
    # >> > import scipy.sparse as sp
    # >> > r_mat_inv = sp.diags(r_inv)
    # >> > r_mat_inv
    # < 3x3 sparse matrix of type '<class 'numpy.float64 '>'
    # with 3 stored elements (1 diagonals) in DIAgonal format >
    # >> > r_mat_inv.toarray()
    # array([[0.25, 0., 0.],
    #        [0., 0.16666667, 0.],
    #        [0., 0., 0.125]])
    # >> > f = r_mat_inv.dot(b)
    # >> > f
    # array([[0.25, 0.75],
    #        [0.33333333, 0.66666667],
    #        [0.375, 0.625]])

    # a.sum()是将矩阵中所有的元素进行求和;a.sum(axis = 0)是每一列列相加;a.sum(axis = 1)是每一行相加
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten() #flatten()函数返回一个折叠成一维的数组。但是该函数只能适用于numpy对象,即array或者mat,普通的list列表是不行的。
    # print("r_inv:", r_inv)
    # r_inv: [0.11111111 0.04347826 0.05263158... 0.05555556 0.07142857 0.07692308]
    # np.isinf(ndarray)返回一个判断是否是无穷的bool型数组
    r_inv[np.isinf(r_inv)] = 0. 
    r_mat_inv = sp.diags(r_inv) #以r_inv为对角构造对角矩阵
    features = r_mat_inv.dot(features)
    return features.todense()

2 graph_attention_layer.py

    定义GAT模型。

from __future__ import absolute_import

from keras import activations, constraints, initializers, regularizers
from keras import backend as K
from keras.layers import Layer, Dropout, LeakyReLU


class GraphAttention(Layer):

    def __init__(self,
                 F_,
                 attn_heads=1,
                 attn_heads_reduction='concat',  # {'concat', 'average'}
                 dropout_rate=0.5,
                 activation='relu',
                 use_bias=True,
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 attn_kernel_initializer='glorot_uniform',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 attn_kernel_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 attn_kernel_constraint=None,
                 **kwargs):
        if attn_heads_reduction not in {'concat', 'average'}:
            raise ValueError('Possbile reduction methods: concat, average')

        self.F_ = F_  # Number of output features (F' in the paper)
        self.attn_heads = attn_heads  # Number of attention heads (K in the paper)
        self.attn_heads_reduction = attn_heads_reduction  # Eq. 5 and 6 in the paper
        self.dropout_rate = dropout_rate  # Internal dropout rate
        self.activation = activations.get(activation)  # Eq. 4 in the paper
        self.use_bias = use_bias

        self.kernel_initializer = initializers.get(kernel_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.attn_kernel_initializer = initializers.get(attn_kernel_initializer)

        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.attn_kernel_regularizer = regularizers.get(attn_kernel_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)

        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)
        self.attn_kernel_constraint = constraints.get(attn_kernel_constraint)
        self.supports_masking = False

        # Populated by build()
        self.kernels = []       # Layer kernels for attention heads
        self.biases = []        # Layer biases for attention heads
        self.attn_kernels = []  # Attention kernels for attention heads

        if attn_heads_reduction == 'concat':
            # Output will have shape (..., K * F')
            self.output_dim = self.F_ * self.attn_heads
        else:
            # Output will have shape (..., F')
            self.output_dim = self.F_

        super(GraphAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) >= 2
        F = input_shape[0][-1]

        # Initialize weights for each attention head
        for head in range(self.attn_heads):
            # Layer kernel
            kernel = self.add_weight(shape=(F, self.F_),
                                     initializer=self.kernel_initializer,
                                     regularizer=self.kernel_regularizer,
                                     constraint=self.kernel_constraint,
                                     name='kernel_{}'.format(head))
            self.kernels.append(kernel)

            # # Layer bias
            if self.use_bias:
                bias = self.add_weight(shape=(self.F_, ),
                                       initializer=self.bias_initializer,
                                       regularizer=self.bias_regularizer,
                                       constraint=self.bias_constraint,
                                       name='bias_{}'.format(head))
                self.biases.append(bias)

            # Attention kernels
            attn_kernel_self = self.add_weight(shape=(self.F_, 1),
                                               initializer=self.attn_kernel_initializer,
                                               regularizer=self.attn_kernel_regularizer,
                                               constraint=self.attn_kernel_constraint,
                                               name='attn_kernel_self_{}'.format(head),)
            attn_kernel_neighs = self.add_weight(shape=(self.F_, 1),
                                                 initializer=self.attn_kernel_initializer,
                                                 regularizer=self.attn_kernel_regularizer,
                                                 constraint=self.attn_kernel_constraint,
                                                 name='attn_kernel_neigh_{}'.format(head))
            self.attn_kernels.append([attn_kernel_self, attn_kernel_neighs])
        self.built = True

    def call(self, inputs):
        X = inputs[0]  # Node features (N x F)
        A = inputs[1]  # Adjacency matrix (N x N)

        outputs = []
        for head in range(self.attn_heads):
            kernel = self.kernels[head]  # W in the paper (F x F')
            attention_kernel = self.attn_kernels[head]  # Attention kernel a in the paper (2F' x 1)

            # Compute inputs to attention network
            features = K.dot(X, kernel)  # (N x F')

            # Compute feature combinations
            # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j]
            attn_for_self = K.dot(features, attention_kernel[0])    # (N x 1), [a_1]^T [Wh_i]
            attn_for_neighs = K.dot(features, attention_kernel[1])  # (N x 1), [a_2]^T [Wh_j]

            # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]]
            dense = attn_for_self + K.transpose(attn_for_neighs)  # (N x N) via broadcasting

            # Add nonlinearty
            dense = LeakyReLU(alpha=0.2)(dense)

            # Mask values before activation (Vaswani et al., 2017)
            mask = -10e9 * (1.0 - A)
            dense += mask

            # Apply softmax to get attention coefficients
            dense = K.softmax(dense)  # (N x N)

            # Apply dropout to features and attention coefficients
            dropout_attn = Dropout(self.dropout_rate)(dense)  # (N x N)
            dropout_feat = Dropout(self.dropout_rate)(features)  # (N x F')

            # Linear combination with neighbors' features
            node_features = K.dot(dropout_attn, dropout_feat)  # (N x F')

            if self.use_bias:
                node_features = K.bias_add(node_features, self.biases[head])

            # Add output of attention head to final output
            outputs.append(node_features)

        # Aggregate the heads' output according to the reduction method
        if self.attn_heads_reduction == 'concat':
            output = K.concatenate(outputs)  # (N x KF')
        else:
            output = K.mean(K.stack(outputs), axis=0)  # N x F')

        output = self.activation(output)
        return output

    def compute_output_shape(self, input_shape):
        output_shape = input_shape[0][0], self.output_dim
        return output_shape

3 gat.py

from __future__ import division

import numpy as np
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.layers import Input, Dropout
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2

from graph_attention_layer import GraphAttention
from utils import load_data, preprocess_features

# Read data
A, X, Y_train, Y_val, Y_test, idx_train, idx_val, idx_test = load_data('citeseer')

# Parameters
N = X.shape[0]                # Number of nodes in the graph
F = X.shape[1]                # Original feature dimension
n_classes = Y_train.shape[1]  # Number of classes
F_ = 8                        # Output size of first GraphAttention layer
n_attn_heads = 8              # Number of attention heads in first GAT layer
"""
Transductive learning For the transductive learning tasks, we apply a two-layer GAT model. Its
architectural hyperparameters have been optimized on the Cora dataset and are then reused for Citeseer.
The first layer consists of K = 8 attention heads computing F0 = 8 features each (for a total
of 64 features), followed by an exponential linear unit (ELU) (Clevert et al., 2016) nonlinearity. The
second layer is used for classification: a single attention head that computes C features (where C
is the number of classes), followed by a softmax activation. For coping with the small training set
sizes, regularization is liberally applied within the model. During training, we apply L2 regularization
with  = 0:0005. Furthermore, dropout (Srivastava et al., 2014) with p = 0:6 is applied to
both layers’ inputs, as well as to the normalized attention coefficients (critically, this means that at
each training iteration, each node is exposed to a stochastically sampled neighborhood). Similarly
as observed by Monti et al. (2016), we found that Pubmed’s training set size (60 examples) required
slight changes to the GAT architecture: we have applied K = 8 output attention heads (instead of
one), and strengthened the L2 regularization to  = 0:001. Otherwise, the architecture matches the
one used for Cora and Citeseer.
"""
dropout_rate = 0.6            # Dropout rate (between and inside GAT layers)
l2_reg = 5e-4/2               # Factor for l2 regularization
learning_rate = 5e-3          # Learning rate for Adam
epochs = 10000                # Number of training epochs
es_patience = 100             # Patience fot early stopping

# Preprocessing operations
X = preprocess_features(X)
A = A + np.eye(A.shape[0])  # Add self-loops

# Model definition (as per Section 3.3 of the paper)
X_in = Input(shape=(F,))
A_in = Input(shape=(N,))

dropout1 = Dropout(dropout_rate)(X_in)
graph_attention_1 = GraphAttention(F_,
                                   attn_heads=n_attn_heads,
                                   attn_heads_reduction='concat',
                                   dropout_rate=dropout_rate,
                                   activation='elu',
                                   kernel_regularizer=l2(l2_reg),
                                   attn_kernel_regularizer=l2(l2_reg))([dropout1, A_in])
dropout2 = Dropout(dropout_rate)(graph_attention_1)
graph_attention_2 = GraphAttention(n_classes,
                                   attn_heads=1,
                                   attn_heads_reduction='average',
                                   dropout_rate=dropout_rate,
                                   activation='softmax',
                                   kernel_regularizer=l2(l2_reg),
                                   attn_kernel_regularizer=l2(l2_reg))([dropout2, A_in])

# Build model
model = Model(inputs=[X_in, A_in], outputs=graph_attention_2)
optimizer = Adam(lr=learning_rate)
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              weighted_metrics=['acc'])
model.summary()

# Callbacks
es_callback = EarlyStopping(monitor='val_weighted_acc', patience=es_patience)
tb_callback = TensorBoard(batch_size=N)
mc_callback = ModelCheckpoint('logs/best_model.h5',
                              monitor='val_weighted_acc',
                              save_best_only=True,
                              save_weights_only=True)

# Train model
validation_data = ([X, A], Y_val, idx_val)
model.fit([X, A],
          Y_train,
          sample_weight=idx_train,
          epochs=epochs,
          batch_size=N,
          validation_data=validation_data,
          shuffle=False,  # Shuffling data means shuffling the whole graph
          callbacks=[es_callback, tb_callback, mc_callback])

# Load best model
model.load_weights('logs/best_model.h5')

# Evaluate model
eval_results = model.evaluate([X, A],
                              Y_test,
                              sample_weight=idx_test,
                              batch_size=N,
                              verbose=0)
print('Done.\n'
      'Test loss: {}\n'
      'Test accuracy: {}'.format(*eval_results))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章