圖注意力模型GAT代碼分析(Keras版)

本文分享一個對Keras版GAT源碼的分析。
GAT原文:https://arxiv.org/abs/1710.10903,建議參考着知乎superbrother大神的文章進行理解。
TensorFlow版可以看:https://github.com/PetarV-/GAT
源代碼 github:https://github.com/danielegrattarola/keras-gat

1 utils.py

    utils.py定義數據的加載,預處理,與normalize adj矩陣

from __future__ import print_function

import os
import pickle as pkl
import sys

import networkx as nx
import numpy as np
import scipy.sparse as sp


def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)


def load_data(dataset_str):
    """Load data."""
    """
    Loads input data from gcn/data directory

    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

    All objects above must be saved using python pickle module.

    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """
    """
    從data文件中讀取數據,文檔中包含3組數據'cora', 'citeseer',與'pubmed',每組數據有8種類型。
    'x'爲訓練數據的特徵向量,
    'tx'爲測試數據的特徵向量,
    'allx'包括訓練數據和測試數據的特徵向量,
    'y'爲訓練數據的標籤,'ty'爲測試數據的標籤,
    'ally'包括訓練數據和測試數據的標籤,
    'index'爲測試數據的ID,
    'graph'爲圖數據。
    """
    #返回的是.py文件的絕對路徑
    FILE_PATH = os.path.abspath(__file__)
    #返回的是.py文件的目錄
    DIR_PATH = os.path.dirname(FILE_PATH)
    #到這裏返回了data文件夾的絕對路徑
    DATA_PATH = os.path.join(DIR_PATH, 'data/')

    names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
    objects = []
    for i in range(len(names)):
        with open("{}ind.{}.{}".format(DATA_PATH, dataset_str, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))
    # x.shape:(140, 1433); y.shape:(140, 7);tx.shape:(1000, 1433);ty.shape:(1708, 1433);
    # allx.shape:(1708, 1433);ally.shape:(1708, 7)
    x, y, tx, ty, allx, ally, graph = tuple(objects)
    # 訓練數據集
    # print(x[0][0],x.shape,type(x))  ##x是一個稀疏矩陣,記住1的位置,140個實例,每個實例的特徵向量維度是1433  (140,1433)
    # print(y[0],y.shape)   ##y是標籤向量,7分類,140個實例 (140,7)

    ##測試數據集
    # print(tx[0][0],tx.shape,type(tx))  ##tx是一個稀疏矩陣,1000個實例,每個實例的特徵向量維度是1433  (1000,1433)
    # print(ty[0],ty.shape)   ##y是標籤向量,7分類,1000個實例 (1000,7)

    ##allx,ally和上面的形式一致
    # print(allx[0][0],allx.shape,type(allx))  ##tx是一個稀疏矩陣,1708個實例,每個實例的特徵向量維度是1433  (1708,1433)
    # print(ally[0],ally.shape)   ##y是標籤向量,7分類,1708個實例 (1708,7)

    ##graph是一個字典,大圖總共2708個節點
    # for i in graph:
    #     print(i,graph[i])

    # 測試數據集的索引亂序版
    test_idx_reorder = parse_index_file("{}ind.{}.test.index".format(DATA_PATH, dataset_str))
    # print(test_idx_reorder)
    # [2488, 2644, 3261, 2804, 3176, 2432, 3310, 2410, 2812,...]
    
    # 從小到大排序,如[1707,1708,1709,...]
    test_idx_range = np.sort(test_idx_reorder)

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder),
                                    max(test_idx_reorder) + 1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range - min(test_idx_range), :] = tx
        tx = tx_extended
        ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
        ty_extended[test_idx_range - min(test_idx_range), :] = ty
        ty = ty_extended

    # 將allx和tx疊起來並轉化成LIL格式的feature,即輸入一張整圖
    features = sp.vstack((allx, tx)).tolil()
    # 把特徵矩陣還原,和對應的鄰接矩陣對應起來,因爲之前是打亂的,不對齊的話,特徵就和對應的節點搞錯了。
    features[test_idx_reorder, :] = features[test_idx_range, :]
    # print("features.shape:",features.shape)
    # features.shape: (2708, 1433)
    
    # 鄰接矩陣格式也是LIL的,並且shape爲(2708, 2708)
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
    
    # labels.shape:(2708, 7)
    labels = np.vstack((ally, ty))
    labels[test_idx_reorder, :] = labels[test_idx_range, :]
    
    # len(list(idx_val)) + len(list(idx_train)) + len(idx_test) =  1640
    idx_test = test_idx_range.tolist()
    # print(idx_test)
    # [1708, 1709, 1710, 1711, 1712, 1713,...,2705, 2706, 2707]
    # print(len(idx_test))
    # 1000
    
    idx_train = range(len(y))
    # print(idx_train)
    # range(0, 140)
    
    idx_val = range(len(y), len(y) + 500)
    # print(idx_val,len(idx_val))
    # range(140, 640) 500
    
    # 訓練mask:idx_train=[0,140)範圍的是True,後面的是False
    train_mask = sample_mask(idx_train, labels.shape[0])
    # print(train_mask,train_mask.shape)
    # [True  True  True... False False False]  # labels.shape[0]:(2708,)
    
    # 驗證mask:val_mask的idx_val=(140, 640]範圍爲True,其餘的是False
    val_mask = sample_mask(idx_val, labels.shape[0])
    # test_mask,idx_test=[1708,2707]範圍是True,其餘的是False
    test_mask = sample_mask(idx_test, labels.shape[0])

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
    y_test = np.zeros(labels.shape)
    # print(y_train.shape," ",y_test.shape," ",y_val.shape)
    # (2708, 7)(2708, 7)(2708, 7)
    
    # 替換了true位置
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]

    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask

# 處理特徵:特徵矩陣進行歸一化並返回一個格式爲(coords, values, shape)的元組
# 特徵矩陣的每一行的每個元素除以行和,處理後的每一行元素之和爲1
# 處理特徵矩陣,跟譜圖卷積的理論有關,目的是要把周圍節點的特徵和自身節點的特徵都捕捉到,同時避免不同節點間度的不均衡帶來的問題
def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    print("preprocess_features")
    # >> > b = [[1.0, 3], [2, 4], [3, 5]]
    # >> > b = np.array(b)
    # >> > b
    # array([[1., 3.],
    #        [2., 4.],
    #        [3., 5.]])
    # >> > np.array(b.sum(1))
    # array([4., 6., 8.])
    # >> > c = np.array(b.sum(1))
    # >> > np.power(c, -1)
    # array([0.25, 0.16666667, 0.125])
    # >> > np.power(c, -1).flatten()
    # array([0.25, 0.16666667, 0.125])
    # >> > r_inv = np.power(c, -1).flatten()
    # >> > import scipy.sparse as sp
    # >> > r_mat_inv = sp.diags(r_inv)
    # >> > r_mat_inv
    # < 3x3 sparse matrix of type '<class 'numpy.float64 '>'
    # with 3 stored elements (1 diagonals) in DIAgonal format >
    # >> > r_mat_inv.toarray()
    # array([[0.25, 0., 0.],
    #        [0., 0.16666667, 0.],
    #        [0., 0., 0.125]])
    # >> > f = r_mat_inv.dot(b)
    # >> > f
    # array([[0.25, 0.75],
    #        [0.33333333, 0.66666667],
    #        [0.375, 0.625]])

    # a.sum()是將矩陣中所有的元素進行求和;a.sum(axis = 0)是每一列列相加;a.sum(axis = 1)是每一行相加
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten() #flatten()函數返回一個摺疊成一維的數組。但是該函數只能適用於numpy對象,即array或者mat,普通的list列表是不行的。
    # print("r_inv:", r_inv)
    # r_inv: [0.11111111 0.04347826 0.05263158... 0.05555556 0.07142857 0.07692308]
    # np.isinf(ndarray)返回一個判斷是否是無窮的bool型數組
    r_inv[np.isinf(r_inv)] = 0. 
    r_mat_inv = sp.diags(r_inv) #以r_inv爲對角構造對角矩陣
    features = r_mat_inv.dot(features)
    return features.todense()

2 graph_attention_layer.py

    定義GAT模型。

from __future__ import absolute_import

from keras import activations, constraints, initializers, regularizers
from keras import backend as K
from keras.layers import Layer, Dropout, LeakyReLU


class GraphAttention(Layer):

    def __init__(self,
                 F_,
                 attn_heads=1,
                 attn_heads_reduction='concat',  # {'concat', 'average'}
                 dropout_rate=0.5,
                 activation='relu',
                 use_bias=True,
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 attn_kernel_initializer='glorot_uniform',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 attn_kernel_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 attn_kernel_constraint=None,
                 **kwargs):
        if attn_heads_reduction not in {'concat', 'average'}:
            raise ValueError('Possbile reduction methods: concat, average')

        self.F_ = F_  # Number of output features (F' in the paper)
        self.attn_heads = attn_heads  # Number of attention heads (K in the paper)
        self.attn_heads_reduction = attn_heads_reduction  # Eq. 5 and 6 in the paper
        self.dropout_rate = dropout_rate  # Internal dropout rate
        self.activation = activations.get(activation)  # Eq. 4 in the paper
        self.use_bias = use_bias

        self.kernel_initializer = initializers.get(kernel_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.attn_kernel_initializer = initializers.get(attn_kernel_initializer)

        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.attn_kernel_regularizer = regularizers.get(attn_kernel_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)

        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)
        self.attn_kernel_constraint = constraints.get(attn_kernel_constraint)
        self.supports_masking = False

        # Populated by build()
        self.kernels = []       # Layer kernels for attention heads
        self.biases = []        # Layer biases for attention heads
        self.attn_kernels = []  # Attention kernels for attention heads

        if attn_heads_reduction == 'concat':
            # Output will have shape (..., K * F')
            self.output_dim = self.F_ * self.attn_heads
        else:
            # Output will have shape (..., F')
            self.output_dim = self.F_

        super(GraphAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) >= 2
        F = input_shape[0][-1]

        # Initialize weights for each attention head
        for head in range(self.attn_heads):
            # Layer kernel
            kernel = self.add_weight(shape=(F, self.F_),
                                     initializer=self.kernel_initializer,
                                     regularizer=self.kernel_regularizer,
                                     constraint=self.kernel_constraint,
                                     name='kernel_{}'.format(head))
            self.kernels.append(kernel)

            # # Layer bias
            if self.use_bias:
                bias = self.add_weight(shape=(self.F_, ),
                                       initializer=self.bias_initializer,
                                       regularizer=self.bias_regularizer,
                                       constraint=self.bias_constraint,
                                       name='bias_{}'.format(head))
                self.biases.append(bias)

            # Attention kernels
            attn_kernel_self = self.add_weight(shape=(self.F_, 1),
                                               initializer=self.attn_kernel_initializer,
                                               regularizer=self.attn_kernel_regularizer,
                                               constraint=self.attn_kernel_constraint,
                                               name='attn_kernel_self_{}'.format(head),)
            attn_kernel_neighs = self.add_weight(shape=(self.F_, 1),
                                                 initializer=self.attn_kernel_initializer,
                                                 regularizer=self.attn_kernel_regularizer,
                                                 constraint=self.attn_kernel_constraint,
                                                 name='attn_kernel_neigh_{}'.format(head))
            self.attn_kernels.append([attn_kernel_self, attn_kernel_neighs])
        self.built = True

    def call(self, inputs):
        X = inputs[0]  # Node features (N x F)
        A = inputs[1]  # Adjacency matrix (N x N)

        outputs = []
        for head in range(self.attn_heads):
            kernel = self.kernels[head]  # W in the paper (F x F')
            attention_kernel = self.attn_kernels[head]  # Attention kernel a in the paper (2F' x 1)

            # Compute inputs to attention network
            features = K.dot(X, kernel)  # (N x F')

            # Compute feature combinations
            # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j]
            attn_for_self = K.dot(features, attention_kernel[0])    # (N x 1), [a_1]^T [Wh_i]
            attn_for_neighs = K.dot(features, attention_kernel[1])  # (N x 1), [a_2]^T [Wh_j]

            # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]]
            dense = attn_for_self + K.transpose(attn_for_neighs)  # (N x N) via broadcasting

            # Add nonlinearty
            dense = LeakyReLU(alpha=0.2)(dense)

            # Mask values before activation (Vaswani et al., 2017)
            mask = -10e9 * (1.0 - A)
            dense += mask

            # Apply softmax to get attention coefficients
            dense = K.softmax(dense)  # (N x N)

            # Apply dropout to features and attention coefficients
            dropout_attn = Dropout(self.dropout_rate)(dense)  # (N x N)
            dropout_feat = Dropout(self.dropout_rate)(features)  # (N x F')

            # Linear combination with neighbors' features
            node_features = K.dot(dropout_attn, dropout_feat)  # (N x F')

            if self.use_bias:
                node_features = K.bias_add(node_features, self.biases[head])

            # Add output of attention head to final output
            outputs.append(node_features)

        # Aggregate the heads' output according to the reduction method
        if self.attn_heads_reduction == 'concat':
            output = K.concatenate(outputs)  # (N x KF')
        else:
            output = K.mean(K.stack(outputs), axis=0)  # N x F')

        output = self.activation(output)
        return output

    def compute_output_shape(self, input_shape):
        output_shape = input_shape[0][0], self.output_dim
        return output_shape

3 gat.py

from __future__ import division

import numpy as np
from keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from keras.layers import Input, Dropout
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2

from graph_attention_layer import GraphAttention
from utils import load_data, preprocess_features

# Read data
A, X, Y_train, Y_val, Y_test, idx_train, idx_val, idx_test = load_data('citeseer')

# Parameters
N = X.shape[0]                # Number of nodes in the graph
F = X.shape[1]                # Original feature dimension
n_classes = Y_train.shape[1]  # Number of classes
F_ = 8                        # Output size of first GraphAttention layer
n_attn_heads = 8              # Number of attention heads in first GAT layer
"""
Transductive learning For the transductive learning tasks, we apply a two-layer GAT model. Its
architectural hyperparameters have been optimized on the Cora dataset and are then reused for Citeseer.
The first layer consists of K = 8 attention heads computing F0 = 8 features each (for a total
of 64 features), followed by an exponential linear unit (ELU) (Clevert et al., 2016) nonlinearity. The
second layer is used for classification: a single attention head that computes C features (where C
is the number of classes), followed by a softmax activation. For coping with the small training set
sizes, regularization is liberally applied within the model. During training, we apply L2 regularization
with  = 0:0005. Furthermore, dropout (Srivastava et al., 2014) with p = 0:6 is applied to
both layers’ inputs, as well as to the normalized attention coefficients (critically, this means that at
each training iteration, each node is exposed to a stochastically sampled neighborhood). Similarly
as observed by Monti et al. (2016), we found that Pubmed’s training set size (60 examples) required
slight changes to the GAT architecture: we have applied K = 8 output attention heads (instead of
one), and strengthened the L2 regularization to  = 0:001. Otherwise, the architecture matches the
one used for Cora and Citeseer.
"""
dropout_rate = 0.6            # Dropout rate (between and inside GAT layers)
l2_reg = 5e-4/2               # Factor for l2 regularization
learning_rate = 5e-3          # Learning rate for Adam
epochs = 10000                # Number of training epochs
es_patience = 100             # Patience fot early stopping

# Preprocessing operations
X = preprocess_features(X)
A = A + np.eye(A.shape[0])  # Add self-loops

# Model definition (as per Section 3.3 of the paper)
X_in = Input(shape=(F,))
A_in = Input(shape=(N,))

dropout1 = Dropout(dropout_rate)(X_in)
graph_attention_1 = GraphAttention(F_,
                                   attn_heads=n_attn_heads,
                                   attn_heads_reduction='concat',
                                   dropout_rate=dropout_rate,
                                   activation='elu',
                                   kernel_regularizer=l2(l2_reg),
                                   attn_kernel_regularizer=l2(l2_reg))([dropout1, A_in])
dropout2 = Dropout(dropout_rate)(graph_attention_1)
graph_attention_2 = GraphAttention(n_classes,
                                   attn_heads=1,
                                   attn_heads_reduction='average',
                                   dropout_rate=dropout_rate,
                                   activation='softmax',
                                   kernel_regularizer=l2(l2_reg),
                                   attn_kernel_regularizer=l2(l2_reg))([dropout2, A_in])

# Build model
model = Model(inputs=[X_in, A_in], outputs=graph_attention_2)
optimizer = Adam(lr=learning_rate)
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              weighted_metrics=['acc'])
model.summary()

# Callbacks
es_callback = EarlyStopping(monitor='val_weighted_acc', patience=es_patience)
tb_callback = TensorBoard(batch_size=N)
mc_callback = ModelCheckpoint('logs/best_model.h5',
                              monitor='val_weighted_acc',
                              save_best_only=True,
                              save_weights_only=True)

# Train model
validation_data = ([X, A], Y_val, idx_val)
model.fit([X, A],
          Y_train,
          sample_weight=idx_train,
          epochs=epochs,
          batch_size=N,
          validation_data=validation_data,
          shuffle=False,  # Shuffling data means shuffling the whole graph
          callbacks=[es_callback, tb_callback, mc_callback])

# Load best model
model.load_weights('logs/best_model.h5')

# Evaluate model
eval_results = model.evaluate([X, A],
                              Y_test,
                              sample_weight=idx_test,
                              batch_size=N,
                              verbose=0)
print('Done.\n'
      'Test loss: {}\n'
      'Test accuracy: {}'.format(*eval_results))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章