【論文筆記】node2vec: Scalable Feature Learning for Networks



  • abstract

We define a flexible notion of a node’s network neighborhood and design a biased random walk procedure, which efficiently explores diverse neighborhoods. Our algorithm generalizes prior work which is based on rigid notions of network neighborhoods, and we argue that the added flexibility in exploring neighborhoods is the key to learning richer representations.


  • introduction

Overall our paper makes the following contributions:

  1. We propose node2vec, an efficient scalable algorithm for feature learning in networks that efficiently optimizes a novel network-aware, neighborhood preserving objective using SGD.

  2. We show how node2vec is in accordance with established principles in network science, providing flexibility in discovering representations conforming to different equivalences.

  3. We extend node2vec and other feature learning methods based on neighborhood preserving objectives, from nodes to pairs of nodes for edge-based prediction tasks.

  4. We empirically evaluate node2vec for multi-label classificaion and link prediction on several real-world datasets.





  • 條件獨立性假設


  • 特徵空間對稱性假設






  • node2vec

    • random walks



  • search bias


dtx denotes the shortest path distance between nodes t and x.

Return parameter, p. Parameter p controls the likelihood of immediately revisiting a node in the walk.

In-out parameter, q. Parameter q allows the search to differentiate between “inward” and “outward” nodes.


  • algorithm


  • learning edge features




  • 整體結構

  • node2vec.py

import os
from collections import defaultdict

import numpy as np
import networkx as nx
import gensim
from joblib import Parallel, delayed
from tqdm import tqdm

from .parallel import parallel_generate_walks

class Node2Vec:
    FIRST_TRAVEL_KEY = 'first_travel_key'
    PROBABILITIES_KEY = 'probabilities'
    NEIGHBORS_KEY = 'neighbors'
    WEIGHT_KEY = 'weight'
    NUM_WALKS_KEY = 'num_walks'
    WALK_LENGTH_KEY = 'walk_length'
    P_KEY = 'p'
    Q_KEY = 'q'

    def __init__(self, graph: nx.Graph, dimensions: int = 128, walk_length: int = 80, num_walks: int = 10, p: float = 1,
                 q: float = 1, weight_key: str = 'weight', workers: int = 1, sampling_strategy: dict = None,
                 quiet: bool = False, temp_folder: str = None):
        Initiates the Node2Vec object, precomputes walking probabilities and generates the walks.

        :param graph: Input graph
        :param dimensions: Embedding dimensions (default: 128)
        :param walk_length: Number of nodes in each walk (default: 80)
        :param num_walks: Number of walks per node (default: 10)
        :param p: Return hyper parameter (default: 1)
        :param q: Inout parameter (default: 1)
        :param weight_key: On weighted graphs, this is the key for the weight attribute (default: 'weight')
        :param workers: Number of workers for parallel execution (default: 1)
        :param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
        Use these keys exactly. If not set, will use the global ones which were passed on the object initialization
        :param temp_folder: Path to folder with enough space to hold the memory map of self.d_graph (for big graphs); to be passed joblib.Parallel.temp_folder

        self.graph = graph
        self.dimensions = dimensions
        self.walk_length = walk_length
        self.num_walks = num_walks
        self.p = p
        self.q = q
        self.weight_key = weight_key
        self.workers = workers
        self.quiet = quiet
        self.d_graph = defaultdict(dict)

        # 採樣策略,包括指定某個節點p、q、num_walks、walk_length
        if sampling_strategy is None:
            self.sampling_strategy = {}
            self.sampling_strategy = sampling_strategy

        self.temp_folder, self.require = None, None
        if temp_folder:
            if not os.path.isdir(temp_folder):
                raise NotADirectoryError("temp_folder does not exist or is not a directory. ({})".format(temp_folder))

            self.temp_folder = temp_folder
            self.require = "sharedmem"

        self.walks = self._generate_walks()

    def _precompute_probabilities(self):
        Precomputes transition probabilities for each node.

        d_graph = self.d_graph

        # 統計圖中節點,若quiet爲Ture,則可以不輸出統計的進度
        nodes_generator = self.graph.nodes() if self.quiet \
            else tqdm(self.graph.nodes(), desc='Computing transition probabilities')

        for source in nodes_generator:

            # Init probabilities dict for first travel
            if self.PROBABILITIES_KEY not in d_graph[source]:
                d_graph[source][self.PROBABILITIES_KEY] = dict()

            # 探查當前節點的鄰居
            for current_node in self.graph.neighbors(source):

                # Init probabilities dict
                if self.PROBABILITIES_KEY not in d_graph[current_node]:
                    d_graph[current_node][self.PROBABILITIES_KEY] = dict()

                unnormalized_weights = list()
                d_neighbors = list()

                # Calculate unnormalized weights
                # 計算未歸一化的權重
                for destination in self.graph.neighbors(current_node):

                    p = self.sampling_strategy[current_node].get(self.P_KEY,
                                                                 self.p) if current_node in self.sampling_strategy else self.p
                    q = self.sampling_strategy[current_node].get(self.Q_KEY,
                                                                 self.q) if current_node in self.sampling_strategy else self.q

                    if destination == source:  # Backwards probability
                        ss_weight = self.graph[current_node][destination].get(self.weight_key, 1) * 1 / p
                    elif destination in self.graph[source]:  # If the neighbor is connected to the source
                        ss_weight = self.graph[current_node][destination].get(self.weight_key, 1)
                        ss_weight = self.graph[current_node][destination].get(self.weight_key, 1) * 1 / q

                    # Assign the unnormalized sampling strategy weight, normalize during random walk

                # Normalize
                unnormalized_weights = np.array(unnormalized_weights)
                    source] = unnormalized_weights / unnormalized_weights.sum()

                # Save neighbors
                d_graph[current_node][self.NEIGHBORS_KEY] = d_neighbors

            # Calculate first_travel weights for source
            first_travel_weights = []

            for destination in self.graph.neighbors(source):
                first_travel_weights.append(self.graph[source][destination].get(self.weight_key, 1))

            first_travel_weights = np.array(first_travel_weights)
            d_graph[source][self.FIRST_TRAVEL_KEY] = first_travel_weights / first_travel_weights.sum()

    def _generate_walks(self) -> list:
        Generates the random walks which will be used as the skip-gram input.
        :return: List of walks. Each walk is a list of nodes.

        # 將數據拉平
        flatten = lambda l: [item for sublist in l for item in sublist]

        # Split num_walks for each worker
        num_walks_lists = np.array_split(range(self.num_walks), self.workers)

        # 並行執行
        walk_results = Parallel(n_jobs=self.workers, temp_folder=self.temp_folder, require=self.require)(
                                             self.quiet) for
            idx, num_walks
            in enumerate(num_walks_lists, 1))

        # print(walk_results)
        walks = flatten(walk_results)

        return walks

    def fit(self, **skip_gram_params) -> gensim.models.Word2Vec:
        Creates the embeddings using gensim's Word2Vec.
        :param skip_gram_params: Parameteres for gensim.models.Word2Vec - do not supply 'size' it is taken from the Node2Vec 'dimensions' parameter
        :type skip_gram_params: dict
        :return: A gensim word2vec model

        if 'workers' not in skip_gram_params:
            skip_gram_params['workers'] = self.workers

        if 'size' not in skip_gram_params:
            skip_gram_params['size'] = self.dimensions

        return gensim.models.Word2Vec(self.walks, **skip_gram_params)
  • parallel.py(主要實現並行運行)

import random

import numpy as np
from tqdm import tqdm

def parallel_generate_walks(d_graph: dict, global_walk_length: int, num_walks: int, cpu_num: int,
                            sampling_strategy: dict = None, num_walks_key: str = None, walk_length_key: str = None,
                            neighbors_key: str = None, probabilities_key: str = None, first_travel_key: str = None,
                            quiet: bool = False) -> list:
    Generates the random walks which will be used as the skip-gram input.

    :return: List of walks. Each walk is a list of nodes.

    walks = list()

    # 輸出當前是第幾個cpu
    if not quiet:
        pbar = tqdm(total=num_walks, desc='Generating walks (CPU: {})'.format(cpu_num))

    for n_walk in range(num_walks):

        # Update progress bar
        if not quiet:

        # Shuffle the nodes
        shuffled_nodes = list(d_graph.keys())

        # Start a random walk from every node
        for source in shuffled_nodes:

            # Skip nodes with specific num_walks
            if source in sampling_strategy and \
                    num_walks_key in sampling_strategy[source] and \
                    sampling_strategy[source][num_walks_key] <= n_walk:

            # Start walk
            walk = [source]

            # Calculate walk length
            if source in sampling_strategy:
                walk_length = sampling_strategy[source].get(walk_length_key, global_walk_length)
                walk_length = global_walk_length

            # Perform walk
            while len(walk) < walk_length:

                walk_options = d_graph[walk[-1]].get(neighbors_key, None)

                # Skip dead end nodes
                if not walk_options:

                if len(walk) == 1:  # For the first step
                    probabilities = d_graph[walk[-1]][first_travel_key]
                    walk_to = np.random.choice(walk_options, size=1, p=probabilities)[0]
                    probabilities = d_graph[walk[-1]][probabilities_key][walk[-2]]
                    walk_to = np.random.choice(walk_options, size=1, p=probabilities)[0]


            walk = list(map(str, walk))  # Convert all to strings


    if not quiet:

    return walks
  • edges.py(利用EdgeEmbedder實現了paper裏面table1總結的幾種操作,)

EdgeEmbedder is an abstract class which all the concrete edge embeddings class inherit from. The classes are AverageEmbedder, HadamardEmbedder, WeightedL1Embedder and WeightedL2Embedder which their practical definition could be found in the paper on table 1 Notice that edge embeddings are defined for any pair of nodes, connected or not and even node with itself.

import numpy as np
from abc import ABC, abstractmethod
from functools import reduce
from itertools import combinations_with_replacement
from gensim.models import KeyedVectors
from tqdm import tqdm

class EdgeEmbedder(ABC):

    def __init__(self, keyed_vectors: KeyedVectors, quiet: bool = False):
        :param keyed_vectors: KeyedVectors containing nodes and embeddings to calculate edges for

        self.kv = keyed_vectors
        self.quiet = quiet

    def _embed(self, edge: tuple) -> np.ndarray:
        Abstract method for implementing the embedding method
        :param edge: tuple of two nodes
        :return: Edge embedding

    def __getitem__(self, edge) -> np.ndarray:
        if not isinstance(edge, tuple) or not len(edge) == 2:
            raise ValueError('edge must be a tuple of two nodes')

        if edge[0] not in self.kv.index2word:
            raise KeyError('node {} does not exist in given KeyedVectors'.format(edge[0]))

        if edge[1] not in self.kv.index2word:
            raise KeyError('node {} does not exist in given KeyedVectors'.format(edge[1]))

        return self._embed(edge)

    def as_keyed_vectors(self) -> KeyedVectors:
        Generated a KeyedVectors instance with all the possible edge embeddings
        :return: Edge embeddings

        edge_generator = combinations_with_replacement(self.kv.index2word, r=2)

        if not self.quiet:
            vocab_size = len(self.kv.vocab)
            total_size = reduce(lambda x, y: x * y, range(1, vocab_size + 2)) / \
                         (2 * reduce(lambda x, y: x * y, range(1, vocab_size)))

            edge_generator = tqdm(edge_generator, desc='Generating edge features', total=total_size)

        # Generate features
        tokens = []
        features = []
        for edge in edge_generator:
            token = str(tuple(sorted(edge)))
            embedding = self._embed(edge)


        # Build KV instance
        edge_kv = KeyedVectors(vector_size=self.kv.vector_size)

        return edge_kv

class AverageEmbedder(EdgeEmbedder):
    Average node features

    def _embed(self, edge: tuple):
        return (self.kv[edge[0]] + self.kv[edge[1]]) / 2

class HadamardEmbedder(EdgeEmbedder):
    Hadamard product node features

    def _embed(self, edge: tuple):
        return self.kv[edge[0]] * self.kv[edge[1]]

class WeightedL1Embedder(EdgeEmbedder):
    Weighted L1 node features

    def _embed(self, edge: tuple):
        return np.abs(self.kv[edge[0]] - self.kv[edge[1]])

class WeightedL2Embedder(EdgeEmbedder):
    Weighted L2 node features

    def _embed(self, edge: tuple):
        return (self.kv[edge[0]] - self.kv[edge[1]]) ** 2


pip install node2vec


