圖表示學習Graph Embedding:DeepWalk python實現

DeepWalk python 實現

代碼及測試數據集:https://github.com/AI-luyuan/graph-embedding

DeepWalk 原理

DeepWalk 基於 Word2vec 方法對圖頂點向量化表示,主要思路:

  • 在圖網絡中進行隨機遊走,生成圖頂點路徑,模仿文本生成的過程,提供一個圖頂點序列
  • 使用CBOW或Skip-gram模型對隨機遊走序列中的節點進行向量表示學習。
    在這裏插入圖片描述

算法

參考:DeepWalk:Online Learning of Social Representations
在這裏插入圖片描述

在這裏插入圖片描述

python實現

隨機遊走

import numpy as np
import networkx as nx

# 隨機遊走生成頂點序列
def randomWalk(_g, _corpus_num, _deep_num, _current_word):
	_corpus = []
	for i in range(_corpus_num):
		sentence = [_current_word]  
		current_word = _current_word
		count = 0
		while count<_deep_num:
			count+=1
			_node_list = []
			_weight_list = []
			for _nbr, _data in _g[current_word].items():
				_node_list.append(_nbr)
				_weight_list.append(_data['weight'])
			_ps = [float(_weight) / sum(_weight_list) for _weight in _weight_list]
			sel_node = roulette(_node_list, _ps)
			sentence.append(sel_node)
			current_word = sel_node
		_corpus.append(sentence)
	return _corpus

def roulette(_datas, _ps):
	return np.random.choice(_datas, p=_ps)


# 生成有向圖網絡
G = nx.DiGraph() 
path = './graph.txt'
word_list = []
with open(path,'r') as f:
	for line in f:    
		cols = line.strip().split(',')    
		G.add_weighted_edges_from([(cols[0], cols[1], float(cols[2]))])
		word_list.append(cols[0])
		G.add_weighted_edges_from([(cols[1], cols[0], float(cols[2]))])
		word_list.append(cols[1])

word_set = set(word_list) 

num = 10				# 每個節點開始生成num條頂點序列
deep_num = 20			 #每個序列深度爲deep_num

sentence_file = open('./GraphSentence.txt','w')
k = 1
for word in word_set:
	print(k)
	k+=1
	corpus = randomWalk(G, num, deep_num, word)
	# print(corpus)
	for cols in corpus:
		sentences = '\t'.join(cols)
		sentence_file.write(sentences+'\n')

word2vector

import gensim
with open('./GraphSentence.txt','r') as f:
    sentences = []
    for line in f:
        cols = line.strip().split('\t')
        sentences.append(cols)



model = gensim.models.Word2Vec(sentences, sg=1, size=300, alpha=0.025, window=3, min_count=1, max_vocab_size=None, sample=1e-3, seed=1, workers=45, min_alpha=0.0001, hs=0, negative=20, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=1e4)


outfile = './test'
fname = './testmodel'
# save
model.save(fname) 
model.wv.save_word2vec_format(outfile + '.model.bin', binary=True)  
model.wv.save_word2vec_format(outfile + '.model.txt', binary=False) 


fname = './testmodel'
model = gensim.models.Word2Vec.load(fname)  
nearest10 = model.most_similar('子')
print(nearest10 )
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章