python實現短文本相似度計算—word2vec對文本編碼、LSTM計算距離

path='./data/qa_test.txt'#數據的路徑
path_word2vec='/home/ruben/data/nlp/word2vec_wx'#word2vec路徑
#造數據
fake_data=open(path,'r').readlines()
tain_data_l=[]
tain_data_r=[]
for line in fake_data:
    for line2 in fake_data:
        if(line is not line2):
            print(line.replace('\n',''),line2.replace('\n',''))
            tain_data_l.append(line.replace('\n',''))
            tain_data_r.append(line2.replace('\n',''))
print('left length:',len(tain_data_l))
print('right length:',len(tain_data_r))
import jieba
#構造字典和weight矩陣
list_word=['UNK']
dict_word={}
tain_data_l_n=[]#左邊LSTM的輸入
tain_data_r_n=[]#右邊LSTM的輸入

for data in [tain_data_l,tain_data_r]:
    for line in data:
        words=list(jieba.cut(line))
        for i,word in enumerate(words):
            if word not in dict_word:
                dict_word[word]=len(dict_word)
print(dict_word)#字典構造完畢
id2w={dict_word[w]:w for w in dict_word}#word的索引
embedding_size=256
embedding_arry=np.random.randn(len(dict_word)+1,embedding_size)#句子embedding矩陣
embedding_arry[0]=0
word2vector=gensim.models.Word2Vec.load(path_word2vec)
for index,word in enumerate(dict_word):
    if word in word2vector.wv.vocab:
        embedding_arry[index]=word2vector.wv.word_vec(word)
print('embedding_arry shape:',embedding_arry.shape)
del word2vector
#將詞組替換爲索引
for line in tain_data_l:
    words = list(jieba.cut(line))
    for i,word in enumerate(words):
        words[i]=dict_word[word]
    tain_data_l_n.append(words)
print('tain_data_l_n length:',len(tain_data_l_n))
y_train=np.ones((len(tain_data_l_n),))
for line in tain_data_r:
    words = list(jieba.cut(line))
    for i,word in enumerate(words):
        words[i]=dict_word[word]
    tain_data_r_n.append(words)
print('tain_data_r_n length:',len(tain_data_r_n))
#得到語料中句子的最大長度
max_length=0
for line in tain_data_r_n:
    if max_length<len(line):
        max_length=len(line)
print('max length:',max_length)

# 對齊語料中句子的長度
tain_data_l_n = pad_sequences(tain_data_l_n, maxlen=max_length)
tain_data_r_n = pad_sequences(tain_data_r_n, maxlen=max_length)

#模型參數
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 5
n_epoch = 15

#相似度計算
def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left - right), axis=1, keepdims=True))


#輸入層
left_input = Input(shape=(max_length,), dtype='int32')
right_input = Input(shape=(max_length,), dtype='int32')
embedding_layer = Embedding(len(embedding_arry), embedding_size, weights=[embedding_arry], input_length=max_length,
                            trainable=False)
#對句子embedding
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)
#兩個LSTM共享參數
shared_lstm = LSTM(n_hidden)
left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)
malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),
     output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# model
malstm = Model([left_input, right_input], [malstm_distance])

optimizer = Adadelta(clipnorm=gradient_clipping_norm)

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
#train
malstm.fit(x=[np.asarray(tain_data_l_n), np.asarray(tain_data_r_n)], y=y_train, batch_size=batch_size, epochs=n_epoch,
                            validation_data=([np.asarray(tain_data_l_n), np.asarray(tain_data_r_n)], y_train) )
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章