Tensorflow 循環神經網絡 RNN文本匹配 問答匹配 (accuracy 74%)(3)

這篇博客是記錄我做文本匹配的一些嘗試,現在依然用之前的淘寶數據,準確度74%。

之前的博客已經講解了句子序列是怎麼回事了,現在我們把文本分類問題改寫成問答匹配問題。那麼顯然現在我們的輸入變成了兩個句子,輸出依然是分類標籤。那麼兩個句子經過同樣的RNN獲得最後的sentence vector,如何匹配sentence vector呢?我這裏用的是dot product,兩個vector對應位置相乘,但是不需要最後相加的一步。dot product的長度是hidden layer unit size,和單個sentence vector的維度相同,然後接一個普通網絡,output size爲2,因爲2分類。

 

import tensorflow as tf
import nltk
import pandas as pd
from collections import Counter
import numpy as np
import pandas as pd

import time
max_pair = 200000

def get_pair(number,dialogue):
    pairs = []
    for conversation in dialogue:
        utterances = conversation[2:].strip('\n').split('\t')
        # print(utterances)
        # break

        for i, utterance in enumerate(utterances):
            if i % 2 != 0: continue
            pairs.append([utterances[i], utterances[i + 1]])
            if len(pairs)>=max_pair:
                return pairs
    return pairs
def convert_dialogue_to_pair():
    dialogue = open('dialogue_alibaba2.txt', encoding='utf-8', mode='r')
    dialogue = dialogue.readlines()
    dialogue = [p for p in dialogue if p.startswith('1')]
    print(len(dialogue))
    pairs = get_pair(max_pair, dialogue)
        # break
    # print(pairs)
    data = []
    for p in pairs:
        data.append([p[0], p[1], 1])
    for i, p in enumerate(pairs):
        data.append([p[0], pairs[(i + 8) % len(pairs)][1], 0])
    df = pd.DataFrame(data, columns=['sentence_q', 'sentence_a', 'label'])

    print(len(data))
    return df


MAX_FEATURES = 150
MAX_SENTENCE_LENGTH = 100

# hyperparameters
lr = 0.001
training_iters = 100000
batch_size = 700
vocab_size = 200
embedding_size = 300
n_inputs = embedding_size  # MNIST data input (img shape: 28*28)
n_steps = MAX_SENTENCE_LENGTH  # time steps
n_hidden_units = 128 # neurons in hidden layer
n_classes = 2  # MNIST classes (0-9 digits)


def get_sentiment_data():
    df_sentiment = convert_dialogue_to_pair()
    print('=========finish convert ========')
    df_sentiment = df_sentiment.sample(frac=0.9)
    # df_sentiment = pd.read_csv('sentiment.csv', encoding='utf-8')
    # df_sentiment['sentence_q'] = df_sentiment['sentence']
    # df_sentiment['sentence_a'] = df_sentiment['sentence']
    sentenses_q = df_sentiment['sentence_q'].values
    sentenses_a = df_sentiment['sentence_a'].values
    sentenses = [s.lower() for s in sentenses_q + sentenses_a]
    wordlist_sentence = [nltk.word_tokenize(s) for s in sentenses]
    ws = []
    for wordlist in wordlist_sentence:
        ws.extend(wordlist)
    word_counter = Counter(ws)
    mc = word_counter.most_common(100)
    # print(mc)
    vocab_size = min(MAX_FEATURES, len(word_counter)) + 2
    word2index = {x[0]: i + 2 for i, x in
                  enumerate(word_counter.most_common(MAX_FEATURES))}
    word2index["PAD"] = 0
    word2index["UNK"] = 1
    index2word = {v: k for k, v in word2index.items()}
    res = []
    print('=========finish index word ========')
    print('iterrows')
    for line in df_sentiment.iterrows():
        # print('line')
        label, sentence = str(line[1]['label']), line[1]['sentence_q']
        # label, sentence = line.strip().split("\t")
        # print(label,sentence)
        # words = nltk.word_tokenize(sentence_q.lower())

        # words = nltk.word_tokenize(sentence.lower())
        words = sentence.split(' ')
        # print(words)
        seqs1 = []
        for word in words:
            if word in word2index.keys():
                seqs1.append(word2index[word])
            else:
                seqs1.append(word2index["UNK"])
        if MAX_SENTENCE_LENGTH < len(seqs1):
            #print('unexpected length of padding', len(padding))
            continue
        padding = [0] * (MAX_SENTENCE_LENGTH - len(seqs1))
        padding.extend(seqs1)
        # if len(padding) != MAX_SENTENCE_LENGTH:
        #     print('unexpected length of padding', len(padding))


        question = padding
        label, sentence = str(line[1]['label']), line[1]['sentence_a']
        # label, sentence = line.strip().split("\t")
        # print(label,sentence)
        # words = nltk.word_tokenize(sentence_q.lower())

        # words = nltk.word_tokenize(sentence.lower())
        words = sentence.split(' ')
        # print(words)
        seqs1 = []
        for word in words:
            if word in word2index.keys():
                seqs1.append(word2index[word])
            else:
                seqs1.append(word2index["UNK"])
        if MAX_SENTENCE_LENGTH < len(seqs1):
            #print('unexpected length of padding', len(padding))
            continue
        padding = [0] * (MAX_SENTENCE_LENGTH - len(seqs1))
        padding.extend(seqs1)
        # if len(padding) != MAX_SENTENCE_LENGTH:
        #     print('unexpected length of padding', len(padding))
        # padding = [u for u in padding]
        # for i in range(MAX_SENTENCE_LENGTH):

        answer = padding
        if label == '0':
            res.append([np.array([1, 0]), question, answer])
            # print('0')
        if label == '1':
            res.append([np.array([0, 1]), question, answer])
            # print('1')
    return res


# set random seed for comparing the two result calculations
tf.set_random_seed(1)

# x ==> x_q,x_a
x_q = tf.placeholder(tf.int32, [None, n_steps])
x_a = tf.placeholder(tf.int32, [None, n_steps])

W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name="W")
embedded_chars_q = tf.nn.embedding_lookup(W, x_q)
embedded_chars_a = tf.nn.embedding_lookup(W, x_a)
# embedded_chars_expanded = tf.expand_dims(embedded_chars, -1)


y = tf.placeholder(tf.float32, [None, n_classes])

# Define weights
weights = {
    # (28, 128)
    'in': tf.Variable(tf.random_normal([n_inputs, n_hidden_units])),
    # (128, 10)
    'out': tf.Variable(tf.random_normal([n_hidden_units, n_classes]))
}
biases = {
    # (128, )
    'in': tf.Variable(tf.constant(0.1, shape=[n_hidden_units, ])),
    # (10, )
    'out': tf.Variable(tf.constant(0.1, shape=[n_classes, ]))
}


def RNN(X1, X2, name, weights, biases):
    # hidden layer for input to cell
    ########################################
    # tf.reset_default_graph()
    # transpose the inputs shape from
    # X ==> (128 batch * 28 steps, 28 inputs)
    # X = tf.reshape(X, [-1, n_inputs])

    # into hidden
    # X_in = (128 batch * 28 steps, 128 hidden)
    # X_in = tf.matmul(X, weights['in']) + biases['in']
    # X_in ==> (128 batch, 28 steps, 128 hidden)
    # X_in = tf.reshape(X_in, [-1, n_steps, n_hidden_units])

    # cell
    ##########################################

    # basic LSTM Cell.

    with tf.variable_scope('RNN' + name):
        if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
            print('<12')
            cell = tf.nn.rnn_cell.LSTMCell(n_hidden_units, forget_bias=1.0, state_is_tuple=True)
        else:
            cell = tf.contrib.rnn.LSTMCell(n_hidden_units)
    # lstm cell is divided into two parts (c_state, h_state)
    init_state = cell.zero_state(batch_size, dtype=tf.float32)

    # You have 2 options for following step.
    # 1: tf.nn.rnn(cell, inputs);
    # 2: tf.nn.dynamic_rnn(cell, inputs).
    # If use option 1, you have to modified the shape of X_in, go and check out this:
    # https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py
    # In here, we go for option 2.
    # dynamic_rnn receive Tensor (batch, steps, inputs) or (steps, batch, inputs) as X_in.
    # Make sure the time_major is changed accordingly.
    outputs, final_state = tf.nn.dynamic_rnn(cell, X1, initial_state=init_state, time_major=False)
    # hidden layer for output as the final results
    #############################################
    # results = tf.matmul(final_state[1], weights['out']) + biases['out']
    # # or
    # unpack to list [(batch, outputs)..] * steps
    if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
        outputs1 = tf.unpack(tf.transpose(outputs, [1, 0, 2]))  # states is the last outputs
    else:
        outputs1 = tf.unstack(tf.transpose(outputs, [1, 0, 2]))
    outputs, final_state = tf.nn.dynamic_rnn(cell, X2, initial_state=init_state, time_major=False)
    # hidden layer for output as the final results
    #############################################
    # results = tf.matmul(final_state[1], weights['out']) + biases['out']
    # # or
    # unpack to list [(batch, outputs)..] * steps
    if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
        outputs2 = tf.unpack(tf.transpose(outputs, [1, 0, 2]))  # states is the last outputs
    else:
        outputs2 = tf.unstack(tf.transpose(outputs, [1, 0, 2]))

    return outputs1[-1], outputs2[-1]
    # results = tf.matmul(outputs[-1], weights['out']) + biases['out']  # shape = (128, 10)
    # return results


def generate_number_classification():
    import numpy as np
    import random
    number = training_iters
    data = []
    for i in range(number):
        number_list = []
        for j in range(MAX_SENTENCE_LENGTH):
            number_list.append(random.randint(0, MAX_FEATURES))
        # number_list.sort()
        # number_list = [str(n) for n in number_list]
        data.append(number_list)
    res = []
    for i, number in enumerate(data):
        if i % 2 == 0:
            question = [str(n) for n in number]
            res.append([[1, 0], question])
        if i % 2 == 1:
            question = [str(n + 30) for n in number]
            res.append([[0, 1], question])
    # training_data = pd.DataFrame(res, columns=['label', 'sentence_q', 'sentence_a'])
    return res


data = get_sentiment_data()
training_iters = len(data)
print('{} pairs of dialogue')
RNN_state_q, RNN_state_a = RNN(embedded_chars_q, embedded_chars_a, 'q', weights, biases)

# match_W  = tf.Variable(tf.random_normal([n_hidden_units, n_hidden_units]))
# match_b = tf.Variable(tf.constant(0.1, shape=[n_hidden_units, ]))
# RNN_state_q = tf.matmul(RNN_state_q, match_W) + match_b
#product = tf.reduce_sum(tf.multiply(RNN_state_q, RNN_state_a),keepdims = True)
product = tf.multiply(RNN_state_q, RNN_state_a)
two_class_W = tf.Variable(tf.random_normal([n_hidden_units, 2]))
two_class_b = tf.Variable(tf.constant(0.1, shape = [2, ]))
pred = tf.matmul(product, two_class_W) + two_class_b


# concat is of no use
# concat_layer=tf.concat([RNN_state_q,RNN_state_a],axis=1)
# concat_W = tf.Variable(tf.random_normal([n_hidden_units*2, 2]))
# concat_b = tf.Variable(tf.constant(0.1, shape = [2, ]))
# pred = tf.matmul(concat_layer, concat_W) + concat_b



# dot product mapping
# two_class_W = tf.Variable(tf.random_normal([1, 2]))
# two_class_b = tf.Variable(tf.constant(0.1, shape = [2, ]))
# pred = tf.matmul(product, two_class_W)+ two_class_b

# pred = tf.matmul(RNN_state, weights['out']) + biases['out']
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
train_op = tf.train.AdamOptimizer(lr).minimize(cost)
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


def get_batch(data, step, batch_size):
    data = data[step * batch_size:(step + 1) * batch_size]
    return [u[1] for u in data], [u[2] for u in data], [u[0] for u in data]
start = time.time()

with tf.Session() as sess:
    # tf.initialize_all_variables() no long valid from
    # 2017-03-02 if using tensorflow >= 0.12
    if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
        init = tf.initialize_all_variables()
    else:
        init = tf.global_variables_initializer()
    sess.run(init)
    step = 0
    while ((step + 2) * batch_size) < training_iters:

        # print('{},{},{},{}'.format(step,batch_size,training_iters,(step+1) * batch_size ))
        batch_xs_q, batch_xs_a, batch_ys = get_batch(data, step, batch_size)
        batch_xs2_q, batch_xs2_a, batch_ys2 = get_batch(data, step + 1, batch_size)
        # mnist.train.next_batch(batch_size)
        # batch_xs = batch_xs.reshape([batch_size, n_steps, n_inputs])
        # batch_xs = batch_xs.reshape([batch_size, n_steps])
        sess.run([train_op], feed_dict={
            x_q: batch_xs_q,
            x_a: batch_xs_a,
            y: batch_ys,
        })
        if step % 20 == 0:
            print((step) * batch_size, sess.run(accuracy, feed_dict={
                x_q: batch_xs2_q,
                x_a: batch_xs2_a,
                y: batch_ys2,
            }))
        step += 1
end = time.time()
print('finish  training  ,use {} sec'.format(int(end - start)))

 

關鍵網絡結構代碼如下:

RNN_state_q, RNN_state_a = RNN(embedded_chars_q, embedded_chars_a, 'q', weights, biases)
product = tf.multiply(RNN_state_q, RNN_state_a)
two_class_W = tf.Variable(tf.random_normal([n_hidden_units, 2]))
two_class_b = tf.Variable(tf.constant(0.1, shape = [2, ]))
pred = tf.matmul(product, two_class_W) + two_class_b

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章