textCNN 多分類

參考https://github.com/dennybritz/cnn-text-classification-tf

基於TextCNN模型的文本分類

問題:隨機生成的字向量表未保存,模型的test.py沒寫

1、數據來源於診斷庫,該excel文件總共包含條診斷數據,個類別,前90%數據當作訓練集,後10%數據當作測試集。

2、文本預處理:載入詞典,去停用詞,分詞。

3、詞向量化:統計所有的診斷數據,一條診斷中詞語最多的出現個數爲42個,所以將每條診斷用一個42維的向量構成,向量中每個數字代表一個詞語,若某條診斷包含的詞語小於42個,則多出來的用0表示。另外構造一個行數爲所有詞語的個數,列數指定爲42的隨機矩陣,每一行表示一個詞語向量,則一條診斷信息原爲42×1的向量,通過該矩陣表示後,一個診斷信息可表示爲42×42的矩陣。

4、神經網絡分類:神經網絡架構如圖所示,卷積層由64個大小爲3、4、5的卷積核卷積42×42矩陣,通過池化層後得到3×64個特徵,這些特徵與(類別數目)相連得到輸出,採用softmax損失函數+梯度下降法訓練網絡模型。

import pandas as pd
import jieba
import jieba.analyse
from tensorflow.contrib import learn
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt


excel = pd.read_excel('H:/own_textcnn_1/data/dict/診斷.xls')
excel_diag = list([str(a).strip() for a in list(excel['DESC'])])
excel_class = list([str(a).strip() for a in list(excel['CODE'])])
jieba.load_userdict("H:/own_textcnn_1/data/dict/器名.txt")
jieba.load_userdict("H:/own_textcnn_1/data/dict/實習.txt")
jieba.analyse.set_stop_words("H:/own_textcnn_1/data/dict/停用詞表.txt")
diag_seg_list = [(jieba.cut(a)) for a in excel_diag]
diag_seg_list = [(" ").join(a) for a in diag_seg_list]
with open("H:/own_textcnn_1/data/dict/result.txt", 'w') as f:
    for i in range(len(diag_seg_list)):
        f.writelines(excel_class[i])
        f.writelines(" ")
        f.writelines(diag_seg_list[i].strip())
        f.writelines('\n')
max_document_length = max([len(x.split(" ")) for x in diag_seg_list]) - 1
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
vocab_processor.vocabulary_.freeze(False)
x = np.array(list(vocab_processor.fit_transform(diag_seg_list)))
y = np.array(pd.get_dummies(excel_class))


def get_batch(x, y, batch_size):
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]


    # Split train/test set  90%訓練  10%驗證
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(0.1 * float(len(y)))
    x_train, x_test = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    y_train, y_test = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

    batches=np.array(list(zip(x_train, y_train)))
    batches_test=np.array(list(zip(x_test,y_test)))
    num_batches_per_epoch = int((len(batches) - 1) / batch_size) + 1
    for i in range(num_batches_per_epoch):
        start_index=i*batch_size
        end_index=min(batch_size*(i+1),len(batches))
        yield (batches[start_index:end_index],batches_test)


    # x_train, x_test = x_shuffled[:16], x_shuffled[-16:]
    # y_train, y_test = y_shuffled[:,:16], y_shuffled[:,-16:]
    #
    # return x_train, x_test, y_train, y_test, vocab_size


def train(sequence_length, num_classes, vocab_size, embedding_size, num_filters, dropout_keep_prob):
    X=tf.placeholder(dtype=tf.int32,shape=[None, sequence_length])
    Y=tf.placeholder(dtype=tf.int32,shape=[None, num_classes])

    l2_loss = tf.Variable(0.0)
    W1 = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
    embedding_chars=tf.nn.embedding_lookup(W1,X)
    embedded_chars_expanded = tf.expand_dims(embedding_chars, -1)  # 最後一維增加一維
    pooled_outputs=[]

    #卷積核3
    filter_shape = [3, embedding_size, 1, num_filters]
    W3 = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
    b3 = tf.Variable(tf.constant(0.1, shape=[num_filters]))
    conv_3 = tf.nn.conv2d(
        embedded_chars_expanded,
        W3,
        strides=[1, 1, 1, 1],
        padding="VALID")
    # Apply nonlinearity
    h_3 = tf.nn.relu(tf.nn.bias_add(conv_3, b3))
    # Maxpooling over the outputs
    pooled_3 = tf.nn.max_pool(
        h_3,
        ksize=[1, sequence_length - 3 + 1, 1, 1],
        strides=[1, 1, 1, 1],
        padding='VALID',
        name="pool")
    pooled_outputs.append(pooled_3)

    # 卷積核4
    filter_shape = [4, embedding_size, 1, num_filters]
    W4 = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
    b4 = tf.Variable(tf.constant(0.1, shape=[num_filters]))
    conv_4 = tf.nn.conv2d(
        embedded_chars_expanded,
        W4,
        strides=[1, 1, 1, 1],
        padding="VALID")
    # Apply nonlinearity
    h_4 = tf.nn.relu(tf.nn.bias_add(conv_4, b4))
    # Maxpooling over the outputs
    pooled_4 = tf.nn.max_pool(
        h_4,
        ksize=[1, sequence_length - 4 + 1, 1, 1],
        strides=[1, 1, 1, 1],
        padding='VALID')
    pooled_outputs.append(pooled_4)

    # 卷積核5
    filter_shape = [5, embedding_size, 1, num_filters]
    W5 = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
    b5 = tf.Variable(tf.constant(0.1, shape=[num_filters]))
    conv_5 = tf.nn.conv2d(
        embedded_chars_expanded,
        W5,
        strides=[1, 1, 1, 1],
        padding="VALID")
    # Apply nonlinearity
    h_5 = tf.nn.relu(tf.nn.bias_add(conv_5, b5))
    # Maxpooling over the outputs
    pooled_5 = tf.nn.max_pool(
        h_5,
        ksize=[1, sequence_length - 5 + 1, 1, 1],
        strides=[1, 1, 1, 1],
        padding='VALID',)
    pooled_outputs.append(pooled_5)

    num_filters_total=num_filters*3
    h_pool = tf.concat(pooled_outputs, 3)
    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

    h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

    # Final (unnormalized) scores and predictions
    W_6 = tf.Variable(tf.truncated_normal(shape=[num_filters_total, num_classes], stddev=0.1))
    b_6 = tf.Variable(tf.constant(0.1, shape=[num_classes]))
    l2_loss += tf.nn.l2_loss(W_6)
    l2_loss += tf.nn.l2_loss(b_6)
    scores = tf.nn.xw_plus_b(h_drop, W_6, b_6)
    predictions = tf.argmax(scores, 1)

    # Calculate mean cross-entropy loss
    losses = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=Y)
    loss = tf.reduce_mean(losses)
    train_op=tf.train.GradientDescentOptimizer(0.01).minimize(loss)

    # Accuracy
    correct_predictions = tf.equal(predictions, tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
    all_loss=[]
    all_acc=[]
    max_acc=0
    init=tf.initialize_all_variables()
    with tf.Session() as sess:
        sess.run(init)
        for i in range(1500):
            np.random.seed(i)
            # acdafa=list(get_batch(x,y,8))
            # for jjj in get_batch(x,y,8):
            #     mh=jjj
            #     print(mh)
            batches=list(get_batch(x,y,64))
            for batch in batches:
                train_batch=batch[0]
                test_batch=batch[1]
                x_batch,y_batch=zip(*train_batch)
                sess.run(train_op,feed_dict={X: x_batch,Y: y_batch})
                print(i, sess.run([l2_loss,loss], feed_dict={X: x_batch, Y: y_batch}))
                all_loss.append(sess.run(loss, feed_dict={X: x_batch, Y: y_batch}))

            # if i%10==0:
                xtest_batch, ytest_batch = zip(*test_batch)
                test_acc=sess.run(accuracy,feed_dict={X: xtest_batch,Y: ytest_batch})
                all_acc.append(test_acc)
                if i%20==0 and test_acc>max_acc:
                    max_acc=test_acc
                    print("accuracy:",sess.run(accuracy,feed_dict={X: xtest_batch,Y: ytest_batch}))
                    saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
                    saver.save(sess,"H://own_textcnn//model//textcnn_model",global_step=i)
    plt.figure(1)
    plt.plot(all_acc)
    plt.figure(2)
    plt.plot(all_loss)
    plt.show()





if __name__=='__main__':
    num_classes=y.shape[1]
    vocab_size=len(vocab_processor.vocabulary_)
    embedding_size=42
    num_filters=64
    dropout_keep_prob = 0.5
    train(sequence_length, num_classes, vocab_size, embedding_size, num_filters, dropout_keep_prob)
    print("aa")




 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章