參考https://github.com/dennybritz/cnn-text-classification-tf
基於TextCNN模型的文本分類
問題:隨機生成的字向量表未保存,模型的test.py沒寫
1、數據來源於診斷庫,該excel文件總共包含條診斷數據,個類別,前90%數據當作訓練集,後10%數據當作測試集。
2、文本預處理:載入詞典,去停用詞,分詞。
3、詞向量化:統計所有的診斷數據,一條診斷中詞語最多的出現個數爲42個,所以將每條診斷用一個42維的向量構成,向量中每個數字代表一個詞語,若某條診斷包含的詞語小於42個,則多出來的用0表示。另外構造一個行數爲所有詞語的個數,列數指定爲42的隨機矩陣,每一行表示一個詞語向量,則一條診斷信息原爲42×1的向量,通過該矩陣表示後,一個診斷信息可表示爲42×42的矩陣。
4、神經網絡分類:神經網絡架構如圖所示,卷積層由64個大小爲3、4、5的卷積核卷積42×42矩陣,通過池化層後得到3×64個特徵,這些特徵與(類別數目)相連得到輸出,採用softmax損失函數+梯度下降法訓練網絡模型。
import pandas as pd
import jieba
import jieba.analyse
from tensorflow.contrib import learn
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
excel = pd.read_excel('H:/own_textcnn_1/data/dict/診斷.xls')
excel_diag = list([str(a).strip() for a in list(excel['DESC'])])
excel_class = list([str(a).strip() for a in list(excel['CODE'])])
jieba.load_userdict("H:/own_textcnn_1/data/dict/器名.txt")
jieba.load_userdict("H:/own_textcnn_1/data/dict/實習.txt")
jieba.analyse.set_stop_words("H:/own_textcnn_1/data/dict/停用詞表.txt")
diag_seg_list = [(jieba.cut(a)) for a in excel_diag]
diag_seg_list = [(" ").join(a) for a in diag_seg_list]
with open("H:/own_textcnn_1/data/dict/result.txt", 'w') as f:
for i in range(len(diag_seg_list)):
f.writelines(excel_class[i])
f.writelines(" ")
f.writelines(diag_seg_list[i].strip())
f.writelines('\n')
max_document_length = max([len(x.split(" ")) for x in diag_seg_list]) - 1
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
vocab_processor.vocabulary_.freeze(False)
x = np.array(list(vocab_processor.fit_transform(diag_seg_list)))
y = np.array(pd.get_dummies(excel_class))
def get_batch(x, y, batch_size):
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# Split train/test set 90%訓練 10%驗證
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(0.1 * float(len(y)))
x_train, x_test = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_test = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
batches=np.array(list(zip(x_train, y_train)))
batches_test=np.array(list(zip(x_test,y_test)))
num_batches_per_epoch = int((len(batches) - 1) / batch_size) + 1
for i in range(num_batches_per_epoch):
start_index=i*batch_size
end_index=min(batch_size*(i+1),len(batches))
yield (batches[start_index:end_index],batches_test)
# x_train, x_test = x_shuffled[:16], x_shuffled[-16:]
# y_train, y_test = y_shuffled[:,:16], y_shuffled[:,-16:]
#
# return x_train, x_test, y_train, y_test, vocab_size
def train(sequence_length, num_classes, vocab_size, embedding_size, num_filters, dropout_keep_prob):
X=tf.placeholder(dtype=tf.int32,shape=[None, sequence_length])
Y=tf.placeholder(dtype=tf.int32,shape=[None, num_classes])
l2_loss = tf.Variable(0.0)
W1 = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
embedding_chars=tf.nn.embedding_lookup(W1,X)
embedded_chars_expanded = tf.expand_dims(embedding_chars, -1) # 最後一維增加一維
pooled_outputs=[]
#卷積核3
filter_shape = [3, embedding_size, 1, num_filters]
W3 = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
b3 = tf.Variable(tf.constant(0.1, shape=[num_filters]))
conv_3 = tf.nn.conv2d(
embedded_chars_expanded,
W3,
strides=[1, 1, 1, 1],
padding="VALID")
# Apply nonlinearity
h_3 = tf.nn.relu(tf.nn.bias_add(conv_3, b3))
# Maxpooling over the outputs
pooled_3 = tf.nn.max_pool(
h_3,
ksize=[1, sequence_length - 3 + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="pool")
pooled_outputs.append(pooled_3)
# 卷積核4
filter_shape = [4, embedding_size, 1, num_filters]
W4 = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
b4 = tf.Variable(tf.constant(0.1, shape=[num_filters]))
conv_4 = tf.nn.conv2d(
embedded_chars_expanded,
W4,
strides=[1, 1, 1, 1],
padding="VALID")
# Apply nonlinearity
h_4 = tf.nn.relu(tf.nn.bias_add(conv_4, b4))
# Maxpooling over the outputs
pooled_4 = tf.nn.max_pool(
h_4,
ksize=[1, sequence_length - 4 + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID')
pooled_outputs.append(pooled_4)
# 卷積核5
filter_shape = [5, embedding_size, 1, num_filters]
W5 = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1))
b5 = tf.Variable(tf.constant(0.1, shape=[num_filters]))
conv_5 = tf.nn.conv2d(
embedded_chars_expanded,
W5,
strides=[1, 1, 1, 1],
padding="VALID")
# Apply nonlinearity
h_5 = tf.nn.relu(tf.nn.bias_add(conv_5, b5))
# Maxpooling over the outputs
pooled_5 = tf.nn.max_pool(
h_5,
ksize=[1, sequence_length - 5 + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',)
pooled_outputs.append(pooled_5)
num_filters_total=num_filters*3
h_pool = tf.concat(pooled_outputs, 3)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)
# Final (unnormalized) scores and predictions
W_6 = tf.Variable(tf.truncated_normal(shape=[num_filters_total, num_classes], stddev=0.1))
b_6 = tf.Variable(tf.constant(0.1, shape=[num_classes]))
l2_loss += tf.nn.l2_loss(W_6)
l2_loss += tf.nn.l2_loss(b_6)
scores = tf.nn.xw_plus_b(h_drop, W_6, b_6)
predictions = tf.argmax(scores, 1)
# Calculate mean cross-entropy loss
losses = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=Y)
loss = tf.reduce_mean(losses)
train_op=tf.train.GradientDescentOptimizer(0.01).minimize(loss)
# Accuracy
correct_predictions = tf.equal(predictions, tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
all_loss=[]
all_acc=[]
max_acc=0
init=tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
for i in range(1500):
np.random.seed(i)
# acdafa=list(get_batch(x,y,8))
# for jjj in get_batch(x,y,8):
# mh=jjj
# print(mh)
batches=list(get_batch(x,y,64))
for batch in batches:
train_batch=batch[0]
test_batch=batch[1]
x_batch,y_batch=zip(*train_batch)
sess.run(train_op,feed_dict={X: x_batch,Y: y_batch})
print(i, sess.run([l2_loss,loss], feed_dict={X: x_batch, Y: y_batch}))
all_loss.append(sess.run(loss, feed_dict={X: x_batch, Y: y_batch}))
# if i%10==0:
xtest_batch, ytest_batch = zip(*test_batch)
test_acc=sess.run(accuracy,feed_dict={X: xtest_batch,Y: ytest_batch})
all_acc.append(test_acc)
if i%20==0 and test_acc>max_acc:
max_acc=test_acc
print("accuracy:",sess.run(accuracy,feed_dict={X: xtest_batch,Y: ytest_batch}))
saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
saver.save(sess,"H://own_textcnn//model//textcnn_model",global_step=i)
plt.figure(1)
plt.plot(all_acc)
plt.figure(2)
plt.plot(all_loss)
plt.show()
if __name__=='__main__':
num_classes=y.shape[1]
vocab_size=len(vocab_processor.vocabulary_)
embedding_size=42
num_filters=64
dropout_keep_prob = 0.5
train(sequence_length, num_classes, vocab_size, embedding_size, num_filters, dropout_keep_prob)
print("aa")