搜狐新闻文本分类之CNN(tensorflow版Version1)

注:数据提取码
代码所需训练数据

一、环境配置

python3.6
tensorflow-gpu1.12
Windows10
pycharm

二、代码背景

**1、**结合Character-level Convolutional Networks for Text Classification这篇论文,参考了github上的代码进行学习,github地址
**2、**本篇代码尚未使用词向量模型处理,通过简单的统计出所有文本中高频出现的5000个词的个数,然后根据每篇文章中的词得出相应的词的id,组成一个文本的向量作为输入值

三、代码

**注:**运行时,要在终端运行
(1)先运行python run_cnn.py train
(2)在运行python run_cnn.py test
(3)这个项目一共有三个文件分别为 cnn_model.py(cnn搭建模型)、cnews_loader.py(信息加载)、run_cnn.py(运行文件)
(4)我们需要在同级目录下创建data文件夹、Checkpoint文件夹、tensorboard文件夹
在这里插入图片描述

1、 cnn_model.py文件

import sys
import numpy as np
import tensorflow as tf
from collections import Counter


def read_file(filename):
    #读取文件数据,将每个文本中的每个字分隔开
    contents,labels = [],[]
    with open(filename,mode='r',encoding='utf-8',errors='ignore') as f:
        for line in f:
            try:
                label,content = line.strip().split('\t')
                if content:
                    contents.append(list(content))
                    #将当前文本中的每一个字作为一个字符串分开
                    labels.append(label)
            except:
                pass
    return contents,labels

def build_vocab(train_dir,vocab_dir,vocab_size = 5000):
    #根据训练集构建词汇表存储
    data_train,_ = read_file(train_dir)
    #将每篇文章分为每一个字为一个字符串的链表
    all_data = []
    for content in data_train:
        all_data.extend(content)
        #将所有文本中的内容放在一个链表中
        #在all_data的数组中追加content列表中的内容,将所有内容都放在一个列表中

    counter = Counter(all_data)
    #统计每个字出现的次数
    count_pairs = counter.most_common(vocab_size-1)
    #返回的内容为top(vocab_size-1)的字符和频率
    words,_ = list(zip(*count_pairs))
    #将元组列表解压为列表,只取前top的
    words = ['<PAD>']+list(words)
    # 添加一个<PAD>来将所有的文本pad为同一长度
    open(vocab_dir,mode='w').write('\n'.join(words)+'\n')

def read_vocab(vocab_dir):
    #读取词汇表
    with open(vocab_dir,mode='r',encoding='utf-8',errors='ignore') as fp:
        words = [words.strip() for words in fp.readlines()]
    word_to_id = dict(zip(words,range(len(words))))
    return words,word_to_id

def read_category():
    #读取分类目录,固定
    categories = categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
    categories = [content for content in categories]

    cat_to_id = dict(zip(categories,range(len(categories))))
    return categories,cat_to_id

def to_words(content,words):
    #将id的内容转化为文字
    return ''.join(words[x] for x in content)

def process_file(filename,word_to_id,cat_to_id,max_lenth = 600):
    #将文件转化为id的表示
    contents,labels = read_file(filename)
    data_id,label_id = [],[]
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    x_pad = tf.keras.preprocessing.sequence.pad_sequences(data_id,max_lenth)
    #当长度不够时默认在开始补零
    y_pad = tf.keras.utils.to_categorical(label_id,num_classes=len(cat_to_id))
    #将每个字的序号作为输入的向量值
    return x_pad,y_pad

def batch_iter(x,y,batch_size=64):
    #生成批次数据
    data_len = len(x)
    num_batch = int((data_len-1)/batch_size)+1

    indices = np.random.permutation(np.arange(data_len))
    #得到一个data_len长度的随机排列的数组,将x和y重新排列
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i*batch_size
        end_id = min((i+1)*batch_size,data_len)

        yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]

2、cnews_load.py

import sys
import numpy as np
import tensorflow as tf
from collections import Counter


def read_file(filename):
    #读取文件数据
    contents,labels = [],[]
    with open(filename,mode='r',encoding='utf-8',errors='ignore') as fp:
        for line in fp.readlines():
            try:
                label,content = line.strip().split('\t')
                if content:
                    contents.append(list(content))
                    #将文本中的每个字作为一个字符存入列表中
                    labels.append(label)
            except:
                pass
    return contents,labels

def build_vocab(train_dir,vocab_dir,vocab_size = 5000):
    #根据训练集构建词汇表存储
    data_train,_ = read_file(train_dir)
    all_data = []
    for content in data_train:
        all_data.extend(content)
        #将所有文本中的内容添加到列表中,后续统计所有字出现的次数

    counter = Counter(all_data)
    #统计每个字出现的次数
    count_pairs = counter.most_common(vocab_size-1)
    #找到出现频率为top(vocab_size-1)的值
    words,_ = list(zip(*count_pairs))
    #将元组列表解压为二维列表
    words = ['<PAD>']+list(words)
    # 添加一个<PAD>来将所有的文本pad为同一长度
    open(vocab_dir,mode='w').write('\n'.join(words)+'\n')

def read_vocab(vocab_dir):
    #读取词汇表
    with open(vocab_dir,mode='r',encoding='utf-8',errors='ignore') as fp:
        words = [words.strip() for words in fp.readlines()]
    word_to_id = dict(zip(words,range(len(words))))
    return words,word_to_id

def read_category():
    #读取分类目录,固定
    categories = categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
    categories = [content for content in categories]

    cat_to_id = dict(zip(categories,range(len(categories))))
    return categories,cat_to_id

def to_words(content,words):
    #将id的内容转化为文字
    return ''.join(words[x] for x in content)

def process_file(filename,word_to_id,cat_to_id,max_lenth = 600):
    #将文件转化为id的表示
    contents,labels = read_file(filename)
    data_id,label_id = [],[]
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    x_pad = tf.keras.preprocessing.sequence.pad_sequences(data_id,max_lenth)
    #当长度不够时默认在开始补零
    y_pad = tf.keras.utils.to_categorical(label_id,num_classes=len(cat_to_id))
    #将每个字的序号作为输入的向量值
    return x_pad,y_pad

def batch_iter(x,y,batch_size=64):
    #生成批次数据
    data_len = len(x)
    num_batch = int((data_len-1)/batch_size)+1

    indices = np.random.permutation(np.arange(data_len))
    #得到一个data_len长度的随机排列的数组,将x和y重新排列
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i*batch_size
        end_id = min((i+1)*batch_size,data_len)

        yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]

3、run_cnn.py

from __future__ import print_function

import os
import sys
import time
import warnings
warnings.filterwarnings('ignore')
from datetime import timedelta

import numpy as np
import tensorflow as tf
from sklearn import metrics

from cnn_model import TCNNConfig,TextCNN
from cnews_loader import read_vocab,read_category,batch_iter,process_file,build_vocab

train_dir = './data/cnews.train.txt'
test_dir = './data/cnews.test.txt'
val_dir = './data/cnews.val.txt'
vocab_dir = './data/cnews.vocab.txt'

save_dir = './Checkpoint'
save_path = './Checkpoint/best_validation'

def get_time_dif(start_time):
    #获取已使用的时间
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds = int(round(time_dif)))

def feed_data(x_batch,y_batch,keep_prob):
    feed_dict ={
        model.input_x:x_batch,
        model.input_y:y_batch,
        model.keep_prob:keep_prob
    }
    return feed_dict

def evaluate(sess,x,y):
    #评估在某一数据上的准确率和损失
    data_len = len(x)
    batch_eval = batch_iter(x,y,128)
    total_loss = 0.0
    total_acc = 0.0

    for x_batch,y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch,y_batch,1.0)
        loss,acc = sess.run([model.loss,model.acc],feed_dict=feed_dict)
        total_loss += loss*batch_len
        total_acc += acc*batch_len

    return total_loss/data_len,total_acc/data_len

def train():
    print('Configuring TensorBoard and Saver')
    #每次训练之前需要将tensorboard文件夹清空,否则会覆盖
    tensorboard_dir = './tensorboard'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss",model.loss)
    tf.summary.scalar("accuracy",model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    #配置Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print('Loading training and validation data....')
    #载入训练集和验证集
    start_time = time.time()
    x_train,y_train = process_file(train_dir,word_to_id,cat_to_id,config.seq_length)
    x_val,y_val = process_file(val_dir,word_to_id,cat_to_id,config.seq_length)
    time_dif = get_time_dif(start_time)
    print('Time usage:',time_dif)

    #创建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('training and evaluating....')
    start_time = time.time()
    total_batch = 0 #总批次
    best_acc_val = 0.0 #最佳验证集准确率
    last_improved = 0 #记录上次提升批次
    require_improvement= 1000 #如果超过一千轮未提升则提前结束

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch',epoch+1)
        batch_train = batch_iter(x_train,y_train,config.batch_size)
        for x_batch,y_batch in batch_train:
            feed_dict = feed_data(x_batch,y_batch,config.dropout_keep_prob)

            if total_batch%config.save_per_batch ==  0:
                #每多少轮次将训练结果写入tensorboard scalar
                s = session.run(merged_summary,feed_dict=feed_dict)
                writer.add_summary(s,total_batch)

                if total_batch % config.print_per_batch == 0:
                    # 每多少轮次输出在训练集和验证集上的性能
                    feed_dict[model.keep_prob] = 1.0
                    loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                    loss_val, acc_val = evaluate(session, x_val, y_val)  # todo
                    if acc_val > best_acc_val:
                        # 保存最好结果
                        best_acc_val = acc_val
                        last_improved = total_batch
                        saver.save(sess=session, save_path=save_path)
                        improved_str = '*'
                    else:
                        improved_str = ''
                    time_dif = get_time_dif(start_time)
                    msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                          + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                    print(msg.format(total_batch,loss_train,acc_train,loss_val,acc_val,time_dif,improved_str))

            feed_dict[model.keep_prob] = config.dropout_keep_prob
            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1
            if total_batch - last_improved > require_improvement:
                # 验证集正确率长期不提升,提前结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循环
        if flag:  # 同上
            break

def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型
    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))
    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1
    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
    for i in range(num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")
    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextCNN(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章