NLP實踐-Task5

任務鏈接:https://wx.zsxq.com/dweb/#/index/222248424811
深度學習視頻推薦1:https://www.icourse163.org/learn/PKU-1002536002?tid=1003797005#/learn/content
深度學習視頻推薦2:https://mooc.study.163.com/course/2001281002#/info
github:https://github.com/jiayinZH(textCNN代碼及測試數據等會上傳)

1.激活函數種類

神經網絡中激活函數的主要作用是提供網絡的非線性建模能力。假設一個神經網絡中僅包含線性卷積和全連接運算,那麼該網絡僅能夠表達線性映射,即便增加網絡的深度也依舊還是線性映射,難以有效建模實際環境中非線性分佈的數據。加入(非線性)激活函數之後,深度神經網絡才具備了分層的非線性映射學習能力。因此,激活函數是深度神經網絡中不可或缺的部分。
常見的激活函數有sigmoid、tanh、ReLU、softmax等等
參考文章1:http://blog.csdn.net/u014595019/article/details/52562159
參考文章2:https://zhuanlan.zhihu.com/p/22142013

2.深度學習正則化種類

正則化的作用是選擇經驗風險與模型複雜度同時較小的模型
參考文章:https://blog.csdn.net/qq_16137569/article/details/81584165

3.深度學習優化方法

參考文章:https://blog.csdn.net/qq_21460525/article/details/70146665

4.代碼展示

使用THUCNews數據集實現textCNN
Text類實現有參考https://github.com/gaussic/text-classification-cnn-rnn/blob/master/data/cnews_loader.py

import os
import numpy as np
import tensorflow as tf
from collections import Counter
import tensorflow.contrib.keras as kr


class Text(object):
    # 打開文件
    def open_file(self, filename, mode='r'):
        return open(filename, mode, encoding='utf-8', errors='ignore')

    # 讀取文件
    def read_file(self, filename):
        contents, labels = [], []
        with self.open_file(filename) as f:
            for line in f:
                try:
                    label, content = line.strip().split('\t')
                    if content:
                        contents.append(list(content))
                        labels.append(label)
                except:
                    pass
        return contents, labels

    # 讀取詞彙表,一個詞對應一個id
    def read_vocab(self, vocab_dir):
        with self.open_file(vocab_dir) as fp:
            words = [_.strip() for _ in fp.readlines()]
        word_to_id = dict(zip(words, range(len(words))))
        return words, word_to_id

    # 讀取分類目錄,一個類別對應一個id
    def read_category(self):
        categories = ['體育', '財經', '房產', '家居', '教育', '科技', '時尚', '時政', '遊戲', '娛樂']
        cat_to_id = dict(zip(categories, range(len(categories))))
        return categories, cat_to_id

    # 根據訓練集構建詞彙表,存儲
    def build_vocab(self, train_dir, vocab_dir, vocab_size=5000):
        data_train, _ = self.read_file(train_dir)

        all_data = []
        for content in data_train:
            all_data.extend(content)

        counter = Counter(all_data)
        count_pairs = counter.most_common(vocab_size - 1)
        words, _ = list(zip(*count_pairs))
        # 添加一個 <PAD> 來將所有文本pad爲同一長度
        words = ['<PAD>'] + list(words)
        self.open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')

    # 將文件轉換爲id表示
    def process_file(self, filename, word_to_id, cat_to_id, max_length=600):
        contents, labels = self.read_file(filename)

        data_id, label_id = [], []
        for i in range(len(contents)):
            data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
            label_id.append(cat_to_id[labels[i]])

        # 使用keras提供的pad_sequences來將文本轉爲固定長度,不足的補0
        x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
        y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 將標籤轉換爲one-hot表示

        return x_pad, y_pad

    # 獲取數據
    def get_data(self, filenname, text_length):
        vocab_dir = './data/cnews/cnews.vocab.txt'
        categories, cat_to_id = text.read_category()
        words, word_to_id = text.read_vocab(vocab_dir)
        x, y = text.process_file(filenname, word_to_id, cat_to_id, text_length)
        return x, y


class TextCNN(object):
    def __init__(self):
        self.text_length = 600  # 文本長度
        self.num_classer = 10  # 類別數

        self.vocab_size = 5000  # 詞彙表達小
        self. word_vec_dim = 64  # 詞向量維度

        self.filter_width = 2  # 卷積核尺寸
        self.filter_width_list = [2, 3, 4]  # 卷積核尺寸列表
        self.num_filters = 5  # 卷積核數目

        self.dropout_prob = 0.5  # dropout概率
        self.learning_rate = 0.005  # 學習率
        self.iter_num = 10  # 迭代次數
        self.batch_size = 64  # 每輪迭代訓練多少數據
        self.model_save_path = './model/'  # 模型保存路徑
        self.model_name = 'mnist_model'  # 模型的命名
        self.embedding = tf.get_variable('embedding', [self.vocab_size, self.word_vec_dim])

        self.fc1_size = 32  # 第一層全連接的神經元個數
        self.fc2_size = 64  # 第二層全連接的神經元個數
        self.fc3_size = 10  # 第三層全連接的神經元個數

    # 模型1,使用多種卷積核
    def model_1(self, x, is_train):
        # embedding層
        embedding_res = tf.nn.embedding_lookup(self.embedding, x)

        pool_list = []
        for filter_width in self.filter_width_list:
            # 卷積層
            conv_w = self.get_weight([filter_width, self.word_vec_dim, self.num_filters], 0.01)
            conv_b = self.get_bias([self.num_filters])
            conv = tf.nn.conv1d(embedding_res, conv_w, stride=1, padding='VALID')
            conv_res = tf.nn.relu(tf.nn.bias_add(conv, conv_b))

            # 最大池化層
            pool_list.append(tf.reduce_max(conv_res, reduction_indices=[1]))
        pool_res = tf.concat(pool_list, 1)

        # 第一個全連接層
        fc1_w = self.get_weight([self.num_filters * len(self.filter_width_list), self.fc1_size], 0.01)
        fc1_b = self.get_bias([self.fc1_size])
        fc1_res = tf.nn.relu(tf.matmul(pool_res, fc1_w) + fc1_b)
        if is_train:
            fc1_res = tf.nn.dropout(fc1_res, 0.5)

        # 第二個全連接層
        fc2_w = self.get_weight([self.fc1_size, self.fc2_size], 0.01)
        fc2_b = self.get_bias([self.fc2_size])
        fc2_res = tf.nn.relu(tf.matmul(fc1_res, fc2_w) + fc2_b)
        if is_train:
            fc2_res = tf.nn.dropout(fc2_res, 0.5)

        # 第三個全連接層
        fc3_w = self.get_weight([self.fc2_size, self.fc3_size], 0.01)
        fc3_b = self.get_bias([self.fc3_size])
        fc3_res = tf.matmul(fc2_res, fc3_w) + fc3_b

        return fc3_res

    # 模型2,使用一個卷積核
    def model_2(self, x, is_train):
        # embedding層
        embedding_res = tf.nn.embedding_lookup(self.embedding, x)

        # 卷積層
        conv_w = self.get_weight([self.filter_width, self.word_vec_dim, self.num_filters], 0.01)
        conv_b = self.get_bias([self.num_filters])
        conv = tf.nn.conv1d(embedding_res, conv_w, stride=1, padding='VALID')
        conv_res = tf.nn.relu(tf.nn.bias_add(conv, conv_b))

        # 最大池化層
        pool_res = tf.reduce_max(conv_res, reduction_indices=[1])

        # 第一個全連接層
        fc1_w = self.get_weight([self.num_filters, self.fc1_size], 0.01)
        fc1_b = self.get_bias([self.fc1_size])
        fc1_res = tf.nn.relu(tf.matmul(pool_res, fc1_w) + fc1_b)
        if is_train:
            fc1_res = tf.nn.dropout(fc1_res, 0.5)

        # 第二個全連接層
        fc2_w = self.get_weight([self.fc1_size, self.fc2_size], 0.01)
        fc2_b = self.get_bias([self.fc2_size])
        fc2_res = tf.nn.relu(tf.matmul(fc1_res, fc2_w) + fc2_b)
        if is_train:
            fc2_res = tf.nn.dropout(fc2_res, 0.5)

        # 第三個全連接層
        fc3_w = self.get_weight([self.fc2_size, self.fc3_size], 0.01)
        fc3_b = self.get_bias([self.fc3_size])
        fc3_res = tf.matmul(fc2_res, fc3_w) + fc3_b

        return fc3_res

    # 定義初始化網絡權重函數
    def get_weight(self, shape, regularizer):
        w = tf.Variable(tf.truncated_normal(shape, stddev=0.1))
        tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regularizer)(w))  # 爲權重加入L2正則化
        return w

    # 定義初始化偏置項函數
    def get_bias(self, shape):
        b = tf.Variable(tf.ones(shape))
        return b

    # 生成批次數據
    def batch_iter(self, x, y):
        data_len = len(x)
        num_batch = int((data_len - 1) / self.batch_size) + 1
        indices = np.random.permutation(np.arange(data_len))  # 隨機打亂一個數組
        x_shuffle = x[indices]  # 隨機打亂數據
        y_shuffle = y[indices]  # 隨機打亂數據
        for i in range(num_batch):
            start = i * self.batch_size
            end = min((i + 1) * self.batch_size, data_len)
            yield x_shuffle[start:end], y_shuffle[start:end]


# 訓練
def train(cnn, X_train, y_train):
    x = tf.placeholder(tf.int32, [None, cnn.text_length])
    y = tf.placeholder(tf.float32, [None, cnn.num_classer])
    y_pred = cnn.model_1(x, True)

    # 聲明一個全局計數器,並輸出化爲0,存放到目前爲止模型優化迭代的次數
    global_step = tf.Variable(0, trainable=False)

    # 損失函數,交叉熵
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y)
    loss = tf.reduce_mean(cross_entropy)

    # 優化器
    train_step = tf.train.AdamOptimizer(learning_rate=cnn.learning_rate).minimize(loss, global_step=global_step)

    saver = tf.train.Saver()  # 實例化一個保存和恢復變量的saver

    # 創建一個會話,並通過python中的上下文管理器來管理這個會話
    with tf.Session() as sess:
        # 初始化計算圖中的變量
        init_op = tf.global_variables_initializer()
        sess.run(init_op)

        # 通過checkpoint文件定位到最新保存的模型
        ckpt = tf.train.get_checkpoint_state(cnn.model_save_path)
        if ckpt and ckpt.model_checkpoint_path:
            # 加載最新的模型
            saver.restore(sess, ckpt.model_checkpoint_path)

        # 循環迭代,每次迭代讀取一個batch_size大小的數據
        for i in range(cnn.iter_num):
            batch_train = cnn.batch_iter(X_train, y_train)
            for x_batch, y_batch in batch_train:
                loss_value, step = sess.run([loss, train_step], feed_dict={x: x_batch, y: y_batch})
                print('After %d training step(s), loss on training batch is %g.' % (i, loss_value))
                saver.save(sess, os.path.join(cnn.model_save_path, cnn.model_name), global_step=global_step)


# 預測
def predict(cnn, X_test, y_test):
    # 創建一個默認圖,在該圖中執行以下操作
    # with tf.Graph.as_default():
        x = tf.placeholder(tf.int32, [None, cnn.text_length])
        y = tf.placeholder(tf.float32, [None, cnn.num_classer])
        y_pred = cnn.model_1(x, False)

        saver = tf.train.Saver()  # 實例化一個保存和恢復變量的saver

        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_pred, 1))  # 判斷預測值和實際值是否相同
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))  # 求平均得到準確率

        with tf.Session() as sess:
            ckpt = tf.train.get_checkpoint_state(cnn.model_save_path)
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)

                # 根據讀入的模型名字切分出該模型是屬於迭代了多少次保存的
                global_step = ckpt.model_checkpoint_path.split('/')[-1].split(' ')[-1]

                # 計算出測試集上準確
                accuracy_score = sess.run(accuracy, feed_dict={x: X_test, y: y_test})
                print('After %s training step(s), test accuracy = %g' % (global_step, accuracy_score))
            else:
                print('No checkpoint file found')
                return


if __name__ == '__main__':
    text_length = 600  # 文本長度
    text = Text()
    X_train, y_train = text.get_data('./data/cnews/cnews.train.txt', text_length)  # X_train shape (50000, 300)
    X_test, y_test = text.get_data('./data/cnews/cnews.test.txt', text_length)  # X_test shape (10000, 300)
    X_val, y_val = text.get_data('./data/cnews/cnews.val.txt', text_length)  # X_val shape (5000, 300)

    is_train = True
    cnn = TextCNN()
    if is_train:
        train(cnn, X_train, y_train)
    else:
        predict(cnn, X_test, y_test)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章