Tensorflow圖像生成文本(4)計算圖設計以及訓練流程

準備工作

  1. 數據集的獲取。參考:Tensorflow圖像生成文本實現(1)flickr30k數據集介紹
  2. 詞表詞頻文件。參考:Tensorflow圖像生成文本(2)詞表詞頻的構建
  3. 所有圖像的特徵文件。參考:Tensorflow圖像生成文本(3)圖像特徵的提取

訓練流程

按照代碼編寫的順序,依次爲:

  1. 載入詞表,構建兩個映射關係。
    word_to_id 將單詞轉化爲id表示
    id_to_word 將id轉化爲單詞
  2. 將token文件的形式變爲:
    dict {‘1234.jpg’: [‘4, 556, 44, 6, 57’, ‘2223, 4, 54, 221’]}
    即: 一個字典,key是圖像名稱,value是一個列表,裏面儲存的是每一條圖像描述信息。
  3. 載入圖像特徵
    構建batch
    爲每一張圖像,隨機挑選出來一條描述
  4. 計算圖構建
    四個placeholder,分別是:圖像特徵、對應描述、mask(代碼中有介紹),drop值。
    文本embedding、圖像embedding
    進入lstm結構,全連接

代碼實現

# -*- coding:utf-8 -*-

import os
import sys
import tensorflow as tf
from tensorflow import gfile
from tensorflow import logging
import pprint
import pickle
import numpy as np
import math
import random

# 打印出 log
tf.logging.set_verbosity(tf.logging.INFO)


input_description_file = "./data/results_20130124.token"
input_img_feature_dir = './data/download_inception_v3_features'
input_vocab_file = './data/vocab.txt'
output_dir = './data/local_run'

if not gfile.Exists(output_dir):
    gfile.MakeDirs(output_dir)


def get_default_params():
    return tf.contrib.training.HParams(
        num_vocab_word_threshold=3,
        num_embedding_nodes=32,
        num_timesteps=10,
        num_lstm_nodes=[64, 64],
        num_lstm_layers=2,
        num_fc_nodes=32,
        batch_size=100,
        cell_type='lstm',
        clip_lstm_grads=1.0,
        learning_rate=0.001,
        keep_prob=0.8,
        log_frequent=500,
        save_frequent=5000,
    )

training_steps = 1000000

hps = get_default_params()


class Vocab(object):
    '''
    構建詞表
    '''
    def __init__(self, filename, word_num_threshold):
        self._id_to_word = {} # 從 詞id 到 單詞 映射
        self._word_to_id = {} # 從 單詞 到 詞id 的映射
        self._unk = -1
        self._eos = -1
        self._word_num_threshold = word_num_threshold
        self._read_dict(filename) # 將 詞表 讀入 成 字典形式

    def _read_dict(self, filename):
        '''
        將 詞表 讀入 成 字典形式
        :param filename: 詞表文件
        :return:
        '''
        with gfile.GFile(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            # occurence 是 詞頻
            word, occurence = line.strip('\r\n').split('\t')
            occurence = int(occurence)
            if word != '<UNK>' and occurence < self._word_num_threshold:
                continue
            # 按照 進入 字典 的 順序排序
            idx = len(self._id_to_word)
            if word == '<UNK>':
                self._unk = idx
            elif word == '.':
                self._eos = idx
            if idx in self._id_to_word or word in self._word_to_id:
                raise Exception('duplicate words in vocab file')
            # 接下來 構建兩個映射
            self._word_to_id[word] = idx
            self._id_to_word[idx] = word

    @property
    def unk(self):
        return self._unk

    @property
    def eos(self):
        return self._eos

    def word_to_id(self, word):
        '''
        單個單詞 轉化爲 id 表示
        :param word: 單詞名稱
        :return: 詞id
        '''
        return self._word_to_id.get(word, self.unk)

    def id_to_word(self, cur_id):
        '''
        詞id 轉化 爲 單詞
        :param cur_id:  詞id
        :return: 單詞
        '''
        return self._id_to_word.get(cur_id, '<UNK>')

    def size(self):
        # 詞表 長度
        return len(self._word_to_id)

    def encode(self, sentence):
        '''
        將一個描述中的單詞,映射成 id 表示
        :param sentence: 描述語句
        :return: 詞id句子
        '''
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split(' ')]
        return word_ids

    def decode(self, sentence_id):
        '''
        將一個 id 句子,轉化爲 單詞句子
        :param sentence_id:
        :return:
        '''
        words = [self.id_to_word(word_id) for word_id in sentence_id]
        return ' '.join(words)


def parse_token_file(token_file):
    '''
    解析token文件
    :param token_file: 文件路徑
    :return: dict 形式如: {'1234.jpg': ['this is a people', 'the people is happy']}
    '''
    img_name_to_tokens = {}
    with gfile.GFile(token_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        img_id, description = line.strip('\r\n').split('\t')
        img_name, _ = img_id.split('#')
        img_name_to_tokens.setdefault(img_name, [])
        img_name_to_tokens[img_name].append(description)
    return img_name_to_tokens


def convert_token_to_id(img_name_to_tokens, vocab):
    '''
    簡單的說,就是在上一個函數出來的結果中,把描述文字 換成 id 表示
    :param img_name_to_tokens:
    :param vocab: 詞表 字典
    :return: dict 形式如: {'1234.jpg': ['4 556 44 6757', '2223 4354 22 1']}
    '''
    img_name_to_token_ids = {}
    for img_name in img_name_to_tokens:
        img_name_to_token_ids.setdefault(img_name, [])
        descriptions = img_name_to_tokens[img_name]
        for description in descriptions:
            token_ids = vocab.encode(description)
            img_name_to_token_ids[img_name].append(token_ids)
    return img_name_to_token_ids


vocab = Vocab(input_vocab_file, hps.num_vocab_word_threshold)
vocab_size = vocab.size() # 獲得詞表長度
logging.info("vocab_size: %d" % vocab_size)


img_name_to_tokens = parse_token_file(input_description_file)
# 圖像 對應的 描述信息
img_name_to_token_ids = convert_token_to_id(img_name_to_tokens, vocab)


class ImageCaptionData(object):
    '''
    數據供應
    '''
    def __init__(self,
                 img_name_to_token_ids,
                 img_feature_dir,
                 num_timesteps,
                 vocab,
                 deterministic=False):
        '''

        :param img_name_to_token_ids: 圖像到描述字典
        :param img_feature_dir: 圖像特徵 保存文件目錄
        :param num_timesteps: 時間步的數量
        :param vocab: 詞表
        :param deterministic: 是否打亂
        '''
        self._vocab = vocab
        self._all_img_feature_filepaths = [] # 拼接出 圖像特徵文件的 路徑
        for filename in gfile.ListDirectory(img_feature_dir):
            self._all_img_feature_filepaths.append(os.path.join(img_feature_dir, filename))

        self._img_name_to_token_ids = img_name_to_token_ids
        self._num_timesteps = num_timesteps
        self._indicator = 0 # batch_size 的 起始點
        self._deterministic = deterministic
        self._img_feature_filenames = [] # 保存所有圖像特徵的路徑
        self._img_feature_data = [] # 保存 所有 圖像特徵
        self._load_img_feature_pickle()
        if not self._deterministic:
            self._random_shuffle()

    def _load_img_feature_pickle(self):
        '''
        從 文件 從 讀取 圖像 特徵
        :return:
        '''
        for filepath in self._all_img_feature_filepaths:
            with gfile.GFile(filepath, 'rb') as f:
                filenames, features = pickle.load(f, encoding='iso-8859-1')
                self._img_feature_filenames += filenames # 將列表拼接到一起
                self._img_feature_data.append(features) # 將 特徵 保存到一起
        # 如 原來矩陣是 [#(1000, 1, 1, 2048), #(1000, 1, 1, 2048)] 合併之後爲 (2000, 1, 1, 2048)
        self._img_feature_data = np.vstack(self._img_feature_data)
        origin_shape = self._img_feature_data.shape
        # 此刻 origin_shape 的 shape:(31783, 1, 1, 2048)
        self._img_feature_data = np.reshape( # 將其中的 兩維度 去掉
            self._img_feature_data, (origin_shape[0], origin_shape[3]))
        self._img_feature_filenames = np.asarray(self._img_feature_filenames)
        print(self._img_feature_data.shape) # (31783, 2048)
        print(self._img_feature_filenames.shape) # (31783,)
        if not self._deterministic:
            self._random_shuffle()

    def size(self):
        # 圖像文件的個數
        return len(self._img_feature_filenames)

    def img_feature_size(self):
        # 獲得圖像特徵的維度
        return self._img_feature_data.shape[1]

    def _random_shuffle(self):
        p = np.random.permutation(self.size())
        self._img_feature_filenames = self._img_feature_filenames[p]
        self._img_feature_data = self._img_feature_data[p]

    def _img_desc(self, filenames):
        '''
        從多條語句中,隨機獲得一條描述
        :param filenames:
        :return:
        '''
        batch_sentence_ids = []
        batch_weights = []# 爲最後 去掉無用的梯度做準備
        for filename in filenames:
            token_ids_set = self._img_name_to_token_ids[filename]
            chosen_token_ids = random.choice(token_ids_set) # 隨機選取一個
            #chosen_token_ids = token_ids_set[0]
            chosen_token_length = len(chosen_token_ids)

            weight = [1 for i in range(chosen_token_length)]
            if chosen_token_length >= self._num_timesteps:
                chosen_token_ids = chosen_token_ids[0:self._num_timesteps]
                weight = weight[0:self._num_timesteps]
            else:# 否則 需要補零
                # 計算需要補零的個數
                remaining_length = self._num_timesteps - chosen_token_length
                chosen_token_ids += [self._vocab.eos for i in range(remaining_length)]
                weight += [0 for i in range(remaining_length)]
            batch_sentence_ids.append(chosen_token_ids)
            batch_weights.append(weight)
        batch_sentence_ids = np.asarray(batch_sentence_ids)
        batch_weights = np.asarray(batch_weights)
        # 此刻返回的是 batch 句子描述, 和 weights
        return batch_sentence_ids, batch_weights

    def next(self, batch_size):
        '''
                返回 batch_size 個數據
                流程如下:
                1. 得到 圖像名稱
                2. 得到 圖像特徵
                3. 得到 圖像描述信息
                :param batch_size:
                :return:
                '''
        end_indicator = self._indicator + batch_size
        if end_indicator > self.size():
            if not self._deterministic:
                self._random_shuffle()
            self._indicator = 0
            end_indicator = self._indicator + batch_size
        assert end_indicator <= self.size()

        batch_img_features = self._img_feature_data[self._indicator: end_indicator]
        batch_img_names = self._img_feature_filenames[self._indicator: end_indicator]

        # batch_sentence_ids 是 圖像描述 的id形式,
        # batch_weights 句子權重,sentence_ids:[100, 101, 102, 0, 0, 0]--->[1, 1, 1, 0, 0, 0]
        #   相當於是一個mask,和sentence_ids相乘,計算損失函數的時候,不去計算他們的損失
        batch_sentence_ids, batch_weights = self._img_desc(batch_img_names)

        self._indicator = end_indicator
        return batch_img_features, batch_sentence_ids, batch_weights, batch_img_names


caption_data = ImageCaptionData(img_name_to_token_ids, input_img_feature_dir, hps.num_timesteps, vocab)
img_feature_dim = caption_data.img_feature_size()

def create_rnn_cell(hidden_dim, cell_type):
    '''
    根據cell類型,返回相應的網絡結構
    :param hidden_dim:
    :param cell_type:
    :return:
    '''
    if cell_type == 'lstm':
        return tf.contrib.rnn.BasicLSTMCell(hidden_dim, state_is_tuple=True)
    elif cell_type == 'gru':
        return tf.contrib.rnn.GRUCell(hidden_dim)
    else:
        raise Exception("%s has not been supported" % cell_type)


def dropout(cell, keep_prob):
    return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)


def get_train_model(hps, vocab_size, img_feature_dim):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size

    img_feature = tf.placeholder(tf.float32, (batch_size, img_feature_dim))
    sentence = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    mask = tf.placeholder(tf.float32, (batch_size, num_timesteps))
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step', trainable=False)

    '''
        訓練過程:
        句子:[a, b, c, d, e, f]

        真正的輸入:[img, a, b, c, d, e]
        圖像特徵 [0.3, 0.5, 0.2, 0.9]
        predict #1 img_feature -> embedding_img -> (a)
        predict #2 a -> embedding_word -> lstm -> b
        predict #3 b ->                        -> c  
    '''
    # Sets up the embedding layer.
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    # tf.random_uniform_initializer() 生成具有均勻分佈的張量的初始化器
    # 參考:https://www.w3cschool.cn/tensorflow_python/tensorflow_python-f1np2gyt.html
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embeddings',
            [vocab_size, hps.num_embedding_nodes],
            tf.float32)
        embed_token_ids = tf.nn.embedding_lookup(embeddings, sentence[:, 0:num_timesteps - 1])
        # 此刻 的 embed_token_ids 的 shape:[batch_size, num_timestep-1, num_embedding]

    # 對圖像進行 embedding
    # 此刻的圖像是一個 2048 的向量,需要進行一個全連接,轉化成一個詞embedding 長度一樣的一個向量。
    # 這樣就可以將 圖像embedding 和 詞 embedding 拼接到一起,用來做預測
    img_feature_embed_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    # 參考鏈接:https://www.w3cschool.cn/tensorflow_python/tensorflow_python-fy6t2o0o.html
    with tf.variable_scope('image_feature_embed', initializer=img_feature_embed_init):
        # img_feature:[batch_size, img_feature_dim]
        # embed_img: [batch_size, num_embedding_nodes]
        embed_img = tf.layers.dense(img_feature, hps.num_embedding_nodes)
        embed_img = tf.expand_dims(embed_img, 1)
        # 此刻的 embed_inputs shape: [batch_size, num_timesteps, num_embedding_nodes]
        embed_inputs = tf.concat([embed_img, embed_token_ids], axis=1)

    # Sets up LSTM network.
    scale = 1.0 / math.sqrt(hps.num_embedding_nodes + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('lstm_nn', initializer=lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = create_rnn_cell(hps.num_lstm_nodes[i], hps.cell_type)
            cell = dropout(cell, keep_prob)
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)

        initial_state = cell.zero_state(hps.batch_size, tf.float32)
        # rnn_outputs: [batch_size, num_timesteps, hps.num_lstm_node[-1]]
        rnn_outputs, _ = tf.nn.dynamic_rnn(cell,
                                           embed_inputs,
                                           initial_state=initial_state)

    # Sets up the fully-connected layer.
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc', initializer=fc_init):
        # 因爲要使用 rnn_outputs 做全連接,需要改變維度,保留最後一個維度不變,合併前兩個維度
        rnn_outputs_2d = tf.reshape(rnn_outputs, [-1, hps.num_lstm_nodes[-1]])
        fc1 = tf.layers.dense(rnn_outputs_2d, hps.num_fc_nodes, name='fc1')
        fc1_dropout = tf.nn.dropout(fc1, keep_prob)
        fc1_dropout = tf.nn.relu(fc1_dropout)
        logits = tf.layers.dense(fc1_dropout, vocab_size, name='logits')
        # logits 是 整個詞表的 概率分佈
        # logits的 shape 是: (800, 10875) 800是batch_size*timesteps 10875是詞表長度
        # 注意,在全鏈接中的dropout和在lstm中的dropout不同的
        # lstm tf.contrib.rnn.DropoutWrapper()


    with tf.variable_scope('loss'):
        # 因爲在進入全連接之前,將第一維和第二維給展平了,所以,同樣需要將GT給展平
        '''
        這裏多做一點註釋,以防以後忘掉
        因爲在 進行 全連接之前,已經將數據reshape 成了二維,
        即 [
                [1.jpg的第1個timestep, lstm最後一層的個數],
                [1.jpg的第2個timestep, lstm最後一層的個數],
                ...
                [2.jpg的第1個timestep, lstm最後一層的個數],
                [2.jpg的第2個timestep, lstm最後一層的個數]
            ]
        這樣,最終logits輸出的是
            [1.jpg的第1個timestep預測值的概率分佈,
             1.jpg的第2個timestep預測值的概率分佈,
            ...
             2.jpg的第1個timestep預測值的概率分佈,
            ]
        同樣的, 將sentences進行reshape 之後,就成了
            [
                1.jpg的第1個timestep gt
                1.jpg的第2個timestep gt
                ...
                2.jpg的第1個timestep gt
                2.jpg的第2個timestep gt
            ]
        這樣,正好可以 將 預測值 和 真實值 對上
        '''
        sentence_flatten = tf.reshape(sentence, [-1])
        mask_flatten = tf.reshape(mask, [-1])
        mask_sum = tf.reduce_sum(mask_flatten)
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=sentence_flatten)
        weighted_softmax_loss = tf.multiply(softmax_loss,
                                            tf.cast(mask_flatten, tf.float32))
        # 該函數做了三件事兒:1.對logits進行softmax。2.對labels進行one-hot編碼 3.計算交叉熵

        prediction = tf.argmax(logits, 1) # 得到預測值
        # 預測值 和 真實值 做比較
        correct_prediction = tf.equal(tf.cast(prediction,tf.int32), sentence_flatten)
        # 使用 mask 去掉 噪音
        correct_prediction_with_mask = tf.multiply(
            tf.cast(correct_prediction, tf.float32),
            mask_flatten)
        accuracy = tf.reduce_sum(correct_prediction_with_mask) / mask_sum
        loss = tf.reduce_sum(weighted_softmax_loss) / mask_sum
        tf.summary.scalar('loss', loss)

    with tf.variable_scope('train_op'):
        tvars = tf.trainable_variables()
        for var in tvars:
            logging.info("variable name: %s" % (var.name))
        grads, _ = tf.clip_by_global_norm( # 對梯度進行裁剪
            tf.gradients(loss, tvars), hps.clip_lstm_grads)
        for grad, var in zip(grads, tvars):
            tf.summary.histogram('%s_grad' % (var.name), grad)
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)

    return ((img_feature, sentence, mask, keep_prob),
            (loss, accuracy, train_op),
            global_step)


placeholders, metrics, global_step = get_train_model(hps, vocab_size, img_feature_dim)
img_feature, sentence, mask, keep_prob = placeholders
loss, accuracy, train_op = metrics

summary_op = tf.summary.merge_all()

init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10)

with tf.Session() as sess:
    sess.run(init_op)
    writer = tf.summary.FileWriter(output_dir, sess.graph)
    for i in range(training_steps):
        batch_img_features, batch_sentence_ids, batch_weights, _ = caption_data.next(hps.batch_size)
        input_vals = (batch_img_features, batch_sentence_ids, batch_weights, hps.keep_prob)

        feed_dict = dict(zip(placeholders, input_vals))
        fetches = [global_step, loss, accuracy, train_op]

        should_log = (i + 1) % hps.log_frequent == 0
        should_save = (i + 1) % hps.save_frequent == 0
        if should_log:
            fetches += [summary_op]
        outputs = sess.run(fetches, feed_dict)
        global_step_val, loss_val, accuracy_val = outputs[0:3]
        if should_log:
            summary_str = outputs[4]
            writer.add_summary(summary_str, global_step_val)
            logging.info('Step: %5d, loss: %3.3f, accuracy: %3.3f'
                         % (global_step_val, loss_val, accuracy_val))
        if should_save:
            logging.info("Step: %d, image caption model saved" % (global_step_val))
            saver.save(sess, os.path.join(output_dir, "image_caption"), global_step=global_step_val)

訓練了一百萬次,我竟然忘了保存訓練結果了 ? 。記得好像正確率只有0.38,將近0.40的樣子。其實也不少了,剛開始訓練的時候好像是十幾的樣子。
好了,圖像生成文本到這裏就結束了。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章