深度學習四:tensorflow-使用卷積神經網絡識別手寫數字

當你安裝了tensorflow後,tensorflow自帶的教程演示瞭如何使用卷積神經網絡來識別手寫數字。代碼路徑爲tensorflow-master\tensorflow\examples\tutorials\mnist\mnist_deep.py。
爲了快速測試該程序,我提前將需要的mnist手寫數字庫下載到了工程目錄(我在pycharm中新建了工程,並把mnist_deep.py中的代碼拷貝過去)下的input_data目錄下:
這裏寫圖片描述
然後,需要修改程序中指定mnist圖片庫路徑的代碼,

if __name__ == '__main__':
  print("main run")
  parser = argparse.ArgumentParser()
  parser.add_argument('--data_dir', type=str,
                      **default='input_data'**,
                      help='Directory for storing input data')
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

將這裏的default的值改爲input_data即可。然後程序就可以運行了。默認會訓練20000批次,每個批次50個數據。運行完成後準確率達到99.2%。
接下來,這裏會對該程序做一點分析,並且做一些修改,來驗證一些咱們的猜想,並且加深對代碼的理解。

# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""A deep MNIST classifier using convolutional layers.

See extensive documentation at
https://www.tensorflow.org/get_started/mnist/pros
"""
# Disable linter warnings to maintain consistency with tutorial.
# pylint: disable=invalid-name
# pylint: disable=g-bad-import-order

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys

from tensorflow.examples.tutorials.mnist import input_data

import tensorflow as tf

FLAGS = None


def deepnn(x):
  """deepnn builds the graph for a deep net for classifying digits.

  Args:
    x: an input tensor with the dimensions (N_examples, 784), where 784 is the
    number of pixels in a standard MNIST image.

  Returns:
    A tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with values
    equal to the logits of classifying the digit into one of 10 classes (the
    digits 0-9). keep_prob is a scalar placeholder for the probability of
    dropout.
  """
  # Reshape to use within a convolutional neural net.
  # Last dimension is for "features" - there is only one here, since images are
  # grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
  x_image = tf.reshape(x, [-1, 28, 28, 1])

  # First convolutional layer - maps one grayscale image to 32 feature maps.
  W_conv1 = weight_variable([5, 5, 1, 32])
  b_conv1 = bias_variable([32])
  h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)

  # Pooling layer - downsamples by 2X.
  h_pool1 = max_pool_2x2(h_conv1)

  # Second convolutional layer -- maps 32 feature maps to 64.
  W_conv2 = weight_variable([5, 5, 32, 64])
  b_conv2 = bias_variable([64])
  h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)

  # Second pooling layer.
  h_pool2 = max_pool_2x2(h_conv2)

  # Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
  # is down to 7x7x64 feature maps -- maps this to 1024 features.
  W_fc1 = weight_variable([7 * 7 * 64, 1024])
  b_fc1 = bias_variable([1024])

  h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
  h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

  # Dropout - controls the complexity of the model, prevents co-adaptation of
  # features.
  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

  # Map the 1024 features to 10 classes, one for each digit
  W_fc2 = weight_variable([1024, 10])
  b_fc2 = bias_variable([10])

  y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
  return y_conv, keep_prob


def conv2d(x, W):
  """conv2d returns a 2d convolution layer with full stride."""
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')


def max_pool_2x2(x):
  """max_pool_2x2 downsamples a feature map by 2X."""
  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')


def weight_variable(shape):
  """weight_variable generates a weight variable of a given shape."""
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)


def bias_variable(shape):
  """bias_variable generates a bias variable of a given shape."""
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)


def main(_):
  print("inport data")
  # Import data
  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)

  # Create the model
  x = tf.placeholder(tf.float32, [None, 784])

  # Define loss and optimizer
  y_ = tf.placeholder(tf.float32, [None, 10])

  # Build the graph for the deep net
  y_conv, keep_prob = deepnn(x)

  cross_entropy = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
  train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
  correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

  print("start train")
  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    #batch = mnist.train.next_batch(1)
    #print(y_conv.eval(feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0}))
    for i in range(1000):
      batch = mnist.train.next_batch(50)
      if i % 100 == 0:
        train_accuracy = accuracy.eval(feed_dict={
            x: batch[0], y_: batch[1], keep_prob: 1.0})
        print('step %d, training accuracy %g' % (i, train_accuracy))
      train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})

    print('test accuracy %g' % accuracy.eval(feed_dict={
        x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))


if __name__ == '__main__':
  print("main run")
  parser = argparse.ArgumentParser()
  parser.add_argument('--data_dir', type=str,
                      default='input_data',
                      help='Directory for storing input data')
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

這個程序做了如下幾點:

第一、裝載數據

一開始使用mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)來導入mnist圖片庫。

第二、使用deepnn函數來構造升經網絡

deepnn構造神經網絡的過程如下:

2-1 調整輸入的圖片

 x_image = tf.reshape(x, [-1, 28, 28, 1])

輸入的圖片本來是一維的,需要把它調整爲四維的。-1表示任意多個,兩個28指的是圖片的長和寬,1是說圖片是灰度圖,只有一個通道。

2-2構建卷積層

  W_conv1 = weight_variable([5, 5, 1, 32])
  b_conv1 = bias_variable([32])
  h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)

weight_variable如下:

def weight_variable(shape):
  """weight_variable generates a weight variable of a given shape."""
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

其實tensorflow中沒有層的概念。tensorflow的整體用一個圖表示,可類比爲一個神經網絡。圖則由操作構成。因此這裏申明瞭兩個變量,一個是保存權重,一個保存偏執,每一個變量的添加也是一個操作。然後使用tf.nn.relu創建另一個操作。這個操作是一個運算,這個運算首先會計算輸入圖像與權重的卷積,然後加上偏置,計算完成後,別忘了他是一個激活函數,這個激活函數對權重的卷積加偏置做激活運算。這個過程便是一個神經網絡的卷積層所做的事,因此可以看做是一個卷積層。weight_variable的兩個5表明卷積核大小爲5,1應該還是圖像的通道數,32表明該卷積層會提取32個特徵,也就是會輸出32個maps。
relu是神經元的激活函數,它類似於sigmod,之後我們會對他們做一個對比,看看哪個激活函數有更好的表現。
tf.truncated_normal用來給權重做隨機的初始化。

2-3構建池化層

  h_pool1 = max_pool_2x2(h_conv1)

max_pool_2x2函數如下:

def max_pool_2x2(x):
  """max_pool_2x2 downsamples a feature map by 2X."""
  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')

ksize是一個四位的結構,第一個參數爲1,可以理解爲第四維爲1,因此它可以理解爲一個三維結構了,兩個2表明構建池化層的核的大小爲2x2,最後的1應該是圖像通道數(這裏並不確定,如有誤,望指正)。strides指的是池化核移動的步幅。參數與ksize相同

2-4全連接層

deepnn構建了兩個卷積層,每個卷積層都跟着一個池化層。之後是兩個全連接層:

  W_fc1 = weight_variable([7 * 7 * 64, 1024])
  b_fc1 = bias_variable([1024])

  h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
  h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

  # Dropout - controls the complexity of the model, prevents co-adaptation of
  # features.
  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

  # Map the 1024 features to 10 classes, one for each digit
  W_fc2 = weight_variable([1024, 10])
  b_fc2 = bias_variable([10])

  y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

全連接層的構建只需要指明輸入的大小和輸出的大小。輸入的大小爲第二個池化層的所有的輸出,輸出的大小爲1024,也就是這一層有1024個神經元。緊接着又跟了一個大小爲10個神經元的全連接層,用來表示10個手寫數字。

第三、訓練神經網絡

  cross_entropy = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
  train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
  correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

這裏使用交叉熵作爲誤差函數。tf.argmax求一個tensor的最大值的標號,tf.equal用來比對兩個標號是否相等。也就是計算出的數字和應該的數字比對是否相等。
tf.train.AdamOptimizer是使用Adam算法的優化器。minimize方法是結合交叉熵代價函數與adam算法,讓神經網絡的代價不斷減少的方法。這個方法其實是compute_gradients()方法和apply_gradients()方法的組合。Adam的收斂速度會比較快,1e-4是學習速率。
除了Adam算法的優化器外,tensorflow還提供了一些優化器,比如:
class tf.train.GradientDescentOptimizer–梯度下降算法的優化器
class tf.train.AdadeltaOptimizer – 使用adadelta算法的優化器
class tf.train.AdagradOptimizer – 使用adagradOptimizer算法的優化器
class tf.train.MomentumOptimizer – 使用Momentum算法的優化器
等等。大家可以自行嘗試那種優化器有更好的效果。
tf.reduce_mean用來計算準確率的平均值。

修改激活函數

關於代碼的講解暫時就說這麼多。下面我們做一點嘗試,將激活函數改爲sigmod:
爲了能快速看到結果,以下,我均將訓練次數改爲了1000次。

inport data
Extracting input_data\train-images-idx3-ubyte.gz
Extracting input_data\train-labels-idx1-ubyte.gz
Extracting input_data\t10k-images-idx3-ubyte.gz
Extracting input_data\t10k-labels-idx1-ubyte.gz
start train
step 0, training accuracy 0.04
step 100, training accuracy 0.08
step 200, training accuracy 0.16
step 300, training accuracy 0.14
step 400, training accuracy 0.34
step 500, training accuracy 0.44
step 600, training accuracy 0.68
step 700, training accuracy 0.7
step 800, training accuracy 0.7
step 900, training accuracy 0.7
test accuracy 0.7732

relu作爲激活函數:

main run
inport data
Extracting input_data\train-images-idx3-ubyte.gz
Extracting input_data\train-labels-idx1-ubyte.gz
Extracting input_data\t10k-images-idx3-ubyte.gz
Extracting input_data\t10k-labels-idx1-ubyte.gz
start train
step 0, training accuracy 0.04
step 100, training accuracy 0.86
step 200, training accuracy 0.92
step 300, training accuracy 0.86
step 400, training accuracy 0.96
step 500, training accuracy 0.92
step 600, training accuracy 0.98
step 700, training accuracy 0.96
step 800, training accuracy 0.9
step 900, training accuracy 1
test accuracy 0.9607

可見relu似乎是更好的激活函數

增加捲積層

再增加一個卷積層和一個池化層後,deepnn方法如下:

def deepnn(x):
  """deepnn builds the graph for a deep net for classifying digits.

  Args:
    x: an input tensor with the dimensions (N_examples, 784), where 784 is the
    number of pixels in a standard MNIST image.

  Returns:
    A tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with values
    equal to the logits of classifying the digit into one of 10 classes (the
    digits 0-9). keep_prob is a scalar placeholder for the probability of
    dropout.
  """
  # Reshape to use within a convolutional neural net.
  # Last dimension is for "features" - there is only one here, since images are
  # grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
  x_image = tf.reshape(x, [-1, 28, 28, 1])

  # First convolutional layer - maps one grayscale image to 32 feature maps.
  W_conv1 = weight_variable([5, 5, 1, 32])
  b_conv1 = bias_variable([32])
  h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)

  # Pooling layer - downsamples by 2X.
  h_pool1 = max_pool_2x2(h_conv1)

  # Second convolutional layer -- maps 32 feature maps to 64.
  W_conv2 = weight_variable([5, 5, 32, 64])
  b_conv2 = bias_variable([64])
  h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)

  # Second pooling layer.
  h_pool2 = max_pool_2x2(h_conv2)

  # Second convolutional layer -- maps 32 feature maps to 64.
  W_conv3 = weight_variable([5, 5, 64, 128])
  b_conv3 = bias_variable([128])
  h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)

  # Second pooling layer.
  h_pool3 = max_pool_2x2(h_conv3)

  # Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
  # is down to 7x7x64 feature maps -- maps this to 1024 features.
  W_fc1 = weight_variable([4 * 4 * 128, 1024])
  b_fc1 = bias_variable([1024])

  h_pool2_flat = tf.reshape(h_pool3, [-1, 4*4*128])
  h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

  # Dropout - controls the complexity of the model, prevents co-adaptation of
  # features.
  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

  # Map the 1024 features to 10 classes, one for each digit
  W_fc2 = weight_variable([1024, 10])
  b_fc2 = bias_variable([10])

  y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
  return y_conv, keep_prob

訓練1000次後:

main run
inport data
Extracting input_data\train-images-idx3-ubyte.gz
Extracting input_data\train-labels-idx1-ubyte.gz
Extracting input_data\t10k-images-idx3-ubyte.gz
Extracting input_data\t10k-labels-idx1-ubyte.gz
start train
step 0, training accuracy 0.06
step 100, training accuracy 0.8
step 200, training accuracy 0.92
step 300, training accuracy 0.84
step 400, training accuracy 0.94
step 500, training accuracy 0.92
step 600, training accuracy 0.98
step 700, training accuracy 0.92
step 800, training accuracy 0.92
step 900, training accuracy 1
test accuracy 0.9624

增加一層卷積層後,效果並沒有顯著提升。

保存與恢復參數

接下來,保存我們辛辛苦苦訓練出來的升經網絡的參數:
保存和加載參數主要由類:class tf.train.Saver完成,設計的代碼如下:
saver = tf.train.Saver()
save_path = saver.save(sess, model_path)
load_path = saver.restore(sess, model_path)

因此,我們的代碼在訓練完成後便可將參數保存起來,代碼如下:

  **saver = tf.train.Saver()**
  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    #batch = mnist.train.next_batch(1)
    #print(y_conv.eval(feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0}))
    for i in range(1000):
      batch = mnist.train.next_batch(50)
      if i % 100 == 0:
        train_accuracy = accuracy.eval(feed_dict={
            x: batch[0], y_: batch[1], keep_prob: 1.0})
        print('step %d, training accuracy %g' % (i, train_accuracy))
      train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
    #saver.restore(sess,"./model/me")
    **saver.save(sess,"./model/me")**
    print('test accuracy %g' % accuracy.eval(feed_dict={
        x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))

下次,我們就不需要再去訓練了,我們可以直接加載參數,然後做測試:

  saver = tf.train.Saver()

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    #batch = mnist.train.next_batch(1)
    #print(y_conv.eval(feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0}))
    # for i in range(1000):
    #   batch = mnist.train.next_batch(50)
    #   if i % 100 == 0:
    #     train_accuracy = accuracy.eval(feed_dict={
    #         x: batch[0], y_: batch[1], keep_prob: 1.0})
    #     print('step %d, training accuracy %g' % (i, train_accuracy))
    #   train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
    saver.restore(sess,"./model/me")
    #saver.save(sess,"./model/me")
    print('test accuracy %g' % accuracy.eval(feed_dict={
        x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
main run
inport data
Extracting input_data\train-images-idx3-ubyte.gz
Extracting input_data\train-labels-idx1-ubyte.gz
Extracting input_data\t10k-images-idx3-ubyte.gz
Extracting input_data\t10k-labels-idx1-ubyte.gz
test accuracy 0.965

直接加載參數後,測試的結果也爲0.965

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章