cs231n:assignment2——python文件:fc_net.py

視頻裏 Andrej Karpathy上課的時候說,這次的作業meaty but educational,確實很meaty,作業一般是由.ipynb文件和.py文件組成,這次因爲每個.ipynb文件涉及到的.py文件較多,且互相之間有交叉,所以每篇博客只貼出一個.ipynb或者一個.py文件.(因爲之前的作業由於是一個.ipynb文件對應一個.py文件,所以就整合到一篇博客裏)
還是那句話,有錯誤希望幫我指出來,多多指教,謝謝

第二部分編寫任意層數的全連接層的類的時候,我的前向計算和反向計算都太繁瑣,這是在寫任意層數的conv_net的時候發現的,所以懶得改了

fc_net.py內容:

import numpy as np

from cs231n.layers import *
from cs231n.layer_utils import *


class TwoLayerNet(object):
  """
  A two-layer fully-connected neural network with ReLU nonlinearity and
  softmax loss that uses a modular layer design. We assume an input dimension
  of D, a hidden dimension of H, and perform classification over C classes.

  The architecure should be affine - relu - affine - softmax.

  Note that this class does not implement gradient descent; instead, it
  will interact with a separate Solver object that is responsible for running
  optimization.

  The learnable parameters of the model are stored in the dictionary
  self.params that maps parameter names to numpy arrays.
  """

  def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,
               weight_scale=1e-3, reg=0.0):
    """
    Initialize a new network.

    Inputs:
    - input_dim: An integer giving the size of the input
    - hidden_dim: An integer giving the size of the hidden layer
    - num_classes: An integer giving the number of classes to classify
    - dropout: Scalar between 0 and 1 giving dropout strength.
    - weight_scale: Scalar giving the standard deviation for random
      initialization of the weights.
    - reg: Scalar giving L2 regularization strength.
    """
    self.params = {}
    self.reg = reg

    ############################################################################
    # TODO: Initialize the weights and biases of the two-layer net. Weights    #
    # should be initialized from a Gaussian with standard deviation equal to   #
    # weight_scale, and biases should be initialized to zero. All weights and  #
    # biases should be stored in the dictionary self.params, with first layer  #
    # weights and biases using the keys 'W1' and 'b1' and second layer weights #
    # and biases using the keys 'W2' and 'b2'.                                 #
    ############################################################################
    self.params['W1'] = weight_scale * np.random.randn(input_dim, hidden_dim)
    self.params['b1'] = np.zeros(hidden_dim)
    self.params['W2'] = weight_scale * np.random.randn(hidden_dim, num_classes)
    self.params['b2'] = np.zeros(num_classes)
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################


  def loss(self, X, y=None):
    """
    Compute loss and gradient for a minibatch of data.

    Inputs:
    - X: Array of input data of shape (N, d_1, ..., d_k)
    - y: Array of labels, of shape (N,). y[i] gives the label for X[i].

    Returns:
    If y is None, then run a test-time forward pass of the model and return:
    - scores: Array of shape (N, C) giving classification scores, where
      scores[i, c] is the classification score for X[i] and class c.

    If y is not None, then run a training-time forward and backward pass and
    return a tuple of:
    - loss: Scalar value giving the loss
    - grads: Dictionary with the same keys as self.params, mapping parameter
      names to gradients of the loss with respect to those parameters.
    """
    scores = None
    ############################################################################
    # TODO: Implement the forward pass for the two-layer net, computing the    #
    # class scores for X and storing them in the scores variable.              #
    ############################################################################
    a2, cache1 = affine_relu_forward(X, self.params['W1'], self.params['b1'])
    scores, cache2 = affine_forward(a2, self.params['W2'], self.params['b2'])
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    # If y is None then we are in test mode so just return scores
    if y is None:
      return scores

    loss, grads = 0, {}
    ############################################################################
    # TODO: Implement the backward pass for the two-layer net. Store the loss  #
    # in the loss variable and gradients in the grads dictionary. Compute data #
    # loss using softmax, and make sure that grads[k] holds the gradients for  #
    # self.params[k]. Don't forget to add L2 regularization!                   #
    #                                                                          #
    # NOTE: To ensure that your implementation matches ours and you pass the   #
    # automated tests, make sure that your L2 regularization includes a factor #
    # of 0.5 to simplify the expression for the gradient.                      #
    ############################################################################
    loss_without_reg, dscores = softmax_loss(scores, y)
    loss = loss_without_reg + 0.5 * self.reg * (np.sum(self.params['W1']**2) + \
                                          np.sum(self.params['W2']**2))
    da2, grads['W2'], grads['b2'] = affine_backward(dscores, cache2)
    grads['W2'] += self.reg*cache2[1]
    dx, grads['W1'], grads['b1'] = affine_relu_backward(da2, cache1)
    grads['W1'] += self.reg*cache1[0][1]
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    return loss, grads


class FullyConnectedNet(object):
  """
  A fully-connected neural network with an arbitrary number of hidden layers,
  ReLU nonlinearities, and a softmax loss function. This will also implement
  dropout and batch normalization as options. For a network with L layers,
  the architecture will be

  {affine - [batch norm] - relu - [dropout]} x (L - 1) - affine - softmax

  where batch normalization and dropout are optional, and the {...} block is
  repeated L - 1 times.

  Similar to the TwoLayerNet above, learnable parameters are stored in the
  self.params dictionary and will be learned using the Solver class.
  """

  def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
               dropout=0, use_batchnorm=False, reg=0.0,
               weight_scale=1e-2, dtype=np.float32, seed=None):
    """
    Initialize a new FullyConnectedNet.

    Inputs:
    - hidden_dims: A list of integers giving the size of each hidden layer.
    - input_dim: An integer giving the size of the input.
    - num_classes: An integer giving the number of classes to classify.
    - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then
      the network should not use dropout at all.
    - use_batchnorm: Whether or not the network should use batch normalization.
    - reg: Scalar giving L2 regularization strength.
    - weight_scale: Scalar giving the standard deviation for random
      initialization of the weights.
    - dtype: A numpy datatype object; all computations will be performed using
      this datatype. float32 is faster but less accurate, so you should use
      float64 for numeric gradient checking.
    - seed: If not None, then pass this random seed to the dropout layers. This
      will make the dropout layers deteriminstic so we can gradient check the
      model.
    """
    self.use_batchnorm = use_batchnorm
    self.use_dropout = dropout > 0
    self.reg = reg
    self.num_layers = 1 + len(hidden_dims)
    self.dtype = dtype
    self.params = {}

    ############################################################################
    # TODO: Initialize the parameters of the network, storing all values in    #
    # the self.params dictionary. Store weights and biases for the first layer #
    # in W1 and b1; for the second layer use W2 and b2, etc. Weights should be #
    # initialized from a normal distribution with standard deviation equal to  #
    # weight_scale and biases should be initialized to zero.                   #
    #                                                                          #
    # When using batch normalization, store scale and shift parameters for the #
    # first layer in gamma1 and beta1; for the second layer use gamma2 and     #
    # beta2, etc. Scale parameters should be initialized to one and shift      #
    # parameters should be initialized to zero.                                #
    ############################################################################
    for i in xrange(self.num_layers):
      if i == 0:
        #initialize first affine layers
        self.params['W'+str(i+1)] = \
            weight_scale * np.random.randn(input_dim, hidden_dims[i])
        self.params['b'+str(i+1)] = np.zeros(hidden_dims[i])
        #initialize first batch normalize layers
        if self.use_batchnorm:
          self.params['gamma'+str(i+1)] = np.ones(hidden_dims[i])
          self.params['beta'+str(i+1)] = np.zeros(hidden_dims[i])
      elif i == self.num_layers-1:
        #initialize last affine layers
        self.params['W'+str(i+1)] = \
            weight_scale * np.random.randn(hidden_dims[i-1], num_classes)
        self.params['b'+str(i+1)] = np.zeros(num_classes)
      else:
        #initialize  affine layers
        self.params['W'+str(i+1)] = \
            weight_scale * np.random.randn(hidden_dims[i-1], hidden_dims[i])
        self.params['b'+str(i+1)] = np.zeros(hidden_dims[i])
        #initialize batch normalize layers
        if self.use_batchnorm:
          self.params['gamma'+str(i+1)] = np.ones(hidden_dims[i])
          self.params['beta'+str(i+1)] = np.zeros(hidden_dims[i])
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    # When using dropout we need to pass a dropout_param dictionary to each
    # dropout layer so that the layer knows the dropout probability and the mode
    # (train / test). You can pass the same dropout_param to each dropout layer.
    self.dropout_param = {}
    if self.use_dropout:
      self.dropout_param = {'mode': 'train', 'p': dropout}
      if seed is not None:
        self.dropout_param['seed'] = seed

    # With batch normalization we need to keep track of running means and
    # variances, so we need to pass a special bn_param object to each batch
    # normalization layer. You should pass self.bn_params[0] to the forward pass
    # of the first batch normalization layer, self.bn_params[1] to the forward
    # pass of the second batch normalization layer, etc.
    self.bn_params = []
    if self.use_batchnorm:
      self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)]

    # Cast all parameters to the correct datatype
    for k, v in self.params.iteritems():
      self.params[k] = v.astype(dtype)


  def loss(self, X, y=None):
    """
    Compute loss and gradient for the fully-connected net.

    Input / output: Same as TwoLayerNet above.
    """
    X = X.astype(self.dtype)
    mode = 'test' if y is None else 'train'

    # Set train/test mode for batchnorm params and dropout param since they
    # behave differently during training and testing.
    if self.dropout_param is not None:
      self.dropout_param['mode'] = mode
    if self.use_batchnorm:
      for bn_param in self.bn_params:
        bn_param['mode'] = mode
        # bn_param[mode] = mode
        ## 我覺得這一塊原文可能錯了索引的mode應該是帶引號的,應該是bn_param['mode']  
    scores = None
    ############################################################################
    # TODO: Implement the forward pass for the fully-connected net, computing  #
    # the class scores for X and storing them in the scores variable.          #
    #                                                                          #
    # When using dropout, you'll need to pass self.dropout_param to each       #
    # dropout forward pass.                                                    #
    #                                                                          #
    # When using batch normalization, you'll need to pass self.bn_params[0] to #
    # the forward pass for the first batch normalization layer, pass           #
    # self.bn_params[1] to the forward pass for the second batch normalization #
    # layer, etc.                                                              #
    ############################################################################
    #寫的太繁瑣,效率且低
    if self.use_batchnorm and self.use_dropout:
      a = []
      a.append(X)
      cache = []
      for i in xrange(self.num_layers):
        if i==self.num_layers-1:
          scores, cache_last = affine_forward(a[i], \
                                self.params['W'+str(i+1)], \
                                self.params['b'+str(i+1)])
          cache.append(cache_last)
        else:
          a_out_i, cache_i = affine_bn_relu_dp_forward(a[i], \
                                        self.params['W'+str(i+1)], \
                                        self.params['b'+str(i+1)],\
                                        self.params['gamma'+str(i+1)],\
                                        self.params['beta'+str(i+1)],\
                                        self.bn_params[i],\
                                                                                self.dropout_param)
          a.append(a_out_i)
          cache.append(cache_i)
    elif self.use_batchnorm and not self.use_dropout:
      a = []
      a.append(X)
      cache = []
      for i in xrange(self.num_layers):
        if i==self.num_layers-1:
          scores, cache_last = affine_forward(a[i], \
                                self.params['W'+str(i+1)], \
                                self.params['b'+str(i+1)])
          cache.append(cache_last)
        else:
          a_out_i, cache_i = affine_bn_relu_forward(a[i], \
                                        self.params['W'+str(i+1)], \
                                        self.params['b'+str(i+1)],\
                                        self.params['gamma'+str(i+1)],\
                                        self.params['beta'+str(i+1)],\
                                        self.bn_params[i])
          a.append(a_out_i)
          cache.append(cache_i)
    elif self.use_dropout and not self.use_batchnorm:
      a = []
      a.append(X)
      cache = []
      for i in xrange(self.num_layers):
        if i==self.num_layers-1:
          scores, cache_last = affine_forward(a[i], \
                                self.params['W'+str(i+1)], \
                                self.params['b'+str(i+1)])
          cache.append(cache_last)
        else:
          a_out_i, cache_i = affine_relu_forward(a[i], \
                                        self.params['W'+str(i+1)], \
                                        self.params['b'+str(i+1)])
          # add dropout layer
          d_out_i, dp_cache= dropout_forward(a_out_i, self.dropout_param)

          a.append(d_out_i)
          ad_dp_cache = (cache_i, dp_cache)
          cache.append(ad_dp_cache)
    else:
      a = []
      a.append(X)
      cache = []
      for i in xrange(self.num_layers):
        if i==self.num_layers-1:
          scores, cache_last = affine_forward(a[i], \
                                self.params['W'+str(i+1)], \
                                self.params['b'+str(i+1)])
          cache.append(cache_last)
        else:
          a_out_i, cache_i = affine_relu_forward(a[i], \
                                        self.params['W'+str(i+1)], \
                                        self.params['b'+str(i+1)])
          a.append(a_out_i)
          cache.append(cache_i)
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    # If test mode return early
    if mode == 'test':
      return scores

    loss, grads = 0.0, {}
    ############################################################################
    # TODO: Implement the backward pass for the fully-connected net. Store the #
    # loss in the loss variable and gradients in the grads dictionary. Compute #
    # data loss using softmax, and make sure that grads[k] holds the gradients #
    # for self.params[k]. Don't forget to add L2 regularization!               #
    #                                                                          #
    # When using batch normalization, you don't need to regularize the scale   #
    # and shift parameters.                                                    #
    #                                                                          #
    # NOTE: To ensure that your implementation matches ours and you pass the   #
    # automated tests, make sure that your L2 regularization includes a factor #
    # of 0.5 to simplify the expression for the gradient.                      #
    ############################################################################
    #寫的太繁瑣,效率且低
    if self.use_batchnorm and self.use_dropout:
      loss, dscores = softmax_loss(scores, y)
      for i in xrange(self.num_layers):
        loss += 0.5 * self.reg * np.sum(self.params['W'+str(i+1)]**2)

      dout = []
      dout.append(dscores)
      for i in reversed(xrange(self.num_layers)):
        #print len(cache[i])
        if i==self.num_layers-1:
          dout_i, dw, db = affine_backward(dout[self.num_layers-i-1], cache[i])
          dout.append(dout_i)
          grads['W'+str(i+1)] = dw + self.reg * cache[i][1]
          grads['b'+str(i+1)] = db
        else:
          dout_i, dw, db, dgamma, dbeta = \
                    affine_bn_relu_dp_backward(dout[self.num_layers-i-1], cache[i])

          dout.append(dout_i)
          grads['W'+str(i+1)] = dw + self.reg * cache[i][0][1]
          grads['b'+str(i+1)] = db
          grads['gamma'+str(i+1)] = dgamma
          grads['beta'+str(i+1)] = dbeta
    elif self.use_batchnorm and not self.use_dropout:
      loss, dscores = softmax_loss(scores, y)
      for i in xrange(self.num_layers):
        loss += 0.5 * self.reg * np.sum(self.params['W'+str(i+1)]**2)

      dout = []
      dout.append(dscores)
      for i in reversed(xrange(self.num_layers)):
        #print len(cache[i])
        if i==self.num_layers-1:
          dout_i, dw, db = affine_backward(dout[self.num_layers-i-1], cache[i])
          dout.append(dout_i)
          grads['W'+str(i+1)] = dw + self.reg * cache[i][1]
          grads['b'+str(i+1)] = db
        else:
          dout_i, dw, db, dgamma, dbeta = \
                    affine_bn_relu_backward(dout[self.num_layers-i-1], cache[i])

          dout.append(dout_i)
          grads['W'+str(i+1)] = dw + self.reg * cache[i][0][1]
          grads['b'+str(i+1)] = db
          grads['gamma'+str(i+1)] = dgamma
          grads['beta'+str(i+1)] = dbeta
    elif self.use_dropout and not self.use_batchnorm:
      loss, dscores = softmax_loss(scores, y)
      for i in xrange(self.num_layers):
        loss += 0.5 * self.reg * np.sum(self.params['W'+str(i+1)]**2)

      dout = []
      dout.append(dscores)
      for i in reversed(xrange(self.num_layers)):
        #print len(cache[i])
        if i==self.num_layers-1:
          dout_i, dw, db = affine_backward(dout[self.num_layers-i-1], cache[i])
          dout.append(dout_i)
          grads['W'+str(i+1)] = dw + self.reg * cache[i][1]
          grads['b'+str(i+1)] = db
        else:
          cache_i, dp_cache = cache[i]
          dout_ar = dropout_backward(dout[self.num_layers-i-1], dp_cache)

          dout_i, dw, db = affine_relu_backward(dout_ar, cache_i)
          dout.append(dout_i)
          grads['W'+str(i+1)] = dw + self.reg * cache_i[0][1]
          grads['b'+str(i+1)] = db
    else:
      loss, dscores = softmax_loss(scores, y)
      for i in xrange(self.num_layers):
        loss += 0.5 * self.reg * np.sum(self.params['W'+str(i+1)]**2)

      dout = []
      dout.append(dscores)
      for i in reversed(xrange(self.num_layers)):
        #print len(cache[i])
        if i==self.num_layers-1:
          dout_i, dw, db = affine_backward(dout[self.num_layers-i-1], cache[i])
          dout.append(dout_i)
          grads['W'+str(i+1)] = dw + self.reg * cache[i][1]
          grads['b'+str(i+1)] = db
        else:
          dout_i, dw, db = affine_relu_backward(dout[self.num_layers-i-1], cache[i])
          dout.append(dout_i)
          grads['W'+str(i+1)] = dw + self.reg * cache[i][0][1]
          grads['b'+str(i+1)] = db
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    return loss, grads
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章