[Keras 源碼學習] MobileNetV2

最近閱讀到 Keras.backend.tensorflow_backend.py 。原由是 MobileNetV2 Keras 實現的學習,閱讀的版本是https://github.com/JonathanCMitchell/mobilenet_v2_keras 的實現,官方的另一個指定版本。

MobilenetV2 基本上是平鋪直敘,這一則是記錄對實現過程中,各個網絡層的調用細節。

  • Input
    Input 起始於 from keras.layers import Input ,該方法的具體實現在 keras.engine.input_layer.py 中。

    1. mobilenetv2.py
      from keras.layers import Input
      
    2. keras.layers.init.py
       from ..engine import InputLayer
      
    3. keras.engine.base.py
       from .input_layer import Input
      
    4. keras.engine.input_layer.py
      def Input(*):

    詳細內容,下回分解。

  • Conv2D
    Conv2D 起始於 from keras.layers import Conv2D

    1. mobilenetv2.py

         x = Conv2D(first_block_filters,
                    kernel_size=3,
                    strides=(2, 2), padding='same',
                    use_bias=False, name='Conv1')(img_input)
      
    2. keras.layers.__init__.py

         from .convolutional import *
      
    3. keras.layers.convolutional.py

      	class Conv2D(_Conv):class _Conv(Layer):
          	def call(self, inputs):
      	        if self.rank == 2:
      	            outputs = K.conv2d(
      	                inputs,
      	                self.kernel,
      	                strides=self.strides,
      	                padding=self.padding,
      	                data_format=self.data_format,
      	                dilation_rate=self.dilation_rate)
      
    4. keras.backend.__init__.py

      	from .tensorflow_backend import *
      
    5. keras.backend.tensorflow_backend.py

      	def conv2d(x, kernel, strides=(1, 1), padding='valid',
      	           data_format=None, dilation_rate=(1, 1)):
      	           
      		data_format = normalize_data_format(data_format)
      		x, tf_data_format = _preprocess_conv2d_input(x, data_format)
      		padding = _preprocess_padding(padding)
      		x = tf.nn.convolution(
      			input=x,
      			filter=kernel,
      			dilation_rate=dilation_rate,
      			strides=strides,
      			padding=padding,
      			data_format=tf_data_format)
      	    if data_format == 'channels_first' and tf_data_format == 'NHWC':
      	        x = tf.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
      	    return x
      
    6. tensorflow.python.ops.nn_ops.py

      	@tf_export("nn.convolution")
      	def convolution(
      	    input,  # pylint: disable=redefined-builtin
      	    filter,  # pylint: disable=redefined-builtin
      	    padding,
      	    strides=None,
      	    dilation_rate=None,
      	    name=None,
      	    data_format=None):
      	# pylint: disable=line-too-long
      	  """Computes sums of N-D convolutions (actually cross-correlation).
      	
      	  This also supports either output striding via the optional `strides` parameter
      	  or atrous convolution (also known as convolution with holes or dilated
      	  convolution, based on the French word "trous" meaning holes in English) via
      	  the optional `dilation_rate` parameter.  Currently, however, output striding
      	  is not supported for atrous convolutions.
      	
      	  Specifically, in the case that `data_format` does not start with "NC", given
      	  a rank (N+2) `input` Tensor of shape
      	
      	    [num_batches,
      	     input_spatial_shape[0],
      	     ...,
      	     input_spatial_shape[N-1],
      	     num_input_channels],
      	
      	  a rank (N+2) `filter` Tensor of shape
      	
      	    [spatial_filter_shape[0],
      	     ...,
      	     spatial_filter_shape[N-1],
      	     num_input_channels,
      	     num_output_channels],
      	
      	  an optional `dilation_rate` tensor of shape [N] (defaulting to [1]*N)
      	  specifying the filter upsampling/input downsampling rate, and an optional list
      	  of N `strides` (defaulting [1]*N), this computes for each N-D spatial output
      	  position (x[0], ..., x[N-1]):
      	
      	  ```
      	    output[b, x[0], ..., x[N-1], k] =
      	        sum_{z[0], ..., z[N-1], q}
      	            filter[z[0], ..., z[N-1], q, k] *
      	            padded_input[b,
      	                         x[0]*strides[0] + dilation_rate[0]*z[0],
      	                         ...,
      	                         x[N-1]*strides[N-1] + dilation_rate[N-1]*z[N-1],
      	                         q]
      	  ```
      	  where b is the index into the batch, k is the output channel number, q is the
      	  input channel number, and z is the N-D spatial offset within the filter. Here,
      	  `padded_input` is obtained by zero padding the input using an effective
      	  spatial filter shape of `(spatial_filter_shape-1) * dilation_rate + 1` and
      	  output striding `strides` as described in the
      	  @{$python/nn#Convolution$comment here}.
      	
      	  In the case that `data_format` does start with `"NC"`, the `input` and output
      	  (but not the `filter`) are simply transposed as follows:
      	
      	    convolution(input, data_format, **kwargs) =
      	      tf.transpose(convolution(tf.transpose(input, [0] + range(2,N+2) + [1]),
      	                               **kwargs),
      	                   [0, N+1] + range(1, N+1))
      	
      	  It is required that 1 <= N <= 3.
      	
      	  Args:
      	    input: An N-D `Tensor` of type `T`, of shape
      	      `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
      	      not start with "NC" (default), or
      	      `[batch_size, in_channels] + input_spatial_shape` if data_format starts
      	      with "NC".
      	    filter: An N-D `Tensor` with the same type as `input` and shape
      	      `spatial_filter_shape + [in_channels, out_channels]`.
      	    padding: A string, either `"VALID"` or `"SAME"`. The padding algorithm.
      	    strides: Optional.  Sequence of N ints >= 1.  Specifies the output stride.
      	      Defaults to [1]*N.  If any value of strides is > 1, then all values of
      	      dilation_rate must be 1.
      	    dilation_rate: Optional.  Sequence of N ints >= 1.  Specifies the filter
      	      upsampling/input downsampling rate.  In the literature, the same parameter
      	      is sometimes called `input stride` or `dilation`.  The effective filter
      	      size used for the convolution will be `spatial_filter_shape +
      	      (spatial_filter_shape - 1) * (rate - 1)`, obtained by inserting
      	      (dilation_rate[i]-1) zeros between consecutive elements of the original
      	      filter in each spatial dimension i.  If any value of dilation_rate is > 1,
      	      then all values of strides must be 1.
      	    name: Optional name for the returned tensor.
      	    data_format: A string or None.  Specifies whether the channel dimension of
      	      the `input` and output is the last dimension (default, or if `data_format`
      	      does not start with "NC"), or the second dimension (if `data_format`
      	      starts with "NC").  For N=1, the valid values are "NWC" (default) and
      	      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
      	      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
      	
      	  Returns:
      	    A `Tensor` with the same type as `input` of shape
      	
      	        `[batch_size] + output_spatial_shape + [out_channels]`
      	
      	    if data_format is None or does not start with "NC", or
      	
      	        `[batch_size, out_channels] + output_spatial_shape`
      	
      	    if data_format starts with "NC",
      	    where `output_spatial_shape` depends on the value of `padding`.
      	
      	    If padding == "SAME":
      	      output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
      	
      	    If padding == "VALID":
      	      output_spatial_shape[i] =
      	        ceil((input_spatial_shape[i] -
      	              (spatial_filter_shape[i]-1) * dilation_rate[i])
      	             / strides[i]).
      	
      	  Raises:
      	    ValueError: If input/output depth does not match `filter` shape, if padding
      	      is other than `"VALID"` or `"SAME"`, or if data_format is invalid.
      	
      	  """
      	  # pylint: enable=line-too-long
      		with ops.name_scope(name, "convolution", [input, filter]) as name:
      		    input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
      		    input_shape = input.get_shape()
      		    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
      		    filter_shape = filter.get_shape()
      		    op = Convolution(
      		        input_shape,
      		        filter_shape,
      		        padding,
      		        strides=strides,
      		        dilation_rate=dilation_rate,
      		        name=name,
      		        data_format=data_format)
      			    return op(input, filter) 
      

      tensorflow.python.ops.nn_ops.py 的 def convolution() 方法的註釋,是學習convolution 原理一個很好的材料,下面這段解釋了輸入的 Tensor 如果是 [N,D_1,D_2,…,D_m-1,C]的形式,那麼filter 將會按照 [D_1,D_2,…,D_m-1,C_input,C_output]的形式生成。其中 C_output 與 filter 個數一致。

        """a rank (N+2) `input` Tensor of shape
      	
      	    [num_batches,
      	     input_spatial_shape[0],
      	     ...,
      	     input_spatial_shape[N-1],
      	     num_input_channels],
      	
      	  a rank (N+2) `filter` Tensor of shape
      	
      	    [spatial_filter_shape[0],
      	     ...,
      	     spatial_filter_shape[N-1],
      	     num_input_channels,
      	     num_output_channels],
      	 """
      	 ```
      
    7. tensorflow.python.ops.nn_ops.py

      	class Convolution(object):
      	  """Helper class for convolution.
      	
      	  Note that this class assumes that shapes of input and filter passed to
      	  __call__ are compatible with input_shape and filter_shape passed to the
      	  constructor.
      	
      	  Arguments
      	    input_shape: static shape of input. i.e. input.get_shape().
      	    filter_shape: static shape of the filter. i.e. filter.get_shape().
      	    padding:  see convolution.
      	    strides: see convolution.
      	    dilation_rate: see convolution.
      	    name: see convolution.
      	    data_format: see convolution.
      	  """
      	
      	  def __init__(self,
      	               input_shape,
      	               filter_shape,
      	               padding,
      	               strides=None,
      	               dilation_rate=None,
      	               name=None,
      	               data_format=None):
      	    """Helper function for convolution."""
      	    # filter_shape 是 ( kernal_size, kernal_size, feature_map_channel_size,filter_number )
      	    num_total_dims = filter_shape.ndims
      	    # input_shape 是 (NHWC)或者(NCHW)
      	    if num_total_dims is None:
      	      num_total_dims = input_shape.ndims
      	    if num_total_dims is None:
      	      raise ValueError("rank of input or filter must be known")
      	      
      		# 空間維度,默認爲(NHWC), num_total_dims = 4
      	    num_spatial_dims = num_total_dims - 2
      	
      	    try:
      	      input_shape.with_rank(num_spatial_dims + 2)
      	    except ValueError:
      	      ValueError("input tensor must have rank %d" % (num_spatial_dims + 2))
      	
      	    try:
      	      filter_shape.with_rank(num_spatial_dims + 2)
      	    except ValueError:
      	      ValueError("filter tensor must have rank %d" % (num_spatial_dims + 2))
      	
      	    if data_format is None or not data_format.startswith("NC"):
      	      input_channels_dim = input_shape[num_spatial_dims + 1]
      	      spatial_dims = range(1, num_spatial_dims + 1)
      	    else:
      	      input_channels_dim = input_shape[1]
      	      spatial_dims = range(2, num_spatial_dims + 2)
      	
      	    if not input_channels_dim.is_compatible_with(
      	        filter_shape[num_spatial_dims]):
      	      raise ValueError(
      	          "number of input channels does not match corresponding dimension of "
      	          "filter, {} != {}".format(input_channels_dim,
      	                                    filter_shape[num_spatial_dims]))
      	
      	    strides, dilation_rate = _get_strides_and_dilation_rate(
      	        num_spatial_dims, strides, dilation_rate)
      	
      	    self.input_shape = input_shape
      	    self.filter_shape = filter_shape
      	    self.data_format = data_format
      	    self.strides = strides
      	    self.name = name
      	    self.conv_op = _WithSpaceToBatch(
      	        input_shape,
      	        dilation_rate=dilation_rate,
      	        padding=padding,
      	        build_op=self._build_op,
      	        filter_shape=filter_shape,
      	        spatial_dims=spatial_dims,
      	        data_format=data_format)
      	
      	  def _build_op(self, _, padding):
      	    return _NonAtrousConvolution(
      	        self.input_shape,
      	        filter_shape=self.filter_shape,
      	        padding=padding,
      	        data_format=self.data_format,
      	        strides=self.strides,
      	        name=self.name)
      	
      	  def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
      	    return self.conv_op(inp, filter)
      

      所以 keras.layer.Conv2D 實際用的是 tensorflow.python.ops.nn_ops.Convolution

      待續

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章