TF版FasterRCNN：resnet_v1.py代碼閱讀筆記

個人代碼閱讀筆記。
第二次更新：2019.4.3
# --------------------------------------------------------
# Tensorflow Faster R-CNN
# Licensed under The MIT License [see LICENSE for details]
# Written by Zheqi He and Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflow.contrib.slim import losses
from tensorflow.contrib.slim import arg_scope
from tensorflow.contrib.slim.python.slim.nets import resnet_utils
from tensorflow.contrib.slim.python.slim.nets import resnet_v1
from tensorflow.contrib.slim.python.slim.nets.resnet_v1 import resnet_v1_block
import numpy as np

from nets.network import Network
from model.config import cfg
#傳入一些參數，比如batch_norm_decay傳入到decay中。
def resnet_arg_scope(is_training=True,
                     batch_norm_decay=0.997,
                     batch_norm_epsilon=1e-5,
                     batch_norm_scale=True):
  batch_norm_params = {
    'is_training': False,
    'decay': batch_norm_decay,
    'epsilon': batch_norm_epsilon,
    'scale': batch_norm_scale,
    'trainable': False,
    'updates_collections': tf.GraphKeys.UPDATE_OPS
  }
#arg_scope是tensorflow的slime模塊自帶的組建，張開一個變量作用域，方便用戶定義一些參數。
#打開arg_scope，定義一些參數
  with arg_scope(
      [slim.conv2d],
      weights_regularizer=slim.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY),
      weights_initializer=slim.variance_scaling_initializer(),
      trainable=is_training,
      activation_fn=tf.nn.relu,
      normalizer_fn=slim.batch_norm,
      normalizer_params=batch_norm_params):
    with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc:
      return arg_sc
#resnetv1()爲Network的子類。其中有一些父類的方法不能滿足需要，在子類中進行了方法重寫，入
class resnetv1(Network):
  def __init__(self, num_layers=50):
    Network.__init__(self)
    self._feat_stride = [16, ]#原圖到輸出的縮小比例
    self._feat_compress = [1. / float(self._feat_stride[0]), ]#同上，倒數
    self._num_layers = num_layers#層數
    self._scope = 'resnet_v1_%d' % num_layers#scope的名稱，我用的resnet_v1_101,所以打開它的scope
    self._decide_blocks()

  def _crop_pool_layer(self, bottom, rois, name):#這裏是roi處理的步驟，crop對應的特徵區域，進行Pooling到7x7
    with tf.variable_scope(name) as scope:
      batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1])
      # Get the normalized coordinates of bboxes
      bottom_shape = tf.shape(bottom)
      height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0])
      width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0])
      x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width
      y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height
      x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width
      y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height
      # Won't be back-propagated to rois anyway, but to save time
      bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], 1))
      if cfg.RESNET.MAX_POOL:
        pre_pool_size = cfg.POOLING_SIZE * 2
        crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size],
                                         name="crops")
        crops = slim.max_pool2d(crops, [2, 2], padding='SAME')
      else:
        crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [cfg.POOLING_SIZE, cfg.POOLING_SIZE],
                                         name="crops")
    return crops

  # Do the first few layers manually, because 'SAME' padding can behave inconsistently
  # for images of different sizes: sometimes 0, sometimes 1
  #對於大小不一樣的圖像，same模式的padding可能回產生不同的運算結果，爲了保持一致手工的定義網絡的頭部。
  def _build_base(self):
    with tf.variable_scope(self._scope, self._scope):
      #首先創建一個卷積層，64個卷積和，7x7，步長爲2
      net = resnet_utils.conv2d_same(self._image, 64, 7, stride=2, scope='conv1')
	  #對輸入圖像卷積之後進行pad，用的是tf.pad函數。
      net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
	  #再進行最大池化，步長爲2，大小爲3x3
      net = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='pool1')
    #返回前面手工定義層的處理結果。
    return net

  def _image_to_head(self, is_training, reuse=None):
   #檢查：需要固定參數的block是否變化。res101一共4個Block,分別從0-3，默認設置的是1，代表我訓練的時候，前兩個blocks的權重是不變的，後面的變化  
    assert (0 <= cfg.RESNET.FIXED_BLOCKS <= 3)
    # Now the base is always fixed during training
    with slim.arg_scope(resnet_arg_scope(is_training=False)):
      net_conv = self._build_base()#類內方法的相互調用形式爲[self.方法名字]，這裏相當於把前面_build_base的結算結果調用過來。
    #因爲要fixed的權重block可能是處於中間，所以使用的運算機制是：首先固定住我們需要的層，爲非訓練模式，然後設置剩下的層爲訓練模式。is_training=True
    if cfg.RESNET.FIXED_BLOCKS > 0:
      with slim.arg_scope(resnet_arg_scope(is_training=False)):
	  #注意，這裏返回的net_conv，是經過了幾個固定block的op後的net_conv
        net_conv, _ = resnet_v1.resnet_v1(net_conv,
                                           self._blocks[0:cfg.RESNET.FIXED_BLOCKS],#[0:cfg.RESNET.FIXED_BLOCKS]即爲前幾個固定的blocks
                                           global_pool=False,
                                           include_root_block=False,
                                           reuse=reuse,
                                           scope=self._scope)
    if cfg.RESNET.FIXED_BLOCKS < 3:
	  #slim.arg_scope(resnet_arg_scope(is_training=true or false))應該是slim的標準語法，用於區分訓練和非訓練的變量域
      with slim.arg_scope(resnet_arg_scope(is_training=is_training)):#雖然這是is_training=is_training，但是訓練的時候傳入的是true，測試的時候依然是false。這樣寫很簡潔
        net_conv, _ = resnet_v1.resnet_v1(net_conv,
                                           self._blocks[cfg.RESNET.FIXED_BLOCKS:-1],#[cfg.RESNET.FIXED_BLOCKS:-1]即爲後幾個固定的Blocks
                                           global_pool=False,
                                           include_root_block=False,
                                           reuse=reuse,
                                           scope=self._scope)

    self._act_summaries.append(net_conv)#這裏應該是tensorboard的活動總結的變量，把到這一步的結果記錄
    self._layers['head'] = net_conv#同時也把結果保存到Layers字典中key='head'下

    return net_conv#返回計算結果。
  #res101是Network的子類，在network中_build_network調用了crop_pool_layer方法，即roi-pooling，得到的就是Pool5
  #這裏可能會有一個疑問：最後的feature maps上有很多個roi，這裏沒有用for循環，是怎麼批量把這些rois對應的特徵塊進行crop and resize的呢？
  #用了矩陣的結構，每一行是一個roi，按列來處理，就相當於對行進行批處理了。下面函數隱含的處理也有不同，針對每一行進行計算，而不是一起同時計算。
  def _head_to_tail(self, pool5, is_training, reuse=None):
    with slim.arg_scope(resnet_arg_scope(is_training=is_training)):
     #打開變量域，全連接層屬於可學類型權重
	 #返回fc7的計算結果
      fc7, _ = resnet_v1.resnet_v1(pool5,
                                   self._blocks[-1:],#申明位置，在Block之後。
                                   global_pool=False,
                                   include_root_block=False,
                                   reuse=reuse,#變量重用
                                   scope=self._scope)
      # average pooling done by reduce_mean
	  #全連接層去均值處理。
      fc7 = tf.reduce_mean(fc7, axis=[1, 2])
    return fc7#返回計算結果

  def _decide_blocks(self):
    # choose different blocks for different number of layers
    if self._num_layers == 50:
      self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
                      resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
                      # use stride 1 for the last conv4 layer
                      resnet_v1_block('block3', base_depth=256, num_units=6, stride=1),
                      resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
    #基本就是調用slim的blocks參數。比如我輸入res101，就會得到下面的擴展參數，這些參數再進入slim裏面生成res101網絡
    elif self._num_layers == 101:
	#舉例：第一個block
	#名字：'block1'
	#64個卷積核
	#該層結構複製3次
	#卷積步長爲2
      self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
                      resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
                      # use stride 1 for the last conv4 layer
                      resnet_v1_block('block3', base_depth=256, num_units=23, stride=1),
                      resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]

    elif self._num_layers == 152:
      self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
                      resnet_v1_block('block2', base_depth=128, num_units=8, stride=2),
                      # use stride 1 for the last conv4 layer
                      resnet_v1_block('block3', base_depth=256, num_units=36, stride=1),
                      resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]

    else:
      # other numbers are not supported
      raise NotImplementedError

  def get_variables_to_restore(self, variables, var_keep_dic):
    variables_to_restore = []
    #一個變量保存的函數
	#傳入變量以及
    for v in variables:
      # exclude the first conv layer to swap RGB to BGR
	  #對於每一個名字爲resnet_v1_101/conv1/weights:0的變量進行保存，
      if v.name == (self._scope + '/conv1/weights:0'):
        self._variables_to_fix[v.name] = v#self._variables_to_fix是在父類network中創建的字典。這裏相當於將這個變量加入字典
        continue
      #如果這個變量resnet_v1_101/conv1/weights在變量保留字典裏面，就加入到variables_to_restore裏。
      if v.name.split(':')[0] in var_keep_dic:
        print('Variables restored: %s' % v.name)
        variables_to_restore.append(v)

    return variables_to_restore#返回保留的變量
  #在lib/model/train_val.py用到，self.net.fix_variables(sess, self.pretrained_model)
  def fix_variables(self, sess, pretrained_model):#這裏主要是訓練前修正變量，將rgb轉換爲bgr。首先從模型裏面回覆，然後進行通道反轉。總之模型本身參數是rgb通道的。
    print('Fix Resnet V1 layers..')
    with tf.variable_scope('Fix_Resnet_V1') as scope:#打開名爲Fix_Resnet_V1的變量域
      with tf.device("/cpu:0"):#指定cpu運行
        # fix RGB to BGR
		#使用tf.get_variable調用變量，沒有就創建變量，名字爲"conv1_rgb"，大小爲7x7x3x64，7x7的大小，三個通道，64個卷積核。其實就是rgb三個通道到bgr的轉換，因爲是轉換，所以不可訓練。
        conv1_rgb = tf.get_variable("conv1_rgb", [7, 7, 3, 64], trainable=False)
        restorer_fc = tf.train.Saver({self._scope + "/conv1/weights": conv1_rgb})#這裏創建了一個saver對象，saver（變量），變量是要保存或者回復的變量，這裏主要是回覆。
        restorer_fc.restore(sess, pretrained_model)#對變量進行回覆。注意，這裏的saver指定了回覆的變量，不是整個模型都回復。
#tf.assign是指定，將self._variables_to_fix[self._scope + '/conv1/weights:0']的值指定爲tf.reverse(conv1_rgb, [2])
        sess.run(tf.assign(self._variables_to_fix[self._scope + '/conv1/weights:0'], 
                           tf.reverse(conv1_rgb, [2])))#tf.reverse爲反轉，後面[2]是指定反轉的維度，這裏指定了通道反轉，rgb變爲bgr,因爲cv2讀入是bgr吧
TF版FasterRCNN：resnet_v1.py代碼閱讀筆記

爲什麼要⽤ Foundry

【筆記】動手學深度學習-預備知識

py發送email

MySQL 分庫分表方案，總結太全了。。

Qt/C++音視頻開發71-指定mjpeg/h264格式採集本地攝像頭/存儲文件到mp4/設備推流/採集推流

WPF開源輕便、快速的桌面啓動器

公司來了個新同事，把 DDD 運用得爐火純青！

R-FCN目標檢測模型原理

【目標檢測七】FPN-從原理到模型的訓練與測試

TF版FasterRCNN: network.py代碼解讀筆記

2018GoogleAI目標檢測競賽冠軍-百度計算機視覺團隊使用的策略總結

TF版FasterRCNN：resnet_v1.py代碼閱讀筆記

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結