基於TF 1.5版本的 YOLOV3
其中向下的箭頭很多實在是懶得打,基本對齊的下面都有向下的箭頭。代碼中SPP池化層跟我瞭解到的標準SPP的定義不太一樣,這裏我按照代碼寫的,那個SPP最後是concat 不是簡單加法!!!(一個疑問 SPP後的東西 又跟inputs concat了 那不又不是定長了...)
絕大部分卷積層的padding 爲 same 只有 darknet53 中 有三個部分有 valid padding的(紅快標出來了)
整個結構都是殘差的結構 其中 SPP和detect_layer 的部分有些複雜
這版本的源碼 我分析了一下這裏還是有很多可以簡化的地方 大家也可以幫忙指出錯誤:
# -*- coding: utf-8 -*-
import numpy as np
import tensorflow as tf
slim = tf.contrib.slim
_BATCH_NORM_DECAY = 0.9
_BATCH_NORM_EPSILON = 1e-05
_LEAKY_RELU = 0.1
_ANCHORS = [(10, 13), (16, 30), (33, 23),
(30, 61), (62, 45), (59, 119),
(116, 90), (156, 198), (373, 326)]
#darknet53模塊
def darknet53(inputs):
"""
Builds Darknet-53 model.
"""
inputs = _conv2d_fixed_padding(inputs, 32, 3)
inputs = _conv2d_fixed_padding(inputs, 64, 3, strides=2)
inputs = _darknet53_block(inputs, 32)
inputs = _conv2d_fixed_padding(inputs, 128, 3, strides=2)
for i in range(2):
inputs = _darknet53_block(inputs, 64)
inputs = _conv2d_fixed_padding(inputs, 256, 3, strides=2)
for i in range(8):
inputs = _darknet53_block(inputs, 128)
route_1 = inputs
inputs = _conv2d_fixed_padding(inputs, 512, 3, strides=2)
for i in range(8):
inputs = _darknet53_block(inputs, 256)
route_2 = inputs
inputs = _conv2d_fixed_padding(inputs, 1024, 3, strides=2)
for i in range(4):
inputs = _darknet53_block(inputs, 512)
return route_1, route_2, inputs
#大部分時間都是 conv2d 的same padding 很少情況是valid
def _conv2d_fixed_padding(inputs, filters, kernel_size, strides=1):
if strides > 1:
inputs = _fixed_padding(inputs, kernel_size)#就是pad到原來的尺寸 省着conv2d後縮小了
inputs = slim.conv2d(inputs, filters, kernel_size, stride=strides,
padding=('SAME' if strides == 1 else 'VALID'))
return inputs
# 固定的一個模式 有殘差
def _darknet53_block(inputs, filters):
shortcut = inputs
inputs = _conv2d_fixed_padding(inputs, filters, 1)
inputs = _conv2d_fixed_padding(inputs, filters * 2, 3)
inputs = inputs + shortcut
return inputs
#金字塔池化層 用來將不同大小的經過conv層的特徵 做成一個固定size 的(黑人問好這裏)
def _spp_block(inputs, data_format='NCHW'):
return tf.concat([slim.max_pool2d(inputs, 13, 1, 'SAME'),# 13 就是kernel大小 右面的1 stride
slim.max_pool2d(inputs, 9, 1, 'SAME'),
slim.max_pool2d(inputs, 5, 1, 'SAME'),
inputs],
axis=1 if data_format == 'NCHW' else 3)
@tf.contrib.framework.add_arg_scope #這裏的意思是 當使用 with argscope 的時候 可以設定某些通用傳入參數 相當於修改定義函數時候的默認值
def _fixed_padding(inputs, kernel_size, *args, mode='CONSTANT', **kwargs):
"""
Pads the input along the spatial dimensions independently of input size.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
Should be a positive integer.
data_format: The input format ('NHWC' or 'NCHW').
mode: The mode for tf.pad.
Returns:
A tensor with the same format as the input with the data either intact
(if kernel_size == 1) or padded (if kernel_size > 1).
"""
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
if kwargs['data_format'] == 'NCHW':
padded_inputs = tf.pad(inputs, [[0, 0], [0, 0],
[pad_beg, pad_end],
[pad_beg, pad_end]],
mode=mode)
else:
padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
[pad_beg, pad_end], [0, 0]], mode=mode)
return padded_inputs
#最後部分卷積運算及SPP運算
def _yolo_block(inputs, filters, data_format='NCHW', with_spp=False):
inputs = _conv2d_fixed_padding(inputs, filters, 1)
inputs = _conv2d_fixed_padding(inputs, filters * 2, 3)
inputs = _conv2d_fixed_padding(inputs, filters, 1)
if with_spp:
inputs = _spp_block(inputs, data_format)
inputs = _conv2d_fixed_padding(inputs, filters, 1)
inputs = _conv2d_fixed_padding(inputs, filters * 2, 3)
inputs = _conv2d_fixed_padding(inputs, filters, 1)
route = inputs
inputs = _conv2d_fixed_padding(inputs, filters * 2, 3)
return route, inputs
#返回 channel H W 這三個值
def _get_size(shape, data_format):
if len(shape) == 4:
shape = shape[1:]
return shape[1:3] if data_format == 'NCHW' else shape[0:2]
#將 做完CONV2D運算後 將其各個部分組裝還原到圖像上
def _detection_layer(inputs, num_classes, anchors, img_size, data_format):
num_anchors = len(anchors)
predictions = slim.conv2d(inputs, num_anchors * (5 + num_classes), 1,
stride=1, normalizer_fn=None,
activation_fn=None,
biases_initializer=tf.zeros_initializer())
#prediction 是 卷積完成後的結果
shape = predictions.get_shape().as_list() #輸出的NCHW 維度
# 這裏得到的gridsize 是H W
grid_size = _get_size(shape, data_format)
#DIM = H*W
dim = grid_size[0] * grid_size[1]
# 這裏構造的是 最終輸出 每個結果的一個長度 5 + class數量 5是 h w 寬 高 可信度 class數量是one hot
bbox_attrs = 5 + num_classes
if data_format == 'NCHW':
predictions = tf.reshape(
predictions, [-1, num_anchors * bbox_attrs, dim]) #這裏的anchor 就是取樣用box
predictions = tf.transpose(predictions, [0, 2, 1])
# 將預測結果 重新排布
predictions = tf.reshape(predictions, [-1, num_anchors * dim, bbox_attrs]) #-1基本等於N
# 用圖片H 除以 輸出H 的到stride
stride = (img_size[0] // grid_size[0], img_size[1] // grid_size[1])
# 用 anchor box H 除以 stride h 更新anchor
anchors = [(a[0] / stride[0], a[1] / stride[1]) for a in anchors] #stride 相當於縮小的倍數 這裏就是把anchor 除stride 得到 在最後輸出的HW上用的 anchor 的大小
#從預測結果中拿到 中心點 大小 可信度 分類
box_centers, box_sizes, confidence, classes = tf.split(
predictions, [2, 2, 1, num_classes], axis=-1)
#將其放入到sigmoid 中 歸一化
box_centers = tf.nn.sigmoid(box_centers)
confidence = tf.nn.sigmoid(confidence)
#
grid_x = tf.range(grid_size[0], dtype=tf.float32)# 輸出的 0 1 2...H的list
grid_y = tf.range(grid_size[1], dtype=tf.float32)# 同上 換成 w
a, b = tf.meshgrid(grid_x, grid_y) #將向量擴增爲矩陣 擴增爲 x(列數)*y(列數) 矩陣 a每一行都是grid_x b每一列都是grid_y
x_offset = tf.reshape(a, (-1, 1))# 一行一行展平 平鋪
y_offset = tf.reshape(b, (-1, 1))
x_y_offset = tf.concat([x_offset, y_offset], axis=-1) #連成一列
x_y_offset = tf.reshape(tf.tile(x_y_offset, [1, num_anchors]), [1, -1, 2])#將其複製幾份 anchor 決定 然後reshap 將其改成 1,x,2(這裏的2代表 h w )
# 所有位置的 所有anchor box 的中心點 (這時候是縮放到輸出圖像大小的)
box_centers = box_centers + x_y_offset
box_centers = box_centers * stride # 在縮放會原圖大小
anchors = tf.tile(anchors, [dim, 1])#將anchor box 複製輸出 H*W份
box_sizes = tf.exp(box_sizes) * anchors
box_sizes = box_sizes * stride
detections = tf.concat([box_centers, box_sizes, confidence], axis=-1)
classes = tf.nn.sigmoid(classes)
predictions = tf.concat([detections, classes], axis=-1)
return predictions
#向上取樣 利用 tf.image.resize_nearest_neighbor模式
def _upsample(inputs, out_shape, data_format='NCHW'):
# tf.image.resize_nearest_neighbor accepts input in format NHWC
if data_format == 'NCHW':
inputs = tf.transpose(inputs, [0, 2, 3, 1])
if data_format == 'NCHW':
new_height = out_shape[3]
new_width = out_shape[2]
else:
new_height = out_shape[2]
new_width = out_shape[1]
inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width))
# back to NCHW if needed
if data_format == 'NCHW':
inputs = tf.transpose(inputs, [0, 3, 1, 2])
inputs = tf.identity(inputs, name='upsampled')
return inputs
# 相當於MAIN
def yolo_v3(inputs, num_classes, is_training=False, data_format='NCHW', reuse=False, with_spp=False):
"""
Creates YOLO v3 model.
:param inputs: a 4-D tensor of size [batch_size, height, width, channels].
Dimension batch_size may be undefined. The channel order is RGB.
:param num_classes: number of predicted classes.
:param is_training: whether is training or not.
:param data_format: data format NCHW or NHWC.
:param reuse: whether or not the network and its variables should be reused.
:param with_spp: whether or not is using spp layer.
:return:
"""
# it will be needed later on #這裏順序不對 NHWC 時候是 HW NCHW的話 就是CH了 哈哈 我把他放到下一句後面
#img_size = inputs.get_shape().as_list()[1:3]
# transpose the inputs to NCHW
if data_format == 'NCHW':
inputs = tf.transpose(inputs, [0, 3, 1, 2])#轉成 NHWC [0, 3, 1, 2]這個裏面代表把原來地方放到新的哪裏 c 放到 3的位置 H放到 1的位置...
img_size = inputs.get_shape().as_list()[1:3]
# normalize values to range [0..1] 歸一化
inputs = inputs / 255
# set batch norm params
batch_norm_params = {
'decay': _BATCH_NORM_DECAY,
'epsilon': _BATCH_NORM_EPSILON,
'scale': True,
'is_training': is_training,
'fused': None, # Use fused batch norm if possible.
}
# Set activation_fn and parameters for conv2d, batch_norm.
with slim.arg_scope([slim.conv2d, slim.batch_norm, _fixed_padding], data_format=data_format, reuse=reuse):
with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params,
biases_initializer=None,
activation_fn=lambda x: tf.nn.leaky_relu(x, alpha=_LEAKY_RELU)):
#上面就是將 conv2d 設置一個默認參數
with tf.variable_scope('darknet-53'):
route_1, route_2, inputs = darknet53(inputs)
#darknet 53 部分運算完事
with tf.variable_scope('yolo-v3'):
#進入到yolo部分
route, inputs = _yolo_block(inputs, 512, data_format, with_spp)
detect_1 = _detection_layer(
inputs, num_classes, _ANCHORS[6:9], img_size, data_format)
detect_1 = tf.identity(detect_1, name='detect_1') #複製一個(感覺就是加個名字)
#分多個層就因爲 darknet53 輸出的時候就是多個
inputs = _conv2d_fixed_padding(route, 256, 1)
upsample_size = route_2.get_shape().as_list()
inputs = _upsample(inputs, upsample_size, data_format)
inputs = tf.concat([inputs, route_2],
axis=1 if data_format == 'NCHW' else 3)
route, inputs = _yolo_block(inputs, 256)
detect_2 = _detection_layer(
inputs, num_classes, _ANCHORS[3:6], img_size, data_format)
detect_2 = tf.identity(detect_2, name='detect_2')
inputs = _conv2d_fixed_padding(route, 128, 1)
upsample_size = route_1.get_shape().as_list()
inputs = _upsample(inputs, upsample_size, data_format)
inputs = tf.concat([inputs, route_1],
axis=1 if data_format == 'NCHW' else 3)
_, inputs = _yolo_block(inputs, 128)
detect_3 = _detection_layer(
inputs, num_classes, _ANCHORS[0:3], img_size, data_format)
detect_3 = tf.identity(detect_3, name='detect_3')
detections = tf.concat([detect_1, detect_2, detect_3], axis=1)
detections = tf.identity(detections, name='detections')
return detections
def yolo_v3_spp(inputs, num_classes, is_training=False, data_format='NCHW', reuse=False):
"""
Creates YOLO v3 with SPP model.
:param inputs: a 4-D tensor of size [batch_size, height, width, channels].
Dimension batch_size may be undefined. The channel order is RGB.
:param num_classes: number of predicted classes.
:param is_training: whether is training or not.
:param data_format: data format NCHW or NHWC.
:param reuse: whether or not the network and its variables should be reused.
:return:
"""
return yolo_v3(inputs, num_classes, is_training=is_training, data_format=data_format, reuse=reuse, with_spp=True)