SSD在vgg-16的基础上,去掉了vgg-16的三个全连接层,保留了第1-5的卷积层,又新增加了第6-11的卷积层。下一卷积层的输入,都是上一卷积层提取完特征,池化后的结果。最后选取4,6,7,8,9,10,11.六个特征层,对着六个特征层进行两次卷积,一次卷积通过线性回归预测位置,另一次卷积通过softmax归一化来做分类。
SSD的网络架构:
def ssd_net(inputs, # 定义ssd网络结构
num_classes=SSDNet.default_params.num_classes, # 分类数
feat_layers=SSDNet.default_params.feat_layers, # 特征层
anchor_sizes=SSDNet.default_params.anchor_sizes,
anchor_ratios=SSDNet.default_params.anchor_ratios,
normalizations=SSDNet.default_params.normalizations, # 正则化
is_training=True,
dropout_keep_prob=0.5,
prediction_fn=slim.softmax,
reuse=None,
scope='ssd_300_vgg'):
"""SSD net definition.
"""
# if data_format == 'NCHW':
# inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))
# End_points collect relevant activations for external use.
end_points = {} # 用于收集每一层输出结果
with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
# Original VGG-16 blocks.
net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') # VGG16网络的第一个conv,重复2次卷积,核为3x3,64个特征
end_points['block1'] = net # conv1_2结果存入end_points,name='block1'
net = slim.max_pool2d(net, [2, 2], scope='pool1')
# Block 2.
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') # 重复2次卷积,核为3x3,128个特征 #conv2d,tensorflow的函数
end_points['block2'] = net # conv2_2结果存入end_points,name='block2'
net = slim.max_pool2d(net, [2, 2], scope='pool2')
# Block 3.
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') # 重复3次卷积,核为3x3,256个特征
end_points['block3'] = net # conv3_3结果存入end_points,name='block3'
net = slim.max_pool2d(net, [2, 2], scope='pool3')
# Block 4.
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') # 重复3次卷积,核为3x3,512个特征
end_points['block4'] = net # 在池化层之前,将conv4_3结果存入end_points,name='block4'
net = slim.max_pool2d(net, [2, 2], scope='pool4')
# Block 5.
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') # 重复3次卷积,核为3x3,512个特征
end_points['block5'] = net # conv5_3结果存入end_points,name='block5'
net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5')
# Additional SSD blocks. #去掉了VGG的全连接层
# Block 6: let's dilate the hell out of it!
net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6') # 将VGG基础网络最后的池化层结果做扩展卷积(带孔卷积);
end_points['block6'] = net # conv6结果存入end_points,name='block6'
net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training) # dropout层
# Block 7: 1x1 conv. Because the fuck.
net = slim.conv2d(net, 1024, [1, 1], scope='conv7') # 将dropout后的网络做1x1卷积,输出1024特征,name='block7'
end_points['block7'] = net
net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training) # 将卷积后的网络继续做dropout
# Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
end_point = 'block8'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 256, [1, 1], scope='conv1x1') # 对上述dropout的网络做1x1卷积,然后做3x3卷积,,输出512特征图,name=‘block8’
net = custom_layers.pad2d(net, pad=(1, 1))
net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID')
end_points[end_point] = net
end_point = 'block9'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') # 对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block9’
net = custom_layers.pad2d(net, pad=(1, 1))
net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID')
end_points[end_point] = net
end_point = 'block10'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') # 对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block10’
net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
end_points[end_point] = net
end_point = 'block11'
with tf.variable_scope(end_point):
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') # 对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block11’
net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
end_points[end_point] = net
# Prediction and localisations layers. #预测和定位
predictions = []
logits = []
localisations = []
for i, layer in enumerate(feat_layers): # 遍历特征层
with tf.variable_scope(layer + '_box'): # 起个命名范围
p, l = ssd_multibox_layer(end_points[layer], # 做多尺度大小box预测的特征层,返回每个cell中每个先验框预测的类别p和预测的位置l
num_classes, # 种类数
anchor_sizes[i], # 先验框尺度(同一特征图上的先验框尺度和长宽比一致)
anchor_ratios[i], # 先验框长宽比
normalizations[i]) # 每个特征正则化信息,目前是只对第一个特征图做归一化操作;
# 把每一层的预测收集
predictions.append(prediction_fn(p)) # prediction_fn为softmax,预测类别
logits.append(p) # 把每个cell每个先验框预测的类别的概率值存在logits中
localisations.append(l) # 预测位置信息
return predictions, localisations, logits, end_points # 返回类别预测结果,位置预测结果,所属某个类别的概率值,以及特征层
ssd_net.default_image_size = 300
预测位置和分类:
def ssd_multibox_layer(inputs, # 输入特征层
num_classes, # 类别数
sizes, # 参考先验框的尺度
ratios=[1], # 默认的先验框长宽比为1
normalization=-1, # 默认不做正则化
bn_normalization=False):
"""Construct a multibox layer, return a class and localization predictions.
"""
net = inputs
if normalization > 0: # 如果输入整则化数,则进行L2正
net = custom_layers.l2_normalization(net, scaling=True) # 对通道所在维度进行正则化,随后乘以gamma缩放系数
# Number of anchors.
num_anchors = len(sizes) + len(ratios) # 每层特征图参考先验框的个数[4,6,6,6,4,4]
# Location. #每个先验框对应4个座标信息
num_loc_pred = num_anchors * 4 # 特征图上每个单元预测的座标所需维度=锚点框数*4
loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None, # 通过对特征图进行3x3卷积得到位置信息和类别权重信息
scope='conv_loc') # 该部分是定位信息,输出维度为[特征图h,特征图w,每个单元所有锚点框座标]
loc_pred = custom_layers.channel_to_last(loc_pred)
loc_pred = tf.reshape(loc_pred, # 最后整个特征图所有锚点框预测目标位置 tensor为[h*w*每个cell先验框数,4]
tensor_shape(loc_pred, 4)[:-1] + [num_anchors, 4])
# Class prediction. #类别预测
num_cls_pred = num_anchors * num_classes # 特征图上每个单元预测的类别所需维度=锚点框数*种类数
cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None, # 该部分是类别信息,输出维度为[特征图h,特征图w,每个单元所有锚点框对应类别信息]
scope='conv_cls')
cls_pred = custom_layers.channel_to_last(cls_pred)
cls_pred = tf.reshape(cls_pred,
tensor_shape(cls_pred, 4)[:-1] + [num_anchors,
num_classes]) # 最后整个特征图所有锚点框预测类别 tensor为[h*w*每个cell先验框数,种类数]
return cls_pred, loc_pred # 返回预测得到的类别和box位置 tensor
参考博客:https://blog.csdn.net/c20081052/article/details/80391627