TensorFlow-SSD

SSD在vgg-16的基础上,去掉了vgg-16的三个全连接层,保留了第1-5的卷积层,又新增加了第6-11的卷积层。下一卷积层的输入,都是上一卷积层提取完特征,池化后的结果。最后选取4,6,7,8,9,10,11.六个特征层,对着六个特征层进行两次卷积,一次卷积通过线性回归预测位置,另一次卷积通过softmax归一化来做分类。

SSD的网络架构:

def ssd_net(inputs,  # 定义ssd网络结构
            num_classes=SSDNet.default_params.num_classes,  # 分类数
            feat_layers=SSDNet.default_params.feat_layers,  # 特征层
            anchor_sizes=SSDNet.default_params.anchor_sizes,
            anchor_ratios=SSDNet.default_params.anchor_ratios,
            normalizations=SSDNet.default_params.normalizations,  # 正则化
            is_training=True,
            dropout_keep_prob=0.5,
            prediction_fn=slim.softmax,
            reuse=None,
            scope='ssd_300_vgg'):
    """SSD net definition.
    """
    # if data_format == 'NCHW':
    #     inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))

    # End_points collect relevant activations for external use.
    end_points = {}  # 用于收集每一层输出结果
    with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
        # Original VGG-16 blocks.
        net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')  # VGG16网络的第一个conv,重复2次卷积,核为3x3,64个特征
        end_points['block1'] = net  # conv1_2结果存入end_points,name='block1'
        net = slim.max_pool2d(net, [2, 2], scope='pool1')
        # Block 2.
        net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')  # 重复2次卷积,核为3x3,128个特征    #conv2d,tensorflow的函数
        end_points['block2'] = net  # conv2_2结果存入end_points,name='block2'
        net = slim.max_pool2d(net, [2, 2], scope='pool2')
        # Block 3.
        net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')  # 重复3次卷积,核为3x3,256个特征
        end_points['block3'] = net  # conv3_3结果存入end_points,name='block3'
        net = slim.max_pool2d(net, [2, 2], scope='pool3')
        # Block 4.
        net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')  # 重复3次卷积,核为3x3,512个特征
        end_points['block4'] = net  # 在池化层之前,将conv4_3结果存入end_points,name='block4'
        net = slim.max_pool2d(net, [2, 2], scope='pool4')
        # Block 5.
        net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')  # 重复3次卷积,核为3x3,512个特征
        end_points['block5'] = net  # conv5_3结果存入end_points,name='block5'
        net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5')

        # Additional SSD blocks.                                                  #去掉了VGG的全连接层
        # Block 6: let's dilate the hell out of it!
        net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6')  # 将VGG基础网络最后的池化层结果做扩展卷积(带孔卷积);
        end_points['block6'] = net  # conv6结果存入end_points,name='block6'
        net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)  # dropout层
        # Block 7: 1x1 conv. Because the fuck.
        net = slim.conv2d(net, 1024, [1, 1], scope='conv7')  # 将dropout后的网络做1x1卷积,输出1024特征,name='block7'
        end_points['block7'] = net
        net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)  # 将卷积后的网络继续做dropout

        # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
        end_point = 'block8'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 256, [1, 1], scope='conv1x1')  # 对上述dropout的网络做1x1卷积,然后做3x3卷积,,输出512特征图,name=‘block8’
            net = custom_layers.pad2d(net, pad=(1, 1))
            net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block9'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')  # 对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block9’
            net = custom_layers.pad2d(net, pad=(1, 1))
            net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block10'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')  # 对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block10’
            net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block11'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')  # 对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block11’
            net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
        end_points[end_point] = net

        # Prediction and localisations layers. #预测和定位
        predictions = []
        logits = []
        localisations = []
        for i, layer in enumerate(feat_layers):  # 遍历特征层
            with tf.variable_scope(layer + '_box'):  # 起个命名范围
                p, l = ssd_multibox_layer(end_points[layer],  # 做多尺度大小box预测的特征层,返回每个cell中每个先验框预测的类别p和预测的位置l
                                          num_classes,  # 种类数
                                          anchor_sizes[i],  # 先验框尺度(同一特征图上的先验框尺度和长宽比一致)
                                          anchor_ratios[i],  # 先验框长宽比
                                          normalizations[i])  # 每个特征正则化信息,目前是只对第一个特征图做归一化操作;
                # 把每一层的预测收集
                predictions.append(prediction_fn(p))  # prediction_fn为softmax,预测类别
        logits.append(p)  # 把每个cell每个先验框预测的类别的概率值存在logits中
        localisations.append(l)  # 预测位置信息

    return predictions, localisations, logits, end_points  # 返回类别预测结果,位置预测结果,所属某个类别的概率值,以及特征层


ssd_net.default_image_size = 300

预测位置和分类:

def ssd_multibox_layer(inputs,  # 输入特征层
                       num_classes,  # 类别数
                       sizes,  # 参考先验框的尺度
                       ratios=[1],  # 默认的先验框长宽比为1
                       normalization=-1,  # 默认不做正则化
                       bn_normalization=False):
    """Construct a multibox layer, return a class and localization predictions.
    """
    net = inputs
    if normalization > 0:  # 如果输入整则化数,则进行L2正
        net = custom_layers.l2_normalization(net, scaling=True)  # 对通道所在维度进行正则化,随后乘以gamma缩放系数
    # Number of anchors.
    num_anchors = len(sizes) + len(ratios)  # 每层特征图参考先验框的个数[4,6,6,6,4,4]

    # Location.     #每个先验框对应4个座标信息
    num_loc_pred = num_anchors * 4  # 特征图上每个单元预测的座标所需维度=锚点框数*4
    loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None,  # 通过对特征图进行3x3卷积得到位置信息和类别权重信息
                           scope='conv_loc')  # 该部分是定位信息,输出维度为[特征图h,特征图w,每个单元所有锚点框座标]
    loc_pred = custom_layers.channel_to_last(loc_pred)
    loc_pred = tf.reshape(loc_pred,  # 最后整个特征图所有锚点框预测目标位置 tensor为[h*w*每个cell先验框数,4]
                          tensor_shape(loc_pred, 4)[:-1] + [num_anchors, 4])
    # Class prediction.                                                #类别预测
    num_cls_pred = num_anchors * num_classes  # 特征图上每个单元预测的类别所需维度=锚点框数*种类数
    cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None,  # 该部分是类别信息,输出维度为[特征图h,特征图w,每个单元所有锚点框对应类别信息]
                           scope='conv_cls')
    cls_pred = custom_layers.channel_to_last(cls_pred)
    cls_pred = tf.reshape(cls_pred,
                          tensor_shape(cls_pred, 4)[:-1] + [num_anchors,
                                                            num_classes])  # 最后整个特征图所有锚点框预测类别 tensor为[h*w*每个cell先验框数,种类数]
    return cls_pred, loc_pred  # 返回预测得到的类别和box位置 tensor

参考博客:https://blog.csdn.net/c20081052/article/details/80391627

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章