【darknet源码解析-11】batchnorm_layer.h 和 batchnorm_layer.c解析

本系列为darknet源码解析，本次解析src/batchnorm_layer.h 与 src/batchnorm_layer.c两个。batchnorm主要完成批归一化操作。

论文名字：Batch Normalization: Accelerating Deep Network Training b y Reducing Internal Covariate Shift

论文地址：https://arxiv.org/pdf/1502.03167.pdf

BatchNorm，BN的基本动机与原理是什么？在CNN中如何使用？

神经网络的训练过程的本质是学习数据分布，如果训练数据与测试数据的分布不同将大大降低网络的泛化能力，所以我们需要在训练开始前对所有输入数据进行归一化操作。然后随着网络的训练，每个隐藏层的参数变化使得后一层的输入发生变化，从而每一个batchsize的训练数据的分布也随之变化，使得网络在每次迭代中都需要去拟合不同的数据分布，增大训练的复杂度以及过拟合的风险。BN是在网络的每一层输入之前增加归一化处理（均值为0，标准差为1）将所有批数据强制在统一的数据分布下。

BN层实现：

batchnorm_layer.h 的解析如下：

#ifndef BATCHNORM_LAYER_H
#define BATCHNORM_LAYER_H

#include "image.h"
#include "layer.h"
#include "network.h"

// 构造BN层函数
layer make_batchnorm_layer(int batch, int w, int h, int c);

// BN层前向传播函数
void forward_batchnorm_layer(layer l, network net);

// BN层反向传播函数
void backward_batchnorm_layer(layer l, network net);

#ifdef GPU
void forward_batchnorm_layer_gpu(layer l, network net);
void backward_batchnorm_layer_gpu(layer l, network net);
void pull_batchnorm_layer(layer l);
void push_batchnorm_layer(layer l);
#endif

#endif

求导：

$\frac{\partial L}{\partial y_i}=l.delta$ 【公式1-1】

$\frac{\partial L}{\partial \gamma}=\frac{\partial L}{\partial y_i}*\frac{\partial y_i}{\partial \gamma}=l.delta*\hat {x}_i$ 【公式1-1】

$\frac{\partial L}{\partial \beta }=\frac{\partial L}{\partial y_i}*\frac{\partial y_i}{\partial \beta }=l.delta$ 【公式1-2】

$\frac{\partial L}{\partial \hat{x}_i}=\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}=l.delta*\gamma$ 【公式1-3】

$\frac{\partial L}{\partial \mu_B}=\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}*\frac{\partial \hat{x}_i}{\partial \mu_B}=l.delta*\gamma*(\frac{-1}{\sqrt{\sigma _{B}^{2}+\epsilon }})$ 【公式1-4】

$\frac{\partial L}{\partial\sigma _{B}^{2}}=\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}*\frac{\partial \hat{x}_i}{\partial \sigma _{B}^{2}}=l.delta*\gamma*(x_i-\mu_B)*\frac{-1}{2}*(\sigma _{B}^{2}+\epsilon)^{\frac{-3}{2}}$ 【公式1-5】

$\frac{\partial L}{\partial x_i}=\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}*\frac{\partial \hat{x}_i}{\partial x_i}+\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}*\frac{\partial \hat{x}_i}{\partial \mu_B}*\frac{\partial \mu_B}{\partial x_i}+\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}*\frac{\partial \hat{x}_i}{\partial \delta _{B}^{2}}*\frac{\partial \delta _{B}^{2}}{\partial x_i}=$ 【公式1-6】

$l.delta*\gamma*(\frac{1}{\sqrt{\sigma _{B}^{2}+\epsilon }})+l.delta*\gamma*(\frac{-1}{\sqrt{\sigma _{B}^{2}+\epsilon }}))*\frac{1}{2}+l.delta*\gamma*(x_i-\mu_B)*\frac{-1}{2}*(\sigma _{B}^{2}+\epsilon)^{\frac{-3}{2}}*\frac{2}{m}*(x_i-\mu_B)$

batchnorm_layer.c 的解析如下：

#include "convolutional_layer.h"
#include "batchnorm_layer.h"
#include "blas.h"
#include <stdio.h>


// 构造归一化层
/**
 * 构造归一化层
 * @param batch 一个batch包含图片的张数
 * @param w 输入图片的高度
 * @param h 输入图片的宽度
 * @param c 输入图片的通道数
 * @return
 */
layer make_batchnorm_layer(int batch, int w, int h, int c)
{
    fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
    layer l = {0};
    l.type = BATCHNORM;

    l.batch = batch; // 一个batch中图片的张数
    l.h = l.out_h = h; // 输入图片的高度
    l.w = l.out_w = w; // 输入图片的宽度
    l.c = l.out_c = c; // 输入图片的通道数
    // calloc 传入两个参数，分别为元素的数目和每个元素的大小
    // calloc 会将所有分配的内存空间中的每一位都初始化为零
    l.output = calloc(h * w * c * batch, sizeof(float)); // BN层的所有输出（包含整个batch的）
    l.delta  = calloc(h * w * c * batch, sizeof(float)); // BN层的误差损失项（包含整个batch的）
    l.inputs = w*h*c; // BN层一张输入图片中所有元素的个数
    l.outputs = l.inputs; // BN层对应一张输入图片的输出元素个数, BN层不会改变输入输出的个数，通道数也不发生变化
    //
    l.scales = calloc(c, sizeof(float)); // BN层的gamma参数项
    l.scale_updates = calloc(c, sizeof(float)); // gamma更新值

    l.biases = calloc(c, sizeof(float)); // BN层的beta参数项
    l.bias_updates = calloc(c, sizeof(float)); // beta更新值
    int i;
    for(i = 0; i < c; ++i){ //gamma初始化为1
        l.scales[i] = 1;
    }

    l.mean = calloc(c, sizeof(float)); // 用于保存每个通道元素的平均值
    l.variance = calloc(c, sizeof(float)); // 用于保存每个通道的方差

    l.rolling_mean = calloc(c, sizeof(float)); // 保存每个通道均值的滚动平均
    l.rolling_variance = calloc(c, sizeof(float)); // 保存每个通道的方差的滚动平均

    // BN层的前向, 反向传播函数
    l.forward = forward_batchnorm_layer;
    l.backward = backward_batchnorm_layer;
#ifdef GPU
    l.forward_gpu = forward_batchnorm_layer_gpu;
    l.backward_gpu = backward_batchnorm_layer_gpu;

    l.output_gpu =  cuda_make_array(l.output, h * w * c * batch);
    l.delta_gpu =   cuda_make_array(l.delta, h * w * c * batch);

    l.biases_gpu = cuda_make_array(l.biases, c);
    l.bias_updates_gpu = cuda_make_array(l.bias_updates, c);

    l.scales_gpu = cuda_make_array(l.scales, c);
    l.scale_updates_gpu = cuda_make_array(l.scale_updates, c);

    l.mean_gpu = cuda_make_array(l.mean, c);
    l.variance_gpu = cuda_make_array(l.variance, c);

    l.rolling_mean_gpu = cuda_make_array(l.mean, c);
    l.rolling_variance_gpu = cuda_make_array(l.variance, c);

    l.mean_delta_gpu = cuda_make_array(l.mean, c);
    l.variance_delta_gpu = cuda_make_array(l.variance, c);

    l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
    l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
    #ifdef CUDNN
    cudnnCreateTensorDescriptor(&l.normTensorDesc);
    cudnnCreateTensorDescriptor(&l.dstTensorDesc);
    cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
    cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 

    #endif
#endif
    return l;
}

// 求gamma的梯度,对应公式 BN 2-6
//backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
{
    int i,b,f;
    for(f = 0; f < n; ++f){
        float sum = 0;
        for(b = 0; b < batch; ++b){
            for(i = 0; i < size; ++i){
                int index = i + size*(f + n*b);
                sum += delta[index] * x_norm[index];
            }
        }
        scale_updates[f] += sum;
    }
}

// 求y对均值的导数,对应公式 BN 2-2
void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
{

    int i,j,k;
    for(i = 0; i < filters; ++i){
        mean_delta[i] = 0;
        for (j = 0; j < batch; ++j) {
            for (k = 0; k < spatial; ++k) {
                int index = j*filters*spatial + i*spatial + k;
                mean_delta[i] += delta[index];
            }
        }
        mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
    }
}

// 求y对方差的导数,对应公式 BN 2-1
void  variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
{

    int i,j,k;
    for(i = 0; i < filters; ++i){
        variance_delta[i] = 0;
        for(j = 0; j < batch; ++j){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + i*spatial + k;
                variance_delta[i] += delta[index]*(x[index] - mean[i]);
            }
        }
        variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
    }
}

// 归一化,对应公式 BN 2-3
void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
{
    int f, j, k;
    for(j = 0; j < batch; ++j){
        for(f = 0; f < filters; ++f){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + f*spatial + k;
                delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
            }
        }
    }
}

void resize_batchnorm_layer(layer *layer, int w, int h)
{
    fprintf(stderr, "Not implemented\n");
}

//mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
/**
 * 计算每个通道中所有元素的均值
 * @param x 3
 * @param batch
 * @param filters
 * @param spatial
 * @param mean
 */
void mean_cpu(float *x, int batch, int filters, int spatial, float *mean)
{
    float scale = 1./(batch * spatial);
    int i,j,k;
    for(i = 0; i < filters; ++i){
        mean[i] = 0;
        for(j = 0; j < batch; ++j){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + i*spatial + k;
                mean[i] += x[index];
            }
        }
        mean[i] *= scale;
    }
}

void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
{
    float scale = 1./(batch * spatial - 1);
    int i,j,k;
    for(i = 0; i < filters; ++i){
        variance[i] = 0;
        for(j = 0; j < batch; ++j){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + i*spatial + k;
                variance[i] += pow((x[index] - mean[i]), 2);
            }
        }
        variance[i] *= scale;
    }
}

//scal_cpu(l.out_c, .99, l.rolling_mean, 1);
void scal_cpu(int N, float ALPHA, float *X, int INCX)
{
    int i;
    for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA;
}
//axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
{
    int i;
    for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX];
}





// 归一化前向传播函数
/**
 * BN层前向出传播函数
 * @param l 当前BN层
 * @param net 整个网络
 */
void forward_batchnorm_layer(layer l, network net)
{
    // 对于batchnorm层，直接输出等于输入，BN计算是在l.output进行计算
    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
    copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
    if(net.train){ // 训练状态
        // 求当前batch的均值，对应公式 mini-batch mean
        mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
        // 求当前batch的方差，对应公式 mini-batch variance
        variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);

        // 求均值的滚动平均，【预测时，均值就是这个值】
        // l.rolling_mean *= 0.99 这里的0.99实际上的计算方法是【1 - 1./batch_size】,这里假设batch_size = 100
        scal_cpu(l.out_c, .99, l.rolling_mean, 1);
        // l.rolling_mean = 0.01 * l.mean + l.rolling_mean  这里的0.01实际上的计算方法是【1./batch_size】,这里假设batch_size = 100
        axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);

        // 求方差的滚动平均，【预测时，方差就用这个值】
        // l.rolling_variance *= 0.99
        scal_cpu(l.out_c, .99, l.rolling_variance, 1);
        // l.rolling_variance = 0.01 * l.variance + l.rolling_variance
        axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);

        // 归一化， 对应公式 Normalize
        normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);

        // l.x_norm = l.output 将归一化结果保存在l.x_norm中国,用于反向传播时候的梯度计算
        copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
    } else { // 测试状态, 直接使用rolling_mean 和 rolling_variance 进行归一化即可
        normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
    }

    // 下面这两步，对应缩放和迁移，这里l.scales为gamma，l.biases对应beta
    scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
    add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
}

// 求当前batch的每个元素进行平移操作, 加上beta
void add_bias(float *output, float *biases, int batch, int n, int size)
{
    int i,j,b;
    for(b = 0; b < batch; ++b){
        for(i = 0; i < n; ++i){
            for(j = 0; j < size; ++j){
                output[(b*n + i)*size + j] += biases[i];
            }
        }
    }
}

// 求当前batch的每个元素进行缩放操作,
// scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
void scale_bias(float *output, float *scales, int batch, int n, int size)
{
    int i,j,b;
    for(b = 0; b < batch; ++b){
        for(i = 0; i < n; ++i){
            for(j = 0; j < size; ++j){
                output[(b*n + i)*size + j] *= scales[i];
            }
        }
    }
}

// BN反向传播
/**
 * BN层反向传播函数
 * @param l 当前BN层
 * @param net 整个网络
 */
void backward_batchnorm_layer(layer l, network net)
{
    if(!net.train) { //测试状态
        l.mean = l.rolling_mean;
        l.variance = l.rolling_variance;
    }
    // 求偏差beta的梯度, 【公式1-2】
    backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
    // 求gamma的梯度, d(l.delta)/d(l.gamma) = xi,即l.delta * l.x_norm【归一化的结果】 【公式1-1】
    backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);

    // 正向传播中scale and shift部分已经反向计算完毕，计算公共到gamma部分公共求导 【公式1-3】
    scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);

    // 求y对均值的导数, 【公式1-4】
    mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
    // 求y对方差的导数, 【公式1-5】 这里按上面化简后的公式,若激活函数为ReLU应该直接等于0
    variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
    // 求y对xi的导数,对应公式 【公式1-6】，即求上一层的误差项
    normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
    // 对应BN层,直接输出等于输入，l.delta 拷贝给 net.delta 
    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
}

// 求偏差beta的梯度, 对应公式 BN 2-5
//backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
void backward_bias(float *bias_updates, float *delta, int batch, int n , int size) {
    int i, b;
    for (b = 0; b < batch; b ++) {
        for (i = 0; i < n; i ++) // 每个通道
            bias_updates[i] += sum_array(delta+size*(i+b*n), size);
    }
}

#ifdef GPU

void pull_batchnorm_layer(layer l)
{
    cuda_pull_array(l.scales_gpu, l.scales, l.c);
    cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.c);
    cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
}
void push_batchnorm_layer(layer l)
{
    cuda_push_array(l.scales_gpu, l.scales, l.c);
    cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.c);
    cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
}

void forward_batchnorm_layer_gpu(layer l, network net)
{
    if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
    copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
    if (net.train) {
#ifdef CUDNN
        float one = 1;
        float zero = 0;
        cudnnBatchNormalizationForwardTraining(cudnn_handle(),
                CUDNN_BATCHNORM_SPATIAL,
                &one,
                &zero,
                l.dstTensorDesc,
                l.x_gpu,
                l.dstTensorDesc,
                l.output_gpu,
                l.normTensorDesc,
                l.scales_gpu,
                l.biases_gpu,
                .01,
                l.rolling_mean_gpu,
                l.rolling_variance_gpu,
                .00001,
                l.mean_gpu,
                l.variance_gpu);
#else
        fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
        fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);

        scal_gpu(l.out_c, .99, l.rolling_mean_gpu, 1);
        axpy_gpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
        scal_gpu(l.out_c, .99, l.rolling_variance_gpu, 1);
        axpy_gpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);

        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
        normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);

        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
#endif
    } else {
        normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
    }

}

void backward_batchnorm_layer_gpu(layer l, network net)
{
    if(!net.train){
        l.mean_gpu = l.rolling_mean_gpu;
        l.variance_gpu = l.rolling_variance_gpu;
    }
#ifdef CUDNN
    float one = 1;
    float zero = 0;
    cudnnBatchNormalizationBackward(cudnn_handle(),
            CUDNN_BATCHNORM_SPATIAL,
            &one,
            &zero,
            &one,
            &one,
            l.dstTensorDesc,
            l.x_gpu,
            l.dstTensorDesc,
            l.delta_gpu,
            l.dstTensorDesc,
            l.x_norm_gpu,
            l.normTensorDesc,
            l.scales_gpu,
            l.scale_updates_gpu,
            l.bias_updates_gpu,
            .00001,
            l.mean_gpu,
            l.variance_gpu);
    copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
#else
    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
    backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);

    scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);

    fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
    fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
    normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
#endif
    if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif

【darknet源码解析-11】batchnorm_layer.h 和 batchnorm_layer.c解析

lightdb hash index的性能和限制

[LeetCode 解題報告]026. Remove Duplicates from Sorted Array

[LeetCode 解題報告]206. Reverse Linked List

[LeetCode 解題報告]031. Next Permutation

[LeetCode 解題報告]028. Implement strStr()

[LeetCode 解題報告]032. Longest Valid Parentheses

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結