【darknet源碼解析-11】batchnorm_layer.h 和 batchnorm_layer.c解析

本系列爲darknet源碼解析,本次解析src/batchnorm_layer.h 與 src/batchnorm_layer.c兩個。batchnorm主要完成批歸一化操作。

論文名字:Batch Normalization: Accelerating Deep Network Training b y Reducing Internal Covariate Shift

論文地址:https://arxiv.org/pdf/1502.03167.pdf

BatchNorm,BN的基本動機與原理是什麼?在CNN中如何使用?

神經網絡的訓練過程的本質是學習數據分佈,如果訓練數據與測試數據的分佈不同將大大降低網絡的泛化能力,所以我們需要在訓練開始前對所有輸入數據進行歸一化操作。然後隨着網絡的訓練,每個隱藏層的參數變化使得後一層的輸入發生變化,從而每一個batchsize的訓練數據的分佈也隨之變化,使得網絡在每次迭代中都需要去擬合不同的數據分佈,增大訓練的複雜度以及過擬合的風險。BN是在網絡的每一層輸入之前增加歸一化處理(均值爲0,標準差爲1)將所有批數據強制在統一的數據分佈下。

BN層實現:

batchnorm_layer.h 的解析如下:

#ifndef BATCHNORM_LAYER_H
#define BATCHNORM_LAYER_H

#include "image.h"
#include "layer.h"
#include "network.h"

// 構造BN層函數
layer make_batchnorm_layer(int batch, int w, int h, int c);

// BN層前向傳播函數
void forward_batchnorm_layer(layer l, network net);

// BN層反向傳播函數
void backward_batchnorm_layer(layer l, network net);

#ifdef GPU
void forward_batchnorm_layer_gpu(layer l, network net);
void backward_batchnorm_layer_gpu(layer l, network net);
void pull_batchnorm_layer(layer l);
void push_batchnorm_layer(layer l);
#endif

#endif

求導:

\frac{\partial L}{\partial y_i}=l.delta  【公式1-1】

\frac{\partial L}{\partial \gamma}=\frac{\partial L}{\partial y_i}*\frac{\partial y_i}{\partial \gamma}=l.delta*\hat {x}_i【公式1-1】

\frac{\partial L}{\partial \beta }=\frac{\partial L}{\partial y_i}*\frac{\partial y_i}{\partial \beta }=l.delta【公式1-2】

\frac{\partial L}{\partial \hat{x}_i}=\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}=l.delta*\gamma【公式1-3】

\frac{\partial L}{\partial \mu_B}=\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}*\frac{\partial \hat{x}_i}{\partial \mu_B}=l.delta*\gamma*(\frac{-1}{\sqrt{\sigma _{B}^{2}+\epsilon }})【公式1-4】

\frac{\partial L}{\partial\sigma _{B}^{2}}=\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}*\frac{\partial \hat{x}_i}{\partial \sigma _{B}^{2}}=l.delta*\gamma*(x_i-\mu_B)*\frac{-1}{2}*(\sigma _{B}^{2}+\epsilon)^{\frac{-3}{2}}【公式1-5】

 \frac{\partial L}{\partial x_i}=\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}*\frac{\partial \hat{x}_i}{\partial x_i}+\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}*\frac{\partial \hat{x}_i}{\partial \mu_B}*\frac{\partial \mu_B}{\partial x_i}+\frac{\partial L}{ \partial y_i}*\frac{\partial y_i}{\partial \hat{x}_i}*\frac{\partial \hat{x}_i}{\partial \delta _{B}^{2}}*\frac{\partial \delta _{B}^{2}}{\partial x_i}=【公式1-6】

l.delta*\gamma*(\frac{1}{\sqrt{\sigma _{B}^{2}+\epsilon }})+l.delta*\gamma*(\frac{-1}{\sqrt{\sigma _{B}^{2}+\epsilon }}))*\frac{1}{2}+l.delta*\gamma*(x_i-\mu_B)*\frac{-1}{2}*(\sigma _{B}^{2}+\epsilon)^{\frac{-3}{2}}*\frac{2}{m}*(x_i-\mu_B)

batchnorm_layer.c 的解析如下:

#include "convolutional_layer.h"
#include "batchnorm_layer.h"
#include "blas.h"
#include <stdio.h>


// 構造歸一化層
/**
 * 構造歸一化層
 * @param batch 一個batch包含圖片的張數
 * @param w 輸入圖片的高度
 * @param h 輸入圖片的寬度
 * @param c 輸入圖片的通道數
 * @return
 */
layer make_batchnorm_layer(int batch, int w, int h, int c)
{
    fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
    layer l = {0};
    l.type = BATCHNORM;

    l.batch = batch; // 一個batch中圖片的張數
    l.h = l.out_h = h; // 輸入圖片的高度
    l.w = l.out_w = w; // 輸入圖片的寬度
    l.c = l.out_c = c; // 輸入圖片的通道數
    // calloc 傳入兩個參數,分別爲元素的數目和每個元素的大小
    // calloc 會將所有分配的內存空間中的每一位都初始化爲零
    l.output = calloc(h * w * c * batch, sizeof(float)); // BN層的所有輸出(包含整個batch的)
    l.delta  = calloc(h * w * c * batch, sizeof(float)); // BN層的誤差損失項(包含整個batch的)
    l.inputs = w*h*c; // BN層一張輸入圖片中所有元素的個數
    l.outputs = l.inputs; // BN層對應一張輸入圖片的輸出元素個數, BN層不會改變輸入輸出的個數,通道數也不發生變化
    //
    l.scales = calloc(c, sizeof(float)); // BN層的gamma參數項
    l.scale_updates = calloc(c, sizeof(float)); // gamma更新值

    l.biases = calloc(c, sizeof(float)); // BN層的beta參數項
    l.bias_updates = calloc(c, sizeof(float)); // beta更新值
    int i;
    for(i = 0; i < c; ++i){ //gamma初始化爲1
        l.scales[i] = 1;
    }

    l.mean = calloc(c, sizeof(float)); // 用於保存每個通道元素的平均值
    l.variance = calloc(c, sizeof(float)); // 用於保存每個通道的方差

    l.rolling_mean = calloc(c, sizeof(float)); // 保存每個通道均值的滾動平均
    l.rolling_variance = calloc(c, sizeof(float)); // 保存每個通道的方差的滾動平均

    // BN層的前向, 反向傳播函數
    l.forward = forward_batchnorm_layer;
    l.backward = backward_batchnorm_layer;
#ifdef GPU
    l.forward_gpu = forward_batchnorm_layer_gpu;
    l.backward_gpu = backward_batchnorm_layer_gpu;

    l.output_gpu =  cuda_make_array(l.output, h * w * c * batch);
    l.delta_gpu =   cuda_make_array(l.delta, h * w * c * batch);

    l.biases_gpu = cuda_make_array(l.biases, c);
    l.bias_updates_gpu = cuda_make_array(l.bias_updates, c);

    l.scales_gpu = cuda_make_array(l.scales, c);
    l.scale_updates_gpu = cuda_make_array(l.scale_updates, c);

    l.mean_gpu = cuda_make_array(l.mean, c);
    l.variance_gpu = cuda_make_array(l.variance, c);

    l.rolling_mean_gpu = cuda_make_array(l.mean, c);
    l.rolling_variance_gpu = cuda_make_array(l.variance, c);

    l.mean_delta_gpu = cuda_make_array(l.mean, c);
    l.variance_delta_gpu = cuda_make_array(l.variance, c);

    l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
    l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
    #ifdef CUDNN
    cudnnCreateTensorDescriptor(&l.normTensorDesc);
    cudnnCreateTensorDescriptor(&l.dstTensorDesc);
    cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
    cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 

    #endif
#endif
    return l;
}

// 求gamma的梯度,對應公式 BN 2-6
//backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
{
    int i,b,f;
    for(f = 0; f < n; ++f){
        float sum = 0;
        for(b = 0; b < batch; ++b){
            for(i = 0; i < size; ++i){
                int index = i + size*(f + n*b);
                sum += delta[index] * x_norm[index];
            }
        }
        scale_updates[f] += sum;
    }
}

// 求y對均值的導數,對應公式 BN 2-2
void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
{

    int i,j,k;
    for(i = 0; i < filters; ++i){
        mean_delta[i] = 0;
        for (j = 0; j < batch; ++j) {
            for (k = 0; k < spatial; ++k) {
                int index = j*filters*spatial + i*spatial + k;
                mean_delta[i] += delta[index];
            }
        }
        mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
    }
}

// 求y對方差的導數,對應公式 BN 2-1
void  variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
{

    int i,j,k;
    for(i = 0; i < filters; ++i){
        variance_delta[i] = 0;
        for(j = 0; j < batch; ++j){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + i*spatial + k;
                variance_delta[i] += delta[index]*(x[index] - mean[i]);
            }
        }
        variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
    }
}

// 歸一化,對應公式 BN 2-3
void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
{
    int f, j, k;
    for(j = 0; j < batch; ++j){
        for(f = 0; f < filters; ++f){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + f*spatial + k;
                delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
            }
        }
    }
}

void resize_batchnorm_layer(layer *layer, int w, int h)
{
    fprintf(stderr, "Not implemented\n");
}

//mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
/**
 * 計算每個通道中所有元素的均值
 * @param x 3
 * @param batch
 * @param filters
 * @param spatial
 * @param mean
 */
void mean_cpu(float *x, int batch, int filters, int spatial, float *mean)
{
    float scale = 1./(batch * spatial);
    int i,j,k;
    for(i = 0; i < filters; ++i){
        mean[i] = 0;
        for(j = 0; j < batch; ++j){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + i*spatial + k;
                mean[i] += x[index];
            }
        }
        mean[i] *= scale;
    }
}

void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
{
    float scale = 1./(batch * spatial - 1);
    int i,j,k;
    for(i = 0; i < filters; ++i){
        variance[i] = 0;
        for(j = 0; j < batch; ++j){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + i*spatial + k;
                variance[i] += pow((x[index] - mean[i]), 2);
            }
        }
        variance[i] *= scale;
    }
}

//scal_cpu(l.out_c, .99, l.rolling_mean, 1);
void scal_cpu(int N, float ALPHA, float *X, int INCX)
{
    int i;
    for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA;
}
//axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
{
    int i;
    for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX];
}





// 歸一化前向傳播函數
/**
 * BN層前向出傳播函數
 * @param l 當前BN層
 * @param net 整個網絡
 */
void forward_batchnorm_layer(layer l, network net)
{
    // 對於batchnorm層,直接輸出等於輸入,BN計算是在l.output進行計算
    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
    copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
    if(net.train){ // 訓練狀態
        // 求當前batch的均值,對應公式 mini-batch mean
        mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
        // 求當前batch的方差,對應公式 mini-batch variance
        variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);

        // 求均值的滾動平均,【預測時,均值就是這個值】
        // l.rolling_mean *= 0.99 這裏的0.99實際上的計算方法是【1 - 1./batch_size】,這裏假設batch_size = 100
        scal_cpu(l.out_c, .99, l.rolling_mean, 1);
        // l.rolling_mean = 0.01 * l.mean + l.rolling_mean  這裏的0.01實際上的計算方法是【1./batch_size】,這裏假設batch_size = 100
        axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);

        // 求方差的滾動平均,【預測時,方差就用這個值】
        // l.rolling_variance *= 0.99
        scal_cpu(l.out_c, .99, l.rolling_variance, 1);
        // l.rolling_variance = 0.01 * l.variance + l.rolling_variance
        axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);

        // 歸一化, 對應公式 Normalize
        normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);

        // l.x_norm = l.output 將歸一化結果保存在l.x_norm中國,用於反向傳播時候的梯度計算
        copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
    } else { // 測試狀態, 直接使用rolling_mean 和 rolling_variance 進行歸一化即可
        normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
    }

    // 下面這兩步,對應縮放和遷移,這裏l.scales爲gamma,l.biases對應beta
    scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
    add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
}

// 求當前batch的每個元素進行平移操作, 加上beta
void add_bias(float *output, float *biases, int batch, int n, int size)
{
    int i,j,b;
    for(b = 0; b < batch; ++b){
        for(i = 0; i < n; ++i){
            for(j = 0; j < size; ++j){
                output[(b*n + i)*size + j] += biases[i];
            }
        }
    }
}

// 求當前batch的每個元素進行縮放操作,
// scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
void scale_bias(float *output, float *scales, int batch, int n, int size)
{
    int i,j,b;
    for(b = 0; b < batch; ++b){
        for(i = 0; i < n; ++i){
            for(j = 0; j < size; ++j){
                output[(b*n + i)*size + j] *= scales[i];
            }
        }
    }
}

// BN反向傳播
/**
 * BN層反向傳播函數
 * @param l 當前BN層
 * @param net 整個網絡
 */
void backward_batchnorm_layer(layer l, network net)
{
    if(!net.train) { //測試狀態
        l.mean = l.rolling_mean;
        l.variance = l.rolling_variance;
    }
    // 求偏差beta的梯度, 【公式1-2】
    backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
    // 求gamma的梯度, d(l.delta)/d(l.gamma) = xi,即l.delta * l.x_norm【歸一化的結果】 【公式1-1】
    backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);

    // 正向傳播中scale and shift部分已經反向計算完畢,計算公共到gamma部分公共求導 【公式1-3】
    scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);

    // 求y對均值的導數, 【公式1-4】
    mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
    // 求y對方差的導數, 【公式1-5】 這裏按上面化簡後的公式,若激活函數爲ReLU應該直接等於0
    variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
    // 求y對xi的導數,對應公式 【公式1-6】,即求上一層的誤差項
    normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
    // 對應BN層,直接輸出等於輸入,l.delta 拷貝給 net.delta 
    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
}

// 求偏差beta的梯度, 對應公式 BN 2-5
//backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
void backward_bias(float *bias_updates, float *delta, int batch, int n , int size) {
    int i, b;
    for (b = 0; b < batch; b ++) {
        for (i = 0; i < n; i ++) // 每個通道
            bias_updates[i] += sum_array(delta+size*(i+b*n), size);
    }
}

#ifdef GPU

void pull_batchnorm_layer(layer l)
{
    cuda_pull_array(l.scales_gpu, l.scales, l.c);
    cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.c);
    cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
}
void push_batchnorm_layer(layer l)
{
    cuda_push_array(l.scales_gpu, l.scales, l.c);
    cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.c);
    cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
}

void forward_batchnorm_layer_gpu(layer l, network net)
{
    if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
    copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
    if (net.train) {
#ifdef CUDNN
        float one = 1;
        float zero = 0;
        cudnnBatchNormalizationForwardTraining(cudnn_handle(),
                CUDNN_BATCHNORM_SPATIAL,
                &one,
                &zero,
                l.dstTensorDesc,
                l.x_gpu,
                l.dstTensorDesc,
                l.output_gpu,
                l.normTensorDesc,
                l.scales_gpu,
                l.biases_gpu,
                .01,
                l.rolling_mean_gpu,
                l.rolling_variance_gpu,
                .00001,
                l.mean_gpu,
                l.variance_gpu);
#else
        fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
        fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);

        scal_gpu(l.out_c, .99, l.rolling_mean_gpu, 1);
        axpy_gpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
        scal_gpu(l.out_c, .99, l.rolling_variance_gpu, 1);
        axpy_gpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);

        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
        normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);

        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
#endif
    } else {
        normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
    }

}

void backward_batchnorm_layer_gpu(layer l, network net)
{
    if(!net.train){
        l.mean_gpu = l.rolling_mean_gpu;
        l.variance_gpu = l.rolling_variance_gpu;
    }
#ifdef CUDNN
    float one = 1;
    float zero = 0;
    cudnnBatchNormalizationBackward(cudnn_handle(),
            CUDNN_BATCHNORM_SPATIAL,
            &one,
            &zero,
            &one,
            &one,
            l.dstTensorDesc,
            l.x_gpu,
            l.dstTensorDesc,
            l.delta_gpu,
            l.dstTensorDesc,
            l.x_norm_gpu,
            l.normTensorDesc,
            l.scales_gpu,
            l.scale_updates_gpu,
            l.bias_updates_gpu,
            .00001,
            l.mean_gpu,
            l.variance_gpu);
    copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
#else
    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
    backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);

    scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);

    fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
    fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
    normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
#endif
    if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章