本系列爲darknet源碼解析,本次解析src/batchnorm_layer.h 與 src/batchnorm_layer.c兩個。batchnorm主要完成批歸一化操作。
論文名字:Batch Normalization: Accelerating Deep Network Training b y Reducing Internal Covariate Shift
論文地址:https://arxiv.org/pdf/1502.03167.pdf
BatchNorm,BN的基本動機與原理是什麼?在CNN中如何使用?
神經網絡的訓練過程的本質是學習數據分佈,如果訓練數據與測試數據的分佈不同將大大降低網絡的泛化能力,所以我們需要在訓練開始前對所有輸入數據進行歸一化操作。然後隨着網絡的訓練,每個隱藏層的參數變化使得後一層的輸入發生變化,從而每一個batchsize的訓練數據的分佈也隨之變化,使得網絡在每次迭代中都需要去擬合不同的數據分佈,增大訓練的複雜度以及過擬合的風險。BN是在網絡的每一層輸入之前增加歸一化處理(均值爲0,標準差爲1)將所有批數據強制在統一的數據分佈下。
BN層實現:
batchnorm_layer.h 的解析如下:
#ifndef BATCHNORM_LAYER_H
#define BATCHNORM_LAYER_H
#include "image.h"
#include "layer.h"
#include "network.h"
// 構造BN層函數
layer make_batchnorm_layer(int batch, int w, int h, int c);
// BN層前向傳播函數
void forward_batchnorm_layer(layer l, network net);
// BN層反向傳播函數
void backward_batchnorm_layer(layer l, network net);
#ifdef GPU
void forward_batchnorm_layer_gpu(layer l, network net);
void backward_batchnorm_layer_gpu(layer l, network net);
void pull_batchnorm_layer(layer l);
void push_batchnorm_layer(layer l);
#endif
#endif
求導:
【公式1-1】
【公式1-1】
【公式1-2】
【公式1-3】
【公式1-4】
【公式1-5】
【公式1-6】
batchnorm_layer.c 的解析如下:
#include "convolutional_layer.h"
#include "batchnorm_layer.h"
#include "blas.h"
#include <stdio.h>
// 構造歸一化層
/**
* 構造歸一化層
* @param batch 一個batch包含圖片的張數
* @param w 輸入圖片的高度
* @param h 輸入圖片的寬度
* @param c 輸入圖片的通道數
* @return
*/
layer make_batchnorm_layer(int batch, int w, int h, int c)
{
fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
layer l = {0};
l.type = BATCHNORM;
l.batch = batch; // 一個batch中圖片的張數
l.h = l.out_h = h; // 輸入圖片的高度
l.w = l.out_w = w; // 輸入圖片的寬度
l.c = l.out_c = c; // 輸入圖片的通道數
// calloc 傳入兩個參數,分別爲元素的數目和每個元素的大小
// calloc 會將所有分配的內存空間中的每一位都初始化爲零
l.output = calloc(h * w * c * batch, sizeof(float)); // BN層的所有輸出(包含整個batch的)
l.delta = calloc(h * w * c * batch, sizeof(float)); // BN層的誤差損失項(包含整個batch的)
l.inputs = w*h*c; // BN層一張輸入圖片中所有元素的個數
l.outputs = l.inputs; // BN層對應一張輸入圖片的輸出元素個數, BN層不會改變輸入輸出的個數,通道數也不發生變化
//
l.scales = calloc(c, sizeof(float)); // BN層的gamma參數項
l.scale_updates = calloc(c, sizeof(float)); // gamma更新值
l.biases = calloc(c, sizeof(float)); // BN層的beta參數項
l.bias_updates = calloc(c, sizeof(float)); // beta更新值
int i;
for(i = 0; i < c; ++i){ //gamma初始化爲1
l.scales[i] = 1;
}
l.mean = calloc(c, sizeof(float)); // 用於保存每個通道元素的平均值
l.variance = calloc(c, sizeof(float)); // 用於保存每個通道的方差
l.rolling_mean = calloc(c, sizeof(float)); // 保存每個通道均值的滾動平均
l.rolling_variance = calloc(c, sizeof(float)); // 保存每個通道的方差的滾動平均
// BN層的前向, 反向傳播函數
l.forward = forward_batchnorm_layer;
l.backward = backward_batchnorm_layer;
#ifdef GPU
l.forward_gpu = forward_batchnorm_layer_gpu;
l.backward_gpu = backward_batchnorm_layer_gpu;
l.output_gpu = cuda_make_array(l.output, h * w * c * batch);
l.delta_gpu = cuda_make_array(l.delta, h * w * c * batch);
l.biases_gpu = cuda_make_array(l.biases, c);
l.bias_updates_gpu = cuda_make_array(l.bias_updates, c);
l.scales_gpu = cuda_make_array(l.scales, c);
l.scale_updates_gpu = cuda_make_array(l.scale_updates, c);
l.mean_gpu = cuda_make_array(l.mean, c);
l.variance_gpu = cuda_make_array(l.variance, c);
l.rolling_mean_gpu = cuda_make_array(l.mean, c);
l.rolling_variance_gpu = cuda_make_array(l.variance, c);
l.mean_delta_gpu = cuda_make_array(l.mean, c);
l.variance_delta_gpu = cuda_make_array(l.variance, c);
l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
#ifdef CUDNN
cudnnCreateTensorDescriptor(&l.normTensorDesc);
cudnnCreateTensorDescriptor(&l.dstTensorDesc);
cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w);
cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1);
#endif
#endif
return l;
}
// 求gamma的梯度,對應公式 BN 2-6
//backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
{
int i,b,f;
for(f = 0; f < n; ++f){
float sum = 0;
for(b = 0; b < batch; ++b){
for(i = 0; i < size; ++i){
int index = i + size*(f + n*b);
sum += delta[index] * x_norm[index];
}
}
scale_updates[f] += sum;
}
}
// 求y對均值的導數,對應公式 BN 2-2
void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
{
int i,j,k;
for(i = 0; i < filters; ++i){
mean_delta[i] = 0;
for (j = 0; j < batch; ++j) {
for (k = 0; k < spatial; ++k) {
int index = j*filters*spatial + i*spatial + k;
mean_delta[i] += delta[index];
}
}
mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
}
}
// 求y對方差的導數,對應公式 BN 2-1
void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
{
int i,j,k;
for(i = 0; i < filters; ++i){
variance_delta[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
variance_delta[i] += delta[index]*(x[index] - mean[i]);
}
}
variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
}
}
// 歸一化,對應公式 BN 2-3
void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
{
int f, j, k;
for(j = 0; j < batch; ++j){
for(f = 0; f < filters; ++f){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + f*spatial + k;
delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
}
}
}
}
void resize_batchnorm_layer(layer *layer, int w, int h)
{
fprintf(stderr, "Not implemented\n");
}
//mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
/**
* 計算每個通道中所有元素的均值
* @param x 3
* @param batch
* @param filters
* @param spatial
* @param mean
*/
void mean_cpu(float *x, int batch, int filters, int spatial, float *mean)
{
float scale = 1./(batch * spatial);
int i,j,k;
for(i = 0; i < filters; ++i){
mean[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
mean[i] += x[index];
}
}
mean[i] *= scale;
}
}
void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
{
float scale = 1./(batch * spatial - 1);
int i,j,k;
for(i = 0; i < filters; ++i){
variance[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
variance[i] += pow((x[index] - mean[i]), 2);
}
}
variance[i] *= scale;
}
}
//scal_cpu(l.out_c, .99, l.rolling_mean, 1);
void scal_cpu(int N, float ALPHA, float *X, int INCX)
{
int i;
for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA;
}
//axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
{
int i;
for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX];
}
// 歸一化前向傳播函數
/**
* BN層前向出傳播函數
* @param l 當前BN層
* @param net 整個網絡
*/
void forward_batchnorm_layer(layer l, network net)
{
// 對於batchnorm層,直接輸出等於輸入,BN計算是在l.output進行計算
if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
if(net.train){ // 訓練狀態
// 求當前batch的均值,對應公式 mini-batch mean
mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
// 求當前batch的方差,對應公式 mini-batch variance
variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
// 求均值的滾動平均,【預測時,均值就是這個值】
// l.rolling_mean *= 0.99 這裏的0.99實際上的計算方法是【1 - 1./batch_size】,這裏假設batch_size = 100
scal_cpu(l.out_c, .99, l.rolling_mean, 1);
// l.rolling_mean = 0.01 * l.mean + l.rolling_mean 這裏的0.01實際上的計算方法是【1./batch_size】,這裏假設batch_size = 100
axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
// 求方差的滾動平均,【預測時,方差就用這個值】
// l.rolling_variance *= 0.99
scal_cpu(l.out_c, .99, l.rolling_variance, 1);
// l.rolling_variance = 0.01 * l.variance + l.rolling_variance
axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);
// 歸一化, 對應公式 Normalize
normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);
// l.x_norm = l.output 將歸一化結果保存在l.x_norm中國,用於反向傳播時候的梯度計算
copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
} else { // 測試狀態, 直接使用rolling_mean 和 rolling_variance 進行歸一化即可
normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
}
// 下面這兩步,對應縮放和遷移,這裏l.scales爲gamma,l.biases對應beta
scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
}
// 求當前batch的每個元素進行平移操作, 加上beta
void add_bias(float *output, float *biases, int batch, int n, int size)
{
int i,j,b;
for(b = 0; b < batch; ++b){
for(i = 0; i < n; ++i){
for(j = 0; j < size; ++j){
output[(b*n + i)*size + j] += biases[i];
}
}
}
}
// 求當前batch的每個元素進行縮放操作,
// scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
void scale_bias(float *output, float *scales, int batch, int n, int size)
{
int i,j,b;
for(b = 0; b < batch; ++b){
for(i = 0; i < n; ++i){
for(j = 0; j < size; ++j){
output[(b*n + i)*size + j] *= scales[i];
}
}
}
}
// BN反向傳播
/**
* BN層反向傳播函數
* @param l 當前BN層
* @param net 整個網絡
*/
void backward_batchnorm_layer(layer l, network net)
{
if(!net.train) { //測試狀態
l.mean = l.rolling_mean;
l.variance = l.rolling_variance;
}
// 求偏差beta的梯度, 【公式1-2】
backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
// 求gamma的梯度, d(l.delta)/d(l.gamma) = xi,即l.delta * l.x_norm【歸一化的結果】 【公式1-1】
backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
// 正向傳播中scale and shift部分已經反向計算完畢,計算公共到gamma部分公共求導 【公式1-3】
scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
// 求y對均值的導數, 【公式1-4】
mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
// 求y對方差的導數, 【公式1-5】 這裏按上面化簡後的公式,若激活函數爲ReLU應該直接等於0
variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
// 求y對xi的導數,對應公式 【公式1-6】,即求上一層的誤差項
normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
// 對應BN層,直接輸出等於輸入,l.delta 拷貝給 net.delta
if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
}
// 求偏差beta的梯度, 對應公式 BN 2-5
//backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
void backward_bias(float *bias_updates, float *delta, int batch, int n , int size) {
int i, b;
for (b = 0; b < batch; b ++) {
for (i = 0; i < n; i ++) // 每個通道
bias_updates[i] += sum_array(delta+size*(i+b*n), size);
}
}
#ifdef GPU
void pull_batchnorm_layer(layer l)
{
cuda_pull_array(l.scales_gpu, l.scales, l.c);
cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.c);
cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
}
void push_batchnorm_layer(layer l)
{
cuda_push_array(l.scales_gpu, l.scales, l.c);
cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.c);
cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
}
void forward_batchnorm_layer_gpu(layer l, network net)
{
if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
if (net.train) {
#ifdef CUDNN
float one = 1;
float zero = 0;
cudnnBatchNormalizationForwardTraining(cudnn_handle(),
CUDNN_BATCHNORM_SPATIAL,
&one,
&zero,
l.dstTensorDesc,
l.x_gpu,
l.dstTensorDesc,
l.output_gpu,
l.normTensorDesc,
l.scales_gpu,
l.biases_gpu,
.01,
l.rolling_mean_gpu,
l.rolling_variance_gpu,
.00001,
l.mean_gpu,
l.variance_gpu);
#else
fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
scal_gpu(l.out_c, .99, l.rolling_mean_gpu, 1);
axpy_gpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
scal_gpu(l.out_c, .99, l.rolling_variance_gpu, 1);
axpy_gpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
#endif
} else {
normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
}
}
void backward_batchnorm_layer_gpu(layer l, network net)
{
if(!net.train){
l.mean_gpu = l.rolling_mean_gpu;
l.variance_gpu = l.rolling_variance_gpu;
}
#ifdef CUDNN
float one = 1;
float zero = 0;
cudnnBatchNormalizationBackward(cudnn_handle(),
CUDNN_BATCHNORM_SPATIAL,
&one,
&zero,
&one,
&one,
l.dstTensorDesc,
l.x_gpu,
l.dstTensorDesc,
l.delta_gpu,
l.dstTensorDesc,
l.x_norm_gpu,
l.normTensorDesc,
l.scales_gpu,
l.scale_updates_gpu,
l.bias_updates_gpu,
.00001,
l.mean_gpu,
l.variance_gpu);
copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
#else
backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
#endif
if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif