入口信息

通過如下的調用堆棧信息可以定位到函數ForwardFromTo(其他函數中無重要信息)

caffe::Net<float>::ForwardFromTo() at net.cpp:574
caffe::Net<float>::ForwardPrefilled() at net.cpp:596
caffe::Net<float>::Forward() at net.cpp:610

對於ForwardFromTo有,對每層網絡前向計算（start=0,end=11共12層網絡）。

template <typename Dtype>
Dtype Net<Dtype>::ForwardFromTo(int start, int end) {

  for (int i = start; i <= end; ++i) {
    Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
    loss += layer_loss;
  }
  return loss;
}

在ForwardFromTo中，對網絡的每層調用Forward函數，Forward中根據配置情況選擇調用Forward_gpu還是Forward_cpu。

第一層 DataLayer

DataLayer未實現Forward_cpu或Forward_gpu，其父類BasePrefetchingDataLayer實現了。
內容爲從BasePrefetchingDataLayer的數據緩存隊列BlockingQueue<Batch*>取出一個Batch的數據放入DataLayer的Top Blob中，其中Top[0]存放數據，Top[1]存放標籤。

第二層 SplitLayer

SplitLayer有兩個Top Blob label_mnist_1_split_0和label_mnist_1_split_1，在其Forward_g(c)pu中，從它的Bottom Blob，也就是DataLayer的第二個Top Blob，label中把數據指向數據的指針複製到label_mnist_1_split_0和label_mnist_1_split_1中（即共享了數據）。

代碼如下，將bottom[0]複製成多個top blob

void SplitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  for (int i = 0; i < top.size(); ++i) {
    top[i]->ShareData(*bottom[0]);
  }
}

第三層 ConvolutionLayer

以GPU爲例，展開代碼如下：

template <typename Dtype>
void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const Dtype* weight = this->blobs_[0]->gpu_data();

  //對第一個Bottom Blob，對於Lenet此處只有一個Bottom Blob
  for (int i = 0; i < bottom.size(); ++i) {
    const Dtype* bottom_data = bottom[i]->gpu_data();

    //對於一個Batch中的第一個樣本的Featrue Maps，對於Lenet此處num_爲64(train)或者100（test)
    for (int n = 0; n < this->num_; ++n) {

      //bottom_data中的數據與weight作卷積，結果放入top_data中
      this->forward_gpu_gemm(bottom_data + n * this->bottom_dim_, weight,
          top_data + n * this->top_dim_);

          //*****展開forward_gpu_gemm開始*****
          template <typename Dtype>
          void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
          const Dtype* weights, Dtype* output, bool skip_im2col) {
            const Dtype* col_buff = input;
            //1x1卷積不處理
            if (!is_1x1_) {
              if (!skip_im2col) {
                //把圖像展開成列，進而可以表示一個矩陣（即這個矩陣最後還是寫成了一個列向量col_buffer）
                conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
              }
              col_buff = col_buffer_.gpu_data();
            }

              //調用gemm，對weights與col_buff作卷積，結果放入output
              caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
                  group_, conv_out_spatial_dim_, kernel_dim_,
                  (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
                  (Dtype)0., output + output_offset_ * g);
            }
          }
          //*****展開forward_gpu_gemm結束*****

      //計算偏置
      if (this->bias_term_) {
        const Dtype* bias = this->blobs_[1]->gpu_data();
        this->forward_gpu_bias(top_data + n * this->top_dim_, bias);
      }
    }
  }
}

第四層 PoolingLayer

Caffe中實現了Max Pooling和Average Pooling兩種方法，cuda代碼在理解算法上會更易讀些：

template <typename Dtype>
__global__ void MaxPoolForward(...) {
  //CUDA_KERNEL_LOOP爲caffe中的相關宏，index爲線程索引，caffe中grid,block(512)都一維的
  //在其它《（Caffe）編程小技巧》中介紹了
  //nthreads爲線程的總數，爲該pooling層top blob的輸出神經元總數，也就是說一個線程對應輸出的一個結點
  CUDA_KERNEL_LOOP(index, nthreads) {
    // 該線程對應的top blob（N,C,H,W）中的N,即樣本個數
    const int n = index / pooled_width / pooled_height / channels;
    // 該線程對應的top blob（N,C,H,W）中的C,即第C個Channel(number of feature maps)
    const int c = (index / pooled_width / pooled_height) % channels;
    // 該線程對應的top blob（N,C,H,W）中的H,輸出Feature Map的中的高的座標
    const int ph = (index / pooled_width) % pooled_height;
    // 該線程對應的top blob（N,C,H,W）中的W,輸出Feature Map的中的寬的座標
    const int pw = index % pooled_width;

    // hstart,wstart,hend,wend分別爲bottom blob（上一層feature map）中的點的座標範圍
    // 由這些點計算出該線程對應的點（top blob中的點）
    int hstart = ph * stride_h - pad_h;
    int wstart = pw * stride_w - pad_w;
    const int hend = min(hstart + kernel_h, height);
    const int wend = min(wstart + kernel_w, width);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
    Dtype maxval = -FLT_MAX;
    int maxidx = -1;

    // bottom_slice爲上一層(bottom blob)中相關的那**一個**feature map的切片視圖
    const Dtype* const bottom_slice =
        bottom_data + (n * channels + c) * height * width;
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
        if (bottom_slice[h * width + w] > maxval) {
          maxidx = h * width + w;
          maxval = bottom_slice[maxidx];
        }
      }
    }
    // index正好是top blob中對應點的索引，這也是爲什麼線程都是用了一維的維度
    // 數據在Blob.data中最後都是一維的形式保存的
    top_data[index] = maxval;
    if (mask) {
      mask[index] = maxidx;
    } else {
      top_mask[index] = maxidx;
    }
  }
}

第七層 InnerProductLayer

在InnerProductLayer的Forward_gpu實際止調用了以下的核心代碼來求兩個矩陣的積。其中bottom_data爲M×K的矩陣，weight爲 N×K 的矩陣，top_data爲M×N 的矩陣，M_爲樣本個數，K爲bottom中第個樣本的維度，N爲top中每個樣本的維度.

caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
                          bottom_data, weight, (Dtype)0., top_data);

caffe_gpu_gemm中的實際上調用了cublas的矩陣計算。

void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
    const float alpha, const float* A, const float* B, const float beta,
    float* C) {
  // Note that cublas follows fortran order.
  int lda = (TransA == CblasNoTrans) ? K : M;
  int ldb = (TransB == CblasNoTrans) ? N : K;
  // lenet中cuTransA是不轉置的
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  // lenet中cuTransB是需要轉置的
  cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
}

第八層 ReLUForward

ReLu層很簡單，就是實現了公式

o u t = m a x (0, i n)

核心代碼如下，其中negative_slope一般取0，即爲以上公式。

out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;

第十一層 AccuracyLayer

void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  Dtype accuracy = 0;

  // 一個Batch中包含的樣本數量，lenet中outer_num_=100
  for (int i = 0; i < outer_num_; ++i) {
    //一個樣本對應的類別數量，lenet中一個樣本一個標籤，所以inner_num_=1
    for (int j = 0; j < inner_num_; ++j) {
      //該樣本的label
      const int label_value = static_cast<int>(bottom_label[i * inner_num_ + j]);

      // top_k爲取前k個最高評分（的預測標籤）
      // Top-k accuracy
      std::vector<std::pair<Dtype, int> > bottom_data_vector;
      // num_labels爲分類類別個數，lenet中爲10
      // 接下來兩步把測試評分與類別ID掛勾，並對評分排序
      for (int k = 0; k < num_labels; ++k) {
        bottom_data_vector.push_back(std::make_pair(
            bottom_data[i * dim + k * inner_num_ + j], k));
      }
      std::partial_sort(
          bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
          bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());

      // 看top_5個預測的標籤與實際標籤是不是相同
      // check if true label is in top k predictions
      for (int k = 0; k < top_k_; k++) {
        if (bottom_data_vector[k].second == label_value) {
          ++accuracy;
          if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
          break;
        }
      }
      // 最後的正確率寫入只有一個單位的top blob中
      top[0]->mutable_cpu_data()[0] = accuracy / count;
    }
  }
}

說明：

outer_num_與inner_num_和爲樣本總數量
lenet中outer_num_爲一個Batch中包含的樣本數量
lenet中inner_num_爲1
top_k爲取前k個最高評分（的預測標籤）

第十二層 SoftmaxWithLossLayer

(1). SoftmaxLayer

在其Forward_gpu函數中把100*10的bottom blob,計算得到100*10的top blob，可以理解爲100個樣本，每個樣本特徵數量爲10，計算這100個樣本分別在10個類別上的概率。計算公式如下：

f (z k) = e z k - m \sum i n e z i - m

m = m a x (z i)

對應的說明圖，針對一個樣本而言，y爲樣本的標籤：

void SoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->gpu_data();
  // top shape: 100*10
  Dtype* top_data = top[0]->mutable_gpu_data();
  // scale shape: 100*1
  Dtype* scale_data = scale_.mutable_gpu_data();
  int count = bottom[0]->count();
  int channels = top[0]->shape(softmax_axis_);
  // 從bottom 複製到 top，以下操作都在top上進行
  caffe_copy(count, bottom_data, top_data);

  // 求最大值m=max(z_i)(存放在scale_data)
  kernel_channel_max<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
      scale_data);
  // 求減法，z_k-m(存放在top_data)
  kernel_channel_subtract<Dtype><<<CAFFE_GET_BLOCKS(count),
      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
      scale_data, top_data);
  // 求指數e^{z_k-m}(存放在top_data)
  kernel_exp<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
      count, top_data, top_data);
  // 求和\sum_i^n{e^{z_i-m}}(存放在scale_data)
  kernel_channel_sum<Dtype><<<CAFFE_GET_BLOCKS(outer_num_ * inner_num_),
      CAFFE_CUDA_NUM_THREADS>>>(outer_num_, channels, inner_num_, top_data,
      scale_data);
  // 求除法，得到結果\frac{e^{z_k-m}}{\sum_i^n{e^{z_i-m}}}(存放在top_data)
  kernel_channel_div<Dtype><<<CAFFE_GET_BLOCKS(count),
      CAFFE_CUDA_NUM_THREADS>>>(count, outer_num_, channels, inner_num_,
      scale_data, top_data);
}

(2). SoftmaxWithLossLayer

對於樣本(x,y),z爲x經過網絡處理後在ip2層的輸出，也就是SoftmaxWithLossLayer的輸入，同時也是Softmax的輸入。注意n爲n個樣本，y爲樣本對應的類別（標籤），y=0,1,…,N，損失如下公式計算：

l o s s = \sum n - l o g f (z y)

Forward_gpu函數代碼：

void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  // 計算出prob_(100*10維)，即每個樣本屬於某個類別的概率
  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
  const Dtype* prob_data = prob_.gpu_data();
  const Dtype* label = bottom[1]->gpu_data();
  const int dim = prob_.count() / outer_num_;
  const int nthreads = outer_num_ * inner_num_;

  // 求loss,見公式
  // prob_data爲100*10，label爲100*10，
  // 計算後得loss_data爲100*1
  SoftmaxLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
      CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
      outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
  Dtype loss;

  // 求和，loss_data爲(1)所以最後輸出爲1維
  caffe_gpu_asum(nthreads, loss_data, &loss);

  // 歸一化，除以樣本總數。最後存放在top blob中，top blob只有一個單位內在，用來存放loss
  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, valid_count);

}

SoftmaxLossForwardGPU代碼：

// 爲了提高可讀性，代碼有改動
__global__ void SoftmaxLossForwardGPU(const int nthreads,
          const Dtype* prob_data, const Dtype* label, Dtype* loss,
          const int num, const int dim, const int spatial_dim,
          const bool has_ignore_label_, const int ignore_label_,
          Dtype* counts) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    const int n = index;
    //label_value爲真實標籤
    const int label_value = static_cast<int>(label[n]);

    loss[index] = -log(max(prob_data[n * dim + label_value], Dtype(FLT_MIN)));
    counts[index] = 1;
  }
}

【caffe源碼研究】第四章：完整案例源碼篇(4) ：LeNet前向過程

入口信息

第一層 DataLayer

第二層 SplitLayer

第三層 ConvolutionLayer

第四層 PoolingLayer

第七層 InnerProductLayer

第八層 ReLUForward

第十一層 AccuracyLayer

第十二層 SoftmaxWithLossLayer

(1). SoftmaxLayer

(2). SoftmaxWithLossLayer

測試人員都是畫畫大神，讓我看看誰還不會用代碼圖？

網絡現代化通向雲原生應用的高速公路

面試官：說說你對序列化的理解

Object.values()對象遍歷

【caffe源碼研究】第三章：源碼篇(5) ：Net

【caffe源碼研究】第四章：完整案例源碼篇(5) ：LeNet反向過程

【caffe源碼研究】第四章：完整案例源碼篇(2) ：LeNet初始化訓練網絡

【caffe源碼研究】第四章：完整案例源碼篇(1) ：LeNetSolver初始化

【caffe源碼研究】第三章：源碼篇(12) ：激活函數層

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結