NNPACK詳解
一:NNPACK簡介
NNPACK由facebook開發,是一個加速神經網絡計算的加速包,NNPACK可以在多核CPU平臺上提高卷積層計算性能。NNPACK採用的快速卷積算法是基於Fourier transform 和 Winograd transform算法。
二:前向計算的性能
如下圖(Intel Core i7 6700K vs BVLC Caffe master branch)
Library |
Caffe |
NNPACK |
NNPACK |
NNPACK |
Algorithm |
im2col + sgemm |
FFT-8x8 |
FFT-16x16 |
Winograd F(6x6, 3x3) |
AlexNet:conv2 |
315 ms |
129 ms |
86 ms |
N/A |
AlexNet:conv3 |
182 ms |
87 ms |
44 ms |
70 ms |
AlexNet:conv4 |
264 ms |
109 ms |
56 ms |
89 ms |
AlexNet:conv5 |
177 ms |
77 ms |
40 ms |
64 ms |
VGG-A:conv1 |
255 ms |
303 ms |
260 ms |
404 ms |
VGG-A:conv2 |
902 ms |
369 ms |
267 ms |
372 ms |
VGG-A:conv3.1 |
566 ms |
308 ms |
185 ms |
279 ms |
VGG-A:conv3.2 |
1091 ms |
517 ms |
309 ms |
463 ms |
VGG-A:conv4.1 |
432 ms |
228 ms |
149 ms |
188 ms |
VGG-A:conv4.2 |
842 ms |
402 ms |
264 ms |
329 ms |
VGG-A:conv5 |
292 ms |
141 ms |
83 ms |
114 ms |
OverFeat:conv2 |
424 ms |
158 ms |
73 ms |
N/A |
OverFeat:conv3 |
250 ms |
69 ms |
74 ms |
54 ms |
OverFeat:conv4 |
927 ms |
256 ms |
272 ms |
173 ms |
OverFeat:conv5 |
1832 ms |
466 ms |
524 ms |
315 ms |
三:NNPACK支持的層
- 卷積層 (Convolutional layer)
- 全連接層 (Fully-connected layer)
- 池化層 (Max pooling layer)
- ReLU layer
- Softmax layer
四:NNPACK 編譯 (Linux OS)
1、編譯安裝 PeachPy
git clone https://github.com/Maratyszcza/PeachPy.git
cd PeachPy
[sudo] pip install --upgrade -r requirements.txt
python setup.py generate
[sudo] pip install --upgrade .
|
2、安裝ninja 和 ninja-syntax python 模塊
sudo apt-get install ninja-build || brew install ninja
[sudo] pip install ninja-syntax
|
3、下載編譯NNPACK
git clone --recursive https://github.com/Maratyszcza/NNPACK.git
cd NNPACK
python ./configure.py
ninja
|
注意:
編譯nnpack過程如果出現一些找不到頭文件等情況,一般是需要下載第三方庫。在nnpack的包中有對於的目錄third-party,分別下載,放到對應得目錄中,並分別編譯。
四:測試
NNPACK編譯完成之後,在NNPACK-master/bin目錄下有測試卷積、全連接等可執行程序。
例如,測試卷積輸入通道 16 ,輸出通道 16 , 輸入圖像180*180,kernel 3*3 ,迭代100次,
執行結果:
五、使用NNPACK 實現卷積
- input channels: 1
- output channels:1
- input size:4*4
- kernel size:3*3
1、代碼(conv_nnpack.c):
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#include<unistd.h>
#include<sys/time.h>
#include<nnpack.h>
int main(int argc , char** argv)
{
//init nnpack
enum nnp_status init_status = nnp_initialize();
if (init_status != nnp_status_success) {
fprintf(stderr, "NNPACK initialization failed: error code %d\n", init_status);
exit(EXIT_FAILURE);
}
enum nnp_convolution_algorithm algorithm = nnp_convolution_algorithm_auto;
const size_t batch_size = 1;
const size_t input_channels = 1;
const size_t output_channels = 1;
const struct nnp_padding input_padding = { 0, 0, 0, 0 };
const struct nnp_size input_size ={ 4, 4 };
const struct nnp_size kernel_size = { 3, 3 },;
const struct nnp_size output_size = {
.width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width) / output_subsampling.width + 1,
.height = (input_padding.top + input_size.height + input_padding.bottom - kernel_size.height) / output_subsampling.height + 1
};
int iter=1;
//malloc memory for input, kernel, output, bias
float* input = (float*)malloc(batch_size * input_channels *input_size.height *input_size.width * sizeof(float));
float* kernel = (float*)malloc(input_channels * output_channels * kernel_size.height * kernel_size.width * sizeof(float));
float* output = (float*)malloc(batch_size* output_channels * output_size.height * output_size.width * sizeof(float));
float* bias = (float*)malloc(output_channels * sizeof(float));
pthreadpool_t threadpool=NULL;
//flase:only one thread, true: mutiple threads
if (false) {
threadpool = pthreadpool_create(options.threads);
printf("Threads: %zu\n", pthreadpool_get_threads_count(threadpool));
}
struct nnp_profile computation_profile;//use for compute time;
//init input data
int i,j;
for(int c=0; c<input_channels;c++ ){
for(i=0; i<input_size.height; i++){
for(j=0; j<input_size.width; j++){
input[c*input_size.height*input_size.width+i*input_size.width+j] = (i*input_size.width+j)*1.0;
}
}
}
//init kernel data
for(int i=0; i<output_channels;i++ ){
for(j=0; j<input_channels*kernel_size.height*kernel_size.width; j++){
kernel[i*input_channels*kernel_size.height*kernel_size.width+j] = 1.0;
}
}
//init bias data
for(int i=0; i<output_channels;i++ ){
bias[i] = 1.0;
}
//execute conv
struct timeval conv_start;
struct timeval conv_end;
gettimeofday(&conv_start,NULL);
for(int i=0;i<iter;i++){
nnp_convolution_output(algorithm,
batch_size,
input_channels,
output_channels,
input_size,
input_padding,
kernel_size,
input,
kernel,
bias,
output,
threadpool,
&computation_profile);
}
gettimeofday(&conv_end,NULL);
//printf ouput data
for(i=0;i<output_channels; i++){
for(j=0;j<output_size.height*output_size.width; j++){
printf("%f\t",output[i*output_size.height*output_size.width+j]);
}
printf("\n");
}
float conv_time_use = 1000.0*(float)(conv_end.tv_sec-conv_start.tv_sec)+(float)(conv_end.tv_usec-conv_start.tv_usec)/1000.0;
printf("conv Time use = %f(ms)\n",conv_time_use);
printf("conv mean Time use = %f(ms) / iter\n",conv_time_use/iter);
return 0;
}
2、編譯
編譯,要鏈接 -lnnpack 和-lphterad。-lphterad是第三方庫pthreadpool使用pthread創建線程,pthreadpool是根據平臺CPU的核數,創建對應數量的線程。此處代碼false使用一個線程。
3、輸出結果:
六:NNPACK與 im2col+sgemm卷積性能對比
- im2col+sgemm使用openblas
- intput channels : 16
- output channels :16
- input size : 360*360
- kernel size: 2*2 , 3*3, 5*5
如下圖:
圖 nnpack vs im2col_segemm
|
2 |
3 |
5 |
10 |
nnpack |
6.69ms |
7.38ms |
9.71ms |
26.44ms |
im2col_sgemm |
37.83ms |
86.95ms |
236.91ms |
929.66ms |
表 nnpack vs im2col_segemm