更多文章參考:自己動手實現darknet預測分類動態庫
1.convet_to_image代碼如下:
image convert_to_image(unsigned char* data,int w,int h,int c)
{
int i, j, k;
image im = make_image(w, h, c);
for (k = 0; k < c; ++k) {
for (j = 0; j < h; ++j) {
for (i = 0; i < w; ++i) {
int dst_index = i + w*j + w*h*k;
int src_index = k + c*i + c*w*j;
im.data[dst_index] = (float)data[src_index] / 255.;
//printf("c:%d,h:%d,w:%d,src_index:%d,dst_index:%d\n", k, j, i, src_index, dst_index);
}
}
}
//free(data);
return im;
}
GPU加速代碼:
新建loadimage.cu文件,
添加代碼
#include "dark_cuda.h"
//image convert_to_image_gpu(unsigned char* data,int w,int h,int c);
//__global__ void MatConvertImg(float* dst, unsigned char* src, int w,int h,int c) ;
// Kernel定義
__global__ void MatConvertImg(float* dst, unsigned char* src, int w,int h,int c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
if (i<w&&j<h&&k<c)
{
int dst_index = i + w*j + w*h*k;
int src_index = k + c*i + c*w*j;
dst[dst_index] = (float)src[src_index] / 255.;
}
}
extern "C" image convert_to_image_gpu(unsigned char* data,int w,int h,int c)
{
int nBytes = w*h*c*sizeof(unsigned char);
unsigned char* src;
cudaMalloc((void**)&src, nBytes);
float *dst;
cudaMalloc((void**)&dst, w*h*c*sizeof(float));
// 將host數據拷貝到device
//cuda_push_array(src,data,nBytes);
cudaError_t status = cudaMemcpyAsync(src, data, nBytes, cudaMemcpyHostToDevice, get_cuda_stream());
CHECK_CUDA(status);
// 定義kernel的執行配置
dim3 blockSize(32,32,1);
dim3 gridSize((w + blockSize.x - 1) / blockSize.x,(h + blockSize.y - 1) / blockSize.y,(c + blockSize.z - 1) / blockSize.z);
MatConvertImg << < gridSize, blockSize >>>(dst,src,w,h,c);
image im = make_image(w, h, c);
cuda_pull_array(dst,im.data,w*h*c*sizeof(float));
cudaFree(src);
cudaFree(dst);
return im;
}
convert_to_image_gpu函數前一定要加extern "c",否則在c文件中調用convert_to_image_gpu時會發生鏈接錯誤。
調用時報錯:
關於C語言中的Debug Assertion Failed,在編譯和運行時都不會出現錯誤,但是在執行時會出現錯誤。發生這種錯誤的原因可能是:
1、直接釋放了一個空指針;
2、一個指針被釋放了兩次(即第二次釋放一個空指針);
3、數組越界:訪問了超過數組長度的內存。
跟蹤代碼發現,程序死在
cuda_pull_array(dst,im.data,w*h*c*sizeof(float));
改成
cuda_pull_array(dst,im.data,w*h*c);
問題解決。
darknet源碼解析:cuda_push_array可知,cuda_push_array函數中已經包含sizeof(float)計算,傳參只需傳入float變量個數就可以了,否則會發生訪問越界錯誤。