cuda做卷積修改版

原創

2020-06-21 09:51

#include "device_functions.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdlib.h"
#include <iostream>
//#include<stdio.h>
using namespace std;
//返回thread和block
int getThreadNum()
{
	cudaDeviceProp prop;//cudaDeviceProp的一個對象
	int count = 0;//GPU的個數
	cudaGetDeviceCount(&count);
	std::cout << "gpu 的個數：" << count << '\n';

	cudaGetDeviceProperties(&prop, 0);//第二參數爲那個gpu
	cout << "最大線程數：" << prop.maxThreadsPerBlock << endl;
	cout << "最大網格類型：" << prop.maxGridSize[0] << '\t' << prop.maxGridSize[1] << '\t' << prop.maxGridSize[2] << endl;
	return prop.maxThreadsPerBlock;
}
__global__ void conv(float *imgGpu, float*kernelGpu, float*resultGpu, int width, int height, int kernelSize)
{
	int id = threadIdx.x + blockIdx.x*blockDim.x;
	if (id >= width * height)
	{
		return;
	}
	int row = id / width;//獲取img 的行和列
	int clo = id % width;
	//每一個線程處理一次卷積計算
	//resultGpu[id] = 0;
	for (int i = 0; i < kernelSize; ++i)
	{
		for (int j = 0; j < kernelSize; ++j)
		{
			float imgValue = 0;//記錄結果
			//imgValue += kernelGpu[i*kernelSize + j] * imgGpu[id];
			int curRow = row - kernelSize / 2 + i;
			int curClo = clo - kernelSize / 2 + j;
			if (curRow < 0 || curClo < 0 || curRow >= height || curClo >= width)
			{
			}
			else
			{
				//imgValue += kernelGpu[i*kernelSize + j] * imgGpu[(curRow + i - 1)*width + curClo + j - 1];
				imgValue = imgGpu[curRow*width + curClo];
				
			}
			resultGpu[id] += kernelGpu[i*kernelSize + j] * imgValue;
			
		}
	}
}
int main()
{
	//定義一個1080p照片
	const int width = 1920;
	const int height = 1080;
	//float *img = (float*)calloc(width*height, sizeof(float));
	float *img = new float[width*height];
	//賦值
	for (int row = 0; row < height; ++row)
	{
		for (int col = 0; col < width; ++col)
		{
			img[col + row * width] = (col + row) % 256;
		}
	}
	//聲明卷積核大小,大小爲3*3
	const int kernelSize = 3;
	//float*kernel = (float*)calloc(kernelSize*kernelSize, sizeof(float));
	float *kernel = new float[kernelSize*kernelSize];
	//卷積核賦值
	//第一種方法
	for (int i = 0; i < kernelSize; ++i)
	{
		for (int j = 0; j < kernelSize; ++j)
		{
			kernel[i + j * kernelSize] = i - 1;
		}
	}
	//第二種
	/*for (int i = 0; i < kernelSize*kernelSize; ++i)
	{
		kernel[i] = i % kernelSize - 1;
	}*/
	//輸出img的左上角
	for (int row = 0; row < 10; ++row)
	{
		for (int col = 0; col < 10; ++col)
		{
			std::cout << img[col + row * width] << '\t';
		}
		std::cout << '\n';
	}
	cout << "kernel\n";
	for (int i = 0; i < kernelSize; ++i)
	{
		for (int j = 0; j < kernelSize; ++j)
		{
			std::cout << kernel[i*kernelSize+j] << '\t';
		}
		cout << endl;
		
	}
	

	float *imgGpu = 0;//將host值複製到device上面
	float *kernelGpu = 0;//將kernel也複製到device上
	float *resultGpu = 0;//卷積結果

	//爲Device分配內存
	cudaMalloc(&imgGpu, height*width * sizeof(float));
	cudaMalloc(&kernelGpu, kernelSize*kernelSize * sizeof(float));
	cudaMalloc(&resultGpu, height*width * sizeof(float));
	//這個地方捕捉錯誤，明天改

	cudaMemcpy(imgGpu, img, width*height * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(kernelGpu, kernel, kernelSize*kernelSize * sizeof(float), cudaMemcpyHostToDevice);
	//獲取GPU信息
	const int threadNum = getThreadNum();
	const int blockNum = (width*height + threadNum - 1) / threadNum;//這裏block使用一維
	conv << <blockNum, threadNum >> > (imgGpu, kernelGpu, resultGpu, width, height, kernelSize);
	//接受Device上resultGpu裏面的數據
	float *showImg = new float[height*width];
	cudaMemcpy(showImg, resultGpu, width*height * sizeof(float), cudaMemcpyDeviceToHost);

	for (int row = 0; row < 10; ++row)
	{
		for (int col = 0; col < 10; ++col)
		{
			std::cout << showImg[col + row * width] << '\t';
		}
		std::cout << '\n';
	}
	//沒有釋放內存
	cudaFree(imgGpu);
	cudaFree(kernelGpu);
	cudaFree(resultGpu);
	/*free(img);
	free(kernel);*/
	delete[] img;
	delete[] kernel;
	delete[] showImg;
	system("pause");
	return 0;
}

修改前代碼的不足：
1、忘記釋放內存
2、爲Device分配內存時，少分配了一個
3、將一維數組轉換成二維（邏輯上）時，列算錯了，應該是對總列數取餘
4、二維表在心中的並不是太清晰
修改後代碼的不足：
1、沒有捕捉錯誤
2、等等還有很多錯，但是我能力有限目前還沒看出來
自身的不足：
1、C和C++的格式化的輸出使用不熟練
2、C++的new 和 delete方法用的也不太熟練

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

cuda做卷積修改版

apisix~helm方式的部署到k8s

firmeye - IoT固件漏洞挖掘工具

cuda做卷積和均值池化

數據庫中的EXISTS語句

matplotlib顯示照片

tensorflow持久化數據格式（2）

今天正式學習計算機視覺

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結