cuda做卷積和均值池化

#include "device_functions.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdlib.h"
#include <string>
#include <cassert>
#include <iostream>
using namespace std;
#include <opencv2\opencv.hpp>
#include <opencv2/core.hpp>
#include <opencv2/highgui/highgui.hpp>
using namespace cv;
#include<stdlib.h>

void GetCudaCalError(cudaError err)
{
	if (err != cudaSuccess)
	{
		cout << "分配內存失敗!程序結束!";
	}
	return;
}

//返回thread和block
int getThreadNumZC()
{
	cudaDeviceProp prop;//cudaDeviceProp的一個對象
	int count = 0;//GPU的個數
	cudaGetDeviceCount(&count);
	std::cout << "gpu 的個數:" << count << '\n';

	cudaGetDeviceProperties(&prop, 0);//第二參數爲那個gpu
	cout << "最大線程數:" << prop.maxThreadsPerBlock << endl;
	cout << "最大網格類型:" << prop.maxGridSize[0] << '\t' << prop.maxGridSize[1] << '\t' << prop.maxGridSize[2] << endl;
	return prop.maxThreadsPerBlock;
}

//對照片進行卷積操作

__global__ void convZC(uchar4*d_image, float*d_kernel, uchar4*d_result,int imageRow, int imageCol, int kernelSize)
{
	//這裏block使用一維的

	//獲取Thread的id
	const int id = blockIdx.x*blockDim.x + threadIdx.x;
	//判斷id是否超出邊界,如果超出則不用這個線程
	if (id < imageRow*imageCol)
	{
		//獲取當前的行和列
		const int row = id / imageCol;
		const int col = id % imageCol;
		//每個通道都做卷積計算(這個地方可以進一步做並行化處理)
		for (int i = 0; i < kernelSize; ++i)
		{
			for (int j = 0; j < kernelSize; ++j)
			{
				float3 imgValue = {0,0,0};//記錄結果
				int curRow = row - kernelSize / 2 + i;
				int curClo = col - kernelSize / 2 + j;
				if (curRow < 0 || curClo < 0 || curRow >= imageRow || curClo >= imageCol)
				{
				}
				else
				{
					
					imgValue.x = d_image[curRow*imageCol + curClo].x;
					imgValue.y = d_image[curRow*imageCol + curClo].y;
					imgValue.z = d_image[curRow*imageCol + curClo].z;

				}
				d_result[id].x += d_kernel[i*kernelSize + j] * imgValue.x;
				d_result[id].y += d_kernel[i*kernelSize + j] * imgValue.y;
				d_result[id].z += d_kernel[i*kernelSize + j] * imgValue.z;
				

			}
		}
	
	}


}

//將照片均值模糊化
__global__ void avgImage(uchar4*d_result, float*d_kernel, uchar4*d_result_image,int imageRow, int imageCol, int kernelSize)
{
	//這裏block使用一維的

		//獲取Thread的id
	const int id = blockIdx.x*blockDim.x + threadIdx.x;
	//判斷id是否超出邊界,如果超出則不用這個線程
	if (id < imageRow*imageCol)
	{
		//獲取當前的行和列
		const int row = id / imageCol;
		const int col = id % imageCol;
		//每個通道都做卷積計算(這個地方可以進一步做並行化處理)
		for (int i = 0; i < kernelSize; ++i)
		{
			for (int j = 0; j < kernelSize; ++j)
			{
				float3 imgValue = { 0,0,0 };//記錄結果
				int curRow = row - kernelSize / 2 + i;
				int curClo = col - kernelSize / 2 + j;
				if (curRow < 0 || curClo < 0 || curRow >= imageRow || curClo >= imageCol)
				{
				}
				else
				{

					imgValue.x = d_result[curRow*imageCol + curClo].x;
					imgValue.y = d_result[curRow*imageCol + curClo].y;
					imgValue.z = d_result[curRow*imageCol + curClo].z;

				}
				d_result_image[id].x += d_kernel[i*kernelSize + j] * imgValue.x;
				d_result_image[id].y += d_kernel[i*kernelSize + j] * imgValue.y;
				d_result_image[id].z += d_kernel[i*kernelSize + j] * imgValue.z;


			}
		}
		d_result_image[id].x /= kernelSize * kernelSize;
		d_result_image[id].y /= kernelSize * kernelSize;
		d_result_image[id].z /= kernelSize * kernelSize;


	}
}
void showImageZC(string filename,uchar4 *Image, int imageRow, int imageClo)
{
	//將數組轉換成Mat
	cv::Mat outImage(imageRow, imageClo, CV_8UC4, (void*)Image);
	cv::Mat outImageBGR;
	cv::cvtColor(outImage, outImageBGR, CV_RGBA2BGR);
	string file = "E:\\ZC\\procedure\\CUDA\\Images\\";
	file += filename;
	cv::imwrite(file.c_str(), outImageBGR);
	//顯示處理好的照片
	imshow("convImage", outImageBGR);
	waitKey(0);
}
int main()
{
	//定義變量
	string input_file = "E:\\ZC\\procedure\\CUDA\\Images\\1.png";
	string output_file = "E:\\ZC\\procedure\\CUDA\\Images\\3.png";
	uchar4*h_image, *d_image, *d_result, *h_result, *d_avgImage, *h_avgImage;
	float*d_kernel;
	int imageRow, int imageCol, kernelSize = 3;
	//讀取照片到imageBGR中
	Mat imageBGR = cv::imread(input_file.c_str(), CV_LOAD_IMAGE_COLOR);
	if (imageBGR.empty())
	{
		cerr << "讀取照片失敗:" << input_file << endl;
		exit(1);
	}

	//將BGR轉換成RGB存到imageRGB中
	Mat imageRGB;
	cv::cvtColor(imageBGR, imageRGB, CV_BGR2RGBA);

	//將Mat轉換成數組並將地址賦給h_image
	h_image = (uchar4*)imageRGB.ptr<unsigned char>(0);
	//爲Device上的d_image開闢空間
	imageRow = imageRGB.rows;
	imageCol = imageRGB.cols;
	int size = imageCol * imageRow;
	GetCudaCalError(cudaMalloc(&d_image, size * sizeof(uchar4)));
	//將h_image的值賦給d_image
	cudaMemcpy(d_image, h_image, size * sizeof(uchar4), cudaMemcpyHostToDevice);
	//爲Host上的h_kernel開闢空間
	float *h_kernel = new float[kernelSize*kernelSize];
	//爲h_kernel賦值
	for (int i = 0; i < kernelSize*kernelSize; ++i)
	{
		h_kernel[i] = i % kernelSize - 1;
	}
	//爲Device上的d_kernel開闢空間
	GetCudaCalError(cudaMalloc(&d_kernel, kernelSize *kernelSize * sizeof(float)));

	//將h_kernel的值賦給d_kernel
	cudaMemcpy(d_kernel, h_kernel, kernelSize *kernelSize * sizeof(float), cudaMemcpyHostToDevice);

	//開闢一個和imageRGB等大的內存來存放卷積結果d_result
	GetCudaCalError(cudaMalloc(&d_result, size * sizeof(uchar4)));
	//d_result初始化成0
	cudaMemset(d_result, 0, size * sizeof(uchar4));
	//開闢一個和imageRGB等大的內存來存放卷積結果h_result
	h_result = new uchar4[size];

	//開闢一個和imageRGB等大的內存來存放均值處理結果h_avgImage
	h_avgImage = new uchar4[size];

	//開闢一個和imageRGB等大的內存來存放均值處理結果d_avgImage
	GetCudaCalError(cudaMalloc(&d_avgImage, size * sizeof(uchar4)));

	const int threadNum = getThreadNumZC();
	const int blockNum = (imageRow*imageCol + threadNum - 1) / threadNum;
	convZC << <blockNum, threadNum >> > (d_image, d_kernel, d_result, imageRow, imageCol, kernelSize);
	//等待線程全部結束
	cudaDeviceSynchronize();
	//將結果返回Host上
	cudaMemcpy(h_result, d_result, imageRow*imageCol * sizeof(uchar4), cudaMemcpyDeviceToHost);

	for (int i = 0; i < kernelSize*kernelSize; ++i)
	{
		h_kernel[i] = rand() % 3;
	}
	cudaMemcpy(d_kernel, h_kernel, kernelSize *kernelSize * sizeof(float), cudaMemcpyHostToDevice);
	avgImage << <blockNum, threadNum >> > (d_result, d_kernel, d_avgImage, imageRow, imageCol, kernelSize);
	cudaDeviceSynchronize();
	//將結果返回Host上
	cudaMemcpy(h_avgImage, d_avgImage, imageRow*imageCol * sizeof(uchar4), cudaMemcpyDeviceToHost);
	string name1 = "convTest2.png";
	string name2 = "avgTest2.png";
	showImageZC(name1, h_result, imageRow, imageCol);
	showImageZC(name2, h_avgImage, imageRow, imageCol);

	//釋放內存
	cudaFree(d_image);
	cudaFree(d_avgImage);
	cudaFree(d_kernel);
	delete[] h_avgImage;
	return 0;
}

卷積操作後
平均值池化處理後
不足之處:
1、卷積,均值池化都不能調步數。
2、均值池化沒有改變像素的個數。
3、這個並行化程序的並行化程度還可以再提高。
4、代碼不方便遷移。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章