cuda學習--並行化實現圖像的RGB轉灰度圖

#include <iostream>
#include <string>
#include <cassert>

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#define checkCudaErrors(val) check((val),#val,__FILE__,__LINE__)

cv::Mat imageRGBA;
cv::Mat imageGrey;

//聲明GPU memory
uchar4 *d_rgbaImage__;
uchar  *d_greyImage__;

size_t numRows() { 
	return imageRGBA.rows;
}
size_t numCols() {
	return imageRGBA.cols;
}

template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
	if (err != cudaSuccess) {
		std::cerr << "CUDA error at:" << file << ":" << line << std::endl;
		std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
		exit(1);
	}
}

//圖片預處理
void preProcess(uchar4 **inputImage, unsigned char **greyImage, uchar4 **d_rgbaImage, 
	unsigned char **d_greyImage, const std::string &filename) {
	checkCudaErrors(cudaFree(0));

	//讀取圖片
	cv::Mat image;
	image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
	if (image.empty()) {
		std::cerr << "Couldn't open file:" << filename << std::endl;
		exit(1);
	}

	//把opencv讀取的BGR格式轉爲RGBA格式
	cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);

	//生成一個和原圖一樣大小的imageGrey
	imageGrey.create(image.rows, image.cols, CV_8UC1);

	//判斷圖像是否連續存放
	if (!imageRGBA.isContinuous() || !imageGrey.isContinuous()) {
		std::cerr << "Images aren't continuous!! Exiting." << std::endl;
		exit(1);
	}

	//inputImage指向imageRGBA
	*inputImage = (uchar4 *)imageRGBA.ptr<unsigned char>(0);
	//greyImage指向imageGrey
	*greyImage = imageGrey.ptr<unsigned char>(0);

	//分配GPU memory
	const size_t numPixels = numRows()*numCols();
	checkCudaErrors(cudaMalloc(d_rgbaImage, sizeof(uchar4)*numPixels));
	checkCudaErrors(cudaMalloc(d_greyImage, sizeof(unsigned char)*numPixels));
	//cudaMemset在GPU上清空d_greyImage
	checkCudaErrors(cudaMemset(*d_greyImage, 0, numPixels * sizeof(unsigned char)));

	//把inputImage的數據複製給GPU的d_rgbaImage
	checkCudaErrors(cudaMemcpy(*d_rgbaImage, *inputImage, sizeof(uchar4)*numPixels, cudaMemcpyHostToDevice));

	d_rgbaImage__ = *d_rgbaImage;
	d_greyImage__ = *d_greyImage;
}

__global__
void rgba_to_greyscale(const uchar4* const rgbaImage, unsigned char* const greyImage, int numRows, int numCols) {
	int threadId = blockIdx.x*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
	if (threadId < numRows*numCols) {
		const unsigned char R = rgbaImage[threadId].x;
		const unsigned char G = rgbaImage[threadId].y;
		const unsigned char B = rgbaImage[threadId].z;
		greyImage[threadId] = .299f*R + .587f*G + .114f*B;
	}
}

void postProcess(const std::string& output_file, unsigned char* data_ptr) {
	cv::Mat output(numRows(), numCols(), CV_8UC1, (void*)data_ptr);
	cv::imwrite(output_file.c_str(), output);
}

void cleanup() {
	cudaFree(d_rgbaImage__);
	cudaFree(d_greyImage__);
}

int main(int argc, char* argv[]) {
	//定義輸入地址
	std::string input_file = "E:/code/study_cuda/study_reduce/study_reduce/cinque_terre_small.jpg";
	//定義輸出地址
	std::string output_file = "E:/code/study_cuda/study_reduce/study_reduce/cinque_terre_small_togray.jpg";

	//定義Host的指針
	uchar4 *h_rgbaImage, *d_rgbaImage;
	//定義device的指針
	unsigned char *h_greyImage, *d_greyImage;

	//圖片預處理(把要處理的數據賦值給h_rgbaImage,且複製給d_greyImage)
	preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);

	//並行化處理Kernel
	int thread = 16;
	int grid = (numRows()*numCols() + thread - 1) / (thread*thread);
	const dim3 blockSize(thread, thread);
	const dim3 gridSize(grid);
	rgba_to_greyscale <<<gridSize, blockSize >>> (d_rgbaImage, d_greyImage, numRows(), numCols());

	//只有GPU計算到這個位置後,CPU纔會開始接着
	cudaDeviceSynchronize();

	//GPU結果複製給CPU
	size_t numPixels = numRows()*numCols();
	checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char)*numPixels, cudaMemcpyDeviceToHost));

	//寫入圖片
	postProcess(output_file, h_greyImage);
	
	//釋放
	cleanup();
}

僅記錄學習過程

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章