#include <iostream>
#include <string>
#include <cassert>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#define checkCudaErrors(val) check((val),#val,__FILE__,__LINE__)
cv::Mat imageRGBA;
cv::Mat imageGrey;
//聲明GPU memory
uchar4 *d_rgbaImage__;
uchar *d_greyImage__;
size_t numRows() {
return imageRGBA.rows;
}
size_t numCols() {
return imageRGBA.cols;
}
template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
if (err != cudaSuccess) {
std::cerr << "CUDA error at:" << file << ":" << line << std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
exit(1);
}
}
//圖片預處理
void preProcess(uchar4 **inputImage, unsigned char **greyImage, uchar4 **d_rgbaImage,
unsigned char **d_greyImage, const std::string &filename) {
checkCudaErrors(cudaFree(0));
//讀取圖片
cv::Mat image;
image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
if (image.empty()) {
std::cerr << "Couldn't open file:" << filename << std::endl;
exit(1);
}
//把opencv讀取的BGR格式轉爲RGBA格式
cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
//生成一個和原圖一樣大小的imageGrey
imageGrey.create(image.rows, image.cols, CV_8UC1);
//判斷圖像是否連續存放
if (!imageRGBA.isContinuous() || !imageGrey.isContinuous()) {
std::cerr << "Images aren't continuous!! Exiting." << std::endl;
exit(1);
}
//inputImage指向imageRGBA
*inputImage = (uchar4 *)imageRGBA.ptr<unsigned char>(0);
//greyImage指向imageGrey
*greyImage = imageGrey.ptr<unsigned char>(0);
//分配GPU memory
const size_t numPixels = numRows()*numCols();
checkCudaErrors(cudaMalloc(d_rgbaImage, sizeof(uchar4)*numPixels));
checkCudaErrors(cudaMalloc(d_greyImage, sizeof(unsigned char)*numPixels));
//cudaMemset在GPU上清空d_greyImage
checkCudaErrors(cudaMemset(*d_greyImage, 0, numPixels * sizeof(unsigned char)));
//把inputImage的數據複製給GPU的d_rgbaImage
checkCudaErrors(cudaMemcpy(*d_rgbaImage, *inputImage, sizeof(uchar4)*numPixels, cudaMemcpyHostToDevice));
d_rgbaImage__ = *d_rgbaImage;
d_greyImage__ = *d_greyImage;
}
__global__
void rgba_to_greyscale(const uchar4* const rgbaImage, unsigned char* const greyImage, int numRows, int numCols) {
int threadId = blockIdx.x*blockDim.x*blockDim.y + threadIdx.y*blockDim.x + threadIdx.x;
if (threadId < numRows*numCols) {
const unsigned char R = rgbaImage[threadId].x;
const unsigned char G = rgbaImage[threadId].y;
const unsigned char B = rgbaImage[threadId].z;
greyImage[threadId] = .299f*R + .587f*G + .114f*B;
}
}
void postProcess(const std::string& output_file, unsigned char* data_ptr) {
cv::Mat output(numRows(), numCols(), CV_8UC1, (void*)data_ptr);
cv::imwrite(output_file.c_str(), output);
}
void cleanup() {
cudaFree(d_rgbaImage__);
cudaFree(d_greyImage__);
}
int main(int argc, char* argv[]) {
//定義輸入地址
std::string input_file = "E:/code/study_cuda/study_reduce/study_reduce/cinque_terre_small.jpg";
//定義輸出地址
std::string output_file = "E:/code/study_cuda/study_reduce/study_reduce/cinque_terre_small_togray.jpg";
//定義Host的指針
uchar4 *h_rgbaImage, *d_rgbaImage;
//定義device的指針
unsigned char *h_greyImage, *d_greyImage;
//圖片預處理(把要處理的數據賦值給h_rgbaImage,且複製給d_greyImage)
preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);
//並行化處理Kernel
int thread = 16;
int grid = (numRows()*numCols() + thread - 1) / (thread*thread);
const dim3 blockSize(thread, thread);
const dim3 gridSize(grid);
rgba_to_greyscale <<<gridSize, blockSize >>> (d_rgbaImage, d_greyImage, numRows(), numCols());
//只有GPU計算到這個位置後,CPU纔會開始接着
cudaDeviceSynchronize();
//GPU結果複製給CPU
size_t numPixels = numRows()*numCols();
checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char)*numPixels, cudaMemcpyDeviceToHost));
//寫入圖片
postProcess(output_file, h_greyImage);
//釋放
cleanup();
}
僅記錄學習過程