#include "device_functions.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdlib.h"
#include <string>
#include <cassert>
#include <iostream>
using namespace std;
#include <opencv2\opencv.hpp>
#include <opencv2/core.hpp>
#include <opencv2/highgui/highgui.hpp>
using namespace cv;
#include<stdlib.h>
void GetCudaCalError(cudaError err)
{
if (err != cudaSuccess)
{
cout << "分配內存失敗!程序結束!";
}
return;
}
//返回thread和block
int getThreadNumZC()
{
cudaDeviceProp prop;//cudaDeviceProp的一個對象
int count = 0;//GPU的個數
cudaGetDeviceCount(&count);
std::cout << "gpu 的個數:" << count << '\n';
cudaGetDeviceProperties(&prop, 0);//第二參數爲那個gpu
cout << "最大線程數:" << prop.maxThreadsPerBlock << endl;
cout << "最大網格類型:" << prop.maxGridSize[0] << '\t' << prop.maxGridSize[1] << '\t' << prop.maxGridSize[2] << endl;
return prop.maxThreadsPerBlock;
}
//對照片進行卷積操作
__global__ void convZC(uchar4*d_image, float*d_kernel, uchar4*d_result,int imageRow, int imageCol, int kernelSize)
{
//這裏block使用一維的
//獲取Thread的id
const int id = blockIdx.x*blockDim.x + threadIdx.x;
//判斷id是否超出邊界,如果超出則不用這個線程
if (id < imageRow*imageCol)
{
//獲取當前的行和列
const int row = id / imageCol;
const int col = id % imageCol;
//每個通道都做卷積計算(這個地方可以進一步做並行化處理)
for (int i = 0; i < kernelSize; ++i)
{
for (int j = 0; j < kernelSize; ++j)
{
float3 imgValue = {0,0,0};//記錄結果
int curRow = row - kernelSize / 2 + i;
int curClo = col - kernelSize / 2 + j;
if (curRow < 0 || curClo < 0 || curRow >= imageRow || curClo >= imageCol)
{
}
else
{
imgValue.x = d_image[curRow*imageCol + curClo].x;
imgValue.y = d_image[curRow*imageCol + curClo].y;
imgValue.z = d_image[curRow*imageCol + curClo].z;
}
d_result[id].x += d_kernel[i*kernelSize + j] * imgValue.x;
d_result[id].y += d_kernel[i*kernelSize + j] * imgValue.y;
d_result[id].z += d_kernel[i*kernelSize + j] * imgValue.z;
}
}
}
}
//將照片均值模糊化
__global__ void avgImage(uchar4*d_result, float*d_kernel, uchar4*d_result_image,int imageRow, int imageCol, int kernelSize)
{
//這裏block使用一維的
//獲取Thread的id
const int id = blockIdx.x*blockDim.x + threadIdx.x;
//判斷id是否超出邊界,如果超出則不用這個線程
if (id < imageRow*imageCol)
{
//獲取當前的行和列
const int row = id / imageCol;
const int col = id % imageCol;
//每個通道都做卷積計算(這個地方可以進一步做並行化處理)
for (int i = 0; i < kernelSize; ++i)
{
for (int j = 0; j < kernelSize; ++j)
{
float3 imgValue = { 0,0,0 };//記錄結果
int curRow = row - kernelSize / 2 + i;
int curClo = col - kernelSize / 2 + j;
if (curRow < 0 || curClo < 0 || curRow >= imageRow || curClo >= imageCol)
{
}
else
{
imgValue.x = d_result[curRow*imageCol + curClo].x;
imgValue.y = d_result[curRow*imageCol + curClo].y;
imgValue.z = d_result[curRow*imageCol + curClo].z;
}
d_result_image[id].x += d_kernel[i*kernelSize + j] * imgValue.x;
d_result_image[id].y += d_kernel[i*kernelSize + j] * imgValue.y;
d_result_image[id].z += d_kernel[i*kernelSize + j] * imgValue.z;
}
}
d_result_image[id].x /= kernelSize * kernelSize;
d_result_image[id].y /= kernelSize * kernelSize;
d_result_image[id].z /= kernelSize * kernelSize;
}
}
void showImageZC(string filename,uchar4 *Image, int imageRow, int imageClo)
{
//將數組轉換成Mat
cv::Mat outImage(imageRow, imageClo, CV_8UC4, (void*)Image);
cv::Mat outImageBGR;
cv::cvtColor(outImage, outImageBGR, CV_RGBA2BGR);
string file = "E:\\ZC\\procedure\\CUDA\\Images\\";
file += filename;
cv::imwrite(file.c_str(), outImageBGR);
//顯示處理好的照片
imshow("convImage", outImageBGR);
waitKey(0);
}
int main()
{
//定義變量
string input_file = "E:\\ZC\\procedure\\CUDA\\Images\\1.png";
string output_file = "E:\\ZC\\procedure\\CUDA\\Images\\3.png";
uchar4*h_image, *d_image, *d_result, *h_result, *d_avgImage, *h_avgImage;
float*d_kernel;
int imageRow, int imageCol, kernelSize = 3;
//讀取照片到imageBGR中
Mat imageBGR = cv::imread(input_file.c_str(), CV_LOAD_IMAGE_COLOR);
if (imageBGR.empty())
{
cerr << "讀取照片失敗:" << input_file << endl;
exit(1);
}
//將BGR轉換成RGB存到imageRGB中
Mat imageRGB;
cv::cvtColor(imageBGR, imageRGB, CV_BGR2RGBA);
//將Mat轉換成數組並將地址賦給h_image
h_image = (uchar4*)imageRGB.ptr<unsigned char>(0);
//爲Device上的d_image開闢空間
imageRow = imageRGB.rows;
imageCol = imageRGB.cols;
int size = imageCol * imageRow;
GetCudaCalError(cudaMalloc(&d_image, size * sizeof(uchar4)));
//將h_image的值賦給d_image
cudaMemcpy(d_image, h_image, size * sizeof(uchar4), cudaMemcpyHostToDevice);
//爲Host上的h_kernel開闢空間
float *h_kernel = new float[kernelSize*kernelSize];
//爲h_kernel賦值
for (int i = 0; i < kernelSize*kernelSize; ++i)
{
h_kernel[i] = i % kernelSize - 1;
}
//爲Device上的d_kernel開闢空間
GetCudaCalError(cudaMalloc(&d_kernel, kernelSize *kernelSize * sizeof(float)));
//將h_kernel的值賦給d_kernel
cudaMemcpy(d_kernel, h_kernel, kernelSize *kernelSize * sizeof(float), cudaMemcpyHostToDevice);
//開闢一個和imageRGB等大的內存來存放卷積結果d_result
GetCudaCalError(cudaMalloc(&d_result, size * sizeof(uchar4)));
//d_result初始化成0
cudaMemset(d_result, 0, size * sizeof(uchar4));
//開闢一個和imageRGB等大的內存來存放卷積結果h_result
h_result = new uchar4[size];
//開闢一個和imageRGB等大的內存來存放均值處理結果h_avgImage
h_avgImage = new uchar4[size];
//開闢一個和imageRGB等大的內存來存放均值處理結果d_avgImage
GetCudaCalError(cudaMalloc(&d_avgImage, size * sizeof(uchar4)));
const int threadNum = getThreadNumZC();
const int blockNum = (imageRow*imageCol + threadNum - 1) / threadNum;
convZC << <blockNum, threadNum >> > (d_image, d_kernel, d_result, imageRow, imageCol, kernelSize);
//等待線程全部結束
cudaDeviceSynchronize();
//將結果返回Host上
cudaMemcpy(h_result, d_result, imageRow*imageCol * sizeof(uchar4), cudaMemcpyDeviceToHost);
for (int i = 0; i < kernelSize*kernelSize; ++i)
{
h_kernel[i] = rand() % 3;
}
cudaMemcpy(d_kernel, h_kernel, kernelSize *kernelSize * sizeof(float), cudaMemcpyHostToDevice);
avgImage << <blockNum, threadNum >> > (d_result, d_kernel, d_avgImage, imageRow, imageCol, kernelSize);
cudaDeviceSynchronize();
//將結果返回Host上
cudaMemcpy(h_avgImage, d_avgImage, imageRow*imageCol * sizeof(uchar4), cudaMemcpyDeviceToHost);
string name1 = "convTest2.png";
string name2 = "avgTest2.png";
showImageZC(name1, h_result, imageRow, imageCol);
showImageZC(name2, h_avgImage, imageRow, imageCol);
//釋放內存
cudaFree(d_image);
cudaFree(d_avgImage);
cudaFree(d_kernel);
delete[] h_avgImage;
return 0;
}
不足之處:
1、卷積,均值池化都不能調步數。
2、均值池化沒有改變像素的個數。
3、這個並行化程序的並行化程度還可以再提高。
4、代碼不方便遷移。