cudaimprocess.h
//cuda頭文件
struct CoreDataGPU
{
// host cpu
float *H_X;
float *H_Y ;
float *H_Z ;
//device gpu
float *D_X;
float *D_Y ;
float *D_Z ;
};
//開闢內存空間.
void initPointGPU(PointGpu **_HostPointGPU,int arraySize);
//釋放內存空間
void freePointGPU(PointGpu &_HostPointGPU);
//核心算子.
__global__ void addPoint(PointGpu* _PointGPU,int arraySize);
//對上面結構體進行處理測試.
void PointGpuMethod_Test();
//測試二:上面結構體,假設裏面結構體不是指針,只是一個變量.,那麼做一個結構體數組
//
void PointGpuMethod_dim1();
cudaimproces.cu
///////////////////////
void initPointGPU(PointGpu **_HostPointGPU, int arraySize)
{
//開闢空間. 在本地開闢空間.
cudaMallocHost((void**)(_HostPointGPU), sizeof(PointGpu));
//在cpu上開闢空間,這裏考慮的是:後續進行類封裝的時候,直接從gpu上對應的地址拷貝到cpu對應的數據內存,
cudaMallocHost((void**)&(*_HostPointGPU)->host_X,sizeof(float)*arraySize);
cudaMallocHost((void**)&(*_HostPointGPU)->host_Y, sizeof(float)*arraySize);
cudaMallocHost((void**)&(*_HostPointGPU)->host_Z, sizeof(float)*arraySize);
//在gpu上開闢空間
cudaMalloc((void**)&(*_HostPointGPU)->device_X,sizeof(float)*arraySize);
cudaMalloc((void**)&(*_HostPointGPU)->device_Y, sizeof(float)*arraySize);
cudaMalloc((void**)&(*_HostPointGPU)->device_Z, sizeof(float)*arraySize);
}
void freePointGPU(PointGpu &_HostPointGPU)
{
cudaFreeHost(_HostPointGPU.host_X);
cudaFreeHost(_HostPointGPU.host_Y);
cudaFreeHost(_HostPointGPU.host_Z);
cudaFree(_HostPointGPU.device_X);
cudaFree(_HostPointGPU.device_Y);
cudaFree(_HostPointGPU.device_Z);
cudaFreeHost(&_HostPointGPU);
}
//核函數,我就直接賦值了,不做計算.
__global__ void addPoint(PointGpu *_PointGPU, int arraySize)
{
int tid = threadIdx.x;
if (_PointGPU != nullptr && tid < arraySize)
{
_PointGPU->device_X[tid] = 9;
_PointGPU->device_Y[tid] = 8;
_PointGPU->device_Z[tid] = 10;
}
}
//接下來就是如何使用了.
void PointGpuMethod_Test()
{
////////這裏是單一的數據
const int arraySize = 10;
float X[arraySize] = {0,};
float Y[arraySize] = { 0, };
float Z[arraySize] = { 0 ,};
memset(X, 0, sizeof(float)*arraySize);
memset(Y, 0, sizeof(float)*arraySize);
memset(Z, 0, sizeof(float)*arraySize);
PointGpu* source = {nullptr};
initPointGPU(&source,arraySize); //初始化數據.
addPoint << < 1,arraySize >> > (source, arraySize); //計算
cudaDeviceSynchronize(); //同步 或者在覈函數裏面使用 __syncthreads();
cudaMemcpy(source->host_Z, source->device_Z, arraySize*sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); //下載數據.
cudaMemcpy(source->host_Y, source->device_Y, arraySize * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); //下載數據.
cudaMemcpy(source->host_X, source->device_X, arraySize * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); //下載數據.
memcpy(Z, source->host_Z,sizeof(float)*arraySize);
memcpy(Y, source->host_Y, sizeof(float)*arraySize);
memcpy(X, source->host_X, sizeof(float)*arraySize);
//輸出:
std::cout << "GPU下載的數據:" << std::endl;
for (int i = 0; i < arraySize; i++)
{
std::cout << Z[i]<<" "<<X[i]<<" "<<Y[i] << " | ";
}
std::cout << std::endl;
freePointGPU(*source);
///////////////////GPU 下載數據數組.
}
void PointGpuMethod_dim1()
{
const int arraySize = 10;
float X[arraySize] = { 0, };
float Y[arraySize] = { 0, };
float Z[arraySize] = { 0 , };
memset(X, 0, sizeof(float)*arraySize);
memset(Y, 0, sizeof(float)*arraySize);
memset(Z, 0, sizeof(float)*arraySize);
const int BUfferNumber = 10;
PointGpu** source = new PointGpu*[BUfferNumber]; //創建二級指針
for (int i = 0; i < BUfferNumber; i++)
{
initPointGPU((source+i), 110000); //初始化數據. //每級指針開闢空間
}
//初始化之後,賦值.
for (int i = 0; i < BUfferNumber; i++)
{
addPoint << < 1, 10 >> > (source[i], 1);
cudaDeviceSynchronize();
//獲取數值.賦值,並輸出數據.
float buffer[3][2] = {0};
cudaMemcpy(buffer[0], source[i]->device_X, sizeof(float),cudaMemcpyKind::cudaMemcpyDeviceToHost);
cudaMemcpy(buffer[1], source[i]->device_Y, sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
cudaMemcpy(buffer[2], source[i]->device_Z, sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
std::cout << buffer[0][0] << " " << buffer[1][0] << " " << buffer[2][0] << " "<<std::endl;
}
//釋放空間
for (int i = 0; i < BUfferNumber; i++)
{
freePointGPU(*source[i]);
}
delete []source;
}
///以上經過測試,連續GPU內存以及使用情況,穩定.
這樣,cuda自定義結構體測試成功,這個只是最基本的,其它都可以在這個基礎上做改動.