最近在寫一個多GPU編程的程序,按照傳統的寫法寫了之後,程序一直卡在覈函數運行之後,沒有任何輸出
// includes, project
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cufft.h>
#include <cuComplex.h>
//CUFFT Header file
#include <cufftXt.h>
#define NX 9
#define NY 10
#define NZ 10
#define NZ2 (NZ/2+1)
#define NN (NX*NY*NZ)
#define L (2*M_PI)
#define TX 8
#define TY 8
#define TZ 8
#define MAXGPU 16
__global__
void initialize(int NX_per_GPU, cufftComplex *f1,int Nelements,int offset)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if(index<Nelements){
f1[index].x = index+offset;
f1[index].y = 0.0;
}
}
int main (void)
{
int i, j, k, idx, NX_per_GPU[MAXGPU];
// Set GPU's to use and list device properties
int nGPUs = 2, deviceNum[MAXGPU];
for(i = 0; i<nGPUs; ++i)
{
deviceNum[i] = i;
cudaSetDevice(deviceNum[i]);
printf("set id num : %d \n",deviceNum[i]);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, deviceNum[i]);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}
printf("Running Multi_GPU_FFT_check using %d GPUs on a %dx%dx%d grid.\n",nGPUs,NX,NY,NZ);
// divide data
for(i=0;i<nGPUs-1;i++)
NX_per_GPU[i]=NX/nGPUs;
NX_per_GPU[i]=NX - NX_per_GPU[0]*i;
printf(" divide data : %d %d \n",NX_per_GPU[0],NX_per_GPU[1]);
// Declare variables
cufftComplex *u;
// Allocate memory for arrays
cudaMalloc(&u, sizeof(cufftComplex)*NN );
cufftComplex *data_cpu = (cufftComplex *)malloc(sizeof(cufftComplex)*NN);
// Launch CUDA kernel to initialize velocity field
int offset=0;
//nGPUs=1;
for (i = 0; i<nGPUs; ++i){
cudaSetDevice(deviceNum[i]);
int threadsPerBlock = 256;
int blocksPerGrid =(NX_per_GPU[i]*NY*NZ + threadsPerBlock - 1) / threadsPerBlock;
initialize<<<blocksPerGrid, threadsPerBlock>>>(NX_per_GPU[i], &u[offset],NX_per_GPU[i]*NY*NZ,offset);
offset+=NX_per_GPU[i]*NY*NZ;
}
/*
for (i = 0; i<nGPUs; ++i){
cudaSetDevice(deviceNum[i]);
cudaDeviceSynchronize();
}*/
offset=0;
for(i = 0; i<nGPUs; ++i)
{
cudaSetDevice(deviceNum[i]);
cudaMemcpy(data_cpu,u,sizeof(cufftComplex)*NX*NY*NZ,cudaMemcpyDeviceToHost);
}
for(int i=80;i<80+10;i++)
printf("%f ",data_cpu[i].x);
printf("\n");
for(int i=800;i<800+10;i++)
printf("%f ",data_cpu[i].x);
printf("\n");
return 0;
}
通過分析發現,原因在於cudaMalloc聲明的內存是在一張卡上的,按照邏輯來說另外一張卡想要訪問這塊內存,就需要通過pcie進行數據傳輸可能由於自己服務器的配置問題,一直卡在覈函數的調用,就一直在運行了。
爲了讓程序正確有兩個方案可以修改這段代碼:
1.兩塊在不同卡上的數據分別在每個卡上進行malloc,這樣核函數調用時可以不受干擾
2.將cudaMalloc語句換成cudaMallocManaged, 即統一內存管理,這樣兩個卡也都能讀取到這個數字,使用offset的方式就可以訪問。