如果只是變量,如下即可實現全局變量-
兩種方法:
1.
__device__ float devData;
float value = 3.14f;
cudaMemcpyToSymbol(devData,&value,sizeof(float));
cudaMemcpyFromSymbol(&value,devData,sizeof(float));
2.
__device__ float myData;
float * dptr;
cudaGetSymbolAddress((void **)&dptr,myData);
cudaMemcpy(dptr,&value,sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpyFromSymbol(&value,myData,sizeof(float));
如果是數組方式的話,
__device__ float * Data2;
float * myptr,* h_ptr;
h_ptr = (float*)malloc(sizeof(float)*10);
cudaError_t ret =cudaMalloc((void **)&myptr,sizeof(float)*10);
ret = cudaMemcpyToSymbol(Data2,&myptr,sizeof(float *));
cudaMemcpy(h_ptr,Data2,sizeof(float)*10,cudaMemcpyDeviceToHost);
for(int i=0;i<10;i++){
printf("%f --- \n",h_ptr[i]);
}
也就是說,貌似暫時來看,只能通過先申請一個設備內存塊,然後拷貝到全局內存,計算完成後,然後再把全局內存拷貝到設備內存塊,然後再從設備內存塊中拷貝的主機內存,非常麻煩。。。。