第三章
我们来做个实验。
还是两个二维矩阵的相加,在CUDA上执行。
代码如下
// 这段代码只能在Windows下运行,Linux请自行修改计时的模块
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <device_launch_parameters.h>
#include <Windows.h>
#define abs(a) a>0 ? a : -a
#define CHECK(call) \
{ \
const cudaError_t error=call; \
if(error!=cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n",error,cudaGetErrorString(error)); \
exit(-10*error); \
} \
}
/*
* This example implements matrix element-wise addition on the host and GPU.
* sumMatrixOnHost iterates over the rows and columns of each matrix, adding
* elements from A and B together and storing the results in C. The current
* offset in each matrix is stored using pointer arithmetic. sumMatrixOnGPU2D
* implements the same logic, but using CUDA threads to process each matrix.
*/
void initialData(float *ip, const int size)
{
int i;
for(i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 10.0f;
}
}
void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
{
float *ia = A;
float *ib = B;
float *ic = C;
for (int iy = 0; iy < ny; iy++)
{
for (int ix = 0; ix < nx; ix++)
{
ic[ix] = ia[ix] + ib[ix];
}
ia += nx;
ib += nx;
ic += nx;
}
return;
}
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
printf("host %f gpu %f ", hostRef[i], gpuRef[i]);
printf("Arrays do not match.\n\n");
break;
}
}
}
// grid 2D block 2D
__global__ void sumMatrixOnGPU2D(float *A, float *B, float *C, int NX, int NY)
{
unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int idx = iy * NX + ix;
if (ix < NX && iy < NY)
{
C[idx] = A[idx] + B[idx];
}
}
int main(int argc, char **argv)
{
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
CHECK(cudaSetDevice(dev));
// set up data size of matrix
int nx = 1 << 14;
int ny = 1 << 14;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);
// malloc host memory
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
// initialize data at host side
LARGE_INTEGER ta, tb, tc;
QueryPerformanceFrequency(&tc);
QueryPerformanceCounter(&ta);
printf("Generating random array...\n");
initialData(h_A, nxy);
initialData(h_B, nxy);
QueryPerformanceCounter(&tb);
printf("Generated.\n");
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// add matrix at host side for result checks
QueryPerformanceCounter(&ta);
sumMatrixOnHost (h_A, h_B, hostRef, nx, ny);
QueryPerformanceCounter(&tb);
// malloc device global memory
float *d_MatA, *d_MatB, *d_MatC;
CHECK(cudaMalloc((void **)&d_MatA, nBytes));
CHECK(cudaMalloc((void **)&d_MatB, nBytes));
CHECK(cudaMalloc((void **)&d_MatC, nBytes));
// transfer data from host to device
CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));
// invoke kernel at host side
int dimx = 32;
int dimy = 32;
if(argc > 2)
{
dimx = atoi(argv[1]);
dimy = atoi(argv[2]);
}
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
// execute the kernel
CHECK(cudaDeviceSynchronize());
QueryPerformanceCounter(&ta);
sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
CHECK(cudaDeviceSynchronize());
QueryPerformanceCounter(&tb);
printf("sumMatrixOnGPU2D <<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n", grid.x,grid.y,block.x,block.y, (tb.QuadPart - ta.QuadPart)*1.0 / tc.QuadPart);
CHECK(cudaGetLastError());
// copy kernel result back to host side
CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));
// check device results
checkResult(hostRef, gpuRef, nxy);
// free device global memory
CHECK(cudaFree(d_MatA));
CHECK(cudaFree(d_MatB));
CHECK(cudaFree(d_MatC));
// free host memory
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
// reset device
CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
笔者的电脑配置前面也介绍过,是NVIDIA GTX 1050的显卡。
这个是书上给的结果,用于告诉我们更多的线程块有更好的并行性(512*1024的块数量性能优于512*512的)。在笔者的电脑运行的结果如下(Debug模式):
可以看到,块的尺寸为32*32的时候是最慢的,而其实16*32、16*16、32*16的效率大致都差不多。和书上得到的差异较显著的结果不同。而且在Release模式下,四个的速度几乎都差不多。
使用nvprof --metrics achieved_occupancy <program.exe> [param1] [param2] … 命令可以得到如下数据:
可以看到,第四种情况有着最高的线程束占用率,但是速度却是第二慢的,所以我们还是能得出结论:占用率高不一定能得到最快的运行速度,还和其他的因素有关。