CUDA学习——Chapter 3(6)并行性实验(1)

第三章

我们来做个实验。

还是两个二维矩阵的相加,在CUDA上执行。

代码如下

// 这段代码只能在Windows下运行,Linux请自行修改计时的模块
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <device_launch_parameters.h>
#include <Windows.h>
#define abs(a) a>0 ? a : -a
#define CHECK(call) \
{ \
    const cudaError_t error=call; \
    if(error!=cudaSuccess) \
      { \
        printf("Error: %s:%d, ", __FILE__, __LINE__); \
        printf("code:%d, reason: %s\n",error,cudaGetErrorString(error)); \
        exit(-10*error); \
	} \
}
/*
 * This example implements matrix element-wise addition on the host and GPU.
 * sumMatrixOnHost iterates over the rows and columns of each matrix, adding
 * elements from A and B together and storing the results in C. The current
 * offset in each matrix is stored using pointer arithmetic. sumMatrixOnGPU2D
 * implements the same logic, but using CUDA threads to process each matrix.
 */

void initialData(float *ip, const int size)
{
    int i;

    for(i = 0; i < size; i++)
    {
        ip[i] = (float)( rand() & 0xFF ) / 10.0f;
    }
}

void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
{
    float *ia = A;
    float *ib = B;
    float *ic = C;

    for (int iy = 0; iy < ny; iy++)
    {
        for (int ix = 0; ix < nx; ix++)
        {
            ic[ix] = ia[ix] + ib[ix];
        }

        ia += nx;
        ib += nx;
        ic += nx;
    }

    return;
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            printf("host %f gpu %f ", hostRef[i], gpuRef[i]);
            printf("Arrays do not match.\n\n");
            break;
        }
    }
}

// grid 2D block 2D
__global__ void sumMatrixOnGPU2D(float *A, float *B, float *C, int NX, int NY)
{
    unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int idx = iy * NX + ix;

    if (ix < NX && iy < NY)
    {
        C[idx] = A[idx] + B[idx];
    }
}

int main(int argc, char **argv)
{
    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    CHECK(cudaSetDevice(dev));

    // set up data size of matrix
    int nx = 1 << 14;
    int ny = 1 << 14;

    int nxy = nx * ny;
    int nBytes = nxy * sizeof(float);

    // malloc host memory
    float *h_A, *h_B, *hostRef, *gpuRef;
    h_A = (float *)malloc(nBytes);
    h_B = (float *)malloc(nBytes);
    hostRef = (float *)malloc(nBytes);
    gpuRef = (float *)malloc(nBytes);

    // initialize data at host side
    LARGE_INTEGER ta, tb, tc;
    QueryPerformanceFrequency(&tc);
    QueryPerformanceCounter(&ta);
	printf("Generating random array...\n");
    initialData(h_A, nxy);
    initialData(h_B, nxy);
    QueryPerformanceCounter(&tb);
	printf("Generated.\n");

    memset(hostRef, 0, nBytes);
    memset(gpuRef, 0, nBytes);

    // add matrix at host side for result checks
    QueryPerformanceCounter(&ta);
    sumMatrixOnHost (h_A, h_B, hostRef, nx, ny);
    QueryPerformanceCounter(&tb);

    // malloc device global memory
    float *d_MatA, *d_MatB, *d_MatC;
    CHECK(cudaMalloc((void **)&d_MatA, nBytes));
    CHECK(cudaMalloc((void **)&d_MatB, nBytes));
    CHECK(cudaMalloc((void **)&d_MatC, nBytes));

    // transfer data from host to device
    CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));

    // invoke kernel at host side
    int dimx = 32;
    int dimy = 32;

    if(argc > 2)
    {
        dimx = atoi(argv[1]);
        dimy = atoi(argv[2]);
    }

    dim3 block(dimx, dimy);
    dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);

    // execute the kernel
    CHECK(cudaDeviceSynchronize());
    QueryPerformanceCounter(&ta);
    sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
    CHECK(cudaDeviceSynchronize());
    QueryPerformanceCounter(&tb);
    printf("sumMatrixOnGPU2D <<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n", grid.x,grid.y,block.x,block.y, (tb.QuadPart - ta.QuadPart)*1.0 / tc.QuadPart);
    CHECK(cudaGetLastError());

    // copy kernel result back to host side
    CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));

    // check device results
    checkResult(hostRef, gpuRef, nxy);

    // free device global memory
    CHECK(cudaFree(d_MatA));
    CHECK(cudaFree(d_MatB));
    CHECK(cudaFree(d_MatC));

    // free host memory
    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);

    // reset device
    CHECK(cudaDeviceReset());

    return EXIT_SUCCESS;
}

笔者的电脑配置前面也介绍过,是NVIDIA GTX 1050的显卡。

在这里插入图片描述
这个是书上给的结果,用于告诉我们更多的线程块有更好的并行性(512*1024的块数量性能优于512*512的)。在笔者的电脑运行的结果如下(Debug模式):
在这里插入图片描述

可以看到,块的尺寸为32*32的时候是最慢的,而其实16*32、16*16、32*16的效率大致都差不多。和书上得到的差异较显著的结果不同。而且在Release模式下,四个的速度几乎都差不多。

使用nvprof --metrics achieved_occupancy <program.exe> [param1] [param2] … 命令可以得到如下数据:
在这里插入图片描述
在这里插入图片描述

可以看到,第四种情况有着最高的线程束占用率,但是速度却是第二慢的,所以我们还是能得出结论:占用率高不一定能得到最快的运行速度,还和其他的因素有关。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章