CUDA學習——Chapter 3(6)並行性實驗(1)

第三章

我們來做個實驗。

還是兩個二維矩陣的相加,在CUDA上執行。

代碼如下

// 這段代碼只能在Windows下運行,Linux請自行修改計時的模塊
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <device_launch_parameters.h>
#include <Windows.h>
#define abs(a) a>0 ? a : -a
#define CHECK(call) \
{ \
    const cudaError_t error=call; \
    if(error!=cudaSuccess) \
      { \
        printf("Error: %s:%d, ", __FILE__, __LINE__); \
        printf("code:%d, reason: %s\n",error,cudaGetErrorString(error)); \
        exit(-10*error); \
	} \
}
/*
 * This example implements matrix element-wise addition on the host and GPU.
 * sumMatrixOnHost iterates over the rows and columns of each matrix, adding
 * elements from A and B together and storing the results in C. The current
 * offset in each matrix is stored using pointer arithmetic. sumMatrixOnGPU2D
 * implements the same logic, but using CUDA threads to process each matrix.
 */

void initialData(float *ip, const int size)
{
    int i;

    for(i = 0; i < size; i++)
    {
        ip[i] = (float)( rand() & 0xFF ) / 10.0f;
    }
}

void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
{
    float *ia = A;
    float *ib = B;
    float *ic = C;

    for (int iy = 0; iy < ny; iy++)
    {
        for (int ix = 0; ix < nx; ix++)
        {
            ic[ix] = ia[ix] + ib[ix];
        }

        ia += nx;
        ib += nx;
        ic += nx;
    }

    return;
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            printf("host %f gpu %f ", hostRef[i], gpuRef[i]);
            printf("Arrays do not match.\n\n");
            break;
        }
    }
}

// grid 2D block 2D
__global__ void sumMatrixOnGPU2D(float *A, float *B, float *C, int NX, int NY)
{
    unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int idx = iy * NX + ix;

    if (ix < NX && iy < NY)
    {
        C[idx] = A[idx] + B[idx];
    }
}

int main(int argc, char **argv)
{
    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    CHECK(cudaSetDevice(dev));

    // set up data size of matrix
    int nx = 1 << 14;
    int ny = 1 << 14;

    int nxy = nx * ny;
    int nBytes = nxy * sizeof(float);

    // malloc host memory
    float *h_A, *h_B, *hostRef, *gpuRef;
    h_A = (float *)malloc(nBytes);
    h_B = (float *)malloc(nBytes);
    hostRef = (float *)malloc(nBytes);
    gpuRef = (float *)malloc(nBytes);

    // initialize data at host side
    LARGE_INTEGER ta, tb, tc;
    QueryPerformanceFrequency(&tc);
    QueryPerformanceCounter(&ta);
	printf("Generating random array...\n");
    initialData(h_A, nxy);
    initialData(h_B, nxy);
    QueryPerformanceCounter(&tb);
	printf("Generated.\n");

    memset(hostRef, 0, nBytes);
    memset(gpuRef, 0, nBytes);

    // add matrix at host side for result checks
    QueryPerformanceCounter(&ta);
    sumMatrixOnHost (h_A, h_B, hostRef, nx, ny);
    QueryPerformanceCounter(&tb);

    // malloc device global memory
    float *d_MatA, *d_MatB, *d_MatC;
    CHECK(cudaMalloc((void **)&d_MatA, nBytes));
    CHECK(cudaMalloc((void **)&d_MatB, nBytes));
    CHECK(cudaMalloc((void **)&d_MatC, nBytes));

    // transfer data from host to device
    CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));

    // invoke kernel at host side
    int dimx = 32;
    int dimy = 32;

    if(argc > 2)
    {
        dimx = atoi(argv[1]);
        dimy = atoi(argv[2]);
    }

    dim3 block(dimx, dimy);
    dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);

    // execute the kernel
    CHECK(cudaDeviceSynchronize());
    QueryPerformanceCounter(&ta);
    sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
    CHECK(cudaDeviceSynchronize());
    QueryPerformanceCounter(&tb);
    printf("sumMatrixOnGPU2D <<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n", grid.x,grid.y,block.x,block.y, (tb.QuadPart - ta.QuadPart)*1.0 / tc.QuadPart);
    CHECK(cudaGetLastError());

    // copy kernel result back to host side
    CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));

    // check device results
    checkResult(hostRef, gpuRef, nxy);

    // free device global memory
    CHECK(cudaFree(d_MatA));
    CHECK(cudaFree(d_MatB));
    CHECK(cudaFree(d_MatC));

    // free host memory
    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);

    // reset device
    CHECK(cudaDeviceReset());

    return EXIT_SUCCESS;
}

筆者的電腦配置前面也介紹過,是NVIDIA GTX 1050的顯卡。

在這裏插入圖片描述
這個是書上給的結果,用於告訴我們更多的線程塊有更好的並行性(512*1024的塊數量性能優於512*512的)。在筆者的電腦運行的結果如下(Debug模式):
在這裏插入圖片描述

可以看到,塊的尺寸爲32*32的時候是最慢的,而其實16*32、16*16、32*16的效率大致都差不多。和書上得到的差異較顯著的結果不同。而且在Release模式下,四個的速度幾乎都差不多。

使用nvprof --metrics achieved_occupancy <program.exe> [param1] [param2] … 命令可以得到如下數據:
在這裏插入圖片描述
在這裏插入圖片描述

可以看到,第四種情況有着最高的線程束佔用率,但是速度卻是第二慢的,所以我們還是能得出結論:佔用率高不一定能得到最快的運行速度,還和其他的因素有關。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章