CUDAExample-0-cppIntegration

標籤： CUDAExample

例程說明
例程使用NVIDIA庫vector_types.h中的int2結構體，同時利用補齊數組避免band conflict，但是在GTX980系列中一個band有32個線程，而又半個band的說法，當補齊數組應補齊到32的倍數，而不是16的倍數，此例程針對前期的硬件，現在的硬件不適應，但思想是通用的，同時在數據量小的情況下，band conflict不是影響效率的關鍵，當然，避免band conflict也是很重要的。

字符測試

當字符型可以用字符，也可以用ascii碼，下面的測試程序說明此問題

#include <iostream>
#include <stdio.h>
#include <stdlib.h>

int main()
{
    char a1 = 'H';
    char a2 =  72;

    std::cout << "a1 = "<< a1 << "," << "a2 = " << a2 << std::endl;

    return 0;
}

字符初始化Hello world

  char str[] = { 82, 111, 118, 118, 121, 42, 97, 121, 124, 118, 110, 56,10, 10, 10, 10};

利用庫vector_types.h初始化字符數組，x保存字符，y保存偏移量

   // Use int2 showing that CUDA vector types can be used in cpp code
    int2 i2[16];

    for (int i = 0; i < len; i++)
    {
        i2[i].x = str[i];
        i2[i].y = 10;
    }

runTest函數

此函數完成GPU函數調用，C函數調用，結果對比。其中GPU函數採用兩種方式實現，C函數採用兩種方式實現，最後進行兩種實現函數結果的對比。

////////////////////////////////////////////////////////////////////////////////
//! Entry point for Cuda functionality on host side
//! @param argc  command line argument count
//! @param argv  command line arguments
//! @param data  data to process on the device
//! @param len   len of \a data
////////////////////////////////////////////////////////////////////////////////
extern "C" bool
runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len)
{
    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
    findCudaDevice(argc, (const char **)argv);

    const unsigned int num_threads = len / 4;
    assert(0 == (len % 4));
    const unsigned int mem_size = sizeof(char) * len;
    printf("sizeof(char) = %d\n", sizeof(char));
    const unsigned int mem_size_int2 = sizeof(int2) * len;
    printf("sizeof(int2) = %d\n", sizeof(int2));

    // allocate device memory
    char *d_data;
    checkCudaErrors(cudaMalloc((void **) &d_data, mem_size));
    // copy host memory to device
    checkCudaErrors(cudaMemcpy(d_data, data, mem_size,
                               cudaMemcpyHostToDevice));
    // allocate device memory for int2 version
    int2 *d_data_int2;
    checkCudaErrors(cudaMalloc((void **) &d_data_int2, mem_size_int2));
    // copy host memory to device
    checkCudaErrors(cudaMemcpy(d_data_int2, data_int2, mem_size_int2,
                               cudaMemcpyHostToDevice));

    // setup execution parameters
    dim3 grid(1, 1, 1);
    dim3 threads(num_threads, 1, 1);
    dim3 threads2(len, 1, 1); // more threads needed fir separate int2 version
    // execute the kernel
    kernel<<< grid, threads >>>((int *) d_data);
    kernel2<<< grid, threads2 >>>(d_data_int2);

    // check if kernel execution generated and error
    getLastCudaError("Kernel execution failed");

    // compute reference solutions
    char *reference = (char *) malloc(mem_size);
    computeGold(reference, data, len);
    int2 *reference2 = (int2 *) malloc(mem_size_int2);
    computeGold2(reference2, data_int2, len);

    // copy results from device to host
    checkCudaErrors(cudaMemcpy(data, d_data, mem_size,
                               cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaMemcpy(data_int2, d_data_int2, mem_size_int2,
                               cudaMemcpyDeviceToHost));

    // check result
    bool success = true;

    for (unsigned int i = 0; i < len; i++)
    {
        if (reference[i] != data[i] ||
            reference2[i].x != data_int2[i].x ||
            reference2[i].y != data_int2[i].y)
        {
            success = false;
        }
    }

    // cleanup memory
    checkCudaErrors(cudaFree(d_data));
    checkCudaErrors(cudaFree(d_data_int2));
    free(reference);
    free(reference2);

    return success;
}

GPU 函數實現


///////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality
//! @param g_odata  memory to process (in and out)
///////////////////////////////////////////////////////////////////////////////
__global__ void kernel(int *g_data)
{
    // write data to global memory
    const unsigned int tid = threadIdx.x;
    int data = g_data[tid];

    // use integer arithmetic to process all four bytes with one thread
    // this serializes the execution, but is the simplest solutions to avoid
    // bank conflicts for this very low number of threads
    // in general it is more efficient to process each byte by a separate thread,
    // to avoid bank conflicts the access pattern should be
    // g_data[4 * wtid + wid], where wtid is the thread id within the half warp
    // and wid is the warp id
    // see also the programming guide for a more in depth discussion.
    g_data[tid] = ((((data <<  0) >> 24) - 10) << 24)
                  | ((((data <<  8) >> 24) - 10) << 16)
                  | ((((data << 16) >> 24) - 10) <<  8)
                  | ((((data << 24) >> 24) - 10) <<  0);
}

///////////////////////////////////////////////////////////////////////////////
//! Demonstration that int2 data can be used in the cpp code
//! @param g_odata  memory to process (in and out)
///////////////////////////////////////////////////////////////////////////////
__global__ void
kernel2(int2 *g_data)
{
    // write data to global memory
    const unsigned int tid = threadIdx.x;
    int2 data = g_data[tid];

    // use integer arithmetic to process all four bytes with one thread
    // this serializes the execution, but is the simplest solutions to avoid
    // bank conflicts for this very low number of threads
    // in general it is more efficient to process each byte by a separate thread,
    // to avoid bank conflicts the access pattern should be
    // g_data[4 * wtid + wid], where wtid is the thread id within the half warp
    // and wid is the warp id
    // see also the programming guide for a more in depth discussion.
    g_data[tid].x = data.x - data.y;
}

C函數實現

////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
//! Each element is multiplied with the number of threads / array length
//! @param reference  reference data, computed but preallocated
//! @param idata      input data as provided to device
//! @param len        number of elements in reference / idata
////////////////////////////////////////////////////////////////////////////////
void
computeGold(char *reference, char *idata, const unsigned int len)
{
    for (unsigned int i = 0; i < len; ++i)
        reference[i] = idata[i] - 10;
}

////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set for int2 version
//! Each element is multiplied with the number of threads / array length
//! @param reference  reference data, computed but preallocated
//! @param idata      input data as provided to device
//! @param len        number of elements in reference / idata
////////////////////////////////////////////////////////////////////////////////
void
computeGold2(int2 *reference, int2 *idata, const unsigned int len)
{
    for (unsigned int i = 0; i < len; ++i)
    {
        reference[i].x = idata[i].x - idata[i].y;
        reference[i].y = idata[i].y;
    }
}

輸出結果

GPU Device 0: “GeForce GTX 980” with compute capability 5.2

sizeof(char) = 1
sizeof(int2) = 8
Hello World.
Hello World.
請按任意鍵繼續…

分析

在此輸入正文

CUDAExample-0-cppIntegration

字符測試

字符初始化Hello world

利用庫vector_types.h初始化字符數組，x保存字符，y保存偏移量

runTest函數

GPU 函數實現

C函數實現

輸出結果

分析

linux安裝cuda和cudnn

模擬手機設備：使用 Playwright 實現移動端自動化測試

Mellanox網卡開啓SR-IOV

全面系統的AI學習路徑，幫助普通人也能玩轉AI

HTML 00 Tutorial

uni-app實現上拉加載

vue3編譯優化之“靜態提升”

又是一個月-20240513

flask 如何保證返回json有序

linux服務器設置ssh免密

內存尋址優化

CUDAExample-0-clock

Linux系統動態鏈接庫和靜態鏈接庫CMake的使用方法

統計-均值，期望，方差，協方差，協方差矩陣

上海復旦大學吳立德教授深度學習課程五

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結