CUDAExample-0-cdpSimplePrint

標籤： CUDAExample

作用

>
* Key Concepts: CUDA Dynamic Parallelism *
It generates a unique identifier for each block. Prints the information about that block. Finally, if the ‘max_depth’ has not been reached, the block launches new blocks directly from the GPU.

例程中使用了遞歸思想，在覈函數中打印處每一個線程塊中threads爲0的所在的線程塊，以及它對應的parent線程塊,重點說明了cuda架構可以動態的並行計算。

所使用的技巧

遞歸， device全局變量，命令行得到參數，字符串匹配，設備屬性計算能力，共享內存, 設備函數

代碼分析

main函數傳參機制

int main(int argc, char **argv)

main 前面的 int 則說明main函數返回值是整形，一般是正常退出返回0，異常則是-1.
參數 argc 則是表示 argv 的個數.
argv 則是命令行參數. 這個參數是通過命令提示符窗（Linux稱爲終端）口運行程序，以空格區分參數格式帶入的。
char **argv 就好理解了，它就是一個指向字符串的指針。
argc 是字符串的個數，如device is ready ，則argc = 3， gagv[0] 爲字符串，argv[0][n]表示字符

設備端全局變量

__device__ int g_uids = 0;

checkCmdLineFlag()函數解析

inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
{
    bool bFound = false;

    if (argc >= 1)
    {
        for (int i=1; i < argc; i++)
        {
            int string_start = stringRemoveDelimiter('-', argv[i]);    //返回給個字符串中非“-”的第一個字符的位置
            const char *string_argv = &argv[i][string_start];   //二維數組中非"-"d 開始

            const char *equal_pos = strchr(string_argv, '=');  //查找字符串中首次出現字符=的位置
            int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);

            int length = (int)strlen(string_ref);

            if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
            {
                bFound = true;
                continue;
            }
        }
    }

    return bFound;
}

函數的作用是在輸入字符串中，尋找與傳入字符串*string_ref相同的部分，若有相同部分返回ture，否則false.
其中設計字符串操作函數strchr(),strlen(),strnicmp().

getCmdLineArgumentInt()函數解析

inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
{
    bool bFound = false;
    int value = -1;

    if (argc >= 1)
    {
        for (int i=1; i < argc; i++)
        {
            int string_start = stringRemoveDelimiter('-', argv[i]);
            const char *string_argv = &argv[i][string_start];
            int length = (int)strlen(string_ref);

            if (!STRNCASECMP(string_argv, string_ref, length))
            {
                if (length+1 <= (int)strlen(string_argv))
                {
                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
                    value = atoi(&string_argv[length + auto_inc]);
                }
                else
                {
                    value = 0;
                }

                bFound = true;
                continue;
            }
        }
    }

    if (bFound)
    {
        return value;
    }
    else
    {
        return 0;
    }
}

字符串函數atoi()將字符串轉換爲數字，獲取初始值.

gpu設備屬性

struct cudaDeviceProp
 {
   char   name[256];                  /**< 設備的ASCII標識 */
   size_t totalGlobalMem;             /**< 可用的全局內存量，單位字節 */
   size_t sharedMemPerBlock;          /**< 每個block可用的共享內存量，單位字節 */
   int    regsPerBlock;               /**< 每個block裏可用32位寄存器數量 */
   int    warpSize;                   /**< 在線程warp塊大小*/
   size_t memPitch;                   /**< 允許的內存複製最大修正，單位字節*/
   int    maxThreadsPerBlock;         /**< 每個block最大進程數量 */
   int    maxThreadsDim[3];           /**< 一block裏每個維度最大線程量 */
   int    maxGridSize[3];             /**< 一格里每個維度最大數量 */
   int    clockRate;                  /**< 時鐘頻率，單位千赫khz */
   size_t totalConstMem;              /**< 設備上可用的常量內存，單位字節 */
   int    major;                      /**< 計算功能主版本號*/
   int    minor;                      /**< 計算功能次版本號*/
   size_t textureAlignment;           /**< 對齊要求的紋理 */
   int    deviceOverlap;              /**< 判斷設備是否可以同時拷貝內存和執行內核。已過時。改用asyncEngineCount */
   int    multiProcessorCount;        /**< 設備上的處理器數量 */
   int    kernelExecTimeoutEnabled;   /**< 內核函數是否運行受時間限制*/
   int    integrated;                 /**< 設備是不是獨立的 */
   int    canMapHostMemory;           /**< 設備能否映射主機cudaHostAlloc/cudaHostGetDevicePointer */
   int    computeMode;                /**< 計算模式，有默認，獨佔，禁止，獨佔進程(See ::cudaComputeMode) */
   int    maxTexture1D;               /**< 1D紋理最大值 */
   int    maxTexture2D[2];            /**< 2D紋理最大維數*/
   int    maxTexture3D[3];            /**< 3D紋理最大維數 */
   int    maxTexture1DLayered[2];     /**< 最大的1D分層紋理尺寸 */
   int    maxTexture2DLayered[3];     /**< 最大的2D分層紋理尺寸  */
   size_t surfaceAlignment;           /**< 表面的對齊要求*/
   int    concurrentKernels;          /**< 設備是否能同時執行多個內核*/
   int    ECCEnabled;                 /**< 設備是否支持ECC */
   int    pciBusID;                   /**< 設備的PCI總線ID */
   int    pciDeviceID;                /**< PCI設備的設備ID*/
   int    pciDomainID;                /**<PCI設備的域ID*/
   int    tccDriver;                  /**< 1如果設備是使用了TCC驅動的Tesla設備，否則就是0 */
   int    asyncEngineCount;           /**< 異步Engine數量 */
   int    unifiedAddressing;          /**< 設備是否共享統一的地址空間與主機*/
   int    memoryClockRate;            /**<峯值內存時鐘頻率，單位khz*/
   int    memoryBusWidth;             /**< 全局內存總線寬度，單位bit*/
   int    l2CacheSize;                /**< L2 cache大小，單位字節 */
   int    maxThreadsPerMultiProcessor;/**< 每個多處理器的最大的常駐線程 */
};

設置主設備端

cudaSetDevice(device);//設置某一塊Device作爲這個主機host上的某一個運行線程的設備

這個函數必須要在使用 global 的函數或者Runtime的其他的API調用之前才能生效。如果沒有調用cudaSetDevice()，device0就會被設置爲默認的設備，接下里的如果還有cudaSetDevice()函數也不會有效果.

遞歸核函數

__global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
{
    // We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
    __shared__ int s_uid;

    if (threadIdx.x == 0)
    {
        s_uid = atomicAdd(&g_uids, 1); //原子操作，不同線程塊裏的線程會對同一位置寫入
    }

    __syncthreads();

    // We print the ID of the block and information about its parent.
    print_info(depth, thread, s_uid, parent_uid); //設備端函數

    // We launch new blocks if we haven't reached the max_depth yet.
    if (++depth >= max_depth) //遞歸結束條件
    {
        return;
    }

    cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);
}

函數功能：不斷開闢新線程遞歸調用同一個核函數，知道滿足結束條件，每一個線程都會有打印函數，但只有滿足線程爲0纔會打印到命令行中顯示。

設備端函數

__device__ void print_info(int depth, int thread, int uid, int parent_uid)
{
    if (threadIdx.x == 0) //在 0 線程打印
    {
        if (depth == 0)
            printf("BLOCK %d launched by the host\n", uid);
        else
        {
            char buffer[32];

            for (int i = 0 ; i < depth ; ++i)    //設備端調用是加標記                       
            {
                buffer[3*i+0] = '|';
                buffer[3*i+1] = ' ';
                buffer[3*i+2] = ' ';
            }

            buffer[3*depth] = '\0';
            printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);//打印結果
        }
    }

    __syncthreads();
}

運行結果

starting Simple Print (CUDA Dynamic Parallelism)
Running on GPU 0 (GeForce GTX 980)
***************************************************************************
The CPU launches 2 blocks of 2 threads each. On the device each thread will
launch 2 blocks of 2 threads each. The GPU we will do that recursively
until it reaches max_depth=2
In total 2+8=10 blocks are launched!!! (8 from the GPU)
***************************************************************************
Launching cdp_kernel() with CUDA Dynamic Parallelism:
BLOCK 1 launched by the host
BLOCK 0 launched by the host
| BLOCK 4 launched by thread 0 of block 1
| BLOCK 5 launched by thread 0 of block 1
| BLOCK 2 launched by thread 0 of block 0
| BLOCK 3 launched by thread 0 of block 0
| BLOCK 6 launched by thread 1 of block 1
| BLOCK 7 launched by thread 1 of block 1
| BLOCK 8 launched by thread 1 of block 0
| BLOCK 9 launched by thread 1 of block 0
請按任意鍵繼續…

由於不同線程運行結束時間不同，會導致其中打印順序可能會不一致。

結論

gpu程序支持遞歸，支持核函數打印，原子操作可避免衝突。

End

CUDAExample-0-cdpSimplePrint

作用

所使用的技巧

代碼分析

main函數傳參機制

設備端全局變量

checkCmdLineFlag()函數解析

getCmdLineArgumentInt()函數解析

gpu設備屬性

設置主設備端

遞歸核函數

設備端函數

運行結果

結論

內存尋址優化

CUDAExample-0-clock

Linux系統動態鏈接庫和靜態鏈接庫CMake的使用方法

統計-均值，期望，方差，協方差，協方差矩陣

上海復旦大學吳立德教授深度學習課程五

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結