OpenCL(二)從矢量相加理解OpenCL異構編程過程

在上一篇博客中介紹了異構編程相關概念以及OpenCL框架,都是比較抽象的概念。本文從矢量相加的demo出發,對相關概念做進一步說明,來更深入、直觀地理解OpenCL異構編程的過程。

首先,直接將實現矢量相加(每個矢量128個元素)的完整源碼貼出來

#include<stdio.h>
#include<stdlib.h>

#include<CL/cl.h>


const char* programSource =
"__kernel void vecadd(__global int* A, __global int* B, __global int* C)\n"
"{                                                                        \n"
"    int idx=get_global_id(0);                                            \n"
"    C[idx]=A[idx]+B[idx];                                                \n"
"}                                                                        \n"
;

int main()
{
    int *A = NULL;
    int *B = NULL;
    int *C = NULL;
    const  int elements= 128;



    size_t datasize = sizeof(int)*elements;

    A = (int*)malloc(datasize);
    B = (int*)malloc(datasize);
    C = (int*)malloc(datasize);


    for (int i = 0; i < elements; i++)
    { 
        A[i]=i;
        B[i]=i;
    }


    cl_int status;

    /*Discover and initialize the platforms*/

    cl_uint numPlatforms = 0;
    cl_platform_id* platforms = NULL;

    status = clGetPlatformIDs(0, NULL, &numPlatforms);  //retrieve number of platforms

    printf("# of platform:%d\n", numPlatforms);

    platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id)); // malloct memery for platform 

    status = clGetPlatformIDs(numPlatforms, platforms, NULL); // initialize platforms

    /*print  platform informations*/
    for (int i = 0; i < numPlatforms; i++)
    {
        size_t size=0;

        //name
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &size);
        char* name = (char*)malloc(size);
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, size, name, NULL);
        printf("CL_PLATFORM_NAME:%s\n", name);

        //vendor
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 0, NULL, &size);
        char *vendor = (char *)malloc(size);
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, size, vendor, NULL);
        printf("CL_PLATFORM_VENDOR:%s\n", vendor);

        //version
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 0, NULL, &size);
        char *version = (char *)malloc(size);
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, size, version, NULL);
        printf("CL_PLATFORM_VERSION:%s\n", version);

        // profile
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 0, NULL, &size);
        char *profile = (char *)malloc(size);
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, size, profile, NULL);
        printf("CL_PLATFORM_PROFILE:%s\n", profile);

        // extensions
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 0, NULL, &size);
        char *extensions = (char *)malloc(size);
        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, size, extensions, NULL);
        printf("CL_PLATFORM_EXTENSIONS:%s\n", extensions);

        // release 
        printf("\n\n");
        free(name);
        free(vendor);
        free(version);
        free(profile);
        free(extensions);

    }


    /*Discover and initialize devices*/

    cl_uint numDevices = 0;
    cl_device_id* devices = NULL;

    status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,0,NULL,&numDevices);  // retrieve Device number

    printf("# of device:%d\n", numDevices);
    devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id)); // malloct memery for device

    status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,numDevices,devices,NULL); // fill in device 


    /*print device informations*/
    for (int i = 0; i < numDevices; i++)
    {
        size_t value_size = 0;

        //name
        status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &value_size);     char* name1 = (char*)malloc(value_size);        status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, value_size, name1, NULL);      printf("CL_DEVICE_NAME:%s\n", name1);       //PARALLEL COMPUTE UNITS(CU)        cl_uint     maxComputeUnits = 0;        status = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits), &maxComputeUnits, NULL);     printf("CL_DEVICE_MAX_COMPUTE_UNITS:%u\n", maxComputeUnits);        //maxWorkItemPerGroup       size_t maxWorkItemPerGroup = 0;     status = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkItemPerGroup), &maxWorkItemPerGroup, NULL);       printf("CL_DEVICE_MAX_WORK_GROUP_SIZE: %d\n", maxWorkItemPerGroup);     //maxGlobalMemSize      cl_ulong    maxGlobalMemSize = 0;       status = clGetDeviceInfo(devices[0], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(maxGlobalMemSize), &maxGlobalMemSize, NULL);     printf("CL_DEVICE_GLOBAL_MEM_SIZE: %lu(MB)\n", maxGlobalMemSize / 1024 / 1024);     //maxConstantBufferSize     cl_ulong maxConstantBufferSize = 0;     clGetDeviceInfo(devices[0], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(maxConstantBufferSize), &maxConstantBufferSize, NULL);       printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: %lu(KB)\n", maxConstantBufferSize / 1024);      //maxLocalMemSize       cl_ulong maxLocalMemSize = 0;       status = clGetDeviceInfo(devices[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(maxLocalMemSize), &maxLocalMemSize, NULL);        printf("CL_DEVICE_LOCAL_MEM_SIZE: %lu(KB)\n", maxLocalMemSize / 1024);


        // release      printf("\n\n");     free(name1);
    }



    /*Creat a context*/

    cl_context context = NULL;

    context = clCreateContext( NULL, numDevices,devices,NULL,NULL,&status);
//  context = clCreateContextFromType(NULL,CL_DEVICE_TYPE_ALL,NULL,NULL,&status);
//  cl_device_id device_list;
    size_t  device_num;
    clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, 0, NULL, &device_num);
    printf("Size of cl_device_id:%d\n", sizeof(cl_device_id));
    printf("Num of device in Context:%d\n", device_num);

//  device_list = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list);


    /*Create a command queue*/

    cl_command_queue cmdQueue;
    cmdQueue = clCreateCommandQueue(context,devices[0],0,&status);

        /*Create device buffers*/

        cl_mem bufferA;
        cl_mem bufferB;
        cl_mem bufferC;

        bufferA = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);

        bufferB = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);

        bufferC = clCreateBuffer(context,CL_MEM_WRITE_ONLY,datasize,NULL,&status);

        /*Write host data to device buffers*/

        status = clEnqueueWriteBuffer(cmdQueue,bufferA,CL_FALSE,0,datasize,A,0,NULL,NULL);

        status = clEnqueueWriteBuffer(cmdQueue,bufferB,CL_FALSE,0,datasize,B,0,NULL,NULL);

    //  status = clEnqueueWriteBuffer(cmdQueue,bufferC,CL_FALSE,0,datasize,C,0,NULL,NULL);


        /*Create and compile the program*/

        cl_program program = clCreateProgramWithSource(context,1,(const char **)&programSource,NULL,&status);

        status = clBuildProgram(program,numDevices,devices,NULL,NULL,NULL);


        if (status!= CL_SUCCESS)
        {
            size_t len;
            char buffer[8 * 1024];

            printf("Error: Failed to build program executable!\n");
            clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
            printf("%s\n", buffer);
        }
        /*Create the kernel*/

        cl_kernel kernel = NULL;

        kernel = clCreateKernel(program, "vecadd", &status);

            /*Set the kernel arguments*/

        status = clSetKernelArg(kernel,0,sizeof(cl_mem),&bufferA);
        status = clSetKernelArg(kernel,1,sizeof(cl_mem),&bufferB);
        status = clSetKernelArg(kernel,2,sizeof(cl_mem),&bufferC);

        /*CONFIGURE THE WORK-ITEM STRUCTURE*/

        size_t globalWorkSize[1];
        globalWorkSize[0] = elements;
    //  size_t globalSize[1] = { elements }, localSize[1] = { 256 };
        /*Enqueue the kernel for execution*/

        status = clEnqueueNDRangeKernel(cmdQueue,kernel,1,NULL,globalWorkSize,NULL,0,NULL,NULL);
        /*Read the buffer output back to host*/
        clFinish(cmdQueue);
        clEnqueueReadBuffer(cmdQueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,NULL);


        printf("The calculated outcome:");
        for (int i = 0; i < elements; i++)
        {
            printf("%d", C[i]);
        }
        printf("\n");

        bool result = true;


        printf("The right outcome:");
        for (int i = 0; i < elements; i++)
        {
            D[i] = i + i;
            printf("%d", D[i]);

        }
        printf("\n");

        for (int i = 0; i < elements; i++)
        {
            if (C[i] != D[i])
            {

                result = false;
                break;
            }

        }


        if (result)
        {
            printf("Output is correct!\n");
        }
        else
        {
            printf("Output is incorrect!\n");
        }

        /*Release OpenCL resources*/

        clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseCommandQueue(cmdQueue);
        clReleaseMemObject(bufferA);
        clReleaseMemObject(bufferB);
        clReleaseMemObject(bufferC);
        clReleaseContext(context);

        free(A);
        free(B);
        free(C);
        free(platforms);
        free(devices);
        getchar();
}

下面,分別對代碼進行分析,瞭解其是怎麼實現異構編程的。
頭文件就不用說了,使用OpenCL進行編程的時候需要添加對應的庫,該庫一般設備商會提供。如果只是在PC端使用VS仿真編程,則去OpenCL官方網站OpenCL下載對應的Header,設置相關庫的路徑即可。
首先,以下代碼:

const char* programSource =
"__kernel void vecadd(__global int* A, __global int* B, __global int* C)\n"
"{                                                                        \n"
"    int idx=get_global_id(0);                                            \n"
"    C[idx]=A[idx]+B[idx];                                                \n"
"}                                                                        \n"
;

該部分代碼稱爲Kernel,是真正運行在GPU上的程序。上一篇博客中講執行模型的時候提到,OpenCL程序分爲兩部分,一部分運行在宿主機(CPU),另一部分運行在計算設備上(針對GPU編程,指GPU),上述代碼就是指後者。這部分代碼可以像上面一樣,以字符的方式放在CPU程序中,也可以單獨寫一個.cl文件放在外面。Kernel的語法有一定的規則,比如必須以__Kernel 開頭等,相關的可以查看具體版本OpenCL的Specification,在其官方可以下載,現在可以先不用管。

接下來確定我們的平臺和設備。

 status = clGetPlatformIDs(0, NULL, &numPlatforms);  //retrieve number of platforms

    printf("# of platform:%d\n", numPlatforms);

    platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id)); // malloct memery for platform 

    status = clGetPlatformIDs(numPlatforms, platforms, NULL); // initialize platform

函數clGetPlatformIDs()用於獲取平臺信息,該函數通常調用兩次,第一次用於查詢,第二次則指定我們所使用的平臺。設備信息則通過clGetDeviceIDs()獲得和指定,用法和clGetPlatformIDs()類似,如下所示:

cl_uint numDevices = 0;
    cl_device_id* devices = NULL;

    status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,0,NULL,&numDevices);  // retrieve Device number

    printf("# of device:%d\n", numDevices);
    devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id)); // malloct memery for device

    status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,numDevices,devices,NULL); // fill in device 

通過clGetPlatformInfo()和clGetDeviceInfo()可以查詢平臺和設備的相關信息,根據輸入的參數獲取對應的信息。上述代碼的結果如下圖所示:
這裏寫圖片描述

平臺與設備信息獲得後,則可以創建Context,將指定的CPU和GPU放在一個context下,則在Context中對其進行操作,其關係可以用下圖表示:
這裏寫圖片描述
P.S:該素材來源於AMD OpenCL大學教程
對應的API
cl_context context = NULL;
context = clCreateContext( NULL, numDevices,devices,NULL,NULL,&status);

同樣可以通過 clGetContextInfo()查詢Context的相關信息。

Context用於管理對象,對對象的操作則是通過Commandqueue命令隊列實現的。
這裏寫圖片描述

cl_command_queue cmdQueue;
    cmdQueue = clCreateCommandQueue(context,devices[0],0,&status);

命令隊列創建後,則需要創建內存對象。因爲我們的數據一開始是在host上的,我們需要創建Context中的buffer,來實現host和compute device之間的數據傳輸。
這裏寫圖片描述

        cl_mem bufferA;
        cl_mem bufferB;
        cl_mem bufferC;

        bufferA = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);

        bufferB = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);

        bufferC = clCreateBuffer(context,CL_MEM_WRITE_ONLY,datasize,NULL,&status);

通過上述命令創建的內存對象是空的,還沒有數據。通過以下命令,將宿主機上的數據傳輸至GPU上

        status = clEnqueueWriteBuffer(cmdQueue,bufferA,CL_FALSE,0,datasize,A,0,NULL,NULL);
        status = clEnqueueWriteBuffer(cmdQueue,bufferB,CL_FALSE,0,datasize,B,0,NULL,NULL);

在我們的應用中,只有矢量A和B一開始是有值,C是計算完成後才獲得值。一開始寫入GPU的數據只有A和B,C在計算完成後再讀回host.
這裏寫圖片描述
接下來創建程序對象,創建結束後build,build爲所關聯的設備生成可執行體。創建程序對象試,加載的是Kernel字符。如果Kernel是單獨寫在外面的cl程序,則加載對應.cl文件即可。
這裏寫圖片描述

cl_program program = clCreateProgramWithSource(context,1,(const char**)&programSource,NULL,&status);
status = clBuildProgram(program,numDevices,devices,NULL,NULL,NULL);

程序創建後,創建在GPU上執行的Kernel對象,並傳入相關參數。
這裏寫圖片描述
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "vecadd", &status);
status = clSetKernelArg(kernel,0,sizeof(cl_mem),&bufferA);
status = clSetKernelArg(kernel,1,sizeof(cl_mem),&bufferB);
status = clSetKernelArg(kernel,2,sizeof(cl_mem),&bufferC);

Kernel不同於Program之處在於,Program是屬於host上的,只是加載程序字符並且關聯到設備
Kernel則是運行函數實體,然後通過設置內核參數API設置參數的值。

下面,則開始執行內核。
這裏寫圖片描述
內核執行的時候,則根據上一章節所提到的執行模型生成索引空間,索引空間的大小以及每個工作組的大小都需要事先設定,如下所示。

size_t globalWorkSize[1];
globalWorkSize[0] = elements;

然後執行內核:

status = clEnqueueNDRangeKernel(cmdQueue,kernel,1,NULL,globalWorkSize,NULL,0,NULL,NULL);

需要注意的是,此時程序還沒有真正運行,只是將其入隊,具體什麼時候執行,由隊列決定。關於這部分我也還不是很清楚,目前知道的就是OpenCL由一定的執行機制,比如順序執行或者亂序執行等等,等後面有了比較全面的瞭解後再更新。

GPU計算結束後,我們需要將數據傳回給CPU。

clEnqueueReadBuffer(cmdQueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,NULL);

這裏寫圖片描述

操作完成後,需要釋放相關資源與內存。

clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseCommandQueue(cmdQueue);
        clReleaseMemObject(bufferA);
        clReleaseMemObject(bufferB);
        clReleaseMemObject(bufferC);
        clReleaseContext(context);

至此,我們通過OpenCL完成了CPU 、GPU的異構編程。結合Demo,相信你對OpenCL的相關概念以及其過程有了一定認識。

OpenCL還有很多東西需要進一步學習,比如版本之間的更新情況以及命令隊列執行機制,事件等等,等後面學習到了,也會分享出來的。

其實,我對這個過程以及相關概念已經比較熟悉了,但是我發現寫出來還是有點混亂,邏輯不是很清晰,特別是上一篇。自己也想通過業餘時間寫博客的方式,記錄所學的東西並訓練自己的表達能力吧。因爲是新手,有很多理解不到位的地方,煩請大家指出,我一定虛心接納。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章