在上一篇博客中介紹了異構編程相關概念以及OpenCL框架,都是比較抽象的概念。本文從矢量相加的demo出發,對相關概念做進一步說明,來更深入、直觀地理解OpenCL異構編程的過程。
首先,直接將實現矢量相加(每個矢量128個元素)的完整源碼貼出來
#include<stdio.h>
#include<stdlib.h>
#include<CL/cl.h>
const char* programSource =
"__kernel void vecadd(__global int* A, __global int* B, __global int* C)\n"
"{ \n"
" int idx=get_global_id(0); \n"
" C[idx]=A[idx]+B[idx]; \n"
"} \n"
;
int main()
{
int *A = NULL;
int *B = NULL;
int *C = NULL;
const int elements= 128;
size_t datasize = sizeof(int)*elements;
A = (int*)malloc(datasize);
B = (int*)malloc(datasize);
C = (int*)malloc(datasize);
for (int i = 0; i < elements; i++)
{
A[i]=i;
B[i]=i;
}
cl_int status;
/*Discover and initialize the platforms*/
cl_uint numPlatforms = 0;
cl_platform_id* platforms = NULL;
status = clGetPlatformIDs(0, NULL, &numPlatforms); //retrieve number of platforms
printf("# of platform:%d\n", numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id)); // malloct memery for platform
status = clGetPlatformIDs(numPlatforms, platforms, NULL); // initialize platforms
/*print platform informations*/
for (int i = 0; i < numPlatforms; i++)
{
size_t size=0;
//name
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &size);
char* name = (char*)malloc(size);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, size, name, NULL);
printf("CL_PLATFORM_NAME:%s\n", name);
//vendor
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 0, NULL, &size);
char *vendor = (char *)malloc(size);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, size, vendor, NULL);
printf("CL_PLATFORM_VENDOR:%s\n", vendor);
//version
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 0, NULL, &size);
char *version = (char *)malloc(size);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, size, version, NULL);
printf("CL_PLATFORM_VERSION:%s\n", version);
// profile
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 0, NULL, &size);
char *profile = (char *)malloc(size);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, size, profile, NULL);
printf("CL_PLATFORM_PROFILE:%s\n", profile);
// extensions
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 0, NULL, &size);
char *extensions = (char *)malloc(size);
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, size, extensions, NULL);
printf("CL_PLATFORM_EXTENSIONS:%s\n", extensions);
// release
printf("\n\n");
free(name);
free(vendor);
free(version);
free(profile);
free(extensions);
}
/*Discover and initialize devices*/
cl_uint numDevices = 0;
cl_device_id* devices = NULL;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,0,NULL,&numDevices); // retrieve Device number
printf("# of device:%d\n", numDevices);
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id)); // malloct memery for device
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,numDevices,devices,NULL); // fill in device
/*print device informations*/
for (int i = 0; i < numDevices; i++)
{
size_t value_size = 0;
//name
status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &value_size); char* name1 = (char*)malloc(value_size); status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, value_size, name1, NULL); printf("CL_DEVICE_NAME:%s\n", name1); //PARALLEL COMPUTE UNITS(CU) cl_uint maxComputeUnits = 0; status = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits), &maxComputeUnits, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS:%u\n", maxComputeUnits); //maxWorkItemPerGroup size_t maxWorkItemPerGroup = 0; status = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkItemPerGroup), &maxWorkItemPerGroup, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE: %d\n", maxWorkItemPerGroup); //maxGlobalMemSize cl_ulong maxGlobalMemSize = 0; status = clGetDeviceInfo(devices[0], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(maxGlobalMemSize), &maxGlobalMemSize, NULL); printf("CL_DEVICE_GLOBAL_MEM_SIZE: %lu(MB)\n", maxGlobalMemSize / 1024 / 1024); //maxConstantBufferSize cl_ulong maxConstantBufferSize = 0; clGetDeviceInfo(devices[0], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(maxConstantBufferSize), &maxConstantBufferSize, NULL); printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: %lu(KB)\n", maxConstantBufferSize / 1024); //maxLocalMemSize cl_ulong maxLocalMemSize = 0; status = clGetDeviceInfo(devices[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(maxLocalMemSize), &maxLocalMemSize, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE: %lu(KB)\n", maxLocalMemSize / 1024);
// release printf("\n\n"); free(name1);
}
/*Creat a context*/
cl_context context = NULL;
context = clCreateContext( NULL, numDevices,devices,NULL,NULL,&status);
// context = clCreateContextFromType(NULL,CL_DEVICE_TYPE_ALL,NULL,NULL,&status);
// cl_device_id device_list;
size_t device_num;
clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, 0, NULL, &device_num);
printf("Size of cl_device_id:%d\n", sizeof(cl_device_id));
printf("Num of device in Context:%d\n", device_num);
// device_list = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list);
/*Create a command queue*/
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context,devices[0],0,&status);
/*Create device buffers*/
cl_mem bufferA;
cl_mem bufferB;
cl_mem bufferC;
bufferA = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);
bufferB = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);
bufferC = clCreateBuffer(context,CL_MEM_WRITE_ONLY,datasize,NULL,&status);
/*Write host data to device buffers*/
status = clEnqueueWriteBuffer(cmdQueue,bufferA,CL_FALSE,0,datasize,A,0,NULL,NULL);
status = clEnqueueWriteBuffer(cmdQueue,bufferB,CL_FALSE,0,datasize,B,0,NULL,NULL);
// status = clEnqueueWriteBuffer(cmdQueue,bufferC,CL_FALSE,0,datasize,C,0,NULL,NULL);
/*Create and compile the program*/
cl_program program = clCreateProgramWithSource(context,1,(const char **)&programSource,NULL,&status);
status = clBuildProgram(program,numDevices,devices,NULL,NULL,NULL);
if (status!= CL_SUCCESS)
{
size_t len;
char buffer[8 * 1024];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
}
/*Create the kernel*/
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "vecadd", &status);
/*Set the kernel arguments*/
status = clSetKernelArg(kernel,0,sizeof(cl_mem),&bufferA);
status = clSetKernelArg(kernel,1,sizeof(cl_mem),&bufferB);
status = clSetKernelArg(kernel,2,sizeof(cl_mem),&bufferC);
/*CONFIGURE THE WORK-ITEM STRUCTURE*/
size_t globalWorkSize[1];
globalWorkSize[0] = elements;
// size_t globalSize[1] = { elements }, localSize[1] = { 256 };
/*Enqueue the kernel for execution*/
status = clEnqueueNDRangeKernel(cmdQueue,kernel,1,NULL,globalWorkSize,NULL,0,NULL,NULL);
/*Read the buffer output back to host*/
clFinish(cmdQueue);
clEnqueueReadBuffer(cmdQueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,NULL);
printf("The calculated outcome:");
for (int i = 0; i < elements; i++)
{
printf("%d", C[i]);
}
printf("\n");
bool result = true;
printf("The right outcome:");
for (int i = 0; i < elements; i++)
{
D[i] = i + i;
printf("%d", D[i]);
}
printf("\n");
for (int i = 0; i < elements; i++)
{
if (C[i] != D[i])
{
result = false;
break;
}
}
if (result)
{
printf("Output is correct!\n");
}
else
{
printf("Output is incorrect!\n");
}
/*Release OpenCL resources*/
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(bufferA);
clReleaseMemObject(bufferB);
clReleaseMemObject(bufferC);
clReleaseContext(context);
free(A);
free(B);
free(C);
free(platforms);
free(devices);
getchar();
}
下面,分別對代碼進行分析,瞭解其是怎麼實現異構編程的。
頭文件就不用說了,使用OpenCL進行編程的時候需要添加對應的庫,該庫一般設備商會提供。如果只是在PC端使用VS仿真編程,則去OpenCL官方網站OpenCL下載對應的Header,設置相關庫的路徑即可。
首先,以下代碼:
const char* programSource =
"__kernel void vecadd(__global int* A, __global int* B, __global int* C)\n"
"{ \n"
" int idx=get_global_id(0); \n"
" C[idx]=A[idx]+B[idx]; \n"
"} \n"
;
該部分代碼稱爲Kernel,是真正運行在GPU上的程序。上一篇博客中講執行模型的時候提到,OpenCL程序分爲兩部分,一部分運行在宿主機(CPU),另一部分運行在計算設備上(針對GPU編程,指GPU),上述代碼就是指後者。這部分代碼可以像上面一樣,以字符的方式放在CPU程序中,也可以單獨寫一個.cl
文件放在外面。Kernel的語法有一定的規則,比如必須以__Kernel 開頭等,相關的可以查看具體版本OpenCL的Specification,在其官方可以下載,現在可以先不用管。
接下來確定我們的平臺和設備。
status = clGetPlatformIDs(0, NULL, &numPlatforms); //retrieve number of platforms
printf("# of platform:%d\n", numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id)); // malloct memery for platform
status = clGetPlatformIDs(numPlatforms, platforms, NULL); // initialize platform
函數clGetPlatformIDs()用於獲取平臺信息,該函數通常調用兩次,第一次用於查詢,第二次則指定我們所使用的平臺。設備信息則通過clGetDeviceIDs()獲得和指定,用法和clGetPlatformIDs()類似,如下所示:
cl_uint numDevices = 0;
cl_device_id* devices = NULL;
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,0,NULL,&numDevices); // retrieve Device number
printf("# of device:%d\n", numDevices);
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id)); // malloct memery for device
status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_GPU,numDevices,devices,NULL); // fill in device
通過clGetPlatformInfo()和clGetDeviceInfo()可以查詢平臺和設備的相關信息,根據輸入的參數獲取對應的信息。上述代碼的結果如下圖所示:
平臺與設備信息獲得後,則可以創建Context,將指定的CPU和GPU放在一個context下,則在Context中對其進行操作,其關係可以用下圖表示:
P.S:該素材來源於AMD OpenCL大學教程
對應的API
cl_context context = NULL;
context = clCreateContext( NULL, numDevices,devices,NULL,NULL,&status);
同樣可以通過 clGetContextInfo()查詢Context的相關信息。
Context用於管理對象,對對象的操作則是通過Commandqueue命令隊列實現的。
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context,devices[0],0,&status);
命令隊列創建後,則需要創建內存對象。因爲我們的數據一開始是在host上的,我們需要創建Context中的buffer,來實現host和compute device之間的數據傳輸。
cl_mem bufferA;
cl_mem bufferB;
cl_mem bufferC;
bufferA = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);
bufferB = clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);
bufferC = clCreateBuffer(context,CL_MEM_WRITE_ONLY,datasize,NULL,&status);
通過上述命令創建的內存對象是空的,還沒有數據。通過以下命令,將宿主機上的數據傳輸至GPU上
status = clEnqueueWriteBuffer(cmdQueue,bufferA,CL_FALSE,0,datasize,A,0,NULL,NULL);
status = clEnqueueWriteBuffer(cmdQueue,bufferB,CL_FALSE,0,datasize,B,0,NULL,NULL);
在我們的應用中,只有矢量A和B一開始是有值,C是計算完成後才獲得值。一開始寫入GPU的數據只有A和B,C在計算完成後再讀回host.
接下來創建程序對象,創建結束後build,build爲所關聯的設備生成可執行體。創建程序對象試,加載的是Kernel字符。如果Kernel是單獨寫在外面的cl程序,則加載對應.cl
文件即可。
cl_program program = clCreateProgramWithSource(context,1,(const char**)&programSource,NULL,&status);
status = clBuildProgram(program,numDevices,devices,NULL,NULL,NULL);
程序創建後,創建在GPU上執行的Kernel對象,並傳入相關參數。
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "vecadd", &status);
status = clSetKernelArg(kernel,0,sizeof(cl_mem),&bufferA);
status = clSetKernelArg(kernel,1,sizeof(cl_mem),&bufferB);
status = clSetKernelArg(kernel,2,sizeof(cl_mem),&bufferC);
Kernel不同於Program之處在於,Program是屬於host上的,只是加載程序字符並且關聯到設備
Kernel則是運行函數實體,然後通過設置內核參數API設置參數的值。
下面,則開始執行內核。
內核執行的時候,則根據上一章節所提到的執行模型生成索引空間,索引空間的大小以及每個工作組的大小都需要事先設定,如下所示。
size_t globalWorkSize[1];
globalWorkSize[0] = elements;
然後執行內核:
status = clEnqueueNDRangeKernel(cmdQueue,kernel,1,NULL,globalWorkSize,NULL,0,NULL,NULL);
需要注意的是,此時程序還沒有真正運行,只是將其入隊,具體什麼時候執行,由隊列決定。關於這部分我也還不是很清楚,目前知道的就是OpenCL由一定的執行機制,比如順序執行或者亂序執行等等,等後面有了比較全面的瞭解後再更新。
GPU計算結束後,我們需要將數據傳回給CPU。
clEnqueueReadBuffer(cmdQueue,bufferC,CL_TRUE,0,datasize,C,0,NULL,NULL);
操作完成後,需要釋放相關資源與內存。
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(bufferA);
clReleaseMemObject(bufferB);
clReleaseMemObject(bufferC);
clReleaseContext(context);
至此,我們通過OpenCL完成了CPU 、GPU的異構編程。結合Demo,相信你對OpenCL的相關概念以及其過程有了一定認識。
OpenCL還有很多東西需要進一步學習,比如版本之間的更新情況以及命令隊列執行機制,事件等等,等後面學習到了,也會分享出來的。
其實,我對這個過程以及相關概念已經比較熟悉了,但是我發現寫出來還是有點混亂,邏輯不是很清晰,特別是上一篇。自己也想通過業餘時間寫博客的方式,記錄所學的東西並訓練自己的表達能力吧。因爲是新手,有很多理解不到位的地方,煩請大家指出,我一定虛心接納。