OpenCL 和 CUDA 之間的區別

根據nvidia的官方文檔,對OpenCL和CUDA的異同做比較:

指針遍歷

OpenCL不支持CUDA那樣的指針遍歷方式, 你只能用下標方式間接實現指針遍歷. 例子代碼如下:
// CUDA

struct Node { Node* next; }
n = n->next;

// OpenCL

struct Node { unsigned int next; }

n = bufBase + n;

Kernel 程序異同

CUDA的代碼最終編譯成顯卡上的二進制格式，最後由cudart.dll(個人猜測)裝載到GPU並且執行。OpenCL中運行時庫中包含編譯器，

使用僞代碼，程序運行時即時編譯和裝載。這個類似JAVA, .net 程序，道理也一樣，爲了支持跨平臺的兼容。kernel程序的語法也

有略微不同，如下：

[c-sharp]view
plaincopy

__global__ void vectorAdd(const float * a, const float * b, float * c)  

{ // CUDA  

    int nIndex = blockIdx.x * blockDim.x + threadIdx.x;  

    c[nIndex] = a[nIndex] + b[nIndex];  

}

[cpp]view
plaincopy

__kernel void vectorAdd(__global const float * a, __global const float * b, __global float * c)  

{ // OpenCL  

    int nIndex = get_global_id(0);  

    c[nIndex] = a[nIndex] + b[nIndex];  

}

可以看出大部分都相同。只是細節有差異：

1）CUDA 的kernel函數使用“__global__”申明而OpenCL的kernel函數使用“__kernel”作爲申明。

2）OpenCL的所有參數都有“__global”修飾符，代表這個參數所指地址是在全局內存。

3）衆所周知，CUDA採用threadIdx.{x|y|z}, blockIdx.{x|y|z}來獲得當前線程的索引號，而OpenCL

通過一個特定的get_global_id()函數來獲得在kernel中的全局索引號。OpenCL中如果要獲得在當前工作

組（對等於CUDA中的block）中的局部索引號，可以使用get_local_id()

Host代碼的異同

把上面的kernel代碼編譯成“vectorAdd.cubin”，CUDA調用方法如下：

[cpp]view
plaincopy

const unsigned int cnBlockSize = 512;  

const unsigned int cnBlocks = 3;  

const unsigned int cnDimension = cnBlocks * cnBlockSize;  

CUdevice hDevice;  

CUcontext hContext;  

CUmodule hModule;  

CUfunction hFunction;  

// create CUDA device & context  

cuInit(0);  

cuDeviceGet(&hContext, 0); // pick first device  

cuCtxCreate(&hContext, 0, hDevice));  

cuModuleLoad(&hModule, “vectorAdd.cubin”);  

cuModuleGetFunction(&hFunction, hModule, "vectorAdd");  

// allocate host vectors  

float * pA = new float[cnDimension];  

float * pB = new float[cnDimension];  

float * pC = new float[cnDimension];  

// initialize host memory  

randomInit(pA, cnDimension);  

randomInit(pB, cnDimension);  

// allocate memory on the device  

CUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC;  

cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float));  

cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float));  

cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float));  

// copy host vectors to device  

cuMemcpyHtoD(pDeviceMemA, pA, cnDimension * sizeof(float));  

cuMemcpyHtoD(pDeviceMemB, pB, cnDimension * sizeof(float));  

// setup parameter values  

cuFuncSetBlockShape(cuFunction, cnBlockSize, 1, 1);  

cuParamSeti(cuFunction, 0, pDeviceMemA);  

cuParamSeti(cuFunction, 4, pDeviceMemB);  

cuParamSeti(cuFunction, 8, pDeviceMemC);  

cuParamSetSize(cuFunction, 12);  

// execute kernel  

cuLaunchGrid(cuFunction, cnBlocks, 1);  

// copy the result from device back to host  

cuMemcpyDtoH((void *) pC, pDeviceMemC, cnDimension * sizeof(float));  

delete[] pA;  

delete[] pB;  

delete[] pC;  

cuMemFree(pDeviceMemA);  

cuMemFree(pDeviceMemB);  

cuMemFree(pDeviceMemC);

OpenCL的代碼以文本方式存放在“sProgramSource”。調用方式如下：

[cpp]view
plaincopy

const unsigned int cnBlockSize = 512;  

const unsigned int cnBlocks = 3;  

const unsigned int cnDimension = cnBlocks * cnBlockSize;  

// create OpenCL device & context  

cl_context hContext;  

hContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, 0, 0, 0);  

// query all devices available to the context  

size_t nContextDescriptorSize;  

clGetContextInfo(hContext, CL_CONTEXT_DEVICES, 0, 0, &nContextDescriptorSize);  

cl_device_id * aDevices = malloc(nContextDescriptorSize);  

clGetContextInfo(hContext, CL_CONTEXT_DEVICES, nContextDescriptorSize, aDevices, 0);  

// create a command queue for first device the context reported  

cl_command_queue hCmdQueue;  

hCmdQueue = clCreateCommandQueue(hContext, aDevices[0], 0, 0);  

// create & compile program  

cl_program hProgram;  

hProgram = clCreateProgramWithSource(hContext, 1, sProgramSource, 0, 0);  

clBuildProgram(hProgram, 0, 0, 0, 0, 0);// create kernel  

cl_kernel hKernel;  

hKernel = clCreateKernel(hProgram, “vectorAdd”, 0);  

// allocate host vectors  

float * pA = new float[cnDimension];  

float * pB = new float[cnDimension];  

float * pC = new float[cnDimension];  

// initialize host memory  

randomInit(pA, cnDimension);  

randomInit(pB, cnDimension);  

// allocate device memory  

cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC;  

hDeviceMemA = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);  

hDeviceMemB = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);  

hDeviceMemC = clCreateBuffer(hContext,  

CL_MEM_WRITE_ONLY,  

cnDimension * sizeof(cl_float), 0, 0);  

// setup parameter values  

clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);  

clSetKernelArg(hKernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);  

clSetKernelArg(hKernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);  

// execute kernel  

clEnqueueNDRangeKernel(hCmdQueue, hKernel, 1, 0, &cnDimension, 0, 0, 0, 0);  

// copy results from device back to host  

clEnqueueReadBuffer(hContext, hDeviceMemC, CL_TRUE, 0, cnDimension * sizeof(cl_float),  

pC, 0, 0, 0);  

delete[] pA;  

delete[] pB;  

delete[] pC;  

clReleaseMemObj(hDeviceMemA);  

clReleaseMemObj(hDeviceMemB);  

clReleaseMemObj(hDeviceMemC);

初始化部分的異同

CUDA 在使用任何API之前必須調用cuInit(0)，然後是獲得當前系統的可用設備並獲得Context。
cuInit(0);
cuDeviceGet(&hContext, 0);
cuCtxCreate(&hContext, 0, hDevice));
OpenCL不用全局的初始化，直接指定設備獲得句柄就可以了
cl_context hContext;
hContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, 0, 0, 0);
設備創建完畢後，可以通過下面的方法獲得設備信息和上下文：
size_t nContextDescriptorSize;
clGetContextInfo(hContext, CL_CONTEXT_DEVICES, 0, 0, &nContextDescriptorSize);
cl_device_id * aDevices = malloc(nContextDescriptorSize);

clGetContextInfo(hContext, CL_CONTEXT_DEVICES, nContextDescriptorSize, aDevices, 0);

OpenCL introduces an additional concept: Command Queues. Commands launching kernels and
reading or writing memory are always issued for a specific command queue. A command queue is
created on a specific device in a context. The following code creates a command queue for the
device and context created so far:
cl_command_queue hCmdQueue;
hCmdQueue = clCreateCommandQueue(hContext, aDevices[0], 0, 0);
With this the program has progressed to the point where data can be uploaded to the device’s
memory and processed by launching compute kernels on the device.

Kernel Creation

CUDA kernel 以二進制格式存放與CUBIN文件中間，其調用格式和DLL的用法比較類似，先裝載二進制庫，然後通過函數名查找

函數地址，最後用將函數裝載到GPU運行。示例代碼如下：
CUmodule hModule;
cuModuleLoad(&hModule, “vectorAdd.cubin”);
cuModuleGetFunction(&hFunction, hModule, "vectorAdd");
OpenCL 爲了支持多平臺，所以不使用編譯後的代碼，採用類似JAVA的方式，裝載文本格式的代碼文件，然後即時編譯並運行。
需要注意的是，OpenCL也提供API訪問kernel的二進制程序，前提是這個kernel已經被編譯並且放在某個特定的緩存中了。

// 裝載代碼，即時編譯
cl_program hProgram;
hProgram = clCreateProgramWithSource(hContext, 1, “vectorAdd.c", 0, 0);
clBuildProgram(hProgram, 0, 0, 0, 0, 0);
// 獲得kernel函數句柄
cl_kernel hKernel;
hKernel = clCreateKernel(hProgram, “vectorAdd”, 0);

設備內存分配

內存分配沒有什麼大區別，OpenCL提供兩組特殊的標誌，CL_MEM_READ_ONLY 和 CL_MEM_WRITE_ONLY 用來控制內存

的讀寫權限。另外一個標誌比較有用：CL_MEM_COPY_HOST_PTR 表示這個內存在主機分配，但是GPU可以使用，運行時會自動

將主機內存內容拷貝到GPU，主機內存分配，設備內存分配，主機拷貝數據到設備，3個步驟一氣呵成。
// CUDA

CUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC;
cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float));
cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float));
cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float));
cuMemcpyHtoD(pDeviceMemA, pA, cnDimension * sizeof(float));
cuMemcpyHtoD(pDeviceMemB, pB, cnDimension * sizeof(float));
// OpenCL
hDeviceMemA = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
hDeviceMemB = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
hDeviceMemC = clCreateBuffer(hContext, CL_MEM_WRITE_ONLY, cnDimension * sizeof(cl_float), 0, 0);

Kernel Parameter Specification

The next step in preparing the kernels for launch is to establish a mapping between the kernels’
parameters, essentially pointers to the three vectors A, B and C, to the three device memory regions,
which were allocated in the previous section.
Parameter setting in both APIs is a pretty low-level affair. It requires knowledge of the total number
, order, and types of a given kernel’s parameters. The order and types of the parameters are used to
determine a specific parameters offset inside the data block made up of all parameters. The offset in
bytes for the n-th parameter is essentially the sum of the sizes of all (n-1) preceding parameters.
Using the CUDA Driver API:
In CUDA device pointers are represented as unsigned int and the CUDA Driver API has a
dedicated method for setting that type. Here’s the code for setting the three parameters. Note how
the offset is incrementally computed as the sum of the previous parameters’ sizes.
cuParamSeti(cuFunction, 0, pDeviceMemA);
cuParamSeti(cuFunction, 4, pDeviceMemB);
cuParamSeti(cuFunction, 8, pDeviceMemC);
cuParamSetSize(cuFunction, 12);
Using OpenCL:
In OpenCL parameter setting is done via a single function that takes a pointer to the location of the
parameter to be set.
clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);
clSetKernelArg(hKernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);
clSetKernelArg(hKernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);

Kernel Launch

Launching a kernel requires the specification of the dimension and size of the “thread-grid”. The
CUDA Programming Guide and the OpenCL specification contain details about the structure of
those grids. For NVIDIA GPUs the permissible structures are the same for CUDA and OpenCL.
For the vectorAdd sample we need to start one thread per vector-element (of the output vector).
The number of elements in the vector is given in the cnDimension variable. It is defined to be
cnDimension = cnBlockSize * cnBlocks. This means that cnDimension threads
need to be executed. The threads are structured into cnBlocks one-dimensional thread blocks of
size cnBlockSize.
Using the CUDA Driver API:
A kernel’s block size is specified in a call separate from the actual kernel launch using
cuFunctSetBlockShape. The kernel launching function cuLaunchGrid then only
specifies the number of blocks to be launched.
cuFuncSetBlockShape(cuFunction, cnBlockSize, 1, 1);
cuLaunchGrid (cuFunction, cnBlocks, 1);
Using OpenCL:
The OpenCL equivalent of kernel launching is to “enqueue” a kernel for execution into a command
queue. The enqueue function takes parameters for both the work group size (work group is the
OpenCL equivalent of a CUDA thread-block), and the global work size, which is the size of the
global array of threads.
Note: Where in CUDA the global work size is specified in terms of number of thread
blocks, it is given in number of threads in OpenCL.
Both work group size and global work size are potentially one, two, or three dimensional arrays. The
function expects pointers of unsigned ints to be passed in the fourth and fifth parameters.
For the vectorAdd example, work groups and total work size is a one-dimensional grid of threads.
clEnqueueNDRangeKernel(hCmdQueue, hKernel, 1, 0,
&cnDimension, &cnBlockSize, 0, 0, 0);
The parameters of cnDimension and cnBlockSize must be pointers to unsigned int.
Work group sizes that are dimensions greater than 1, the parameters will be a pointer to arrays of
sizes.

Result Data Retrieval

Both kernel launch functions (CUDA and OpenCL) are asynchronous, i.e. they return immediately
after scheduling the kernel to be executed on the GPU. In order for a copy operation that retrieves
the result vector C (copy from device to host) to produce correct results in synchronization with the
kernel completion needs to happen.
CUDA memcpy functions automatically synchronize and complete any outstanding kernel launches
proceeding. Both API’s also provide a set of asynchronous memory transfer functions which
allows a user to overlap memory transfers with computation to increase throughput.
Using the CUDA Driver API:
Use cuMemcpyDtoH() to copy results back to the host.
cuMemcpyDtoH((void *)pC, pDeviceMemC, cnDimension * sizeof(float));
Using OpenCL:
OpenCL’s clEnqueueReadBuffer() function allows the user to specify whether a read is to
be synchronous or asynchronous (third argument). For the simple vectorAdd sample a
synchronizing read is used, which results in the same behavior as the simple synchronous CUDA
memory copy above:
clEnqueueReadBuffer(hContext, hDeviceC, CL_TRUE, 0, cnDimension * sizeof(cl_float), pC, 0, 0, 0);
When used for asynchronous reads, OpenCL has an event mechanism that allows the host
application to query the status or wait for the completion of a given call.

OpenCL 和 CUDA 之間的區別

如何學習C++（C++書籍階梯式推薦）

支持向量機通俗導論（理解SVM的三層境界）

從最大似然到EM算法淺解

我的算法學習之路

Constant Memory in CUDA

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結