簡介
下面一個例子介紹了向量加法的OpenCL版,相當於學習C語言中的“Hello World”,本篇教程中的代碼以及其餘相關教程都可以通過OLCF github下載
vecAdd.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <CL/opencl.h>
// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource = "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \
"__kernel void vecAdd( __global double *a, \n" \
" __global double *b, \n" \
" __global double *c, \n" \
" const unsigned int n) \n" \
"{ \n" \
" //Get our global thread ID \n" \
" int id = get_global_id(0); \n" \
" \n" \
" //Make sure we do not go out of bounds \n" \
" if (id < n) \n" \
" c[id] = a[id] + b[id]; \n" \
"} \n" \
"\n" ;
int main( int argc, char* argv[] )
{
// Length of vectors
unsigned int n = 100000;
// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;
// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;
cl_platform_id cpPlatform; // OpenCL platform
cl_device_id device_id; // device ID
cl_context context; // context
cl_command_queue queue; // command queue
cl_program program; // program
cl_kernel kernel; // kernel
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
// Initialize vectors on host
int i;
for( i = 0; i < n; i++ )
{
h_a[i] = sinf(i)*sinf(i);
h_b[i] = cosf(i)*cosf(i);
}
size_t globalSize, localSize;
cl_int err;
// Number of work items in each local work group
localSize = 64;
// Number of total work items - localSize must be devisor
globalSize = ceil(n/(float)localSize)*localSize;
// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);
// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
// Create a context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
// Create a command queue
queue = clCreateCommandQueue(context, device_id, 0, &err);
// Create the compute program from the source buffer
program = clCreateProgramWithSource(context, 1,
(const char **) & kernelSource, NULL, &err);
// Build the program executable
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, "vecAdd", &err);
// Create the input and output arrays in device memory for our calculation
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
bytes, h_a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
bytes, h_b, 0, NULL, NULL);
// Set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
// Execute the kernel over the entire range of the data set
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
0, NULL, NULL);
// Wait for the command queue to get serviced before reading back results
clFinish(queue);
// Read the results from the device
clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
bytes, h_c, 0, NULL, NULL );
//Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for(i=0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum/n);
// release OpenCL resources
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);
//release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
代碼分析
內核(kernel):
kernel是OpenCL代碼的核心部分,整個內核必須通過C字符串的形式讀入,最簡單的辦法是像代碼一樣定義一個長長的字符串,在真實的項目代碼中通常都會從單獨的文件中讀入內核。
// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource = "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \
"__kernel void vecAdd( __global double *a, \n" \
" __global double *b, \n" \
" __global double *c, \n" \
" const unsigned int n) \n" \
"{ \n" \
" //Get our global thread ID \n" \
" int id = get_global_id(0); \n" \
" \n" \
" //Make sure we do not go out of bounds \n" \
" if (id < n) \n" \
" c[id] = a[id] + b[id]; \n" \
"} \n" \
"\n" ;
下面是內核的函數聲明:
__kernel void
vecAdd( __global double
*a, __global double
*b, __global double
*c, const
unsigned int
n) |
__kernel是一個定義OpenCL內核的關鍵字,__global則定義函數指針指向全局設備內存空間,否則可以使用一般的C語言函數聲明語法。內核的返回值必須爲空void
int id = get_global_id(0);
|
通過get_global_id函數可以獲得當前工作單元(work item)的全局id,參數爲0表示獲取X維上的ID。
if (id < n) c[id] = a[id] + b[id]; |
工作組(work group)的個數必定是整數,由於工作組的大小不一定是需要的線程數的整數倍,因此通常使用的線程數比需要的線程數要多,在程序設計時可以將無用的線程簡單丟棄掉。
內存(Memory)
// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;
// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;
主機CPU和GPU有不同的內存空間,因此需要分別定義,上面的代碼中前半部分定義主機(host)CPU的內存指針,後半部分定義設備(device)內存的handle,分別用h_和d_前綴來區分。
線程映射(Thread Mapping)
// Number of work items in each local work group
localSize = 64;
// Number of total work items - localSize must be devisor
globalSize = ceil(n/(float)localSize)*localSize;
爲了將我們要解決的問題映射到底層硬件結構,必須定義局部尺寸(local size)和全局尺寸(global size)。局部尺寸定義了每個工作組中的工作單元數,在NVIDIA GPU上等價於每個線程塊(thread block)中的線程數。全局尺寸定義了工作單元的總數目。局部尺寸必須是全局尺寸的倍數。
OpenCL前期準備(setup)
// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);
每個硬件廠商都會綁定一個不同的平臺(platform),在這裏clGetPlatformIDs會將cpPlatform設置成包含系統可用平臺的變量。舉個例子,如果一個系統包含AMD CPU以及NVIDIA GPU,並且安裝了恰當的OpenCL驅動,那麼兩個OpenCL平臺會被返回。
// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
可以詢問每一個平臺都包含哪些設備,在這裏我們通過使用CL_DEVICE_TYPE_GPU來查詢GPU設備。// Create a context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
在使用OpenCL設備之前,必須先準備一個上下文(context),上下文對象用來管理命令隊列(command queue)、內存(memory)、內核操作(Kernel activity),一個上下文對象可一般含多個設備。
// Create a command queue
queue = clCreateCommandQueue(context, device_id, 0, &err);
命令隊列(command queue)用來流式地將命令從主機送到指定的設備,可以把數據傳輸和內核操作命令放到命令隊列上,當條件適宜的時候命令就會被執行。編譯內核(Compile Kernel)
program = clCreateProgramWithSource(context, 1,
(const char **) & kernelSource, NULL, &err);
// Build the program executable
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, "vecAdd", &err);
爲了保證OpenCL代碼可以移植到許多不同的設備上,運行kernel的默認方式是JIT(Just-in-time, 實時編譯)。首先創建一個program對象(包含一系列內核代碼),然後再創建一系列的內核。
準備數據(prepare data)
// Create the input and output arrays in device memory for our calculation
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
bytes, h_a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
bytes, h_b, 0, NULL, NULL);
// Set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
在啓動內核之前,我們必須創建主機和設備之間的緩存(buffer),並將主機數據(host data)和這些新創建的設備緩存想綁定,最後再設定內核參數。
啓動內核(Launch Kernel)
// Execute the kernel over the entire range of the data set
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
0, NULL, NULL);
將結果拷貝回主機(Copy results to host)// Wait for the command queue to get serviced before reading back results
clFinish(queue);
// Read the results from the device
clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
bytes, h_c, 0, NULL, NULL );
我們可以阻塞程序直到命令隊列變爲空,然後把結果拷貝回主機。編譯(Compile)
$ module load cudatoolkit
$ cc -lOpenCL vecAdd.c -o vecAdd.out
運行(Running)
$ aprun ./vecAdd.out
final result: 1.000000
VecAdd.cc
C++綁定在OpenCL的開發中非常常用,它比標準C接口更爲流暢,下面是一個使用這些綁定的例子。
#define __CL_ENABLE_EXCEPTIONS
#include "cl.hpp"
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <math.h>
// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource = "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \
"__kernel void vecAdd( __global double *a, \n" \
" __global double *b, \n" \
" __global double *c, \n" \
" const unsigned int n) \n" \
"{ \n" \
" //Get our global thread ID \n" \
" int id = get_global_id(0); \n" \
" \n" \
" //Make sure we do not go out of bounds \n" \
" if (id < n) \n" \
" c[id] = a[id] + b[id]; \n" \
"} \n" \
"\n" ;
int main(int argc, char *argv[])
{
// Length of vectors
unsigned int n = 1000;
// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;
// Device input buffers
cl::Buffer d_a;
cl::Buffer d_b;
// Device output buffer
cl::Buffer d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = new double[n];
h_b = new double[n];
h_c = new double[n];
// Initialize vectors on host
for(int i = 0; i < n; i++ )
{
h_a[i] = sinf(i)*sinf(i);
h_b[i] = cosf(i)*cosf(i);
}
cl_int err = CL_SUCCESS;
try {
// Query platforms
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platforms.size() == 0) {
std::cout << "Platform size 0\n";
return -1;
}
// Get list of devices on default platform and create context
cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
cl::Context context(CL_DEVICE_TYPE_GPU, properties);
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create command queue for first device
cl::CommandQueue queue(context, devices[0], 0, &err);
// Create device memory buffers
d_a = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
d_b = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, bytes);
// Bind memory buffers
queue.enqueueWriteBuffer(d_a, CL_TRUE, 0, bytes, h_a);
queue.enqueueWriteBuffer(d_b, CL_TRUE, 0, bytes, h_b);
//Build kernel from source string
cl::Program::Sources source(1,
std::make_pair(kernelSource,strlen(kernelSource)));
cl::Program program_ = cl::Program(context, source);
program_.build(devices);
// Create kernel object
cl::Kernel kernel(program_, "vecAdd", &err);
// Bind kernel arguments to kernel
kernel.setArg(0, d_a);
kernel.setArg(1, d_b);
kernel.setArg(2, d_c);
kernel.setArg(3, n);
// Number of work items in each local work group
cl::NDRange localSize(64);
// Number of total work items - localSize must be devisor
cl::NDRange globalSize((int)(ceil(n/(float)64)*64));
// Enqueue kernel
cl::Event event;
queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
globalSize,
localSize,
NULL,
&event);
// Block until kernel completion
event.wait();
// Read back d_c
queue.enqueueReadBuffer(d_c, CL_TRUE, 0, bytes, h_c);
}
catch (cl::Error err) {
std::cerr
<< "ERROR: "<<err.what()<<"("<<err.err()<<")"<<std::endl;
}
// Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for(int i=0; i<n; i++)
sum += h_c[i];
std::cout<<"final result: "<<sum/n<<std::endl;
// Release host memory
delete(h_a);
delete(h_b);
delete(h_c);
return 0;
}
編譯(Compile)
需要先下載cl.hpp
$ module load cudatoolkit
$ CC vecAdd.cc -lOpenCL -o vecAdd.out
運行(Running)
$ aprun ./vecAdd.out
final result: 1.000000