一、案例簡述

　　本案例講述使用OpenCL計算矩陣乘法：C = A * B 。

　　設A、B、C分別是大小爲N*P、P*M和N*M的矩陣，那麼順序實現的C代碼可以如下所示：

 // C Function
void mat_mul(
			 int Ndim, int Mdim, int Pdim,
			 float* A, float* B, float* C)
{
    int i, j, k;
	float tmp;

	for (i = 0; i < Ndim; i++) {
		for (j = 0; j < Mdim; j++) {
			tmp = 0.0;
			for (k = 0; k < Pdim; k++)
				tmp += A[i*Pdim + k] * B[k*Mdim + j];
			C[i*Mdim + j] = tmp;
		}
	}
}

二、 OpenCL實現矩陣乘法

1. 內核函數實現

 // OpenCL Kernel Function
__kernel void HelloOpenCL(
						  const int Ndim,
						  const int Mdim,
						  const int Pdim,
						  __global const float* A, 
						  __global const float* B, 
						  __global float* C)
{
    int i = get_global_id(0);
	int j = get_global_id(1);

	int k;
	float tmp;

	if ((i < Ndim) && (j < Mdim)) {
		tmp = 0.0;
		for (k = 0; k < Pdim; k++)
			tmp += A[i*Pdim + k] * B[k*Mdim + j];
		C[i*Mdim + j] = tmp;
	}
}

　　爲每個工作項分配一個要計算的乘法矩陣的元素。將針對i，j的外層循環刪除，替換爲函數調用，查找這兩維中對應工作項的全局ID。要特別當心，必須保證得到的工作項ID在矩陣C的範圍內。這三個矩陣都留在全局內存中。

2. 宿主機代碼實現

　　下面是在《基於CUDA的OpenCL開發環境搭建與入門程序示例》中main.cpp宿主機代碼爲基礎的補丁文件。測量運行時間的部分：首先，在clCreateCommandQueue()函數中設置CL_QUEUE_PROFILING_ENABLE標誌；然後，在clEnqueueNDRangeKernel()函數中設置事件對象；最後，通過clGetEventProfilingInfo()函數獲取命令入隊時間和命令執行結束時間。注意：時間的單位是納秒，在最後打印時轉換爲秒顯示。

--- /root/Desktop/main.cpp
+++ /root/Desktop/main_new.cpp
@@ -143,8 +143,10 @@
 	}
 
 	// 4. Choose the first device
-	commandQueue = clCreateCommandQueue(context,
-										devices[0], 0, NULL);
+	commandQueue = clCreateCommandQueue(context, 
+										devices[0], 
+										CL_QUEUE_PROFILING_ENABLE, 
+										NULL);
 	if (commandQueue == NULL) {
 		perror("Failed to create commandQueue for device 0.");
 		exit(1);
@@ -183,14 +185,33 @@
 
 
 	/******** 第四部分 創建內核和內存對象 ********/
-	#define ARRAY_SIZE 10
+	const int Ndim = 3;
+	const int Mdim = 4;
+	const int Pdim = 5;
+
+	int szA = Ndim * Pdim;
+	int szB = Pdim * Mdim;
+	int szC = Ndim * Mdim;
 
 	cl_kernel kernel = 0;
 	cl_mem memObjects[3] = {0, 0, 0};
 
-	float a[ARRAY_SIZE];
-	float b[ARRAY_SIZE];
-	float result[ARRAY_SIZE];
+	float *A;
+	float *B;
+	float *C;
+
+	A = (float *)malloc(szA * sizeof(float));
+	B = (float *)malloc(szB * sizeof(float));
+	C = (float *)malloc(szC * sizeof(float));
+
+	int i, j;
+
+	for (i = 0; i < szA; i++)
+		A[i] = i + 1;
+
+	for (i = 0; i < szB; i++)
+		B[i] = i + 1;
+
 
 	// 8. Create the kernel
     kernel = clCreateKernel(program, "HelloOpenCL", NULL);
@@ -200,23 +221,18 @@
 	}
 
 	// 9. Create memory objects
-	for (int i = 0; i < ARRAY_SIZE; i++) {
-		a[i] = (float)i + 1;
-		b[i] = (float)i + 1;
-	}
-
 	memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY |
 								   CL_MEM_COPY_HOST_PTR,
-								   sizeof(float) * ARRAY_SIZE,
-								   a, NULL);
+								   sizeof(float) * szA,
+								   A, NULL);
 	memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY |
 								   CL_MEM_COPY_HOST_PTR,
-								   sizeof(float) * ARRAY_SIZE,
-								   b, NULL);
+								   sizeof(float) * szB,
+								   B, NULL);
 	memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE |
 								   CL_MEM_COPY_HOST_PTR,
-								   sizeof(float) * ARRAY_SIZE,
-								   result, NULL);
+								   sizeof(float) * szC,
+								   C, NULL);
 	if (memObjects[0] == NULL || memObjects[1] == NULL || 
 			memObjects[2] == NULL) {
 		perror("Error in clCreateBuffer.\n");
@@ -225,48 +241,98 @@
 
 
 	/******** 第五部分 執行內核 ********/
-	size_t globalWorkSize[1] = { ARRAY_SIZE };
-	size_t localWorkSize[1] = { 1 };
 
 	// 10. Set the kernel arguments
-	errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
-    errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
-    errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
+	errNum = clSetKernelArg(kernel, 0, sizeof(int), &Ndim);
+    errNum |= clSetKernelArg(kernel, 1, sizeof(int), &Mdim);
+    errNum |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim);
+	errNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &memObjects[0]);
+    errNum |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &memObjects[1]);
+    errNum |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &memObjects[2]);
 	if (errNum != CL_SUCCESS) {
 		perror("Error in clSetKernelArg.\n");
         exit(1);
 	}
 
 	// 11. Queue the kernel up for execution across the array
-	errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
-									globalWorkSize, localWorkSize,
-									0, NULL, NULL);
+	size_t global[2];
+	cl_event prof_event;
+	cl_ulong ev_start_time = (cl_ulong)0;
+	cl_ulong ev_end_time = (cl_ulong)0;
+	double rum_time;
+
+	global[0] = (size_t)Ndim;
+	global[1] = (size_t)Mdim;
+
+	errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL,
+									global, NULL, 0, NULL, &prof_event);
 	if (errNum != CL_SUCCESS) {
 		perror("Error in clEnqueueNDRangeKernel.\n");
         exit(1);
 	}
 
+	clFinish(commandQueue);
+	errNum = clWaitForEvents(1, &prof_event);
+	if (errNum != CL_SUCCESS) {
+		perror("Error in clWaitForEvents.\n");
+        exit(1);
+	}
+
+	errNum = clGetEventProfilingInfo(prof_event,
+									 CL_PROFILING_COMMAND_QUEUED,
+									 sizeof(cl_ulong),
+									 &ev_start_time,
+									 NULL);
+
+	errNum |= clGetEventProfilingInfo(prof_event,
+									 CL_PROFILING_COMMAND_END,
+									 sizeof(cl_ulong),
+									 &ev_end_time,
+									 NULL);
+
+	if (errNum != CL_SUCCESS) {
+		perror("Error in clGetEventProfilingInfo.\n");
+		while(1);
+        exit(1);
+	}
+	
 	// 12. Read the output buffer back to the Host
 	errNum = clEnqueueReadBuffer(commandQueue, memObjects[2],
 								 CL_TRUE, 0,
-								 ARRAY_SIZE * sizeof(float), result,
+								 sizeof(float) * szC, C,
 								 0, NULL, NULL);
 	if (errNum != CL_SUCCESS) {
 		perror("Error in clEnqueueReadBuffer.\n");
         exit(1);
 	}
 
+	rum_time = (double)(ev_end_time - ev_start_time);
+
 
 	/******** 第六部分 測試結果 ********/
-	printf("\nTest: a * b = c\n\n");
-
-	printf("Input numbers:\n");
-	for (int i = 0; i < ARRAY_SIZE; i++)
-		printf("a[%d] = %f, b[%d] = %f\n", i, a[i], i, b[i]);
-
-	printf("\nOutput numbers:\n");
-	for (int i = 0; i < ARRAY_SIZE; i++)
-		printf("a[%d] * b[%d] = %f\n", i, i, result[i]);
+	
+	printf("\nArray A:\n");
+	for (i = 0; i < Ndim; i++) {
+		for (j = 0; j < Pdim; j++)
+			printf("%.3f\t", A[i*Pdim + j]);
+		printf("\n");
+	}
+
+	printf("\nArray B:\n");
+	for (i = 0; i < Pdim; i++) {
+		for (j = 0; j < Mdim; j++)
+			printf("%.3f\t", B[i*Mdim + j]);
+		printf("\n");
+	}
+
+	printf("\nArray C:\n");
+	for (i = 0; i < Ndim; i++) {
+		for (j = 0; j < Mdim; j++)
+			printf("%.3f\t", C[i*Mdim + j]);
+		printf("\n");
+	}
+	
+	printf("\n\nRunning Time:  %f s\n", rum_time*1.0e-9);
 
 	while(1);

3. 運行結果

　　(1). N = 3，M = 4，P = 5。

　　(2). N = 1000，M = 1000，P = 1000。

三、代碼優化 <工作項分組和減少數據移動>

　　矩陣乘法的核心是一個乘加計算。大多數處理器中的ALU都有足夠的帶寬，可以保證這個計算可以接近峯值性能運行，不過只有隱藏數據移動時的開銷時才能做到這一點。因此，優化矩陣運算的根本就是儘量減少數據移動。上面的代碼中的矩陣乘法內核，三個矩陣都保留在全局內存中。這意味着每次乘法都要通過內存層次結構反覆地傳遞行和列(全局內存到私有內存)。

1. 第一次優化

　　在此優化版本中，每個工作項計算矩陣中的一行。NDRange從一個2D range(分區)集(匹配矩陣C的維度)變爲一個1D range(分區)集(匹配矩陣C的行數)。如下內核代碼中，每個工作項管理C中一整行的更新，不過在完成這個更新之前，要把矩陣A中相關的行從全局內存複製到私有內存。注：優化代碼以(N = 1000，M = 1000，P = 1000)爲例。

 // OpenCL Kernel Function
__kernel void HelloOpenCL(
						  const int Ndim,
						  const int Mdim,
						  const int Pdim,
						  __global const float* A, 
						  __global const float* B, 
						  __global float* C)
{
    int i = get_global_id(0);
	//int j = get_global_id(1);

	int j, k;
	float tmp;
	float Awrk[1000];

	if (i < Ndim) {
		for (k = 0; k < Pdim; k++)
			Awrk[k] = A[i*Pdim + k];
	
		for (j = 0; j < Mdim; j++) {
			tmp = 0.0;
			for (k = 0; k < Pdim; k++)
				tmp += Awrk[k] * B[k*Mdim + j];
			C[i*Mdim + j] = tmp;
		}
	}
}

　　在宿主機代碼中，只需將clEnqueueNDRangeKernel()函數中的維度數由"2"改爲"1"。

errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
									global, NULL, 0, NULL, &prof_event);

　　最終運行結果： 0.295128 秒，是上面代碼中0.692887秒的42.6% ！

　　本次優化通過增加內核工作量，減少數據移動來提高運行速度。其實如果是在GPU中而不是CPU中，單獨使用上述方式增加內核工作量，運行時間反而變爲原來幾倍。

2. 第二次優化

　　首先獲取平臺上的設備數及其最大計算單元數，使用以下代碼。

	//OpenCL設備信息
	cl_uint numDevices;
	cl_device_id deviceIds[1];
	size_t maxComputeUnits;

	errNum = clGetDeviceIDs (platformIds[0],
							 CL_DEVICE_TYPE_GPU,
							 0, NULL, &numDevices);
	if ((errNum != CL_SUCCESS) || (numDevices < 1)) {
		perror("Error in clGetDeviceIDs or no GPU deivce.");			
		exit(1);
	}

	errNum = clGetDeviceIDs (platformIds[0],
							 CL_DEVICE_TYPE_GPU,
							 1, &deviceIds[0], NULL);
	if ((errNum != CL_SUCCESS) || (numDevices < 1)) {
		perror("Error in clGetDeviceIDs.");			
		exit(1);
	}

	errNum = clGetDeviceInfo(deviceIds[0],
							 CL_DEVICE_MAX_COMPUTE_UNITS,
							 sizeof(cl_uint),
							 &maxComputeUnits,
							 NULL);

	printf("numDevices = %d, maxComputeUnits = %d\n", numDevices, maxComputeUnits);

　　運行結果：平臺中由一個設備，最大計算單元數爲7。在此，我們劃分爲4個工作組，每個工作組大小設置爲250。具體代碼如下：

size_t global[1];
size_t local[1];

global[0] = (size_t)Ndim;
local[0] = (size_t)250;

errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL,
								global, local, 0, NULL, &prof_event);

　　最終運行時間爲： 0.208867 秒，約爲原有基礎上的2/3 ！

4. 附錄：關於二維數據的工作組分組

　　在宿主機代碼中，根據clGetDeviceInfo()函數查找CL_DEVICE_MAX_WORK_GROUP_SIZE標誌所對應工作組內最多元素個數，在我的電腦上，爲1024。工作組的大小通常是64的倍數，最好不超過256。localx，localy也有要求，根據我的試驗，必須是4的倍數。

	size_t local[2];

	local[0] = 8;
	local[1] = 128;

	errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global, local, 0, NULL, &prof_event);

　　由於globalx與globaly必須恰好整除localx與localy，因此通過下述寫法確保在圖像寬與高不能整除對應localx和localy值時，能夠滿足條件。

	global[0] = (nWidth + local[0] - 1) / local[0];
	global[1] = (nHeight + local[1] - 1) / local[1];

	global[0] *= local[0];
	global[1] *= local[1];

　　爲此，在相應內核代碼中，必須判斷所計算出的globalx與globaly的值是否還在圖像範圍之內。如下：

	if((x < nWidth) && (y < nHeight))
	{
		...
	}

Johnson Lu

發佈了33 篇原創文章 · 獲贊 8 · 訪問量 10萬+

私信關注

OpenCL案例研究之一

一、案例簡述

二、 OpenCL實現矩陣乘法

1. 內核函數實現

2. 宿主機代碼實現

3. 運行結果

三、代碼優化 <工作項分組和減少數據移動>

1. 第一次優化

2. 第二次優化

4. 附錄：關於二維數據的工作組分組

OpenCL案例研究之一

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

OpenCL案例研究之一

一、 案例簡述

二、 OpenCL實現矩陣乘法

1. 內核函數實現

2. 宿主機代碼實現

3. 運行結果

三、 代碼優化 <工作項分組和減少數據移動>

1. 第一次優化

2. 第二次優化

4. 附錄：關於二維數據的工作組分組

一、案例簡述

三、代碼優化 <工作項分組和減少數據移動>