- 刪除loop-carrried依賴
- __kernel void unoptimized(__global int * restrict A,
- __global int * restrict B,
- __global int * restrict result)
- {
- int sum = 0;
- for(unsigned i=0;i<N;i++){
- for(unsigned j=0;j<N;j++){
- sum +=A[i*N+j];
- }
- sum += B[i];
- }
- *result = sum;
- }
使用局部變量能夠解除依賴。
- __kernel void optimized(__global int * restrict A,
- __global int * restrict B,
- __global int * restrict result)
- {
- int sum = 0;
- for(unsigned i=0;i<N;i++){
- int sum2 = 0;
- for(unsigned j=0;j<N;j++){
- sum2 +=A[i*N+j];
- }
- sum += sum2;
- sum += B[i];
- }
- *result = sum;
- }
6.2
- #define N 128
- __kernel void unoptimized(__global float * restrict A,
- __global float * restrict result)
- {
- float mul = 1.0f;
- for(unsigned i=0;i<N;i++)
- mul *= A[i];
- *result = mul;
- }
原因在於在未進行優化之前float類型的乘法的II爲3,進行優化之後II爲1.思想是不使用單個變量來存儲乘法結果,而是對變量的M個副本進行操作。相當於將乘法得到的數據存儲到長度爲M的數組中,並對數組裏的數據進行移位賦值,這樣的話長爲M的數組中就各自存儲了一部分的乘法數據,最後將這些數據進行相乘即爲最終結果。
- #define N 128
- #define M 8
- __kernel void optimized(__global float * restrict A,
- __global float * restrict result)
- {
- float mul = 1.0f;
- float mul_copies[M];
- for(unsigned i = 0;i < M;i++)
- mul_copies[i] = 1.0f;
- for(unsigned i=0;i<N;i++){
- float cur = mul_copies[M-1] * A[i];
- #pragma unroll
- for(unsigned j = M-1;j >0;j--){
- mul_copies[j] = mul_copies[j-1];
- mul_copies[0] = cur;
- }
- }
- #pragma unroll
- for(unsigned i =0;i < M;i++)
- mul *= mul_copies[i];
- *result = mul;
- }
對於無法刪除的循環依賴,通過將循環攜帶依賴項的數組從全局內存移動到本地內存來改進II
- #define N 128
- __kernel void unoptimized(__global float * restrict A)
- {
- for(unsigned i =0;i< N;i++){
- A[N-i] = A[i];
- }
- }
- #define N 128
- __kernel void optimized(__global float * restrict A)
- {
- float B[N];
- for(unsigned i =0;i< N;i++){
- B[i] = A[i];
- }
- for(unsigned i =0;i< N;i++){
- B[N-i] = B[i];
- }
- for(unsigned i =0;i< N;i++){
- A[i] = B[i];
- }
- }