轉自 http://blog.csdn.net/abcjennifer/article/details/42528569
下面通過一個經典例子來看shared memory作用:矩陣乘法
目的:實現C=A*B,方法:c[i,j] = A[i,:] * B[:,j],
其中矩陣用row-major表示,即c[i,j] = *(c.elements + i*c.width + j)
1. 不用shared memory優化版:
設A爲m*t的矩陣;B爲t*n的矩陣;
每個線程讀取A的一行,B的一列,計算C的對應值;
所以這樣需要從global memory中讀n次A,m次B。
- // Matrices are stored in row-major order:
- // M(row, col) = *(M.elements + row * M.width + col)
- typedef struct {
- int width;
- int height;
- float* elements;
- } Matrix;
- // Thread block size
- #define BLOCK_SIZE 16
- // Forward declaration of the matrix multiplication kernel
- __global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
- // Matrix multiplication - Host code
- // Matrix dimensions are assumed to be multiples of BLOCK_SIZE
- void MatMul(const Matrix A, const Matrix B, Matrix C)
- {
- // Load A and B to device memory
- Matrix d_A;
- d_A.width = A.width; d_A.height = A.height;
- size_t size = A.width * A.height * sizeof(float);
- cudaMalloc(&d_A.elements, size);
- cudaMemcpy(d_A.elements, A.elements, size,
- cudaMemcpyHostToDevice);
- Matrix d_B;
- d_B.width = B.width; d_B.height = B.height;
- size = B.width * B.height * sizeof(float);
- cudaMalloc(&d_B.elements, size);
- cudaMemcpy(d_B.elements, B.elements, size,
- cudaMemcpyHostToDevice);
- // Allocate C in device memory
- Matrix d_C;
- d_C.width = C.width; d_C.height = C.height;
- size = C.width * C.height * sizeof(float);
- cudaMalloc(&d_C.elements, size);
- // Invoke kernel
- dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
- dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
- MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
- // Read C from device memory
- cudaMemcpy(C.elements, Cd.elements, size,
- cudaMemcpyDeviceToHost);
- }
- // Free device memory
- cudaFree(d_A.elements);
- cudaFree(d_B.elements);
- cudaFree(d_C.elements);
- }
- // Matrix multiplication kernel called by MatMul()
- __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
- {
- // Each thread computes one element of C
- // by accumulating results into Cvalue
- float Cvalue = 0;
- int row = blockIdx.y * blockDim.y + threadIdx.y;
- int col = blockIdx.x * blockDim.x + threadIdx.x;
- for (int e = 0; e < A.width; ++e)
- Cvalue += A.elements[row * A.width + e]* B.elements[e * B.width + col];
- C.elements[row * C.width + col] = Cvalue;
- }
2. 利用shared memory
每個thread block負責計算一個子矩陣Csub, 其中每個thread負責計算Csub中的一個元素。如下圖所示。爲了將fit設備資源,A,B都分割成很多block_size維的方形matrix,Csub將這些方形matrix的乘積求和而得。每次計算一個乘積時,先將兩個對應方形矩陣從global memory 載入 shared memory(一個thread負責載入A, B兩個sub matrix的元素),然後每個thread計算乘積的一個元素,再由每個thread將這些product加和,存入一個register,最後一次性寫入global memory。計算時注意同步,詳見代碼。
設A爲m*t的矩陣;B爲t*n的矩陣;
這樣呢,A只從global memory讀了n/block_size次,B只讀了m/block_size次;
Kernel Code:
- __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
- {
- // Block row and column
- int blockRow = blockIdx.y;
- int blockCol = blockIdx.x;
- // Each thread block computes one sub-matrix Csub of C
- Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
- // Each thread computes one element of Csub by accumulating results into Cvalue
- float Cvalue = 0;
- // Thread row and column within Csub
- int row = threadIdx.y;
- int col = threadIdx.x;
- // Loop over all the sub-matrices of A and B that are
- // required to compute Csub
- // Multiply each pair of sub-matrices together
- // and accumulate the results
- for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
- // Get sub-matrix Asub of A
- Matrix Asub = GetSubMatrix(A, blockRow, m);
- // Get sub-matrix Bsub of B
- Matrix Bsub = GetSubMatrix(B, m, blockCol);
- // Shared memory used to store Asub and Bsub respectively
- __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
- __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
- // Load Asub and Bsub from device memory to shared memory
- // Each thread loads one element of each sub-matrix
- As[row][col] = GetElement(Asub, row, col);
- Bs[row][col] = GetElement(Bsub, row, col);
- // Synchronize to make sure the sub-matrices are loaded
- // before starting the computation
- __syncthreads();
- // Multiply Asub and Bsub together
- for (int e = 0; e < BLOCK_SIZE; ++e)
- Cvalue += As[row][e] * Bs[e][col];
- // Synchronize to make sure that the preceding
- // computation is done before loading two new
- // sub-matrices of A and B in the next iteration
- __syncthreads();
- }
- // Write Csub to device memory
- // Each thread writes one element
- SetElement(Csub, row, col, Cvalue);
- }
Host Code:
- // Invoke kernel
- dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
- dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
- MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);