前言
我們知道硬件擁有其獨特的並行性,爲了發揮這一特色。我們要將平時串行執行的程序用並行性算法重新改寫才能充分發揮 GPU 的優勢。
實例:做求和:1+2+3+4+···
爲了做這樣一個累加和的加速,有兩種簡單的實現方法,分別是 Redece 進行歸約(二分),或者是用 Scan 通過控制步長進行掃描求和。
Reduce
如上圖所示爲了並行執行累加,我們要構造出一些線程,每個線程並行工作,從而達到加速的目的。
Reduce 的原理如上圖所示。我們構建1024個線程塊,每個線程塊中含有1024個線程,每個block中的線程(每次迭代只有上次迭代一半的線程數)並行的去計算求和。
求和規則是:每次迭代中左半邊的線程分別加上右半邊的線程分配得到的數,即只有一半的線程在工作,如此循環往復,最終0號線程得到的結果就是該線程塊所有數的和。最後再運行最後一個線程塊,將之前得到的線程塊和當作輸入數據,進行最後一次並行計算,從而得到最終和。
實現代碼如下,通過兩種訪存方式進行實現:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
// do reduction in global mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s)
{
d_in[myId] += d_in[myId + s];
}
__syncthreads(); // make sure all adds at one stage are done!
}
// only thread 0 writes result for this block back to global mem
if (tid == 0)
{
d_out[blockIdx.x] = d_in[myId];
}
}
__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
// sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
extern __shared__ float sdata[];
int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;
// load shared mem from global mem
sdata[tid] = d_in[myId];
__syncthreads(); // make sure entire block is loaded!
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads(); // make sure all adds at one stage are done!
}
// only thread 0 writes result for this block back to global mem
if (tid == 0)
{
d_out[blockIdx.x] = sdata[0];
}
}
void reduce(float * d_out, float * d_intermediate, float * d_in,
int size, bool usesSharedMemory)
{
// assumes that size is not greater than maxThreadsPerBlock^2
// and that size is a multiple of maxThreadsPerBlock
const int maxThreadsPerBlock = 1024;
int threads = maxThreadsPerBlock;
int blocks = size / maxThreadsPerBlock;
if (usesSharedMemory)
{
shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
(d_intermediate, d_in);
}
else
{
global_reduce_kernel<<<blocks, threads>>>
(d_intermediate, d_in);
}
// now we're down to one block left, so reduce it
threads = blocks; // launch one thread for each block in prev step
blocks = 1;
if (usesSharedMemory)
{
shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
(d_out, d_intermediate);
}
else
{
global_reduce_kernel<<<blocks, threads>>>
(d_out, d_intermediate);
}
}
int main(int argc, char **argv)
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (deviceCount == 0) {
fprintf(stderr, "error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
int dev = 0;
cudaSetDevice(dev);
cudaDeviceProp devProps;
if (cudaGetDeviceProperties(&devProps, dev) == 0)
{
printf("Using device %d:\n", dev);
printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
devProps.name, (int)devProps.totalGlobalMem,
(int)devProps.major, (int)devProps.minor,
(int)devProps.clockRate);
}
const int ARRAY_SIZE = 1 << 20;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
// generate the input array on the host
float h_in[ARRAY_SIZE];
float sum = 0.0f;
for(int i = 0; i < ARRAY_SIZE; i++) {
// generate random float in [-1.0f, 1.0f]
h_in[i] = -1.0f + (float)random()/((float)RAND_MAX/2.0f);
sum += h_in[i];
}
// declare GPU memory pointers
float * d_in, * d_intermediate, * d_out;
// allocate GPU memory
cudaMalloc((void **) &d_in, ARRAY_BYTES);
cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated
cudaMalloc((void **) &d_out, sizeof(float));
// transfer the input array to the GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
int whichKernel = 0;
if (argc == 2) {
whichKernel = atoi(argv[1]);
}
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// launch the kernel
switch(whichKernel) {
case 0:
printf("Running global reduce\n");
cudaEventRecord(start, 0);
for (int i = 0; i < 100; i++)
{
reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
}
cudaEventRecord(stop, 0);
break;
case 1:
printf("Running reduce with shared mem\n");
cudaEventRecord(start, 0);
for (int i = 0; i < 100; i++)
{
reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
}
cudaEventRecord(stop, 0);
break;
default:
fprintf(stderr, "error: ran no kernel\n");
exit(EXIT_FAILURE);
}
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
elapsedTime /= 100.0f; // 100 trials
// copy back the sum from GPU
float h_out;
cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
printf("average time elapsed: %f\n", elapsedTime);
// free GPU memory allocation
cudaFree(d_in);
cudaFree(d_intermediate);
cudaFree(d_out);
return 0;
}
Scan
Scan 算法便是通過控制每次迭代的步長,從而達到並行求和,原理如下圖所示:
即,第一次的求和步長是1,第二次是2…
實現代碼如下(也是通過兩種訪存方式實現):
#include <stdio.h>
__global__ void global_scan(float* d_out,float* d_in){
int idx = threadIdx.x;
float out = 0.00f;
d_out[idx] = d_in[idx];
__syncthreads();
for(int interpre=1;interpre<sizeof(d_in);interpre*=2){
if(idx-interpre>=0){
out = d_out[idx]+d_out[idx-interpre];
}
__syncthreads();
if(idx-interpre>=0){
d_out[idx] = out;
out = 0.00f;
}
}
}
__global__ void shared_scan(float *d_out, float *d_in)
{
__shared__ float sdata[8];
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int myid = threadIdx.x;
sdata[myid] = d_in[idx];
__syncthreads();
d_out[idx] = sdata[myid];
__syncthreads();
float out = 0.0f;
for(int interpre = 1; interpre < sizeof(sdata); interpre *= 2)
{
if(myid - interpre >= 0)
{
out = d_out[myid] + d_out[myid - interpre];
}
__syncthreads();
if(myid - interpre >= 0)
{
d_out[idx] = out;
out = 0.0f;
}
}
}
int main(int argc,char** argv){
const int ARRAY_SIZE = 8;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
// generate the input array on the host
float h_in[ARRAY_SIZE];
for(int i=0;i<ARRAY_SIZE;i++){
h_in[i] = float(i);
}
float h_out[ARRAY_SIZE];
// declare GPU memory pointers
float* d_in;
float* d_out;
// allocate GPU memory
cudaMalloc((void**) &d_in,ARRAY_BYTES);
cudaMalloc((void**) &d_out,ARRAY_BYTES);
// transfer the array to GPU
cudaMemcpy(d_in,h_in,ARRAY_BYTES,cudaMemcpyHostToDevice);
// launch the kernel
//global_scan<<<1,ARRAY_SIZE>>>(d_out,d_in);
shared_scan<<<1, ARRAY_SIZE>>>(d_out, d_in);
// copy back the result array to the GPU
cudaMemcpy(h_out,d_out,ARRAY_BYTES,cudaMemcpyDeviceToHost);
// print out the resulting array
for(int i=0;i<ARRAY_SIZE;i++){
printf("%f",h_out[i]);
printf(((i%4) != 3) ? "\t" : "\n");
}
// free GPU memory allocation
cudaFree(d_in);
cudaFree(d_out);
return 0;
}