CUDA 流(streams)
異步併發執行
當使用異步函數時,即實現了設備和宿主之間的併發執行。
異步函數:在設備完成請求的任務之前,控制就會返回到應用程序(宿主)。
這些函數包括:
1.核函數
2.執行內存複製並以Async爲後綴的函數
3.執行 設備<一>設備 內存複製的函數
4.設置內存的函數。
流的優點
應用程序通過流(streams)管理併發。流是一個順序執行的操作序列。另一方面,在同一時刻,不同的流之間可以不按順序執行操作。
在主機調用核函數時,我們實現了數據的並行計算;當我們需要實現任務級的並行,就得使用流(streams)
通過創建多個流對象,可以使用不同流之間的並行,實現數據傳輸的並行和不同流的重疊計算。(要求宿主內存爲頁鎖定內存(page-locked))
可以通過以下代碼測試自己的顯卡是否具備重疊計算的能力
cudaDeviceProp prop;
int deviceID;
cudaGetDevice(&deviceID);
cudaGetDeviceProperties(&prop, deviceID);
//檢查設備是否支持重疊功能
if (!prop.deviceOverlap)
{
printf("No device will handle overlaps. so no speed up from stream.\n");
return 0;
}
流的使用
1)創建流對象
cudaStream_t stream[2];
for (int i = 0; i < 2; ++i)
{
cudaStreamCreate(&stream[i]);
}
2)將數據拷貝(host -> device)添加進流
for (int i = 0; i < 2; ++i)
{
cudaMemcpyAsync(d_input + i * height*width, h_input + i * height*width, eachmemsize, cudaMemcpyHostToDevice, stream[i]);
}
3)將啓動核函數添加進流
//若兩個流所啓動的核函數一樣,可以用一個for循環解決
kernel_1 << <gridsize, blocksize, 0, stream[0] >> > (d_output, d_input,width,height);
kernel_2 << <gridsize, blocksize, 0, stream[1] >> > (d_output + height * width, d_input + height * width, width, height);
4)數據拷貝(device -> host)
for (int i = 0; i < 2; ++i)
{
cudaMemcpyAsync(h_input + i * height*width, d_output + i * height*width, eachmemsize, cudaMemcpyDeviceToHost, stream[i]);
}
5)主機與設備端同步
cudaThreadSynchronize();
(若不同流之間的核函數有數據的關聯,可以使用流同步函數解決)
cudaStreamSynchronize(cudaStream_t stream);
代碼示例
以下代碼創建了兩個流,分別執行不同的核函數。
由於實驗的數據量較小,對比不出流的優勢,當數據量足夠大時,則可以體現出流在重疊計算和數據傳輸方面的優勢。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<stdlib.h>
using namespace std;
int iDivUp(int a, int b)
{
return (a%b != 0) ? (a / b + 1) : (a / b);
}
__device__ size_t flatten_2d_index(size_t x, size_t y, size_t offset)
{
return x + y * offset;
}
__global__ void kernel_1(float *output,float *input,unsigned int width,unsigned int height)
{
size_t ix = threadIdx.x + blockIdx.x*blockDim.x;
size_t iy = threadIdx.y + blockIdx.y*blockDim.y;
if (ix < width&&iy < height)
{
size_t i = flatten_2d_index(ix, iy, width);
output[i] = input[i] * 2;
}
}
__global__ void kernel_2(float *output, float *input,unsigned int width,unsigned int height)
{
size_t ix = threadIdx.x + blockIdx.x*blockDim.x;
size_t iy = threadIdx.y + blockIdx.y*blockDim.y;
if (ix < width&&iy < height)
{
size_t i = flatten_2d_index(ix, iy, width);
output[i] = input[i] / 2;
}
}
int main()
{
cudaDeviceProp prop;
int deviceID;
cudaGetDevice(&deviceID);
cudaGetDeviceProperties(&prop, deviceID);
//檢查設備是否支持重疊功能
if (!prop.deviceOverlap)
{
printf("No device will handle overlaps. so no speed up from stream.\n");
return 0;
}
//實驗數據
unsigned int height = 16;
unsigned int width = 8;
unsigned int num = 2;
size_t allmemsize = sizeof(float)*height*width*num;
size_t eachmemsize = sizeof(float)*height*width;
//分配頁鎖定內存,並填充數據
float *h_input;
cudaHostAlloc((void**)&h_input, allmemsize, cudaHostAllocDefault);
for (int k = 0; k < num; ++k)
{
for (int i = 0; i < height*width; ++i)
{
//h_input[k*height*width + i] = rand() % 100;
h_input[k*height*width + i] = i;
cout << h_input[k*height*width + i] << " ";
}
cout << endl;
}
//分配設備內存
float *d_input;
cudaMalloc((void**)&d_input, allmemsize);
float *d_output;
cudaMalloc((void**)&d_output, allmemsize);
//創建流對象
cudaStream_t stream[2];
for (int i = 0; i < 2; ++i)
{
cudaStreamCreate(&stream[i]);
}
//往流裏添加任務
for (int i = 0; i < 2; ++i)
{
cudaMemcpyAsync(d_input + i * height*width, h_input + i * height*width, eachmemsize, cudaMemcpyHostToDevice, stream[i]);
}
cout << "GPU計算中..." << endl;
dim3 blocksize(8, 8);
dim3 gridsize(iDivUp(width, blocksize.x), iDivUp(height, blocksize.y));
kernel_1 << <gridsize, blocksize, 0, stream[0] >> > (d_output, d_input,width,height);
kernel_2 << <gridsize, blocksize, 0, stream[1] >> > (d_output + height * width, d_input + height * width, width, height);
cout << "計算完畢!" << endl;
for (int i = 0; i < 2; ++i)
{
cudaMemcpyAsync(h_input + i * height*width, d_output + i * height*width, eachmemsize, cudaMemcpyDeviceToHost, stream[i]);
}
//設備與主機同步
cudaThreadSynchronize();
//內存釋放
cudaFree(d_input);
cudaFree(d_output);
for (int i = 0; i < 2; ++i)
{
cudaStreamDestroy(stream[i]);
}
for (int k = 0; k < num; ++k)
{
for (int i = 0; i < height*width; ++i)
{
//h_output[k*height*width + i] = rand() % 100;
cout << h_input[k*height*width + i] << " ";
}
cout << endl;
}
system("pause");
return 0;
}