學習CUDA-兩個流

#include "cuda_runtime.h"
#include <iostream>
#include <stdio.h>
#include <math.h>

#define N (1024*1024)
#define FULL_DATA_SIZE N*20

__global__ void kernel(int* a, int* b, int* c) {
	int threadID = blockIdx.x*blockDim.x + threadIdx.x;
	if (threadID < N) {
		c[threadID] = (a[threadID] + b[threadID]) / 2;
	}
}

int main() {
	//獲取設備屬性
	cudaDeviceProp prop;
	int deviceID;
	cudaGetDevice(&deviceID);
	cudaGetDeviceProperties(&prop, deviceID);

	//檢查設備是否支持重疊功能
	if (!prop.deviceOverlap) {
		printf("No device will handle overlaps.so no speed up from stream.\n");
		return 0;
	}

	//啓動計時器
	cudaEvent_t start, stop;
	float elapsedTime;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);

	//創建兩個CUDA流
	cudaStream_t stream, stream1;
	cudaStreamCreate(&stream);
	cudaStreamCreate(&stream1);

	int *host_a, *host_b, *host_c;
	int *dev_a, *dev_b, *dev_c;
	int *dev_a1, *dev_b1, *dev_c1;

	//在GPU上分配內存
	cudaMalloc((void**)&dev_a, N * sizeof(int));
	cudaMalloc((void**)&dev_b, N * sizeof(int));
	cudaMalloc((void**)&dev_c, N * sizeof(int));

	cudaMalloc((void**)&dev_a1, N * sizeof(int));
	cudaMalloc((void**)&dev_b1, N * sizeof(int));
	cudaMalloc((void**)&dev_c1, N * sizeof(int));

	//在CPU上分配頁鎖定內存
	cudaHostAlloc((void**)&host_a, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault);
	cudaHostAlloc((void**)&host_b, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault);
	cudaHostAlloc((void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault);

	//主機上的內存賦值
	for (int i = 0; i < FULL_DATA_SIZE; i++) {
		host_a[i] = i;
		host_b[i] = FULL_DATA_SIZE-i;
	}

	for(int i = 0; i < FULL_DATA_SIZE; i += 2 * N) {
		//異步CPU複製給GPU(每次複製N個數)
		cudaMemcpyAsync(dev_a, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream);
		cudaMemcpyAsync(dev_b, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream);

		cudaMemcpyAsync(dev_a1, host_a + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
		cudaMemcpyAsync(dev_b1, host_b + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);

		kernel << <N / 1024, 1024, 0, stream >> > (dev_a, dev_b, dev_c);
		kernel << <N / 1024, 1024, 0, stream1 >> > (dev_a, dev_b, dev_c1);

		cudaMemcpyAsync(host_c + i, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost, stream);
		cudaMemcpyAsync(host_c + i + N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream);
	}

	//等待Stream流執行完成
	cudaStreamSynchronize(stream);
	cudaStreamSynchronize(stream1);

	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);

	std::cout << "消耗時間:" << elapsedTime << std::endl;

	//輸出10個結果
	for (int i = 0; i < 10; i++) {
		std::cout << host_c[i] << std::endl;
	}

	getchar();

	//釋放
	cudaFreeHost(host_a);
	cudaFreeHost(host_b);
	cudaFreeHost(host_c);

	cudaFreeHost(dev_a);
	cudaFreeHost(dev_b);
	cudaFreeHost(dev_c);

	cudaFreeHost(dev_a1);
	cudaFreeHost(dev_b1);
	cudaFreeHost(dev_c1);

	cudaStreamDestroy(stream);
	cudaStreamDestroy(stream1);
	return 0;
}

實現的過程

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章