CUDA,day-1,一維數組

#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <conio.h>
using namespace std;


__global__ void func1(int*  block,int*  thread,int*  warp,int*  calc_thread);


#define ARRAY_SIZE 128
#define ARRAY_SIZE_IN_BYTES (sizeof(int)*(ARRAY_SIZE))


//int cpu_block[ARRAY_SIZE];
//int cpu_thread[ARRAY_SIZE];
//int cpu_warp[ARRAY_SIZE];
//int cpu_calc_thread[ARRAY_SIZE];
int cpu_block[ARRAY_SIZE];
int cpu_thread[ARRAY_SIZE];
int cpu_warp[ARRAY_SIZE];
int cpu_calc_thread[ARRAY_SIZE];
int main()
{
const int num_blocks = 2;
const int num_threads = 64;


int * gpu_block;
int * gpu_thread;
int * gpu_warp;
int * gpu_calc_thread;




cudaMalloc((void **)&gpu_block, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_thread, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_warp, ARRAY_SIZE_IN_BYTES);
cudaMalloc((void **)&gpu_calc_thread, ARRAY_SIZE_IN_BYTES);


func1 <<<num_blocks, num_threads >>>(gpu_block,gpu_thread,gpu_warp,gpu_calc_thread);


cudaMemcpy(cpu_block, gpu_block, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_thread, gpu_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_warp, gpu_warp, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_calc_thread, gpu_calc_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);


cudaFree(gpu_block);
cudaFree(gpu_thread);
cudaFree(gpu_warp);
cudaFree(gpu_calc_thread);


int i;
for (i = 0; i < ARRAY_SIZE; i++)
{
printf("Calculated Thread: %d - Block: %d - Warp %d -Thread %d\n",cpu_calc_thread[i], cpu_block[i], cpu_warp[i], cpu_thread[i]);
}
cin.get();


return 0;
}


__global__ void func1(int*  block, int*  thread, int*  warp, int*  calc_thread)
{
int i = (blockIdx.x*blockDim.x) + threadIdx.x;
block[i] = blockIdx.x;
thread[i] = threadIdx.x;
warp[i] = threadIdx.x / warpSize;
calc_thread[i] = i;
}
發佈了29 篇原創文章 · 獲贊 7 · 訪問量 2萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章