CUDA並行編程
1.基於CPU的矢量求和
#include <studio.h>
#include <time.h>
#include<iostream>
using namespace std;
#define N 50000
void add(int *a, int *b, int *c) {
int tid = 0;
while (tid < N) {//也可以用for循環
c[tid] = a[tid] + b[tid];
tid += 1;
}
}
int main(void) {
int a[N], b[N], c[N];
//在CPU上給數組'a'和'b'賦值
for (int i = 0; i < N; i++) {
a[i] = -i;//0,-1,-2,-3,-4,-5,-6,-7,-8,-9
b[i] = i*i;//0,1,4,9,26,25,36,49,64,81
}
clock_t t0 = clock();
add(a, b, c);
//顯示結果
for (int i = 0; i < N; i++) {
printf("%d+%d=%d\n", a[i], b[i], c[i]);
}
clock_t t1 = clock();
cout << "Running time is: " << static_cast<double>(t1 - t0) / CLOCKS_PER_SEC * 1000 << "ms" << endl;
return 0;
}
2.基於GPU的矢量求和
#include "cuda_runtime.h"
#include "device_launch_parameters.h"//包含blockIdx.x
#include "book.h"//包含了studio.h
#include <time.h>
#include<iostream>
using namespace std;
#define N 50000//N<65535
__global__ void add(int *a, int *b, int *c) {
int tid = blockIdx.x;
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main(void) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
//在GPU上分配內存
HANDLE_ERROR(cudaMalloc((void**)&dev_a, N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b, N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c, N * sizeof(int)));
//在CPU上給數組'a'和'b'賦值
for (int i = 0; i < N; i++) {
a[i] = -i;//0,-1,-2,-3,-4,-5,-6,-7,-8,-9
b[i] = i*i;//0,1,4,9,26,25,36,49,64,81
}
clock_t t0 = clock();
//將數組'a'和'b'複製到GPU
HANDLE_ERROR(cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice));//主機拷貝到設備
HANDLE_ERROR(cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice));
add << <N, 1 >> > (dev_a, dev_b, dev_c);
//將數組'c'從GPU複製到CPU
HANDLE_ERROR(cudaMemcpy(c,dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));
//顯示結果
for (int i = 0; i < N; i++) {
printf("%d+%d=%d\n", a[i], b[i], c[i]);
}
//釋放在GPU上分配的內存
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
clock_t t1 = clock();
cout << "Running time is: " << static_cast<double>(t1 - t0) / CLOCKS_PER_SEC * 1000 << "ms" << endl;
return 0;
}