#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>
#include <cmath>
using namespace std;
#define M 3200
#define N 3200
#define P 3200
#define B_S 32
//#define SHOW
//A[M][P]*B[P][N]=C[M][N]
__global__
void mulMatri_gpu(float* A, float* B, float* C, int m, int n, int p)
{
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
if (index_x >= n || index_y >= m) return;
float Pvalue = 0;
for (int k = 0; k < p; k++)
{
Pvalue += A[index_y*p + k] * B[k*n + index_x];
}
C[index_y*n + index_x] = Pvalue;
}
void mulMatri_cpu(float* A, float* B, float* C, int m, int n, int p)
{
int i, j, k;
for (i = 0; i<m; i++)
for (j = 0; j<n; j++){
C[i*n + j] = 0;
for (k = 0; k<p; k++)
C[i*n + j] += A[i*p + k] * B[k*n + j];
}
}
void compute_gpu(float* A, float *B, float *C)
{
float *da, *db, *dc;
cudaMalloc((void **)&da, sizeof(float)*M*P);
cudaMalloc((void **)&db, sizeof(float)*P*N);
cudaMalloc((void **)&dc, sizeof(float)*M*N);
///測試時間
float elapsedTime = 0.0f;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(da, A, sizeof(float)*M*P, cudaMemcpyHostToDevice);
cudaMemcpy(db, B, sizeof(float)*P*N, cudaMemcpyHostToDevice);
dim3 dimGrid((M + B_S - 1) / B_S, (N + B_S - 1) / B_S);
dim3 dimBlock(B_S, B_S);
mulMatri_gpu << <dimGrid, dimBlock >> >(da, db, dc, M, N, P);
//cudaDeviceSynchronize();
cudaMemcpy(C, dc, sizeof(float)*N*M, cudaMemcpyDeviceToHost);
///時間結束
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("the time on gpu is %f ms\n", elapsedTime);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void compute_cpu(float* A, float *B, float *C)
{
clock_t start, finish;
start = clock();
mulMatri_cpu(A, B, C, M, N, P);
finish = clock();
printf("the time on cpu is %f ms\n", (double)(finish - start));
}
void verify(float *C1, float *C2, int m, int n)
{
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
{
if ((C2[i*n + j] - C1[i*m + j])>1e-5)
{
printf("error! results are not equel!");
break;
}
}
}
int main()
{
float* A = (float*)malloc(M*P*sizeof(float));
float* B = (float*)malloc(P*N*sizeof(float));
float* C1 = (float*)malloc(M*N*sizeof(float));
float* C2 = (float*)malloc(M*N*sizeof(float));
for (int i = 0; i<M; i++)
for (int j = 0; j < P; j++)
A[i*P + j] = rand() % 10;
for (int i = 0; i<P; i++)
for (int j = 0; j < N; j++)
B[i*N + j] = rand() % 10;
#ifdef SHOW
for (int i = 0; i<M; i++){
for (int j = 0; j<P; j++)
cout << A[i*P + j] << " ";
cout << endl;
}
for (int i = 0; i<P; i++){
for (int j = 0; j<N; j++)
cout << B[i*N + j] << " ";
cout << endl;
}
#endif
compute_cpu(A, B, C1);
#ifdef SHOW
for (int i = 0; i<M; i++){
for (int j = 0; j<N; j++)
cout << C1[i*N + j] << " ";
cout << endl;
}
#endif
compute_gpu(A, B, C2);
#ifdef SHOW
for (int i = 0; i<M; i++){
for (int j = 0; j<N; j++)
cout << C2[i*N + j] << " ";
cout << endl;
}
#endif
verify(C1, C2, M, N);
free(A);
free(B);
free(C1);
free(C2);
return 0;
}
二維矩陣相乘——cpu&&gpu
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.