二維矩陣相乘——cpu&&gpu

原創

qulay

2020-06-22 02:32

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>
#include <cmath>
using namespace std;
#define M 3200
#define N 3200
#define P 3200
#define B_S 32
//#define SHOW
//A[M][P]*B[P][N]=C[M][N]
__global__
void mulMatri_gpu(float* A, float* B, float* C, int m, int n, int p)
{
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
if (index_x >= n || index_y >= m) return;
float Pvalue = 0;
for (int k = 0; k < p; k++)
{
Pvalue += A[index_y*p + k] * B[k*n + index_x];
}
C[index_y*n + index_x] = Pvalue;
}
void mulMatri_cpu(float* A, float* B, float* C, int m, int n, int p)
{
int i, j, k;
for (i = 0; i<m; i++)
for (j = 0; j<n; j++){
C[i*n + j] = 0;
for (k = 0; k<p; k++)
C[i*n + j] += A[i*p + k] * B[k*n + j];
}
}
void compute_gpu(float* A, float *B, float *C)
{
float *da, *db, *dc;
cudaMalloc((void **)&da, sizeof(float)*M*P);
cudaMalloc((void **)&db, sizeof(float)*P*N);
cudaMalloc((void **)&dc, sizeof(float)*M*N);
///測試時間
float elapsedTime = 0.0f;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(da, A, sizeof(float)*M*P, cudaMemcpyHostToDevice);
cudaMemcpy(db, B, sizeof(float)*P*N, cudaMemcpyHostToDevice);
dim3 dimGrid((M + B_S - 1) / B_S, (N + B_S - 1) / B_S);
dim3 dimBlock(B_S, B_S);
mulMatri_gpu << <dimGrid, dimBlock >> >(da, db, dc, M, N, P);
//cudaDeviceSynchronize();
cudaMemcpy(C, dc, sizeof(float)*N*M, cudaMemcpyDeviceToHost);
///時間結束
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("the time on gpu is %f ms\n", elapsedTime);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void compute_cpu(float* A, float *B, float *C)
{
clock_t start, finish;
start = clock();
mulMatri_cpu(A, B, C, M, N, P);
finish = clock();
printf("the time on cpu is %f ms\n", (double)(finish - start));
}
void verify(float *C1, float *C2, int m, int n)
{
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
{
if ((C2[i*n + j] - C1[i*m + j])>1e-5)
{
printf("error! results are not equel！");
break;
}
}
}
int main()
{
float* A = (float*)malloc(M*P*sizeof(float));
float* B = (float*)malloc(P*N*sizeof(float));
float* C1 = (float*)malloc(M*N*sizeof(float));
float* C2 = (float*)malloc(M*N*sizeof(float));
for (int i = 0; i<M; i++)
for (int j = 0; j < P; j++)
A[i*P + j] = rand() % 10;
for (int i = 0; i<P; i++)
for (int j = 0; j < N; j++)
B[i*N + j] = rand() % 10;
#ifdef SHOW
for (int i = 0; i<M; i++){
for (int j = 0; j<P; j++)
cout << A[i*P + j] << " ";
cout << endl;
}
for (int i = 0; i<P; i++){
for (int j = 0; j<N; j++)
cout << B[i*N + j] << " ";
cout << endl;
}
#endif
compute_cpu(A, B, C1);
#ifdef SHOW
for (int i = 0; i<M; i++){
for (int j = 0; j<N; j++)
cout << C1[i*N + j] << " ";
cout << endl;
}
#endif
compute_gpu(A, B, C2);
#ifdef SHOW
for (int i = 0; i<M; i++){
for (int j = 0; j<N; j++)
cout << C2[i*N + j] << " ";
cout << endl;
}
#endif
verify(C1, C2, M, N);
free(A);
free(B);
free(C1);
free(C2);
return 0;
}

來自爲知筆記(Wiz)

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

二維矩陣相乘——cpu&&gpu

釘釘打卡速度慢

Nginx R31 doc 官方文檔-01-nginx 如何安裝

Python 潮流週刊#51：用 Python 繪製美觀的圖表

Qt/C++音視頻開發74-合併標籤圖形/生成yolo運算結果圖形/文字和圖形合併成一個/水印濾鏡

挑戰程序設計競賽 2.2章習題 POJ - 3617 Best Cow Line 貪心

字節面試：MySQL什麼時候鎖表？如何防止鎖表？

.NET8連接SQL SERVER 2008 R2 報：證書鏈是由不受信任的頒發機構頒發的

golang開發環境搭建(win10)

python計算機視覺學習筆記——PIL庫的用法

Golang初學：獲取程序內存使用情況，std runtime

多人多車求距離_cpu&&gpu_寄存器優化_sharememory優化

二維矩陣相乘——cpu&&gpu

菜鳥看論文——地面估計的研究趨勢（一）

菜鳥看論文——Stochastic Occupancy Grids and Dynamic Programming（一）

【轉載】參數估計(Parameter Estimation)：頻率學派（最大似然估計MLE、最大後驗估計MAP）與貝葉斯學派（貝葉斯估計BPE）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結