二維矩陣相乘——cpu&&gpu

  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <time.h>
  4. #include <iostream>
  5. #include <cmath>
  6. using namespace std;
  7. #define M 3200
  8. #define N 3200
  9. #define P 3200
  10. #define B_S 32
  11. //#define SHOW
  12. //A[M][P]*B[P][N]=C[M][N]
  13. __global__
  14. void mulMatri_gpu(float* A, float* B, float* C, int m, int n, int p)
  15. {
  16. int index_x = blockIdx.x * blockDim.x + threadIdx.x;
  17. int index_y = blockIdx.y * blockDim.y + threadIdx.y;
  18. if (index_x >= n || index_y >= m) return;
  19. float Pvalue = 0;
  20. for (int k = 0; k < p; k++)
  21. {
  22. Pvalue += A[index_y*p + k] * B[k*n + index_x];
  23. }
  24. C[index_y*n + index_x] = Pvalue;
  25. }
  26. void mulMatri_cpu(float* A, float* B, float* C, int m, int n, int p)
  27. {
  28. int i, j, k;
  29. for (i = 0; i<m; i++)
  30. for (j = 0; j<n; j++){
  31. C[i*n + j] = 0;
  32. for (k = 0; k<p; k++)
  33. C[i*n + j] += A[i*p + k] * B[k*n + j];
  34. }
  35. }
  36. void compute_gpu(float* A, float *B, float *C)
  37. {
  38. float *da, *db, *dc;
  39. cudaMalloc((void **)&da, sizeof(float)*M*P);
  40. cudaMalloc((void **)&db, sizeof(float)*P*N);
  41. cudaMalloc((void **)&dc, sizeof(float)*M*N);
  42. ///測試時間
  43. float elapsedTime = 0.0f;
  44. cudaEvent_t start, stop;
  45. cudaEventCreate(&start);
  46. cudaEventCreate(&stop);
  47. cudaEventRecord(start, 0);
  48. cudaMemcpy(da, A, sizeof(float)*M*P, cudaMemcpyHostToDevice);
  49. cudaMemcpy(db, B, sizeof(float)*P*N, cudaMemcpyHostToDevice);
  50. dim3 dimGrid((M + B_S - 1) / B_S, (N + B_S - 1) / B_S);
  51. dim3 dimBlock(B_S, B_S);
  52. mulMatri_gpu << <dimGrid, dimBlock >> >(da, db, dc, M, N, P);
  53. //cudaDeviceSynchronize();
  54. cudaMemcpy(C, dc, sizeof(float)*N*M, cudaMemcpyDeviceToHost);
  55. ///時間結束
  56. cudaEventRecord(stop, 0);
  57. cudaEventSynchronize(stop);
  58. cudaEventElapsedTime(&elapsedTime, start, stop);
  59. printf("the time on gpu is %f ms\n", elapsedTime);
  60. cudaFree(da);
  61. cudaFree(db);
  62. cudaFree(dc);
  63. cudaEventDestroy(start);
  64. cudaEventDestroy(stop);
  65. }
  66. void compute_cpu(float* A, float *B, float *C)
  67. {
  68. clock_t start, finish;
  69. start = clock();
  70. mulMatri_cpu(A, B, C, M, N, P);
  71. finish = clock();
  72. printf("the time on cpu is %f ms\n", (double)(finish - start));
  73. }
  74. void verify(float *C1, float *C2, int m, int n)
  75. {
  76. for (int i = 0; i < m; i++)
  77. for (int j = 0; j < n; j++)
  78. {
  79. if ((C2[i*n + j] - C1[i*m + j])>1e-5)
  80. {
  81. printf("error! results are not equel!");
  82. break;
  83. }
  84. }
  85. }
  86. int main()
  87. {
  88. float* A = (float*)malloc(M*P*sizeof(float));
  89. float* B = (float*)malloc(P*N*sizeof(float));
  90. float* C1 = (float*)malloc(M*N*sizeof(float));
  91. float* C2 = (float*)malloc(M*N*sizeof(float));
  92. for (int i = 0; i<M; i++)
  93. for (int j = 0; j < P; j++)
  94. A[i*P + j] = rand() % 10;
  95. for (int i = 0; i<P; i++)
  96. for (int j = 0; j < N; j++)
  97. B[i*N + j] = rand() % 10;
  98. #ifdef SHOW
  99. for (int i = 0; i<M; i++){
  100. for (int j = 0; j<P; j++)
  101. cout << A[i*P + j] << " ";
  102. cout << endl;
  103. }
  104. for (int i = 0; i<P; i++){
  105. for (int j = 0; j<N; j++)
  106. cout << B[i*N + j] << " ";
  107. cout << endl;
  108. }
  109. #endif
  110. compute_cpu(A, B, C1);
  111. #ifdef SHOW
  112. for (int i = 0; i<M; i++){
  113. for (int j = 0; j<N; j++)
  114. cout << C1[i*N + j] << " ";
  115. cout << endl;
  116. }
  117. #endif
  118. compute_gpu(A, B, C2);
  119. #ifdef SHOW
  120. for (int i = 0; i<M; i++){
  121. for (int j = 0; j<N; j++)
  122. cout << C2[i*N + j] << " ";
  123. cout << endl;
  124. }
  125. #endif
  126. verify(C1, C2, M, N);
  127. free(A);
  128. free(B);
  129. free(C1);
  130. free(C2);
  131. return 0;
  132. }





發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章