#include <iostream>
#include <cstdlib>
#include <chrono>
#define N 1000//可以更改大小
using namespace std;
using namespace std::chrono;
double a[N][N], b[N][N], c[N][N];
int main()
{
double tmp;
int n;//矩陣大小
scanf("%d",&n);
//輸入矩陣
for(int i=0; i<n; i++){
for(int j=0; j<n; j++){
scanf("%lf",&a[i][j]);
}
}
for(int i=0; i<n; i++){
for(int j=0; j<n; j++){
scanf("%lf",&b[i][j]);
}
}
#pragma acc enter data create(a, b, c)
#pragma acc kernels present(a, b, c)
{
for(int i=0; i<n; i++){//初始化數組c,每次都清零
for(int j=0; j<n; j++){
c[i][j] = 0;
}
}
}
high_resolution_clock::time_point t1 = high_resolution_clock::now();
#pragma acc kernels present(a, b, c) // 最簡單的,每層循環都 auto
{
#pragma acc loop auto
for (int i=0; i<n; i++)
{
#pragma acc loop auto
for (int j=0; j<n; j++)
{
#pragma acc loop auto
for (int k=0; k<n; k++){
c[i][j] += a[i][k] * b[k][j];//矩陣乘法
}
printf("%.3lf ",c[i][j]);//這裏可以控制輸出精度
}
printf("\n");
}
}
//可以計算出所需要的時間
high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Auto: %.6lf s.\n\n", time.count());
#pragma acc kernels present(c)
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
c[i][j] = 0.0;
}
t1 = high_resolution_clock::now();
#pragma acc kernels present(a, b, c) // 方法 2,外兩層 independent,最裏層串行
{
#pragma acc loop independent
for (int i=0; i<n; i++)
{
#pragma acc loop independent
for (int j=0; j<n; j++)
{
#pragma acc loop independent
for (int k=0; k<n; k++)
c[i][j] += a[i][k] * b[k][j];
printf("%.3lf ",c[i][j]);//這裏可以控制輸出精度
}
printf("\n");
}
}
t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Seq: %.6lf s.\n\n", time.count());
#pragma acc kernels present(c)
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
c[i][j] = 0.0;
}
t1 = high_resolution_clock::now();
#pragma acc kernels present(a, b, c) // 方法 3,外兩層 independent,最裏層規約
{
#pragma acc loop independent
for (int i=0; i<n; i++)
{
#pragma acc loop independent
for (int j=0; j<n; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k=0; k<n; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
printf("%.3lf ",c[i][j]);//這裏可以控制輸出精度
}
printf("\n");
}
}
t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Reduction: %.6lf s.\n\n", time.count());
#pragma acc kernels present(c)
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
c[i][j] = 0.0;
}
t1 = high_resolution_clock::now();
#pragma acc kernels present(a, b, c) // 方法 4,手動指定 gang 和 vector
{
#pragma acc loop gang(32)
for (int i=0; i<n; i++)
{
#pragma acc loop vector(16)
for (int j=0; j<n; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k=0; k<n; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
printf("%.3lf ",c[i][j]);//這裏可以控制輸出精度
}
printf("\n");
}
}
t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Gang Vector: %.6lf s.\n\n", time.count());
#pragma acc kernels present(c)
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
c[i][j] = 0.0;
}
t1 = high_resolution_clock::now();
#pragma acc kernels present(a, b, c) // 方法 5,分塊重排
{
#pragma acc loop tile(32, 32)
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+ \
: tmp)
for (int k=0; k<n; ++k)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
printf("%.3lf ",c[i][j]);//這裏可以控制輸出精度
}
printf("\n");
}
}
t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - tile: %.6lf s.\n\n", time.count());
#pragma acc kernels present(c)
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
c[i][j] = 0.0;
}
t1 = high_resolution_clock::now();
#pragma acc kernels present(a, b, c) // 方法 6,合併多層迭代
{
#pragma acc loop collapse(2) independent
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k=0; k<n; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
printf("%.3lf ",c[i][j]);//這裏可以控制輸出精度
}
printf("\n");
}
}
t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Collapse: %.6lf s.\n\n", time.count());
return 0;
}
參考鏈接: