Openacc優化矩陣乘法

#include <iostream>
#include <cstdlib>
#include <chrono>

#define N 1000//可以更改大小

using namespace std;
using namespace std::chrono;

double a[N][N], b[N][N], c[N][N];

int main()
{
    double tmp;
    int n;//矩陣大小
    scanf("%d",&n);

    //輸入矩陣
    for(int i=0; i<n; i++){
        for(int j=0; j<n; j++){
            scanf("%lf",&a[i][j]);
        }
    }
    for(int i=0; i<n; i++){
        for(int j=0; j<n; j++){
            scanf("%lf",&b[i][j]);
        }
    }

#pragma acc enter data create(a, b, c)
#pragma acc kernels present(a, b, c)
    {
        for(int i=0; i<n; i++){//初始化數組c,每次都清零
            for(int j=0; j<n; j++){
                c[i][j] = 0;
            }
        }

    }

    high_resolution_clock::time_point t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)       // 最簡單的,每層循環都 auto
    {
#pragma acc loop auto
        for (int i=0; i<n; i++)
        {
#pragma acc loop auto
            for (int j=0; j<n; j++)
            {
#pragma acc loop auto
                for (int k=0; k<n; k++){
                        c[i][j] += a[i][k] * b[k][j];//矩陣乘法
                    }
                printf("%.3lf  ",c[i][j]);//這裏可以控制輸出精度
            }

            printf("\n");
        }
    }

    //可以計算出所需要的時間
    high_resolution_clock::time_point t2 = high_resolution_clock::now();
    duration<double> time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - Auto: %.6lf s.\n\n", time.count());


    
    
#pragma acc kernels present(c)
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
            c[i][j] = 0.0;
    }

    t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)        // 方法 2,外兩層 independent,最裏層串行
    {
#pragma acc loop independent
        for (int i=0; i<n; i++)
        {
#pragma acc loop independent
            for (int j=0; j<n; j++)
            {
#pragma acc loop independent
                for (int k=0; k<n; k++)
                    c[i][j] += a[i][k] * b[k][j];
                printf("%.3lf  ",c[i][j]);//這裏可以控制輸出精度
            }
            printf("\n");
        }
    }

    t2 = high_resolution_clock::now();
    time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - Independent Seq: %.6lf s.\n\n", time.count());
    
    
    


#pragma acc kernels present(c)
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
            c[i][j] = 0.0;
    }

    t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)        // 方法 3,外兩層 independent,最裏層規約
    {
#pragma acc loop independent
        for (int i=0; i<n; i++)
        {
#pragma acc loop independent
            for (int j=0; j<n; j++)
            {
                tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
                for (int k=0; k<n; k++)
                    tmp += a[i][k] * b[k][j];
                c[i][j] = tmp;
                printf("%.3lf  ",c[i][j]);//這裏可以控制輸出精度
            }
             printf("\n");
        }
    }

    t2 = high_resolution_clock::now();
    time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - Independent Reduction: %.6lf s.\n\n", time.count());
    
    
    
    
    
    
    
    
   

#pragma acc kernels present(c)
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
            c[i][j] = 0.0;
    }

    t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)        // 方法 4,手動指定 gang 和 vector
    {
#pragma acc loop gang(32)
        for (int i=0; i<n; i++)
        {
#pragma acc loop vector(16)
            for (int j=0; j<n; j++)
            {
                tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
                for (int k=0; k<n; k++)
                    tmp += a[i][k] * b[k][j];
                c[i][j] = tmp;
                printf("%.3lf  ",c[i][j]);//這裏可以控制輸出精度
            }
            printf("\n");
        }
    }

    t2 = high_resolution_clock::now();
    time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - Gang Vector: %.6lf s.\n\n", time.count());
    
    
    
    
    
    
    
    
    

#pragma acc kernels present(c)
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
            c[i][j] = 0.0;
    }

    t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)        // 方法 5,分塊重排
    {
#pragma acc loop tile(32, 32)
        for (int i=0; i<n; i++)
        {
            for (int j=0; j<n; j++)
            {
                tmp = 0.0f;
#pragma acc loop reduction(+ \
                           : tmp)
                for (int k=0; k<n; ++k)
                    tmp += a[i][k] * b[k][j];
                c[i][j] = tmp;
                printf("%.3lf  ",c[i][j]);//這裏可以控制輸出精度
            }
            printf("\n");
        }
    }

    t2 = high_resolution_clock::now();
    time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - tile: %.6lf s.\n\n", time.count());
    
    
    
    
    
    
    

#pragma acc kernels present(c)
    for (int i=0; i<n; i++)
    {
        for (int j=0; j<n; j++)
            c[i][j] = 0.0;
    }

    t1 = high_resolution_clock::now();

#pragma acc kernels present(a, b, c)        // 方法 6,合併多層迭代
    {
#pragma acc loop collapse(2) independent
        for (int i=0; i<n; i++)
        {
            for (int j=0; j<n; j++)
            {
                tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
                for (int k=0; k<n; k++)
                    tmp += a[i][k] * b[k][j];
                c[i][j] = tmp;
                printf("%.3lf  ",c[i][j]);//這裏可以控制輸出精度
            }
            printf("\n");
        }
    }

    t2 = high_resolution_clock::now();
    time = duration_cast<duration<double>>(t2 - t1);
    printf("Time OpenACC - Collapse: %.6lf s.\n\n", time.count());
    
    return 0;
}

參考鏈接:

https://www.cnblogs.com/cuancuancuanhao/p/9459007.html

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章