本系列文章均爲個人學習筆記
Linux環境對OpenMP的支持:
在Linux上編譯和運行OpenMP程序
編譯OpenMP程序: gcc -fopenmp a.c
運行OpenMP程序: ./a.out
1. 串行計算π
#include <stdio.h>
#include <omp.h>
static long num_steps = 100000000;
double step;
int main ()
{
int i;
double x, pi, sum = 0.0;
double start_time, run_time;
step = 1.0/(double) num_steps;
start_time = omp_get_wtime();
for (i=1;i<= num_steps; i++){
x = (i-0.5)*step;
sum = sum + 4.0/(1.0+x*x);
}
pi = step * sum;
run_time = omp_get_wtime() - start_time;
printf("\n pi with %ld steps is %lf in %lf seconds\n ",num_steps,pi,run_time);
}
結果:
2. 使用並行域並行化的程序:
#include <stdio.h>
#include <omp.h>
#define MAX_THREADS 4
static long num_steps = 100000000;
double step;
int main ()
{
int i,j;
double pi, full_sum = 0.0;
double start_time, run_time;
double sum[MAX_THREADS];
step = 1.0/(double) num_steps;
for (j=1;j<=MAX_THREADS ;j++) {
omp_set_num_threads(j);
full_sum=0.0;
start_time = omp_get_wtime();
#pragma omp parallel //並行域開始,每個線程(0和1)都會執行該代碼
{
int i;
int id = omp_get_thread_num();
int numthreads = omp_get_num_threads();
double x;
sum[id] = 0.0;
if (id == 0) //保證只有一個線程輸出ID
printf(" num_threads = %d",numthreads);
for (i=id;i< num_steps; i+=numthreads){
x = (i+0.5)*step;
sum[id] = sum[id] + 4.0/(1.0+x*x);
}
}
for(full_sum = 0.0, i=0;i<j;i++){
full_sum += sum[i];
}
pi = step * full_sum;
run_time = omp_get_wtime() - start_time;
printf("\n pi is %f in %f seconds %d thrds \n",pi,run_time,j);
}
}
//共4個線程參加計算,其中線程0進行迭代步0,4,...線程1進行迭代步1,5,....
結果:
我們發現使用並行計算結果耗時更多,因爲 False sharing。(自行百度)
3.private字句和critical制導語句並行化:
#include <stdio.h>
#include <omp.h>
#define MAX_THREADS 4
static long num_steps = 100000000;
double step;
int main ()
{
int i,j;
double pi, full_sum = 0.0;
double start_time, run_time;
double sum[MAX_THREADS];
step = 1.0/(double) num_steps;
for(j = 1; j <= MAX_THREADS; j++)
{
omp_set_num_threads(j);
full_sum = 0.0;
start_time = omp_get_wtime();
#pragma omp parallel private(i) //該子句表示 i 變量對於每個線程是私有的
{
int id = omp_get_thread_num();
int numthreads = omp_get_num_threads();
double x;
double partial_sum = 0;
#pragma omp single //作用和上一份代碼一樣,保證只有一個線程輸出ID
printf(" num_threads = %d",numthreads);
for (i = id; i < num_steps; i += numthreads){
x = (i+0.5)*step;
partial_sum += + 4.0/(1.0+x*x);
}
#pragma omp critical //指定代碼段在同一時刻只能由一個線程進行執行
full_sum += partial_sum;
}
pi = step * full_sum;
run_time = omp_get_wtime() - start_time;
printf("\n pi is %f in %f seconds %d threds \n ",pi,run_time,j);
}
}
//共4個線程參加計算,其中線程0進行迭代步0,4....,線程1進行迭代步1,5....
結果;
速度明顯快了很多!
4.並行規約並行化:
#include <stdio.h>
#include <omp.h>
static long num_steps = 100000000;
double step;
int main ()
{
int i;
double x, pi, sum = 0.0;
double start_time, run_time;
step = 1.0/(double) num_steps;
for (i = 1; i <= 4; i++)
{
sum = 0.0;
omp_set_num_threads(i);
start_time = omp_get_wtime();
#pragma omp parallel
{
#pragma omp single
printf(" num_threads = %d",omp_get_num_threads());
#pragma omp for reduction(+:sum) //每個線程保留一份私有拷貝sum,最後對線程中所以sum進行+規約,並更新sum的全局值
for (i=1;i<= num_steps; i++){
x = (i-0.5)*step;
sum = sum + 4.0/(1.0+x*x);
}
}
pi = step * sum;
run_time = omp_get_wtime() - start_time;
printf("\n pi is %f in %f seconds and %d threads\n",pi,run_time,i);
}
}
//共4個線程參加計算,其中線程0進行迭代步0~24999,線程1進行迭代步24999~50000.
結果:
速度一般般