Recently I have been studying parallel programming. Paralleled programs can get better performance in multicore machines. For example, a program creating two threads to do a job can reduce the executing time by nearly 50% in dual-core machine than in single-core machine. Without divided into two threads, it can also get the same performance using OpenMP directives. The code is below, and compiled by Intel C++ Compiler. Visual C++ Compiler doesn't support OpenMP automatic optimization.
#include <stdio.h>
#include <windows.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include <omp.h>
using namespace std;
#define N 1000000
struct Workload
{
long data[N];
_int64 sum;
};
struct Workload * work;
void init_Workload(struct Workload * work)
{
for(int i = 0; i < N; ++i)
work->data[i] = i;
}
DWORD WINAPI CalSum(LPVOID n)
{
int i;
DWORD sum = 0;
if((int)n == 0)
for(i = 0; i < N; i += 2)
sum += work->data[i];
else
for(i = 1; i < N; i += 2)
sum += work->data[i];
printf("Work%d has finished./n",n);
return sum;
}
DWORD WINAPI CalSum2(LPVOID n)
{
int i;
int bound1 = N / 2;
int bound2 = N / 2 + 1;
DWORD sum = 0;
if((int)n == 0)
for(i = 0; i < bound1; ++i)
sum += work->data[i];
else
for(i = bound2; i < N; ++i)
sum += work->data[i];
printf("Work%d has finished./n",n);
return sum;
}
inline unsigned _int64 get_cycles()
{
_asm RDTSC
}
int main()
{
//long n = N;
//printf("%ld/n%d,%d",n,sizeof(long),sizeof(_int64));
int i, n = 2;
unsigned _int64 start, end;
HANDLE hWorkers[2];
DWORD dwWorkerID[2];
DWORD dwExitCodes[2];
work = (struct Workload *)malloc(sizeof(struct Workload));
assert(work);
start = get_cycles();
init_Workload(work);
end = get_cycles();
cout<<"The init costs "<<end<<" - "<<start<<" = "<<end - start<<" cycles."<<endl;
//start = get_cycles();
cout<<endl;
cout<<"Well balanced win32 multithreading..."<<endl;
for(i = 0; i < n; ++i)
{
hWorkers[i] = CreateThread(NULL,0,CalSum,(LPVOID)i,0,&dwWorkerID[i]);
if(hWorkers[i])
printf("Worker%d starts to work./n",i);
else
printf("Worker%d has problem working./n",i);
}
start = get_cycles();
WaitForMultipleObjects(2,hWorkers,TRUE,INFINITE);
end = get_cycles();
GetExitCodeThread(hWorkers[0], &dwExitCodes[0]);
GetExitCodeThread(hWorkers[1], &dwExitCodes[1]);
work->sum = dwExitCodes[0] + dwExitCodes[1];
printf("The parallel result is: sum = %ld, ",work->sum);
cout<<"it costs "<<end<<" - "<<start<<" = "<<end - start<<" cycles."<<endl;
CloseHandle(hWorkers[0]);
CloseHandle(hWorkers[1]);
cout<<endl;
cout<<"Not well balanced win32 multithreading..."<<endl;
for(i = 0; i < n; ++i)
{
hWorkers[i] = CreateThread(NULL,0,CalSum2,(LPVOID)i,0,&dwWorkerID[i]);
if(hWorkers[i])
printf("Worker%d starts to work./n",i);
else
printf("Worker%d has problem working./n",i);
}
start = get_cycles();
WaitForMultipleObjects(2,hWorkers,TRUE,INFINITE);
end = get_cycles();
GetExitCodeThread(hWorkers[0], &dwExitCodes[0]);
GetExitCodeThread(hWorkers[1], &dwExitCodes[1]);
work->sum = dwExitCodes[0] + dwExitCodes[1];
printf("The parallel result is: sum = %ld, ",work->sum);
cout<<"it costs "<<end<<" - "<<start<<" = "<<end - start<<" cycles."<<endl;
CloseHandle(hWorkers[0]);
CloseHandle(hWorkers[1]);
cout<<endl;
cout<<"Sequential...(single thread)"<<endl;
_int64 sum = 0;
start = get_cycles();
for(i = 0; i < N; ++i)
sum += work->data[i];
end = get_cycles();
printf("The sequent result is: sum = %ld, ",sum);
cout<<"it costs "<<end<<" - "<<start<<" = "<<end - start<<" cycles."<<endl;
cout<<endl;
/**/
cout<<"OpenMP multithreading..."<<endl;
sum = 0;
start = get_cycles();
#pragma omp parallel for private(i) firstprivate(work) reduction(+:sum) num_threads(2)
for(i = 0; i < N; ++i)
sum += work->data[i];
end = get_cycles();
printf("The openmp result is: sum = %ld, ",sum);
cout<<"it costs "<<end<<" - "<<start<<" = "<<end - start<<" cycles."<<endl;
/**/
cout<<endl;
return EXIT_SUCCESS;
}