Parallel program running in dual-core environment

原創

2020-06-20 09:19

Recently I have been studying parallel programming. Paralleled programs can get better performance in multicore machines. For example, a program creating two threads to do a job can reduce the executing time by nearly 50% in dual-core machine than in single-core machine. Without divided into two threads, it can also get the same performance using OpenMP directives. The code is below, and compiled by Intel C++ Compiler. Visual C++ Compiler doesn't support OpenMP automatic optimization.

#include <stdio.h>
#include <windows.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include <omp.h>
using namespace std;

#define N 1000000

struct Workload
{
long data[N];
_int64 sum;
};

struct Workload * work;

void init_Workload(struct Workload * work)
{
for(int i = 0; i < N; ++i)
work->data[i] = i;
}

DWORD WINAPI CalSum(LPVOID n)
{
int i;
DWORD sum = 0;
if((int)n == 0)
  for(i = 0; i < N; i += 2)
   sum += work->data[i];
else
  for(i = 1; i < N; i += 2)
   sum += work->data[i];
printf("Work%d has finished./n",n);
return sum;
}

DWORD WINAPI CalSum2(LPVOID n)
{
int i;
int bound1 = N / 2;
int bound2 = N / 2 + 1;
DWORD sum = 0;
if((int)n == 0)
  for(i = 0; i < bound1; ++i)
   sum += work->data[i];
else
  for(i = bound2; i < N; ++i)
   sum += work->data[i];
printf("Work%d has finished./n",n);
return sum;
}

inline unsigned _int64 get_cycles()
{
_asm RDTSC
}

int main()
{
//long n = N;
//printf("%ld/n%d,%d",n,sizeof(long),sizeof(_int64));
int i, n = 2;
unsigned _int64 start, end;
HANDLE hWorkers[2];
DWORD dwWorkerID[2];
DWORD dwExitCodes[2];
work = (struct Workload *)malloc(sizeof(struct Workload));
assert(work);
start = get_cycles();
init_Workload(work);
end = get_cycles();
cout<<"The init costs "<<end<<" - "<<start<<" = "<<end - start<<" cycles."<<endl;
//start = get_cycles();

cout<<endl;

cout<<"Well balanced win32 multithreading..."<<endl;
for(i = 0; i < n; ++i)
{
  hWorkers[i] = CreateThread(NULL,0,CalSum,(LPVOID)i,0,&dwWorkerID[i]);
  if(hWorkers[i])
   printf("Worker%d starts to work./n",i);
  else
   printf("Worker%d has problem working./n",i);
}
start = get_cycles();
WaitForMultipleObjects(2,hWorkers,TRUE,INFINITE);
end = get_cycles();
GetExitCodeThread(hWorkers[0], &dwExitCodes[0]);
GetExitCodeThread(hWorkers[1], &dwExitCodes[1]);
work->sum = dwExitCodes[0] + dwExitCodes[1];
printf("The parallel result is: sum = %ld, ",work->sum);
cout<<"it costs "<<end<<" - "<<start<<" = "<<end - start<<" cycles."<<endl;
CloseHandle(hWorkers[0]);
CloseHandle(hWorkers[1]);

cout<<endl;

cout<<"Not well balanced win32 multithreading..."<<endl;
for(i = 0; i < n; ++i)
{
  hWorkers[i] = CreateThread(NULL,0,CalSum2,(LPVOID)i,0,&dwWorkerID[i]);
  if(hWorkers[i])
   printf("Worker%d starts to work./n",i);
  else
   printf("Worker%d has problem working./n",i);
}
start = get_cycles();
WaitForMultipleObjects(2,hWorkers,TRUE,INFINITE);
end = get_cycles();
GetExitCodeThread(hWorkers[0], &dwExitCodes[0]);
GetExitCodeThread(hWorkers[1], &dwExitCodes[1]);
work->sum = dwExitCodes[0] + dwExitCodes[1];
printf("The parallel result is: sum = %ld, ",work->sum);
cout<<"it costs "<<end<<" - "<<start<<" = "<<end - start<<" cycles."<<endl;
CloseHandle(hWorkers[0]);
CloseHandle(hWorkers[1]);

cout<<endl;

cout<<"Sequential...(single thread)"<<endl;
_int64 sum = 0;
start = get_cycles();
for(i = 0; i < N; ++i)
sum += work->data[i];
end = get_cycles();
printf("The sequent result is: sum = %ld, ",sum);
cout<<"it costs "<<end<<" - "<<start<<" = "<<end - start<<" cycles."<<endl;

cout<<endl;

/**/
cout<<"OpenMP multithreading..."<<endl;
sum = 0;
start = get_cycles();
#pragma omp parallel for private(i) firstprivate(work) reduction(+:sum) num_threads(2)
for(i = 0; i < N; ++i)
sum += work->data[i];
end = get_cycles();
printf("The openmp result is: sum = %ld, ",sum);
cout<<"it costs "<<end<<" - "<<start<<" = "<<end - start<<" cycles."<<endl;
/**/

cout<<endl;

return EXIT_SUCCESS;
}

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Parallel program running in dual-core environment

salesforce零基礎學習（一百三十八）零碎知識點小總結（十）

關於接口協議，你必須要知道這些！

FolkMq v1.4.6 發佈（可以內嵌的消息中間件）

一鍵自動化博客發佈工具,用過的人都說好(頭條篇)

01 穩定性（一）如何應對事故並做好覆盤？

美團一面：項目中有 10000 個 if else 如何優化？想了半天，被問懵了！

線程池那些坑爹的參數-核心線程數&最大線程數&工作隊列

京東面試：如何進行JVM調優？

Stream流常用方法總結

Parallel program running in dual-core environment

The end, the new beginning

Print the bit pattern of a float

Two more accurate ways to get time elapsed than clock()

Communications of Processes and Threads

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結