背景
計數排序(Count Sort)基本思想是對於列表a中的每個元素a[i],計算小於a[i]的元素個數,將a[i]插入到由count決定的列表下標位置中,算法結束後,用臨時列表覆蓋原始列表。
問題
如果我們試圖並行化外層循環,哪些變量爲private,哪些變量爲shared?
答:a, n, temp爲shared;i, j, count爲private。
# pragma omp parallel for num_threads(thread_count) \
default(none) shared(a, n, temp) private(i, j, count)\
schedule(static, 2)
是否存在循環攜帶的數據依賴性?爲什麼?
答:不存在。由於計數排序是計算小於a[i]的元素個數,所以下一層循環不會對上一層產生影響,所以不存在循環攜帶的數據依賴型。
編寫並行化的Count_sort。
答:只需要在串行化的代碼中加上一句Count_sort的預處理指令即可。
並行化的Count_sort函數的源代碼如下:
/*-----------------------------------------------------------------
* Function: Count_sort_parallel
* Purpose: Sort list using Count sort
* In args: n
* In/out args: a
*/
void Count_sort_parallel(int a[], int n) {
int i,j,count;
int *temp = malloc(n*sizeof(int));
# pragma omp parallel for num_threads(thread_count) \
default(none) shared(a, n, temp) private(i, j, count)\
schedule(static, 2)
for(i=0; i<n; i++) {
count = 0;
for(j=0; j<n; j++) {
if(a[j]<a[i])
count++;
else if(a[j] == a[i] && j<i)
count++;
}
temp[count] = a[i];
}
memcpy(a, temp, n*sizeof(int));
free(temp);
}/* Count_sort_parallel */
程序的main函數的源代碼如下:
/*-----------------------------------------------------------------*/
int main(int argc, char* argv[]) {
int n;
char g_i;
int* a;
int* b;
int* c;
double start, finish;
clock_t start_time, end_time;
double Total_time;
Get_args(argc, argv, &n, &g_i);
a = malloc(n*sizeof(int));
b = malloc(n*sizeof(int));
c = malloc(n*sizeof(int));
if (g_i == 'g') {
Generate_list(a, n);
# ifdef DEBUG
Print_list(a, n, "Before sort");
# endif
} else {
Read_list(a, n);
}
for (int i = 0; i < n; ++i) {
b[i] = a[i];
c[i] = a[i];
}
start = omp_get_wtime();
Count_sort_parallel(a, n);
finish = omp_get_wtime();
printf("Parallel count sort time = %e seconds\n", finish - start);
# ifdef DEBUG
Print_list(a, n, "After sort");
# endif
start_time = clock();
Count_sort_serial(b, n);
end_time = clock();
Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
printf("Serial count sort time = %e seconds\n", Total_time);
# ifdef DEBUG
Print_list(b, n, "After sort");
# endif
start_time = clock();
qsort(c, n, sizeof(int), cmp);
end_time = clock();
Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
printf("qsort time = %e seconds\n", Total_time);
# ifdef DEBUG
Print_list(c, n, "After sort");
# endif
free(a);
free(b);
free(c);
return 0;
} /* main */
並行化的Count_sort與串行化的Count_sort相比,性能如何?
答:./omp後面的第一個數字是線程數、第二個數字是待排序的數組大小、第三個字母是數組的輸入方式。如圖1所示,當所排序的數組大小很小的時候,串行排序的時間反而比並行排序的時間慢,但是當所排序的數組大小達到10^3數量級的時候,只要線程數大於1,並行的Count_sort比串行的Count_sort要快3倍以上。
兩者的排序速度受待排序列表的數量級的影響,主要是因爲並行排序存在着fork和join的時間開銷。當待排序列表很小的時候,這種開銷甚至會增加程序的運行時間,但是當待排序列表很大的時候,這種開銷相比於整體排序時間就微不足道了,所以串行的Count_sort比並行的Count_sort慢3倍以上。
並行化的Count_sort與串行化的qsort庫函數相比,性能如何?
答:如圖1所示,由於Count_sort的時間複雜度是O(n2),而qsort的時間複雜度是O(nlgn),所以無論線程數或者是待排序的數組的大小怎麼變化,qsort的運行時間總是要比並行化的Count_sort的運行時間要低。當待排序的數組的數據量小於103數量級時,qsort的排序時間要比並行化的Count_sort的排序時間要低10到100倍;當待排序的數組的數據量大於103數量級時,qsort的排序時間要比並行化的Count_sort的排序時間要低103數量級。
程序運行方法
運行環境:Ubuntu16.04
在控制檯中輸入以下命令即可編譯:
gcc -g -Wall -fopenmp -I. -o omp omp.c
輸入以下命令即可運行:
./omp <thread count> <n> <g|i>
其中:
- n是待排序列表的元素個數
- g是通過隨機數生成器生成的待排序列表
- i是用戶輸入的列表
源代碼附後
/* File: omp.c
*
* Purpose: Compare parallel count sort, serial count sort, qsort.
*
* Compile: gcc -g -Wall -fopenmp -I. -o omp omp.c
* Usage: ./omp <thread count> <n> <g|i>
* n: number of elements in list
* 'g': generate list using a random number generator
* 'i': user input list
*
* Input: list (optional)
* Output: elapsed time for parallel count sort, serial count sort,
* qsort.
*
* Note:
* 1. DEBUG flag prints the contents of the list
* 2. This version forks and joins the threads only once.
* 3. Uses the OpenMP library function omp_get_wtime for timing.
* This function returns the number of seconds since some time
* in the past.
*
* IPP: Section 5.6.2 (pp. 235 and ff.)
*/
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <string.h>
#include <time.h>
#ifdef DEBUG
const int RMAX = 100;
#else
const int RMAX = 10000000;
#endif
int thread_count;
void Usage(char* prog_name);
void Get_args(int argc, char* argv[], int* n_p, char* g_i_p);
void Generate_list(int a[], int n);
void Print_list(int a[], int n, char* title);
void Read_list(int a[], int n);
void Count_sort_parallel(int a[], int n);
void Count_sort_serial(int a[], int n);
int cmp(const void * a, const void *b);
/*-----------------------------------------------------------------*/
int main(int argc, char* argv[]) {
int n;
char g_i;
int* a;
int* b;
int* c;
double start, finish;
clock_t start_time, end_time;
double Total_time;
Get_args(argc, argv, &n, &g_i);
a = malloc(n*sizeof(int));
b = malloc(n*sizeof(int));
c = malloc(n*sizeof(int));
if (g_i == 'g') {
Generate_list(a, n);
# ifdef DEBUG
Print_list(a, n, "Before sort");
# endif
} else {
Read_list(a, n);
}
for (int i = 0; i < n; ++i) {
b[i] = a[i];
c[i] = a[i];
}
start = omp_get_wtime();
Count_sort_parallel(a, n);
finish = omp_get_wtime();
printf("Parallel count sort time = %e seconds\n", finish - start);
# ifdef DEBUG
Print_list(a, n, "After sort");
# endif
start_time = clock();
Count_sort_serial(b, n);
end_time = clock();
Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
printf("Serial count sort time = %e seconds\n", Total_time);
# ifdef DEBUG
Print_list(b, n, "After sort");
# endif
start_time = clock();
qsort(c, n, sizeof(int), cmp);
end_time = clock();
Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
printf("qsort time = %e seconds\n", Total_time);
# ifdef DEBUG
Print_list(c, n, "After sort");
# endif
free(a);
free(b);
free(c);
return 0;
} /* main */
/*-----------------------------------------------------------------
* Function: Usage
* Purpose: Summary of how to run program
*/
void Usage(char* prog_name) {
fprintf(stderr, "usage: %s <thread count> <n> <g|i>\n", prog_name);
fprintf(stderr, " n: number of elements in list\n");
fprintf(stderr, " 'g': generate list using a random number generator\n");
fprintf(stderr, " 'i': user input list\n");
} /* Usage */
/*-----------------------------------------------------------------
* Function: Get_args
* Purpose: Get and check command line arguments
* In args: argc, argv
* Out args: n_p, g_i_p
*/
void Get_args(int argc, char* argv[], int* n_p, char* g_i_p) {
if (argc != 4 ) {
Usage(argv[0]);
exit(0);
}
thread_count = strtol(argv[1], NULL, 10);
*n_p = strtol(argv[2], NULL, 10);
*g_i_p = argv[3][0];
if (*n_p <= 0 || (*g_i_p != 'g' && *g_i_p != 'i') ) {
Usage(argv[0]);
exit(0);
}
} /* Get_args */
/*-----------------------------------------------------------------
* Function: Generate_list
* Purpose: Use random number generator to generate list elements
* In args: n
* Out args: a
*/
void Generate_list(int a[], int n) {
int i;
srandom(1);
for (i = 0; i < n; i++)
a[i] = random() % RMAX;
} /* Generate_list */
/*-----------------------------------------------------------------
* Function: Print_list
* Purpose: Print the elements in the list
* In args: a, n
*/
void Print_list(int a[], int n, char* title) {
int i;
printf("%s:\n", title);
for (i = 0; i < n; i++)
printf("%d ", a[i]);
printf("\n\n");
} /* Print_list */
/*-----------------------------------------------------------------
* Function: Read_list
* Purpose: Read elements of list from stdin
* In args: n
* Out args: a
*/
void Read_list(int a[], int n) {
int i;
printf("Please enter the elements of the list\n");
for (i = 0; i < n; i++)
scanf("%d", &a[i]);
} /* Read_list */
/*-----------------------------------------------------------------
* Function: Count_sort_parallel
* Purpose: Sort list using Count sort
* In args: n
* In/out args: a
*/
void Count_sort_parallel(int a[], int n) {
int i,j,count;
int *temp = malloc(n*sizeof(int));
# pragma omp parallel for num_threads(thread_count) \
default(none) shared(a, n, temp) private(i, j, count)\
schedule(static, 2)
for(i=0; i<n; i++) {
count = 0;
for(j=0; j<n; j++) {
if(a[j]<a[i])
count++;
else if(a[j] == a[i] && j<i)
count++;
}
temp[count] = a[i];
}
memcpy(a, temp, n*sizeof(int));
free(temp);
}/* Count_sort_parallel */
/*-----------------------------------------------------------------
* Function: Count_sort_serial
* Purpose: Sort list using Count sort
* In args: n
* In/out args: a
*/
void Count_sort_serial(int a[], int n) {
int i,j,count;
int *temp = malloc(n*sizeof(int));
for(i=0; i<n; i++) {
count = 0;
for(j=0; j<n; j++) {
if(a[j]<a[i])
count++;
else if(a[j] == a[i] && j<i)
count++;
}
temp[count] = a[i];
}
memcpy(a, temp, n*sizeof(int));
free(temp);
}/* Count_sort_serial */
/*-----------------------------------------------------------------
* Function: cmp
* Purpose: compare 2 close elements from small to big
* In args: a b
* In/out args: a b
*/
int cmp(const void * a, const void *b) {
return *(int *)a - *(int *)b;
}