使用OpenMP編寫並行化的計數排序

背景

計數排序(Count Sort)基本思想是對於列表a中的每個元素a[i],計算小於a[i]的元素個數,將a[i]插入到由count決定的列表下標位置中,算法結束後,用臨時列表覆蓋原始列表。

問題

如果我們試圖並行化外層循環,哪些變量爲private,哪些變量爲shared?

答:a, n, temp爲shared;i, j, count爲private。

#  pragma omp parallel for num_threads(thread_count) \
    default(none) shared(a, n, temp) private(i, j, count)\
    schedule(static, 2)

是否存在循環攜帶的數據依賴性?爲什麼?

答:不存在。由於計數排序是計算小於a[i]的元素個數,所以下一層循環不會對上一層產生影響,所以不存在循環攜帶的數據依賴型。

編寫並行化的Count_sort。

全部源代碼請見github

答:只需要在串行化的代碼中加上一句Count_sort的預處理指令即可。
並行化的Count_sort函數的源代碼如下:

/*-----------------------------------------------------------------
 * Function:     Count_sort_parallel
 * Purpose:      Sort list using Count sort
 * In args:      n
 * In/out args:  a
 */
void Count_sort_parallel(int a[], int n) {
	int i,j,count;
	int *temp = malloc(n*sizeof(int));

#  pragma omp parallel for num_threads(thread_count) \
   default(none) shared(a, n, temp) private(i, j, count)\
   schedule(static, 2)
	for(i=0; i<n; i++) {
		count = 0;
		for(j=0; j<n; j++) {
			if(a[j]<a[i])
				count++;
			else if(a[j] == a[i] && j<i)
				count++;
		}
		temp[count] = a[i];
	}

	memcpy(a, temp, n*sizeof(int));
	free(temp);
}/* Count_sort_parallel */

程序的main函數的源代碼如下:

/*-----------------------------------------------------------------*/
int main(int argc, char* argv[]) {
	int  n;
	char g_i;
	int* a;
	int* b;
	int* c;
	double start, finish;
	clock_t  start_time, end_time;
	double Total_time;

	Get_args(argc, argv, &n, &g_i);
	a = malloc(n*sizeof(int));
	b = malloc(n*sizeof(int));
	c = malloc(n*sizeof(int));
	if (g_i == 'g') {
		Generate_list(a, n);
#     ifdef DEBUG
		Print_list(a, n, "Before sort");
#     endif
	} else {
		Read_list(a, n);
	}
	for (int i = 0; i < n; ++i) {
		b[i] = a[i];
		c[i] = a[i];
	}

	start = omp_get_wtime();
	Count_sort_parallel(a, n);
	finish = omp_get_wtime();
	printf("Parallel count sort time = %e seconds\n", finish - start);

#  ifdef DEBUG
	Print_list(a, n, "After sort");
#  endif

	start_time = clock();
	Count_sort_serial(b, n);
	end_time = clock();
	Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
	printf("Serial count sort time = %e seconds\n", Total_time);

#  ifdef DEBUG
	Print_list(b, n, "After sort");
#  endif

	start_time = clock();
	qsort(c, n, sizeof(int), cmp);
	end_time = clock();
	Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
	printf("qsort time = %e seconds\n", Total_time);

#  ifdef DEBUG
	Print_list(c, n, "After sort");
#  endif

	free(a);
	free(b);
	free(c);
	return 0;
}  /* main */

並行化的Count_sort與串行化的Count_sort相比,性能如何?

答:./omp後面的第一個數字是線程數、第二個數字是待排序的數組大小、第三個字母是數組的輸入方式。如圖1所示,當所排序的數組大小很小的時候,串行排序的時間反而比並行排序的時間慢,但是當所排序的數組大小達到10^3數量級的時候,只要線程數大於1,並行的Count_sort比串行的Count_sort要快3倍以上。
兩者的排序速度受待排序列表的數量級的影響,主要是因爲並行排序存在着fork和join的時間開銷。當待排序列表很小的時候,這種開銷甚至會增加程序的運行時間,但是當待排序列表很大的時候,這種開銷相比於整體排序時間就微不足道了,所以串行的Count_sort比並行的Count_sort慢3倍以上。
在這裏插入圖片描述

並行化的Count_sort與串行化的qsort庫函數相比,性能如何?

答:如圖1所示,由於Count_sort的時間複雜度是O(n2),而qsort的時間複雜度是O(nlgn),所以無論線程數或者是待排序的數組的大小怎麼變化,qsort的運行時間總是要比並行化的Count_sort的運行時間要低。當待排序的數組的數據量小於103數量級時,qsort的排序時間要比並行化的Count_sort的排序時間要低10到100倍;當待排序的數組的數據量大於103數量級時,qsort的排序時間要比並行化的Count_sort的排序時間要低103數量級。

程序運行方法

運行環境:Ubuntu16.04
在控制檯中輸入以下命令即可編譯:

gcc -g -Wall -fopenmp -I. -o omp omp.c

輸入以下命令即可運行:

./omp <thread count> <n> <g|i>

其中:

  • n是待排序列表的元素個數
  • g是通過隨機數生成器生成的待排序列表
  • i是用戶輸入的列表

源代碼附後

/* File:    omp.c
 *
 * Purpose: Compare parallel count sort, serial count sort, qsort.
 *
 * Compile: gcc -g -Wall -fopenmp -I. -o omp omp.c
 * Usage:   ./omp <thread count> <n> <g|i>
 *             n:   number of elements in list
 *            'g':  generate list using a random number generator
 *            'i':  user input list
 *
 * Input:   list (optional)
 * Output:  elapsed time for parallel count sort, serial count sort,
 *			qsort.
 *
 * Note:
 * 1.  DEBUG flag prints the contents of the list
 * 2.  This version forks and joins the threads only once.
 * 3.  Uses the OpenMP library function omp_get_wtime for timing.
 *     This function returns the number of seconds since some time
 *     in the past.
 *
 * IPP:  Section 5.6.2 (pp. 235 and ff.)
 */
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <string.h>
#include <time.h>

#ifdef DEBUG
const int RMAX = 100;
#else
const int RMAX = 10000000;
#endif

int thread_count;

void Usage(char* prog_name);
void Get_args(int argc, char* argv[], int* n_p, char* g_i_p);
void Generate_list(int a[], int n);
void Print_list(int a[], int n, char* title);
void Read_list(int a[], int n);
void Count_sort_parallel(int a[], int n);
void Count_sort_serial(int a[], int n);
int cmp(const void * a, const void *b);

/*-----------------------------------------------------------------*/
int main(int argc, char* argv[]) {
	int  n;
	char g_i;
	int* a;
	int* b;
	int* c;
	double start, finish;
	clock_t  start_time, end_time;
	double Total_time;

	Get_args(argc, argv, &n, &g_i);
	a = malloc(n*sizeof(int));
	b = malloc(n*sizeof(int));
	c = malloc(n*sizeof(int));
	if (g_i == 'g') {
		Generate_list(a, n);
#     ifdef DEBUG
		Print_list(a, n, "Before sort");
#     endif
	} else {
		Read_list(a, n);
	}
	for (int i = 0; i < n; ++i) {
		b[i] = a[i];
		c[i] = a[i];
	}

	start = omp_get_wtime();
	Count_sort_parallel(a, n);
	finish = omp_get_wtime();
	printf("Parallel count sort time = %e seconds\n", finish - start);

#  ifdef DEBUG
	Print_list(a, n, "After sort");
#  endif

	start_time = clock();
	Count_sort_serial(b, n);
	end_time = clock();
	Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
	printf("Serial count sort time = %e seconds\n", Total_time);

#  ifdef DEBUG
	Print_list(b, n, "After sort");
#  endif

	start_time = clock();
	qsort(c, n, sizeof(int), cmp);
	end_time = clock();
	Total_time = (double)(end_time-start_time) / CLOCKS_PER_SEC;
	printf("qsort time = %e seconds\n", Total_time);

#  ifdef DEBUG
	Print_list(c, n, "After sort");
#  endif

	free(a);
	free(b);
	free(c);
	return 0;
}  /* main */


/*-----------------------------------------------------------------
 * Function:  Usage
 * Purpose:   Summary of how to run program
 */
void Usage(char* prog_name) {
	fprintf(stderr, "usage:   %s <thread count> <n> <g|i>\n", prog_name);
	fprintf(stderr, "   n:   number of elements in list\n");
	fprintf(stderr, "  'g':  generate list using a random number generator\n");
	fprintf(stderr, "  'i':  user input list\n");
}  /* Usage */


/*-----------------------------------------------------------------
 * Function:  Get_args
 * Purpose:   Get and check command line arguments
 * In args:   argc, argv
 * Out args:  n_p, g_i_p
 */
void Get_args(int argc, char* argv[], int* n_p, char* g_i_p) {
	if (argc != 4 ) {
		Usage(argv[0]);
		exit(0);
	}
	thread_count = strtol(argv[1], NULL, 10);
	*n_p = strtol(argv[2], NULL, 10);
	*g_i_p = argv[3][0];

	if (*n_p <= 0 || (*g_i_p != 'g' && *g_i_p != 'i') ) {
		Usage(argv[0]);
		exit(0);
	}
}  /* Get_args */


/*-----------------------------------------------------------------
 * Function:  Generate_list
 * Purpose:   Use random number generator to generate list elements
 * In args:   n
 * Out args:  a
 */
void Generate_list(int a[], int n) {
	int i;

	srandom(1);
	for (i = 0; i < n; i++)
		a[i] = random() % RMAX;
}  /* Generate_list */


/*-----------------------------------------------------------------
 * Function:  Print_list
 * Purpose:   Print the elements in the list
 * In args:   a, n
 */
void Print_list(int a[], int n, char* title) {
	int i;

	printf("%s:\n", title);
	for (i = 0; i < n; i++)
		printf("%d ", a[i]);
	printf("\n\n");
}  /* Print_list */


/*-----------------------------------------------------------------
 * Function:  Read_list
 * Purpose:   Read elements of list from stdin
 * In args:   n
 * Out args:  a
 */
void Read_list(int a[], int n) {
	int i;

	printf("Please enter the elements of the list\n");
	for (i = 0; i < n; i++)
		scanf("%d", &a[i]);
}  /* Read_list */


/*-----------------------------------------------------------------
 * Function:     Count_sort_parallel
 * Purpose:      Sort list using Count sort
 * In args:      n
 * In/out args:  a
 */
void Count_sort_parallel(int a[], int n) {
	int i,j,count;
	int *temp = malloc(n*sizeof(int));

#  pragma omp parallel for num_threads(thread_count) \
   default(none) shared(a, n, temp) private(i, j, count)\
   schedule(static, 2)
	for(i=0; i<n; i++) {
		count = 0;
		for(j=0; j<n; j++) {
			if(a[j]<a[i])
				count++;
			else if(a[j] == a[i] && j<i)
				count++;
		}
		temp[count] = a[i];
	}

	memcpy(a, temp, n*sizeof(int));
	free(temp);
}/* Count_sort_parallel */


/*-----------------------------------------------------------------
 * Function:     Count_sort_serial
 * Purpose:      Sort list using Count sort
 * In args:      n
 * In/out args:  a
 */
void Count_sort_serial(int a[], int n) {
	int i,j,count;
	int *temp = malloc(n*sizeof(int));

	for(i=0; i<n; i++) {
		count = 0;
		for(j=0; j<n; j++) {
			if(a[j]<a[i])
				count++;
			else if(a[j] == a[i] && j<i)
				count++;
		}
		temp[count] = a[i];
	}

	memcpy(a, temp, n*sizeof(int));
	free(temp);
}/* Count_sort_serial */

/*-----------------------------------------------------------------
 * Function:     cmp
 * Purpose:      compare 2 close elements from small to big
 * In args:      a b
 * In/out args:  a b
 */
int cmp(const void * a, const void *b) {
	return *(int *)a - *(int *)b;
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章