圖4爲排序階段CPU的使用率,可以看到只有一個核達到了100%的利用率。下面爲一個多線程(線程的數量爲核的數量)的排序版本,每個線程只對1G數據中的一部分進行快速排序,排序完成後再由另外一個線程進行歸併,將結果寫入文件。
多線程排序代碼如下:
/*multi_thread_sort.c*/
- /*
- * Author: Chaos Lee
- * Date: 2012-06-30
- * Description: load, merge , store data with single core, but sorting data with all the cores provided by the SMP
- */
- #include<stdio.h>
- #include<pthread.h>
- #include<sys/sysinfo.h>
- #include<sys/stat.h>
- #include<sys/types.h>
- #include<stdint.h>
- #include<stdlib.h>
- #include<assert.h>
- #include "../error.h"
- #include "timer.h"
- uint64_t * buffer = NULL;
- pthread_mutex_t counter_mutex = PTHREAD_MUTEX_INITIALIZER;
- pthread_cond_t merge_start = PTHREAD_COND_INITIALIZER;
- int cores_number;
- int counter;
- int uint64_compare(const void * ptr1,const void * ptr2)
- {
- return *((uint64_t *)ptr1) > *((uint64_t *)ptr2) ? 1 : *((uint64_t *)ptr1) < *((uint64_t *)ptr2) ? -1 : 0;
- }
- typedef struct segment_tag
- {
- uint64_t start;
- uint64_t end;
- }segment_t,*segment_p;
- void barrier()
- {
- int status;
- status = pthread_mutex_lock(&counter_mutex);
- if(0 != status)
- err_abort("locking error.",status);
- counter++;
- if(cores_number == counter)
- {
- pthread_cond_signal(&merge_start);
- }
- status = pthread_mutex_unlock(&counter_mutex);
- if(0 != status)
- err_abort("unlocking error.",status);
- }
- void * sort_thread_routin(void * args)
- {
- DPRINTF(("%s","sorting thread start...\n"));
- segment_p seg = (segment_p) args;
- assert(buffer != NULL);
- DPRINTF(("%s","begin to sort...\n"));
- qsort(buffer+seg->start,seg->end-seg->start,sizeof(uint64_t),uint64_compare);
- DPRINTF(("%s","Entering barrier...\n"));
- barrier();
- pthread_exit((void *)0);
- }
- void * merge_thread_routin(void * args)
- {
- int status,i,finish_count,elapsed_seconds;
- FILE * fp_result;
- uint64_t tmp;
- restart_timer();
- DPRINTF(("%s","merging thread start...\n"));
- fp_result = fopen("multi-result.dat","wb");
- while(cores_number != counter)
- {
- status = pthread_cond_wait(&merge_start,&counter_mutex);
- if(0 != status)
- err_abort("waiting condition error.",status);
- }
- elapsed_seconds = get_elapsed_time();
- fprintf(stdout,"sorting cost %d seconds.\n",elapsed_seconds);
- status = pthread_mutex_unlock(&counter_mutex);
- if(0 != status)
- err_abort("unlocking error.",status);
- DPRINTF(("begin to merge...\n"));
- finish_count = 0;
- segment_p segs = (segment_p) args;
- restart_timer();
- while(finish_count<cores_number)
- {
- int i,first=0,j;
- for(i=0;i<cores_number;i++)
- {
- if( 0 == first)
- {
- if(segs[i].start<segs[i].end)
- {
- tmp = buffer[segs[i].start];
- j = i;
- first = 1;
- }
- }
- else
- {
- if(segs[i].start<segs[i].end && buffer[segs[i].start]<tmp)
- {
- tmp = buffer[segs[i].start];
- j = i;
- }
- }
- }
- segs[j].start++;
- if(segs[j].start >= segs[j].end)
- {
- finish_count++;
- }
- fwrite(&tmp,sizeof(uint64_t),1,fp_result);
- }
- elapsed_seconds = get_elapsed_time();
- fprintf(stdout,"merging cost %d seconds.\n",elapsed_seconds);
- DPRINTF(("merging is over\n"));
- fclose(fp_result);
- pthread_exit((void *)0);
- }
- int main(int argc,char *argv[])
- {
- int elapsed_seconds,status,i;
- segment_p segments;
- pthread_t * sort_threads;
- pthread_t * merge_thread;
- uint64_t size,length,seg_len;
- FILE * fp;
- struct stat data_stat;
- cores_number = get_nprocs();
- status = stat("data.dat",&data_stat);
- if(0 != status)
- error_abort("stat file error.\n");
- size = data_stat.st_size;
- length = size / sizeof(uint64_t);
- seg_len = length / cores_number;
- buffer = (uint64_t *) malloc(size);
- if(NULL == buffer)
- {
- fprintf(stderr,"mallocing error.\n");
- exit(1);
- }
- fp = fopen("data.dat","rb");
- if(NULL == fp)
- {
- fprintf(stderr,"file open error.\n");
- exit(1);
- }
- start_timer();
- fread(buffer,size,1,fp);
- elapsed_seconds = get_elapsed_time();
- fprintf(stdout,"loading cost %d seconds\n",elapsed_seconds);
- segments = (segment_p)malloc(sizeof(segment_t)*cores_number);
- if(NULL == segments)
- {
- fprintf(stderr,"at %s:%d : %s",__FILE__,__LINE__,"malloc error.\n");
- exit(1);
- }
- for(i=0;i<cores_number;i++)
- {
- segments[i].start = i * seg_len;
- if(i != cores_number-1)
- segments[i].end = (i + 1 ) * seg_len;
- else
- segments[i].end = length;
- }
- sort_threads = (pthread_t *)malloc(sizeof(pthread_t) * cores_number);
- if(NULL == sort_threads)
- {
- fprintf(stderr,"at %s:%d :%s",__FILE__,__LINE__,"malloc failuer.\n");
- exit(1);
- }
- merge_thread = (pthread_t *)malloc(sizeof(pthread_t));
- if(NULL == merge_thread)
- {
- fprintf(stderr,"at %s:%d :%s",__FILE__,__LINE__,"malloc failuer.\n");
- exit(1);
- }
- for(i=0;i<cores_number;i++)
- {
- status = pthread_create(&sort_threads[i],NULL,sort_thread_routin,(void *)&segments[i]);
- if(0 != status)
- err_abort("creating threads faulire.\n",status);
- }
- status = pthread_create(merge_thread,NULL,merge_thread_routin,(void *)segments);
- if(0 != status)
- err_abort("creating thread faulier.\n",status);
- for(i=0;i<cores_number;i++)
- {
- status = pthread_join(sort_threads[i],NULL);
- if(0 != status)
- err_abort("joining threads error.\n",status);
- }
- status = pthread_join(*merge_thread,NULL);
- if(0 != status)
- err_abort("joining thread error.\n",status);
- free(buffer);
- fclose(fp);
- return 0;
- }
再編譯運行下,以下爲測試結果:
- [lichao@sg01 thread_power]$ gcc multi_thread_sort.c -o multi_thread_sort timer.o -lpthread
- [lichao@sg01 thread_power]$ ./multi_thread_sort
- loading cost 14 seconds
- sorting cost 22 seconds.
- merging cost 44 seconds.
下圖5爲多線程排序時CPU的利用率,可以看到CPU的四個核都已經達到100%的利用率,即:硬件沒有白投資:D。當然排序的時間效果也很好,幾乎達到了之前的4倍的加速比。另外可以看到文件的加載速度和回寫速度也有所提高,這點也是讓我比較疑惑的。下面再次運行單線程排序版本。
圖5 排序階段CPU的利用率
- [lichao@sg01 thread_power]$ ./single_thread_sort
- loading cost 17 seconds
- sorting cost 81 seconds
- writing results cost 12 seconds
可以看到加載速度和回寫速度有了顯著的提升,雖然排序時間還是沒有多大變化。
再次運行多線程排序版本試試:
- [lichao@sg01 thread_power]$ ./multi_thread_sort
- loading cost 31 seconds
- sorting cost 22 seconds.
- merging cost 23 seconds.
加載速度又延長了,排序速度幾乎不變,回寫速度也提高了不少。我想這主要是因爲文件系統本身提供了緩衝的作用,即上次用過的文件可以放在交換區,便於迅速載入內存吧。這樣第二次使用的時候,由於這些文件還存放在交換區中,所以以很高的速度傳入內存中。回寫的原理應該也一樣。對於1G的文件回寫到內存,只用了23s,大致的回寫速度爲50MB/s
假設文件系統一直起作用,並能達到第二次實驗的效果,即分塊排序22s,歸併排序並回寫文件系統23s,那麼計算和歸併回寫是能夠重合的。對於200G的文件A來說,分塊排序的處理時間大致爲:200*22s =~1.2h,就擴大爲1小時15分鐘吧。這樣對文件B來說也差不多爲1小時15分鐘,一共需要2個半小時,接下來開始歸併比較了,假設文件的緩衝系統能夠啓作用,即速度能達到50MB/s,這樣,對於2個200G的文件都需要在內存中過一遍,大致時間應該爲400*10^3/50 = 8000s,大致爲2小時15分鐘,所以加上前面的2個半小時,對於2個200G的文件尋找相同值共需要的時間爲 5個小時左右,至少比300萬年好點。
PS: =~這個符號表示約等於。