process_vertices函數
有兩個參數:process和要處理的點集合
process_vertices(std::function<R(VertexId)> process, Bitmap * active) {}
其調用如下:
double delta = graph->process_vertices<double>(
[&](VertexId vtx){
curr[vtx] = (double)1;
if (graph->out_degree[vtx]>0) {
curr[vtx] /= graph->out_degree[vtx];
}
return (double)1;
},
active_in
);
函數體內部如下:
根據已有的threads和設置的threads_per_socket把線程劃分到分區,s_i 是分區號,s_j socket內的線程編號
// process vertices
template<typename R>
R process_vertices(std::function<R(VertexId)> process, Bitmap * active) {
double stream_time = 0;
stream_time -= MPI_Wtime();
R reducer = 0;
size_t basic_chunk = 64;
for (int t_i=0;t_i<threads;t_i++) {
int s_i = get_socket_id(t_i);
int s_j = get_socket_offset(t_i);
計算分區大小partition_size,thread_state[t_i]->curr和thread_state[t_i]->end分別表示分區內每個線程t_i負責部分的首尾
VertexId partition_size = local_partition_offset[s_i+1] -local_partition_offset[s_i];
thread_state[t_i]->curr = local_partition_offset[s_i] + partition_size / threads_per_socket / basic_chunk * basic_chunk * s_j;
thread_state[t_i]->end = local_partition_offset[s_i] + partition_size / threads_per_socket / basic_chunk * basic_chunk * (s_j+1);
如果一個分區規定的線程數已經滿了,此時線程的尾就設置爲下一分區的開始。把當前線程的狀態設置爲working,表示正在執行自己的任務。
if (s_j == threads_per_socket - 1) {
thread_state[t_i]->end = local_partition_offset[s_i+1];
}
thread_state[t_i]->status = WORKING;
}
下列代碼是並行執行的。
__sync_fetch_and_add相關介紹看這裏
下面一段代碼講的是線程執行自己的任務。
把線程現在的位置賦給v_i然後curr往後移64位,一個基本塊的大小,當v_i已經到end的時候就跳出循環,表示當前線程自己的工作已經完成了,該線程狀態置爲stealing,看看其他線程還有什麼工作可以幫忙做的。
#pragma omp parallel reduction(+:reducer)
{
R local_reducer = 0;
int thread_id = omp_get_thread_num();
while (true) {
VertexId v_i = __sync_fetch_and_add(&thread_state[thread_id]->curr, basic_chunk);
if (v_i >= thread_state[thread_id]->end) break;
下面這一段不是很懂
從v_i開始,做完一個往後移動一個,直到word爲0
unsigned long word = active->data[WORD_OFFSET(v_i)];
while (word != 0) {
if (word & 1) {
local_reducer += process(v_i);
}
v_i++;
word = word >> 1;
}
}
thread_state[thread_id]->status = STEALING;
下面一段代碼講的是線程stealing任務做。
根據當前線程id計算一個線程id出來賦值給t_i,當線程t_i的狀態處於stealing時,他就執行reduce
for (int t_offset=1;t_offset<threads;t_offset++) {
int t_i = (thread_id + t_offset) % threads;
while (thread_state[t_i]->status!=STEALING) {
VertexId v_i = __sync_fetch_and_add(&thread_state[t_i]->curr, basic_chunk);
if (v_i >= thread_state[t_i]->end) continue;
unsigned long word = active->data[WORD_OFFSET(v_i)];
while (word != 0) {
if (word & 1) {
local_reducer += process(v_i);
}
v_i++;
word = word >> 1;
}
}
}
reducer += local_reducer;
}
以上都是並行執行的代碼。
MPI_ALLreduce函數
相關MPI編程
R global_reducer;
MPI_Datatype dt = get_mpi_data_type<R>();
MPI_Allreduce(&reducer, &global_reducer, 1, dt, MPI_SUM, MPI_COMM_WORLD);
stream_time += MPI_Wtime();
#ifdef PRINT_DEBUG_MESSAGES
if (partition_id==0) {
printf("process_vertices took %lf (s)\n", stream_time);
}
#endif
return global_reducer;
}