process_vertices函数
有两个参数:process和要处理的点集合
process_vertices(std::function<R(VertexId)> process, Bitmap * active) {}
其调用如下:
double delta = graph->process_vertices<double>(
[&](VertexId vtx){
curr[vtx] = (double)1;
if (graph->out_degree[vtx]>0) {
curr[vtx] /= graph->out_degree[vtx];
}
return (double)1;
},
active_in
);
函数体内部如下:
根据已有的threads和设置的threads_per_socket把线程划分到分区,s_i 是分区号,s_j socket内的线程编号
// process vertices
template<typename R>
R process_vertices(std::function<R(VertexId)> process, Bitmap * active) {
double stream_time = 0;
stream_time -= MPI_Wtime();
R reducer = 0;
size_t basic_chunk = 64;
for (int t_i=0;t_i<threads;t_i++) {
int s_i = get_socket_id(t_i);
int s_j = get_socket_offset(t_i);
计算分区大小partition_size,thread_state[t_i]->curr和thread_state[t_i]->end分别表示分区内每个线程t_i负责部分的首尾
VertexId partition_size = local_partition_offset[s_i+1] -local_partition_offset[s_i];
thread_state[t_i]->curr = local_partition_offset[s_i] + partition_size / threads_per_socket / basic_chunk * basic_chunk * s_j;
thread_state[t_i]->end = local_partition_offset[s_i] + partition_size / threads_per_socket / basic_chunk * basic_chunk * (s_j+1);
如果一个分区规定的线程数已经满了,此时线程的尾就设置为下一分区的开始。把当前线程的状态设置为working,表示正在执行自己的任务。
if (s_j == threads_per_socket - 1) {
thread_state[t_i]->end = local_partition_offset[s_i+1];
}
thread_state[t_i]->status = WORKING;
}
下列代码是并行执行的。
__sync_fetch_and_add相关介绍看这里
下面一段代码讲的是线程执行自己的任务。
把线程现在的位置赋给v_i然后curr往后移64位,一个基本块的大小,当v_i已经到end的时候就跳出循环,表示当前线程自己的工作已经完成了,该线程状态置为stealing,看看其他线程还有什么工作可以帮忙做的。
#pragma omp parallel reduction(+:reducer)
{
R local_reducer = 0;
int thread_id = omp_get_thread_num();
while (true) {
VertexId v_i = __sync_fetch_and_add(&thread_state[thread_id]->curr, basic_chunk);
if (v_i >= thread_state[thread_id]->end) break;
下面这一段不是很懂
从v_i开始,做完一个往后移动一个,直到word为0
unsigned long word = active->data[WORD_OFFSET(v_i)];
while (word != 0) {
if (word & 1) {
local_reducer += process(v_i);
}
v_i++;
word = word >> 1;
}
}
thread_state[thread_id]->status = STEALING;
下面一段代码讲的是线程stealing任务做。
根据当前线程id计算一个线程id出来赋值给t_i,当线程t_i的状态处于stealing时,他就执行reduce
for (int t_offset=1;t_offset<threads;t_offset++) {
int t_i = (thread_id + t_offset) % threads;
while (thread_state[t_i]->status!=STEALING) {
VertexId v_i = __sync_fetch_and_add(&thread_state[t_i]->curr, basic_chunk);
if (v_i >= thread_state[t_i]->end) continue;
unsigned long word = active->data[WORD_OFFSET(v_i)];
while (word != 0) {
if (word & 1) {
local_reducer += process(v_i);
}
v_i++;
word = word >> 1;
}
}
}
reducer += local_reducer;
}
以上都是并行执行的代码。
MPI_ALLreduce函数
相关MPI编程
R global_reducer;
MPI_Datatype dt = get_mpi_data_type<R>();
MPI_Allreduce(&reducer, &global_reducer, 1, dt, MPI_SUM, MPI_COMM_WORLD);
stream_time += MPI_Wtime();
#ifdef PRINT_DEBUG_MESSAGES
if (partition_id==0) {
printf("process_vertices took %lf (s)\n", stream_time);
}
#endif
return global_reducer;
}