process_edges函數
有六個參數:sparse_signal, sparse_slot, dense_signal, dense_slot,活躍點集合和dense_selective
pagegraph.cpp:
把next數組置空。next和curr負責存放上一輪和本輪頂點值
for (int i_i=0;i_i<iterations;i_i++) {
if (graph->partition_id==0) {
printf("delta(%d)=%lf\n", i_i, delta);
}
VertexId activated = 0;
graph->fill_vertex_array(next, (double)0);
graph.hpp:
process_edges原函數如下:
// process edges
template<typename R, typename M>
R process_edges(std::function<void(VertexId)> sparse_signal, std::function<R(VertexId, M, VertexAdjList<EdgeData>)> sparse_slot, std::function<void(VertexId, VertexAdjList<EdgeData>)> dense_signal, std::function<R(VertexId, M)> dense_slot, Bitmap * active, Bitmap * dense_selective = nullptr) {
double stream_time = 0;
stream_time -= MPI_Wtime();
對於每個線程分配緩存空間
for (int t_i=0;t_i<threads;t_i++) {
local_send_buffer[t_i]->resize( sizeof(MsgUnit<M>) * local_send_buffer_limit );
local_send_buffer[t_i]->count = 0;
}
下面調用了process_vertices這個函數
R reducer = 0;
EdgeId active_edges = process_vertices<EdgeId>(
[&](VertexId vtx){
return (EdgeId)out_degree[vtx];
},
active
);
bool sparse = (active_edges < edges / 20);
if (sparse) {
for (int i=0;i<partitions;i++) {
for (int s_i=0;s_i<sockets;s_i++) {
recv_buffer[i][s_i]->resize( sizeof(MsgUnit<M>) * (partition_offset[i+1] - partition_offset[i]) * sockets );
send_buffer[i][s_i]->resize( sizeof(MsgUnit<M>) * owned_vertices * sockets );
send_buffer[i][s_i]->count = 0;
recv_buffer[i][s_i]->count = 0;
}
}
} else {
for (int i=0;i<partitions;i++) {
for (int s_i=0;s_i<sockets;s_i++) {
recv_buffer[i][s_i]->resize( sizeof(MsgUnit<M>) * owned_vertices * sockets );
send_buffer[i][s_i]->resize( sizeof(MsgUnit<M>) * (partition_offset[i+1] - partition_offset[i]) * sockets );
send_buffer[i][s_i]->count = 0;
recv_buffer[i][s_i]->count = 0;
}
}
}
size_t basic_chunk = 64;
if (sparse) {
#ifdef PRINT_DEBUG_MESSAGES
if (partition_id==0) {
printf("sparse mode\n");
}
#endif
int * recv_queue = new int [partitions];
int recv_queue_size = 0;
std::mutex recv_queue_mutex;
current_send_part_id = partition_id;
#pragma omp parallel for
for (VertexId begin_v_i=partition_offset[partition_id];begin_v_i<partition_offset[partition_id+1];begin_v_i+=basic_chunk) {
VertexId v_i = begin_v_i;
unsigned long word = active->data[WORD_OFFSET(v_i)];
while (word != 0) {
if (word & 1) {
sparse_signal(v_i);
}
v_i++;
word = word >> 1;
}
}
#pragma omp parallel for
for (int t_i=0;t_i<threads;t_i++) {
flush_local_send_buffer<M>(t_i);
}
recv_queue[recv_queue_size] = partition_id;
recv_queue_mutex.lock();
recv_queue_size += 1;
recv_queue_mutex.unlock();
std::thread send_thread([&](){
for (int step=1;step<partitions;step++) {
int i = (partition_id - step + partitions) % partitions;
for (int s_i=0;s_i<sockets;s_i++) {
MPI_Send(send_buffer[partition_id][s_i]->data, sizeof(MsgUnit<M>) * send_buffer[partition_id][s_i]->count, MPI_CHAR, i, PassMessage, MPI_COMM_WORLD);
}
}
});
std::thread recv_thread([&](){
for (int step=1;step<partitions;step++) {
int i = (partition_id + step) % partitions;
for (int s_i=0;s_i<sockets;s_i++) {
MPI_Status recv_status;
MPI_Probe(i, PassMessage, MPI_COMM_WORLD, &recv_status);
MPI_Get_count(&recv_status, MPI_CHAR, &recv_buffer[i][s_i]->count);
MPI_Recv(recv_buffer[i][s_i]->data, recv_buffer[i][s_i]->count, MPI_CHAR, i, PassMessage, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
recv_buffer[i][s_i]->count /= sizeof(MsgUnit<M>);
}
recv_queue[recv_queue_size] = i;
recv_queue_mutex.lock();
recv_queue_size += 1;
recv_queue_mutex.unlock();
}
});
for (int step=0;step<partitions;step++) {
while (true) {
recv_queue_mutex.lock();
bool condition = (recv_queue_size<=step);
recv_queue_mutex.unlock();
if (!condition) break;
__asm volatile ("pause" ::: "memory");
}
int i = recv_queue[step];
MessageBuffer ** used_buffer;
if (i==partition_id) {
used_buffer = send_buffer[i];
} else {
used_buffer = recv_buffer[i];
}
for (int s_i=0;s_i<sockets;s_i++) {
MsgUnit<M> * buffer = (MsgUnit<M> *)used_buffer[s_i]->data;
size_t buffer_size = used_buffer[s_i]->count;
for (int t_i=0;t_i<threads;t_i++) {
// int s_i = get_socket_id(t_i);
int s_j = get_socket_offset(t_i);
VertexId partition_size = buffer_size;
thread_state[t_i]->curr = partition_size / threads_per_socket / basic_chunk * basic_chunk * s_j;
thread_state[t_i]->end = partition_size / threads_per_socket / basic_chunk * basic_chunk * (s_j+1);
if (s_j == threads_per_socket - 1) {
thread_state[t_i]->end = buffer_size;
}
thread_state[t_i]->status = WORKING;
}
#pragma omp parallel reduction(+:reducer)
{
R local_reducer = 0;
int thread_id = omp_get_thread_num();
int s_i = get_socket_id(thread_id);
while (true) {
VertexId b_i = __sync_fetch_and_add(&thread_state[thread_id]->curr, basic_chunk);
if (b_i >= thread_state[thread_id]->end) break;
VertexId begin_b_i = b_i;
VertexId end_b_i = b_i + basic_chunk;
if (end_b_i>thread_state[thread_id]->end) {
end_b_i = thread_state[thread_id]->end;
}
for (b_i=begin_b_i;b_i<end_b_i;b_i++) {
VertexId v_i = buffer[b_i].vertex;
M msg_data = buffer[b_i].msg_data;
if (outgoing_adj_bitmap[s_i]->get_bit(v_i)) {
local_reducer += sparse_slot(v_i, msg_data, VertexAdjList<EdgeData>(outgoing_adj_list[s_i] + outgoing_adj_index[s_i][v_i], outgoing_adj_list[s_i] + outgoing_adj_index[s_i][v_i+1]));
}
}
}
thread_state[thread_id]->status = STEALING;
for (int t_offset=1;t_offset<threads;t_offset++) {
int t_i = (thread_id + t_offset) % threads;
if (thread_state[t_i]->status==STEALING) continue;
while (true) {
VertexId b_i = __sync_fetch_and_add(&thread_state[t_i]->curr, basic_chunk);
if (b_i >= thread_state[t_i]->end) break;
VertexId begin_b_i = b_i;
VertexId end_b_i = b_i + basic_chunk;
if (end_b_i>thread_state[t_i]->end) {
end_b_i = thread_state[t_i]->end;
}
int s_i = get_socket_id(t_i);
for (b_i=begin_b_i;b_i<end_b_i;b_i++) {
VertexId v_i = buffer[b_i].vertex;
M msg_data = buffer[b_i].msg_data;
if (outgoing_adj_bitmap[s_i]->get_bit(v_i)) {
local_reducer += sparse_slot(v_i, msg_data, VertexAdjList<EdgeData>(outgoing_adj_list[s_i] + outgoing_adj_index[s_i][v_i], outgoing_adj_list[s_i] + outgoing_adj_index[s_i][v_i+1]));
}
}
}
}
reducer += local_reducer;
}
}
}
send_thread.join();
recv_thread.join();
delete [] recv_queue;
} else {
// dense selective bitmap
if (dense_selective!=nullptr && partitions>1) {
double sync_time = 0;
sync_time -= get_time();
std::thread send_thread([&](){
for (int step=1;step<partitions;step++) {
int recipient_id = (partition_id + step) % partitions;
MPI_Send(dense_selective->data + WORD_OFFSET(partition_offset[partition_id]), owned_vertices / 64, MPI_UNSIGNED_LONG, recipient_id, PassMessage, MPI_COMM_WORLD);
}
});
std::thread recv_thread([&](){
for (int step=1;step<partitions;step++) {
int sender_id = (partition_id - step + partitions) % partitions;
MPI_Recv(dense_selective->data + WORD_OFFSET(partition_offset[sender_id]), (partition_offset[sender_id + 1] - partition_offset[sender_id]) / 64, MPI_UNSIGNED_LONG, sender_id, PassMessage, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
});
send_thread.join();
recv_thread.join();
MPI_Barrier(MPI_COMM_WORLD);
sync_time += get_time();
#ifdef PRINT_DEBUG_MESSAGES
if (partition_id==0) {
printf("sync_time = %lf\n", sync_time);
}
#endif
}
#ifdef PRINT_DEBUG_MESSAGES
if (partition_id==0) {
printf("dense mode\n");
}
#endif
int * send_queue = new int [partitions];
int * recv_queue = new int [partitions];
volatile int send_queue_size = 0;
volatile int recv_queue_size = 0;
std::mutex send_queue_mutex;
std::mutex recv_queue_mutex;
std::thread send_thread([&](){
for (int step=0;step<partitions;step++) {
if (step==partitions-1) {
break;
}
while (true) {
send_queue_mutex.lock();
bool condition = (send_queue_size<=step);
send_queue_mutex.unlock();
if (!condition) break;
__asm volatile ("pause" ::: "memory");
}
int i = send_queue[step];
for (int s_i=0;s_i<sockets;s_i++) {
MPI_Send(send_buffer[i][s_i]->data, sizeof(MsgUnit<M>) * send_buffer[i][s_i]->count, MPI_CHAR, i, PassMessage, MPI_COMM_WORLD);
}
}
});
std::thread recv_thread([&](){
std::vector<std::thread> threads;
for (int step=1;step<partitions;step++) {
int i = (partition_id - step + partitions) % partitions;
threads.emplace_back([&](int i){
for (int s_i=0;s_i<sockets;s_i++) {
MPI_Status recv_status;
MPI_Probe(i, PassMessage, MPI_COMM_WORLD, &recv_status);
MPI_Get_count(&recv_status, MPI_CHAR, &recv_buffer[i][s_i]->count);
MPI_Recv(recv_buffer[i][s_i]->data, recv_buffer[i][s_i]->count, MPI_CHAR, i, PassMessage, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
recv_buffer[i][s_i]->count /= sizeof(MsgUnit<M>);
}
}, i);
}
for (int step=1;step<partitions;step++) {
int i = (partition_id - step + partitions) % partitions;
threads[step-1].join();
recv_queue[recv_queue_size] = i;
recv_queue_mutex.lock();
recv_queue_size += 1;
recv_queue_mutex.unlock();
}
recv_queue[recv_queue_size] = partition_id;
recv_queue_mutex.lock();
recv_queue_size += 1;
recv_queue_mutex.unlock();
});
current_send_part_id = partition_id;
for (int step=0;step<partitions;step++) {
current_send_part_id = (current_send_part_id + 1) % partitions;
int i = current_send_part_id;
for (int t_i=0;t_i<threads;t_i++) {
*thread_state[t_i] = tuned_chunks_dense[i][t_i];
}
#pragma omp parallel
{
int thread_id = omp_get_thread_num();
int s_i = get_socket_id(thread_id);
VertexId final_p_v_i = thread_state[thread_id]->end;
while (true) {
VertexId begin_p_v_i = __sync_fetch_and_add(&thread_state[thread_id]->curr, basic_chunk);
if (begin_p_v_i >= final_p_v_i) break;
VertexId end_p_v_i = begin_p_v_i + basic_chunk;
if (end_p_v_i > final_p_v_i) {
end_p_v_i = final_p_v_i;
}
for (VertexId p_v_i = begin_p_v_i; p_v_i < end_p_v_i; p_v_i ++) {
VertexId v_i = compressed_incoming_adj_index[s_i][p_v_i].vertex;
dense_signal(v_i, VertexAdjList<EdgeData>(incoming_adj_list[s_i] + compressed_incoming_adj_index[s_i][p_v_i].index, incoming_adj_list[s_i] + compressed_incoming_adj_index[s_i][p_v_i+1].index));
}
}
thread_state[thread_id]->status = STEALING;
for (int t_offset=1;t_offset<threads;t_offset++) {
int t_i = (thread_id + t_offset) % threads;
int s_i = get_socket_id(t_i);
while (thread_state[t_i]->status!=STEALING) {
VertexId begin_p_v_i = __sync_fetch_and_add(&thread_state[t_i]->curr, basic_chunk);
if (begin_p_v_i >= thread_state[t_i]->end) break;
VertexId end_p_v_i = begin_p_v_i + basic_chunk;
if (end_p_v_i > thread_state[t_i]->end) {
end_p_v_i = thread_state[t_i]->end;
}
for (VertexId p_v_i = begin_p_v_i; p_v_i < end_p_v_i; p_v_i ++) {
VertexId v_i = compressed_incoming_adj_index[s_i][p_v_i].vertex;
dense_signal(v_i, VertexAdjList<EdgeData>(incoming_adj_list[s_i] + compressed_incoming_adj_index[s_i][p_v_i].index, incoming_adj_list[s_i] + compressed_incoming_adj_index[s_i][p_v_i+1].index));
}
}
}
}
#pragma omp parallel for
for (int t_i=0;t_i<threads;t_i++) {
flush_local_send_buffer<M>(t_i);
}
if (i!=partition_id) {
send_queue[send_queue_size] = i;
send_queue_mutex.lock();
send_queue_size += 1;
send_queue_mutex.unlock();
}
}
for (int step=0;step<partitions;step++) {
while (true) {
recv_queue_mutex.lock();
bool condition = (recv_queue_size<=step);
recv_queue_mutex.unlock();
if (!condition) break;
__asm volatile ("pause" ::: "memory");
}
int i = recv_queue[step];
MessageBuffer ** used_buffer;
if (i==partition_id) {
used_buffer = send_buffer[i];
} else {
used_buffer = recv_buffer[i];
}
for (int t_i=0;t_i<threads;t_i++) {
int s_i = get_socket_id(t_i);
int s_j = get_socket_offset(t_i);
VertexId partition_size = used_buffer[s_i]->count;
thread_state[t_i]->curr = partition_size / threads_per_socket / basic_chunk * basic_chunk * s_j;
thread_state[t_i]->end = partition_size / threads_per_socket / basic_chunk * basic_chunk * (s_j+1);
if (s_j == threads_per_socket - 1) {
thread_state[t_i]->end = used_buffer[s_i]->count;
}
thread_state[t_i]->status = WORKING;
}
#pragma omp parallel reduction(+:reducer)
{
R local_reducer = 0;
int thread_id = omp_get_thread_num();
int s_i = get_socket_id(thread_id);
MsgUnit<M> * buffer = (MsgUnit<M> *)used_buffer[s_i]->data;
while (true) {
VertexId b_i = __sync_fetch_and_add(&thread_state[thread_id]->curr, basic_chunk);
if (b_i >= thread_state[thread_id]->end) break;
VertexId begin_b_i = b_i;
VertexId end_b_i = b_i + basic_chunk;
if (end_b_i>thread_state[thread_id]->end) {
end_b_i = thread_state[thread_id]->end;
}
for (b_i=begin_b_i;b_i<end_b_i;b_i++) {
VertexId v_i = buffer[b_i].vertex;
M msg_data = buffer[b_i].msg_data;
local_reducer += dense_slot(v_i, msg_data);
}
}
thread_state[thread_id]->status = STEALING;
reducer += local_reducer;
}
}
send_thread.join();
recv_thread.join();
delete [] send_queue;
delete [] recv_queue;
}
R global_reducer;
MPI_Datatype dt = get_mpi_data_type<R>();
MPI_Allreduce(&reducer, &global_reducer, 1, dt, MPI_SUM, MPI_COMM_WORLD);
stream_time += MPI_Wtime();
#ifdef PRINT_DEBUG_MESSAGES
if (partition_id==0) {
printf("process_edges took %lf (s)\n", stream_time);
}
#endif
return global_reducer;
}
};
process_edges函數調用如下:
graph->process_edges<int,double>(
[&](VertexId src){
graph->emit(src, curr[src]);
},
[&](VertexId src, double msg, VertexAdjList<Empty> outgoing_adj){
for (AdjUnit<Empty> * ptr=outgoing_adj.begin;ptr!=outgoing_adj.end;ptr++) {
VertexId dst = ptr->neighbour;
write_add(&next[dst], msg);
}
return 0;
},
[&](VertexId dst, VertexAdjList<Empty> incoming_adj) {
double sum = 0;
for (AdjUnit<Empty> * ptr=incoming_adj.begin;ptr!=incoming_adj.end;ptr++) {
VertexId src = ptr->neighbour;
sum += curr[src];
}
graph->emit(dst, sum);
},
[&](VertexId dst, double msg) {
write_add(&next[dst], msg);
return 0;
},
active_in
);
if (i_i==iterations-1) {
delta = graph->process_vertices<double>(
[&](VertexId vtx) {
next[vtx] = 1 - d + d * next[vtx];
// if( fabs(next[vtx] - curr[vtx])>0.000001){
// // printf("\nthe progrem has compute the next[%d]\n",vtx);
// active_out->set_bit(vtx);
// activated += 1;
// }
return 0;
},
active_in
);
} else {
delta = graph->process_vertices<double>(
[&](VertexId vtx) {
next[vtx] = 1 - d + d * next[vtx];
//printf("\nthe progrem has compute the next[%d]\n",vtx);p
// if( fabs(next[vtx] - curr[vtx])>0.000001){
// // printf("\nthe progrem has compute the next[%d]\n",vtx);
// active_out->set_bit(vtx);
// activated += 1;
// }
if (graph->out_degree[vtx]>0) {
next[vtx] /= graph->out_degree[vtx];
return fabs(next[vtx] - curr[vtx]) * graph->out_degree[vtx];
}
return fabs(next[vtx] - curr[vtx]);
},
active_in
);
}
delta /= graph->vertices;
std::swap(curr, next);
printf("\nnext iteraction will process %d vertices, accounted for %lf\n\n\n", activated, 100*activated/graph->vertices);
std::swap(active_in, active_out);
}