Pipe核心分析及消息體結構
Pipe是什麼
Pipe是SimpleMessenger中的一個複雜的組件,每個Pipe實例都擁有工作在socket上的兩個線程,分別負責socket連接上的讀取和寫入的數據傳輸,除此之外還會將socket上的錯誤信息彙報給SimpleMessenger,同時處於一個穩定狀態,來持續不斷的爲SimpleMessenger提供數據服務。Pipe的不同狀態:
STATE_ACCEPTING, STATE_CONNECTING, STATE_OPEN, STATE_STANDBY, STATE_CLOSED, STATE_CLOSING, STATE_WAIT
Pipe::reader()從socke中讀取消息:
主要處理邏輯過程:- 判斷pipe的當前狀態,如果處於STATE_ACCEPTING,則執行accept(),接收連接。
- 判斷Pipe的當前狀態,Pipe的狀態不是STATE_CONNECTING且也不是STATE_CLOSED,進入主循環體。
- 從連接的socket中讀取消息的tag,tag的取值有多種對應着不同的消息類型。
- 根據tag類型,做相應的處理操作,依tag == CEPH_MSGR_TAG_MSG爲例。
- 調用read_message()讀取消息(message)
- 根據message的類型判斷是快速處理(fast_dispatch)還是先放到in_q隊列中。
- 執行主體循環邏輯,如果Pipe的狀態不在滿足,跳槽主循環結束reader。
Pipe::reader的函數調用:
Pipe::reader() >Pipe::accept() >Pipe::tcp_read()//讀取tag >Pipe::read_message() //讀取消息 >Pipe::tcp_read()->Pipe::tcp_read_noblocking() or >Pipe::tcp_read_noblocking() >Pipe::buffered_read() >Pipe::do_recv() >recv() //syscall
Pipe::reader部分源代碼
void Pipe::reader()
{
pipe_lock.Lock();
//1.判斷pipe的當前狀態,如果處於STATE_ACCEPTING,則執行accept(),接收連接。
if (state == STATE_ACCEPTING) {
accept();
assert(pipe_lock.is_locked());
}
//2.判斷Pipe的當前狀態,Pipe的狀態不是STATE_CONNECTING且也不是STATE_CLOSED,進入主循環體。
// loop.
while (state != STATE_CLOSED &&
state != STATE_CONNECTING) {
assert(pipe_lock.is_locked());
// sleep if (re)connecting
if (state == STATE_STANDBY) {
ldout(msgr->cct,20) << "reader sleeping during reconnect|standby" << dendl;
cond.Wait(pipe_lock);
continue;
}
// get a reference to the AuthSessionHandler while we have the pipe_lock
ceph::shared_ptr<AuthSessionHandler> auth_handler = session_security;
pipe_lock.Unlock();
//3.從連接的socket中讀取消息的tag,tag的取值有多種對應着不同的消息類型。
char tag = -1;
ldout(msgr->cct,20) << "reader reading tag..." << dendl;
if (tcp_read((char*)&tag, 1) < 0) {
pipe_lock.Lock();
ldout(msgr->cct,2) << "reader couldn't read tag, " << cpp_strerror(errno) << dendl;
fault(true);
continue;
}
//4.根據tag類型,做相應的處理操作,依tag == CEPH_MSGR_TAG_MSG爲例。
...
if(tag == "..."){ //此處忽略其他類型的tag,對應的操作。
}
else if (tag == CEPH_MSGR_TAG_MSG) {
ldout(msgr->cct,20) << "reader got MSG" << dendl;
Message *m = 0;
//5.調用read_message()讀取消息(message)
int r = read_message(&m, auth_handler.get());
pipe_lock.Lock();
if (!m) {
if (r < 0)
fault(true);
continue;
}
if (state == STATE_CLOSED ||
state == STATE_CONNECTING) {
msgr->dispatch_throttle_release(m->get_dispatch_throttle_size());
m->put();
continue;
}
//保證接收消息的安全可靠
// check received seq#. if it is old, drop the message.
// note that incoming messages may skip ahead. this is convenient for the client
// side queueing because messages can't be renumbered, but the (kernel) client will
// occasionally pull a message out of the sent queue to send elsewhere. in that case
// it doesn't matter if we "got" it or not.
if (m->get_seq() <= in_seq) {
ldout(msgr->cct,0) << "reader got old message "
<< m->get_seq() << " <= " << in_seq << " " << m << " " << *m
<< ", discarding" << dendl;
msgr->dispatch_throttle_release(m->get_dispatch_throttle_size());
m->put();
if (connection_state->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
msgr->cct->_conf->ms_die_on_old_message)
assert(0 == "old msgs despite reconnect_seq feature");
continue;
}
if (m->get_seq() > in_seq + 1) {
ldout(msgr->cct,0) << "reader missed message? skipped from seq "
<< in_seq << " to " << m->get_seq() << dendl;
if (msgr->cct->_conf->ms_die_on_skipped_message)
assert(0 == "skipped incoming seq");
}
m->set_connection(connection_state.get());
// note last received message.
in_seq = m->get_seq();
cond.Signal(); // wake up writer, to ack this
ldout(msgr->cct,10) << "reader got message "
<< m->get_seq() << " " << m << " " << *m
<< dendl;
in_q->fast_preprocess(m);
if (delay_thread) {
utime_t release;
if (rand() % 10000 < msgr->cct->_conf->ms_inject_delay_probability * 10000.0) {
release = m->get_recv_stamp();
release += msgr->cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
lsubdout(msgr->cct, ms, 1) << "queue_received will delay until " << release << " on " << m << " " << *m << dendl;
}
delay_thread->queue(release, m);
} else {
//6.根據message的類型判斷是快速處理(fast_dispatch)還是先放到in_q隊列中。
if (in_q->can_fast_dispatch(m)) {
reader_dispatching = true;
pipe_lock.Unlock();
in_q->fast_dispatch(m);
pipe_lock.Lock();
reader_dispatching = false;
if (state == STATE_CLOSED ||
notify_on_dispatch_done) { // there might be somebody waiting
notify_on_dispatch_done = false;
cond.Signal();
}
} else {
in_q->enqueue(m, m->get_priority(), conn_id);
}
}
}
else if (tag == CEPH_MSGR_TAG_CLOSE) {
ldout(msgr->cct,20) << "reader got CLOSE" << dendl;
pipe_lock.Lock();
if (state == STATE_CLOSING) {
state = STATE_CLOSED;
state_closed.set(1);
} else {
state = STATE_CLOSING;
}
cond.Signal();
break;
}
else {
ldout(msgr->cct,0) << "reader bad tag " << (int)tag << dendl;
pipe_lock.Lock();
fault(true);
}
}
//7.執行主體循環邏輯,如果Pipe的狀態不在滿足,跳槽主循環結束reader。
// reap?
reader_running = false;
reader_needs_join = true;
unlock_maybe_reap();
ldout(msgr->cct,10) << "reader done" << dendl;
}
int Pipe::read_message(Message **pm, AuthSessionHandler* auth_handler)
{
int ret = -1;
// envelope
//ldout(msgr->cct,10) << "receiver.read_message from sd " << sd << dendl;
ceph_msg_header header;
ceph_msg_footer footer;
__u32 header_crc = 0;
//接收消息的頭部
if (connection_state->has_feature(CEPH_FEATURE_NOSRCADDR)) {
¦ if (tcp_read((char*)&header, sizeof(header)) < 0)
¦ ¦ return -1;
¦ if (msgr->crcflags & MSG_CRC_HEADER) {
¦ ¦ header_crc = ceph_crc32c(0, (unsigned char *)&header, sizeof(header) - sizeof(header.crc));
¦ }
} else {
¦ ceph_msg_header_old oldheader;
¦ if (tcp_read((char*)&oldheader, sizeof(oldheader)) < 0)
¦ ¦ return -1;
¦ // this is fugly
¦ memcpy(&header, &oldheader, sizeof(header));
¦ header.src = oldheader.src.name;
¦ header.reserved = oldheader.reserved;
¦ if (msgr->crcflags & MSG_CRC_HEADER) {
¦ ¦ header.crc = oldheader.crc;
¦ ¦ header_crc = ceph_crc32c(0, (unsigned char *)&oldheader, sizeof(oldheader) - sizeof(oldheader.crc));
¦ }
}
ldout(msgr->cct,20) << "reader got envelope type=" << header.type
¦ ¦ ¦ ¦ ¦<< " src " << entity_name_t(header.src)
¦ ¦ ¦ ¦ ¦<< " front=" << header.front_len
¦ ¦<< " data=" << header.data_len
¦ ¦<< " off " << header.data_off
¦ ¦ ¦ ¦ ¦<< dendl;
//驗證頭部的crc
// verify header crc
if ((msgr->crcflags & MSG_CRC_HEADER) && header_crc != header.crc) {
¦ ldout(msgr->cct,0) << "reader got bad header crc " << header_crc << " != " << header.crc << dendl;
¦ return -1;
}
bufferlist front, middle, data;
int front_len, middle_len;
unsigned data_len, data_off;
int aborted;
Message *message;
utime_t recv_stamp = ceph_clock_now(msgr->cct);
//執行simpleMessenger的throttler的策略
if (policy.throttler_messages) {
¦ ldout(msgr->cct,10) << "reader wants " << 1 << " message from policy throttler "
<< policy.throttler_messages->get_current() << "/"
<< policy.throttler_messages->get_max() << dendl;
¦ policy.throttler_messages->get();
}
uint64_t message_size = header.front_len + header.middle_len + header.data_len;
if (message_size) {
¦ if (policy.throttler_bytes) {
¦ ¦ ldout(msgr->cct,10) << "reader wants " << message_size << " bytes from policy throttler "
¦ ¦ ¦ ¦<< policy.throttler_bytes->get_current() << "/"
¦ ¦ ¦ ¦<< policy.throttler_bytes->get_max() << dendl;
¦ ¦ policy.throttler_bytes->get(message_size);
¦ }
¦ // throttle total bytes waiting for dispatch. do this _after_ the
¦ // policy throttle, as this one does not deadlock (unless dispatch
¦ // blocks indefinitely, which it shouldn't). in contrast, the
¦ // policy throttle carries for the lifetime of the message.
¦ ldout(msgr->cct,10) << "reader wants " << message_size << " from dispatch throttler "
¦ ¦ ¦<< msgr->dispatch_throttler.get_current() << "/"
¦ ¦ ¦<< msgr->dispatch_throttler.get_max() << dendl;
¦ msgr->dispatch_throttler.get(message_size);
}
utime_t throttle_stamp = ceph_clock_now(msgr->cct);
//接收消息的front部分
// read front
front_len = header.front_len;
if (front_len) {
¦ bufferptr bp = buffer::create(front_len);
¦ if (tcp_read(bp.c_str(), front_len) < 0)
¦ ¦ goto out_dethrottle;
¦ front.push_back(bp);
¦ ldout(msgr->cct,20) << "reader got front " << front.length() << dendl;
}
//讀取消息的middle部分
// read middle
middle_len = header.middle_len;
if (middle_len) {
¦ bufferptr bp = buffer::create(middle_len);
¦ if (tcp_read(bp.c_str(), middle_len) < 0)
¦ ¦ goto out_dethrottle;
¦ middle.push_back(bp);
¦ ldout(msgr->cct,20) << "reader got middle " << middle.length() << dendl;
}
//接收消息的數據體(data)
// read data
data_len = le32_to_cpu(header.data_len);
data_off = le32_to_cpu(header.data_off);
if (data_len) {
¦ unsigned offset = 0;
¦ unsigned left = data_len;
¦ bufferlist newbuf, rxbuf;
¦ bufferlist::iterator blp;
¦ int rxbuf_version = 0;
¦ while (left > 0) {
¦ ¦ // wait for data
¦ ¦ if (tcp_read_wait() < 0)
goto out_dethrottle;
¦ ¦ // get a buffer
¦ ¦ connection_state->lock.Lock();
¦ ¦ map<ceph_tid_t,pair<bufferlist,int> >::iterator p = connection_state->rx_buffers.find(header.tid);
¦ ¦ if (p != connection_state->rx_buffers.end()) {
if (rxbuf.length() == 0 || p->second.second != rxbuf_version) {
¦ ldout(msgr->cct,10) << "reader seleting rx buffer v " << p->second.second
¦ ¦<< " at offset " << offset
¦ ¦<< " len " << p->second.first.length() << dendl;
¦ rxbuf = p->second.first;
¦ rxbuf_version = p->second.second;
¦ // make sure it's big enough
¦ if (rxbuf.length() < data_len)
¦ ¦ rxbuf.push_back(buffer::create(data_len - rxbuf.length()));
¦ blp = p->second.first.begin();
¦ blp.advance(offset);
}
¦ ¦ } else {
if (!newbuf.length()) {
¦ ldout(msgr->cct,20) << "reader allocating new rx buffer at offset " << offset << dendl;
¦ alloc_aligned_buffer(newbuf, data_len, data_off);
¦ blp = newbuf.begin();
¦ blp.advance(offset);
}
¦ ¦ }
¦ ¦ bufferptr bp = blp.get_current_ptr();
¦ ¦ int read = MIN(bp.length(), left);
¦ ¦ ldout(msgr->cct,20) << "reader reading nonblocking into " << (void*)bp.c_str() << " len " << bp.length() << dendl;
¦ ¦ int got = tcp_read_nonblocking(bp.c_str(), read);
¦ ¦ ldout(msgr->cct,30) << "reader read " << got << " of " << read << dendl;
¦ ¦ connection_state->lock.Unlock();
¦ ¦ if (got < 0)
goto out_dethrottle;
¦ ¦ if (got > 0) {
blp.advance(got);
data.append(bp, 0, got);
offset += got;
left -= got;
¦ ¦ } // else we got a signal or something; just loop.
¦ }
}
//接收消息的footer部分
// footer
if (connection_state->has_feature(CEPH_FEATURE_MSG_AUTH)) {
¦ if (tcp_read((char*)&footer, sizeof(footer)) < 0)
¦ ¦ goto out_dethrottle;
} else {
¦ ceph_msg_footer_old old_footer;
¦ if (tcp_read((char*)&old_footer, sizeof(old_footer)) < 0)
¦ ¦ goto out_dethrottle;
¦ footer.front_crc = old_footer.front_crc;
¦ footer.middle_crc = old_footer.middle_crc;
¦ footer.data_crc = old_footer.data_crc;
¦ footer.sig = 0;
¦ footer.flags = old_footer.flags;
}
aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0;
ldout(msgr->cct,10) << "aborted = " << aborted << dendl;
if (aborted) {
¦ ldout(msgr->cct,0) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length()
¦ ¦ << " byte message.. ABORTED" << dendl;
¦ ret = 0;
¦ goto out_dethrottle;
}
ldout(msgr->cct,20) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length()
¦ ¦<< " byte message" << dendl;
message = decode_message(msgr->cct, msgr->crcflags, header, footer, front, middle, data);
if (!message) {
¦ ret = -EINVAL;
¦ goto out_dethrottle;
}
//
// Check the signature if one should be present. A zero return indicates success. PLR
//
//驗證消息的簽名
if (auth_handler == NULL) {
¦ ldout(msgr->cct, 10) << "No session security set" << dendl;
} else {
¦ if (auth_handler->check_message_signature(message)) {
¦ ¦ ldout(msgr->cct, 0) << "Signature check failed" << dendl;
¦ ¦ ret = -EINVAL;
¦ ¦ goto out_dethrottle;
¦ }
}
message->set_byte_throttler(policy.throttler_bytes);
message->set_message_throttler(policy.throttler_messages);
// store reservation size in message, so we don't get confused
// by messages entering the dispatch queue through other paths.
message->set_dispatch_throttle_size(message_size);
message->set_recv_stamp(recv_stamp);
message->set_throttle_stamp(throttle_stamp);
message->set_recv_complete_stamp(ceph_clock_now(msgr->cct));
*pm = message;
return 0;
out_dethrottle:
// release bytes reserved from the throttlers on failure
if (policy.throttler_messages) {
¦ ldout(msgr->cct,10) << "reader releasing " << 1 << " message to policy throttler "
<< policy.throttler_messages->get_current() << "/"
<< policy.throttler_messages->get_max() << dendl;
¦ policy.throttler_messages->put();
}
if (message_size) {
¦ if (policy.throttler_bytes) {
¦ ¦ ldout(msgr->cct,10) << "reader releasing " << message_size << " bytes to policy throttler "
<< policy.throttler_bytes->get_current() << "/"
<< policy.throttler_bytes->get_max() << dendl;
¦ ¦ policy.throttler_bytes->put(message_size);
¦ }
¦ msgr->dispatch_throttle_release(message_size);
}
return ret;
}
`
6. 總結:
reader函數是Pipe reader thread的主要邏輯,負責從連接的socket上接收消息。