Pipe核心分析及消息體結構-reader

Pipe核心分析及消息體結構

  1. Pipe是什麼
    Pipe是SimpleMessenger中的一個複雜的組件,每個Pipe實例都擁有工作在socket上的兩個線程,分別負責socket連接上的讀取和寫入的數據傳輸,除此之外還會將socket上的錯誤信息彙報給SimpleMessenger,同時處於一個穩定狀態,來持續不斷的爲SimpleMessenger提供數據服務。

  2. Pipe的不同狀態

    STATE_ACCEPTING,
    STATE_CONNECTING,
    STATE_OPEN,
    STATE_STANDBY,
    STATE_CLOSED,
    STATE_CLOSING,
    STATE_WAIT
  3. Pipe::reader()從socke中讀取消息:
    主要處理邏輯過程:

    1. 判斷pipe的當前狀態,如果處於STATE_ACCEPTING,則執行accept(),接收連接。
    2. 判斷Pipe的當前狀態,Pipe的狀態不是STATE_CONNECTING且也不是STATE_CLOSED,進入主循環體。
    3. 從連接的socket中讀取消息的tag,tag的取值有多種對應着不同的消息類型。
    4. 根據tag類型,做相應的處理操作,依tag == CEPH_MSGR_TAG_MSG爲例。
    5. 調用read_message()讀取消息(message)
    6. 根據message的類型判斷是快速處理(fast_dispatch)還是先放到in_q隊列中。
    7. 執行主體循環邏輯,如果Pipe的狀態不在滿足,跳槽主循環結束reader。
  4. Pipe::reader的函數調用

    Pipe::reader()
    >Pipe::accept()
    >Pipe::tcp_read()//讀取tag
    >Pipe::read_message() //讀取消息
        >Pipe::tcp_read()->Pipe::tcp_read_noblocking()
        or
        >Pipe::tcp_read_noblocking()
            >Pipe::buffered_read()
                >Pipe::do_recv()
                    >recv() //syscall 
  5. Pipe::reader部分源代碼

void Pipe::reader()                                                                                                                                                                                               
{
  pipe_lock.Lock();
//1.判斷pipe的當前狀態,如果處於STATE_ACCEPTING,則執行accept(),接收連接。
  if (state == STATE_ACCEPTING) {
    accept();
    assert(pipe_lock.is_locked());
  }
//2.判斷Pipe的當前狀態,Pipe的狀態不是STATE_CONNECTING且也不是STATE_CLOSED,進入主循環體。
  // loop.
  while (state != STATE_CLOSED &&
         state != STATE_CONNECTING) {
    assert(pipe_lock.is_locked());

    // sleep if (re)connecting
    if (state == STATE_STANDBY) {
      ldout(msgr->cct,20) << "reader sleeping during reconnect|standby" << dendl;
      cond.Wait(pipe_lock);
      continue;
    }  

    // get a reference to the AuthSessionHandler while we have the pipe_lock
    ceph::shared_ptr<AuthSessionHandler> auth_handler = session_security;

    pipe_lock.Unlock();
//3.從連接的socket中讀取消息的tag,tag的取值有多種對應着不同的消息類型。
    char tag = -1;
    ldout(msgr->cct,20) << "reader reading tag..." << dendl;
    if (tcp_read((char*)&tag, 1) < 0) { 
      pipe_lock.Lock();
      ldout(msgr->cct,2) << "reader couldn't read tag, " << cpp_strerror(errno) << dendl;
      fault(true);
      continue;
    }  
//4.根據tag類型,做相應的處理操作,依tag == CEPH_MSGR_TAG_MSG爲例。
        ...
    if(tag == "..."){ //此處忽略其他類型的tag,對應的操作。
    }
    else if (tag == CEPH_MSGR_TAG_MSG) {
      ldout(msgr->cct,20) << "reader got MSG" << dendl;
      Message *m = 0;
      //5.調用read_message()讀取消息(message)
      int r = read_message(&m, auth_handler.get());

      pipe_lock.Lock();

      if (!m) {
        if (r < 0)
          fault(true);
        continue;
      }

      if (state == STATE_CLOSED ||
          state == STATE_CONNECTING) {
        msgr->dispatch_throttle_release(m->get_dispatch_throttle_size());
        m->put();
        continue;
      }
//保證接收消息的安全可靠
      // check received seq#.  if it is old, drop the message.  
      // note that incoming messages may skip ahead.  this is convenient for the client
      // side queueing because messages can't be renumbered, but the (kernel) client will
      // occasionally pull a message out of the sent queue to send elsewhere.  in that case
      // it doesn't matter if we "got" it or not.
      if (m->get_seq() <= in_seq) {
        ldout(msgr->cct,0) << "reader got old message "
                << m->get_seq() << " <= " << in_seq << " " << m << " " << *m
                << ", discarding" << dendl;
        msgr->dispatch_throttle_release(m->get_dispatch_throttle_size());
        m->put();                                                                                                                                                                                                 
        if (connection_state->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
            msgr->cct->_conf->ms_die_on_old_message)
          assert(0 == "old msgs despite reconnect_seq feature");
        continue;
      }
      if (m->get_seq() > in_seq + 1) {
        ldout(msgr->cct,0) << "reader missed message?  skipped from seq "
                           << in_seq << " to " << m->get_seq() << dendl;
        if (msgr->cct->_conf->ms_die_on_skipped_message)
          assert(0 == "skipped incoming seq");
      }

      m->set_connection(connection_state.get());

      // note last received message.
      in_seq = m->get_seq();

      cond.Signal();  // wake up writer, to ack this

      ldout(msgr->cct,10) << "reader got message "
               << m->get_seq() << " " << m << " " << *m
               << dendl;
      in_q->fast_preprocess(m);

      if (delay_thread) {
        utime_t release;
        if (rand() % 10000 < msgr->cct->_conf->ms_inject_delay_probability * 10000.0) {
          release = m->get_recv_stamp();
          release += msgr->cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
          lsubdout(msgr->cct, ms, 1) << "queue_received will delay until " << release << " on " << m << " " << *m << dendl;
        }
        delay_thread->queue(release, m);
      } else {
      //6.根據message的類型判斷是快速處理(fast_dispatch)還是先放到in_q隊列中。
        if (in_q->can_fast_dispatch(m)) {
          reader_dispatching = true;
          pipe_lock.Unlock();
          in_q->fast_dispatch(m);
          pipe_lock.Lock();
          reader_dispatching = false;
          if (state == STATE_CLOSED ||
              notify_on_dispatch_done) { // there might be somebody waiting
            notify_on_dispatch_done = false;                                                                                                                                                                      
            cond.Signal();
          }
        } else {
          in_q->enqueue(m, m->get_priority(), conn_id);
        }
      }
    }

    else if (tag == CEPH_MSGR_TAG_CLOSE) {
      ldout(msgr->cct,20) << "reader got CLOSE" << dendl;
      pipe_lock.Lock();
      if (state == STATE_CLOSING) {
        state = STATE_CLOSED;
        state_closed.set(1);
      } else {
        state = STATE_CLOSING;
      }
      cond.Signal();
      break;
    }
    else {
      ldout(msgr->cct,0) << "reader bad tag " << (int)tag << dendl;
      pipe_lock.Lock();
      fault(true);
    }
  }
//7.執行主體循環邏輯,如果Pipe的狀態不在滿足,跳槽主循環結束reader。

  // reap?
  reader_running = false;
  reader_needs_join = true;
  unlock_maybe_reap();
  ldout(msgr->cct,10) << "reader done" << dendl;
}



int Pipe::read_message(Message **pm, AuthSessionHandler* auth_handler)
{
  int ret = -1;
  // envelope
  //ldout(msgr->cct,10) << "receiver.read_message from sd " << sd  << dendl;

  ceph_msg_header header; 
  ceph_msg_footer footer;
  __u32 header_crc = 0;
  //接收消息的頭部
  if (connection_state->has_feature(CEPH_FEATURE_NOSRCADDR)) {
  ¦ if (tcp_read((char*)&header, sizeof(header)) < 0)
  ¦ ¦ return -1;
  ¦ if (msgr->crcflags & MSG_CRC_HEADER) {
  ¦ ¦ header_crc = ceph_crc32c(0, (unsigned char *)&header, sizeof(header) - sizeof(header.crc));
  ¦ }
  } else {
  ¦ ceph_msg_header_old oldheader;
  ¦ if (tcp_read((char*)&oldheader, sizeof(oldheader)) < 0)
  ¦ ¦ return -1;
  ¦ // this is fugly
  ¦ memcpy(&header, &oldheader, sizeof(header));
  ¦ header.src = oldheader.src.name;
  ¦ header.reserved = oldheader.reserved;
  ¦ if (msgr->crcflags & MSG_CRC_HEADER) {
  ¦ ¦ header.crc = oldheader.crc;
  ¦ ¦ header_crc = ceph_crc32c(0, (unsigned char *)&oldheader, sizeof(oldheader) - sizeof(oldheader.crc));
  ¦ }
  }

  ldout(msgr->cct,20) << "reader got envelope type=" << header.type
  ¦ ¦ ¦ ¦ ¦<< " src " << entity_name_t(header.src)
  ¦ ¦ ¦ ¦ ¦<< " front=" << header.front_len
        ¦ ¦<< " data=" << header.data_len
        ¦ ¦<< " off " << header.data_off
  ¦ ¦ ¦ ¦ ¦<< dendl;
//驗證頭部的crc
  // verify header crc
  if ((msgr->crcflags & MSG_CRC_HEADER) && header_crc != header.crc) {                                                                                                                                            
  ¦ ldout(msgr->cct,0) << "reader got bad header crc " << header_crc << " != " << header.crc << dendl;
  ¦ return -1;
  }
  bufferlist front, middle, data;
  int front_len, middle_len;
  unsigned data_len, data_off;
  int aborted;
  Message *message;
  utime_t recv_stamp = ceph_clock_now(msgr->cct);
//執行simpleMessenger的throttler的策略
  if (policy.throttler_messages) {
  ¦ ldout(msgr->cct,10) << "reader wants " << 1 << " message from policy throttler "
                        << policy.throttler_messages->get_current() << "/"
                        << policy.throttler_messages->get_max() << dendl;
  ¦ policy.throttler_messages->get();
  }

  uint64_t message_size = header.front_len + header.middle_len + header.data_len;
  if (message_size) {
  ¦ if (policy.throttler_bytes) {
  ¦ ¦ ldout(msgr->cct,10) << "reader wants " << message_size << " bytes from policy throttler "
        ¦ ¦ ¦ ¦<< policy.throttler_bytes->get_current() << "/"
        ¦ ¦ ¦ ¦<< policy.throttler_bytes->get_max() << dendl;
  ¦ ¦ policy.throttler_bytes->get(message_size);
  ¦ }

  ¦ // throttle total bytes waiting for dispatch.  do this _after_ the
  ¦ // policy throttle, as this one does not deadlock (unless dispatch
  ¦ // blocks indefinitely, which it shouldn't).  in contrast, the
  ¦ // policy throttle carries for the lifetime of the message.
  ¦ ldout(msgr->cct,10) << "reader wants " << message_size << " from dispatch throttler "
        ¦ ¦ ¦<< msgr->dispatch_throttler.get_current() << "/"
        ¦ ¦ ¦<< msgr->dispatch_throttler.get_max() << dendl;
  ¦ msgr->dispatch_throttler.get(message_size);
  }

  utime_t throttle_stamp = ceph_clock_now(msgr->cct);
//接收消息的front部分
  // read front
  front_len = header.front_len;
  if (front_len) {                                                                                                                                                                                                
  ¦ bufferptr bp = buffer::create(front_len);
  ¦ if (tcp_read(bp.c_str(), front_len) < 0)
  ¦ ¦ goto out_dethrottle;
  ¦ front.push_back(bp);
  ¦ ldout(msgr->cct,20) << "reader got front " << front.length() << dendl;
  }
//讀取消息的middle部分
  // read middle
  middle_len = header.middle_len;
  if (middle_len) {
  ¦ bufferptr bp = buffer::create(middle_len);
  ¦ if (tcp_read(bp.c_str(), middle_len) < 0)
  ¦ ¦ goto out_dethrottle;
  ¦ middle.push_back(bp);
  ¦ ldout(msgr->cct,20) << "reader got middle " << middle.length() << dendl;
  }

//接收消息的數據體(data)
  // read data
  data_len = le32_to_cpu(header.data_len);
  data_off = le32_to_cpu(header.data_off);
  if (data_len) {
  ¦ unsigned offset = 0;
  ¦ unsigned left = data_len;

  ¦ bufferlist newbuf, rxbuf;
  ¦ bufferlist::iterator blp;
  ¦ int rxbuf_version = 0;

  ¦ while (left > 0) {
  ¦ ¦ // wait for data
  ¦ ¦ if (tcp_read_wait() < 0)
        goto out_dethrottle;

  ¦ ¦ // get a buffer
  ¦ ¦ connection_state->lock.Lock();
  ¦ ¦ map<ceph_tid_t,pair<bufferlist,int> >::iterator p = connection_state->rx_buffers.find(header.tid);
  ¦ ¦ if (p != connection_state->rx_buffers.end()) {
        if (rxbuf.length() == 0 || p->second.second != rxbuf_version) {
        ¦ ldout(msgr->cct,10) << "reader seleting rx buffer v " << p->second.second
                ¦ ¦<< " at offset " << offset                                                                                                                                                                     
                ¦ ¦<< " len " << p->second.first.length() << dendl;
        ¦ rxbuf = p->second.first;
        ¦ rxbuf_version = p->second.second;
        ¦ // make sure it's big enough
        ¦ if (rxbuf.length() < data_len)
        ¦ ¦ rxbuf.push_back(buffer::create(data_len - rxbuf.length()));
        ¦ blp = p->second.first.begin();
        ¦ blp.advance(offset);
        }
  ¦ ¦ } else {
        if (!newbuf.length()) {
        ¦ ldout(msgr->cct,20) << "reader allocating new rx buffer at offset " << offset << dendl;
        ¦ alloc_aligned_buffer(newbuf, data_len, data_off);
        ¦ blp = newbuf.begin();
        ¦ blp.advance(offset);
        }
  ¦ ¦ }
  ¦ ¦ bufferptr bp = blp.get_current_ptr();
  ¦ ¦ int read = MIN(bp.length(), left);
  ¦ ¦ ldout(msgr->cct,20) << "reader reading nonblocking into " << (void*)bp.c_str() << " len " << bp.length() << dendl;
  ¦ ¦ int got = tcp_read_nonblocking(bp.c_str(), read);
  ¦ ¦ ldout(msgr->cct,30) << "reader read " << got << " of " << read << dendl;
  ¦ ¦ connection_state->lock.Unlock();
  ¦ ¦ if (got < 0)
        goto out_dethrottle;
  ¦ ¦ if (got > 0) {
        blp.advance(got);
        data.append(bp, 0, got);
        offset += got;
        left -= got;
  ¦ ¦ } // else we got a signal or something; just loop.
  ¦ }
  }
//接收消息的footer部分
  // footer
  if (connection_state->has_feature(CEPH_FEATURE_MSG_AUTH)) {
  ¦ if (tcp_read((char*)&footer, sizeof(footer)) < 0)
  ¦ ¦ goto out_dethrottle;
  } else {                                                                                                                                                                                                        
  ¦ ceph_msg_footer_old old_footer;
  ¦ if (tcp_read((char*)&old_footer, sizeof(old_footer)) < 0)
  ¦ ¦ goto out_dethrottle;
  ¦ footer.front_crc = old_footer.front_crc;
  ¦ footer.middle_crc = old_footer.middle_crc;
  ¦ footer.data_crc = old_footer.data_crc;
  ¦ footer.sig = 0;
  ¦ footer.flags = old_footer.flags;
  }

  aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0;
  ldout(msgr->cct,10) << "aborted = " << aborted << dendl;
  if (aborted) {
  ¦ ldout(msgr->cct,0) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length()
        ¦ ¦ << " byte message.. ABORTED" << dendl;
  ¦ ret = 0;
  ¦ goto out_dethrottle;
  }

  ldout(msgr->cct,20) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length()
        ¦ ¦<< " byte message" << dendl;
  message = decode_message(msgr->cct, msgr->crcflags, header, footer, front, middle, data);
  if (!message) {
  ¦ ret = -EINVAL;
  ¦ goto out_dethrottle;
  }

  //
  //  Check the signature if one should be present.  A zero return indicates success. PLR
  //
//驗證消息的簽名
  if (auth_handler == NULL) {
  ¦ ldout(msgr->cct, 10) << "No session security set" << dendl;
  } else {
  ¦ if (auth_handler->check_message_signature(message)) {
  ¦ ¦ ldout(msgr->cct, 0) << "Signature check failed" << dendl;
  ¦ ¦ ret = -EINVAL;
  ¦ ¦ goto out_dethrottle;
  ¦ } 
  }

  message->set_byte_throttler(policy.throttler_bytes);
  message->set_message_throttler(policy.throttler_messages);
  // store reservation size in message, so we don't get confused
  // by messages entering the dispatch queue through other paths.
  message->set_dispatch_throttle_size(message_size);

  message->set_recv_stamp(recv_stamp);
  message->set_throttle_stamp(throttle_stamp);
  message->set_recv_complete_stamp(ceph_clock_now(msgr->cct));

  *pm = message;
  return 0;

 out_dethrottle:
  // release bytes reserved from the throttlers on failure
  if (policy.throttler_messages) {
  ¦ ldout(msgr->cct,10) << "reader releasing " << 1 << " message to policy throttler "
                        << policy.throttler_messages->get_current() << "/"
                        << policy.throttler_messages->get_max() << dendl;
  ¦ policy.throttler_messages->put();
  }
  if (message_size) {
  ¦ if (policy.throttler_bytes) {
  ¦ ¦ ldout(msgr->cct,10) << "reader releasing " << message_size << " bytes to policy throttler "
                          << policy.throttler_bytes->get_current() << "/"
                          << policy.throttler_bytes->get_max() << dendl;
  ¦ ¦ policy.throttler_bytes->put(message_size);
  ¦ }

  ¦ msgr->dispatch_throttle_release(message_size);
  }
  return ret;
}

`
6. 總結
reader函數是Pipe reader thread的主要邏輯,負責從連接的socket上接收消息。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章