ZooKeeper架構 - Leader選舉 - FastLeaderElection

Leader選舉概述

  1. 更新邏輯時鐘。

  2. 初始化投票,投給自己。

  3. 發送通知給所有的參與者。

  4. 如果還沒有選舉出leader,而且這個選舉還在進行中,就循環以下步驟。

  5. 從接收隊列中拉取通知。

    如果通知爲空,從連接管理器中檢查所有的阻塞隊列,只要有一個隊列爲空,再次發送通知;否則重新連接集羣中的其他機器。

  6. 如果通知不爲空,並且屬於集羣成員的通知。然後判斷通知的狀態。

  • 如果是looking狀態,如果外部的選舉週期大於內部的選舉週期,更新內部的選舉週期,清空收到的投票集合,比較epoch、zxid、sid,從而選出更優的選票,更新自己的選票,發送通知給所有的參與者;如果外部的選舉週期小於內部的選舉週期,執行下一次循環;如果外部的選舉週期等於內部的選舉週期,更新自己的選票,發送通知給所有的參與者。
    然後將更新後的選票加入到接收到的投票集合中。再從接收隊列中循環拉取通知,選出更優的選票。最後更新服務器狀態,清空接收隊列。

  • 如果是observing狀態,僅僅日誌記錄。

  • 如果是following或者leading狀態,如果外部的選舉週期等於內部的選舉週期,如果該通知得到了接收的投票集合中半數以上的支持並且此時leader有效,更新服務器狀態,清空接收隊列,退出選舉過程。否則,將通知的id、以及選票加入到不參與選舉的HashMap中,也就是加入到已有集羣中。如果該通知在不參與選舉的集合(已有集羣)中得到了半數以上的支持,並且此時leader有效,更新內部選舉週期,更新服務器狀態,清空接收隊列。

其他

在這裏插入圖片描述

源碼分析

public Vote lookForLeader() throws InterruptedException {
    try {
        self.jmxLeaderElectionBean = new LeaderElectionBean();
        MBeanRegistry.getInstance().register(
                self.jmxLeaderElectionBean, self.jmxLocalPeerBean);
    } catch (Exception e) {
        LOG.warn("Failed to register with JMX", e);
        self.jmxLeaderElectionBean = null;
    }
    if (self.start_fle == 0) {
       self.start_fle = System.currentTimeMillis();
    }
    try {
        HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();

        HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();

        int notTimeout = finalizeWait;

        synchronized(this){
        	/* 1. 更新邏輯時鐘 */
        	/* volatile long類型,表示邏輯時鐘(選舉輪次也OK) */
            logicalclock++;
            /* 2. 初始投票(選擇自己)*/
            /* myid, zxid, epoch */
            updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
        }

        LOG.info("New election. My id =  " + self.getId() +
                ", proposed zxid=0x" + Long.toHexString(proposedZxid));
		/* 3. 將選票信息發送給參與選票的所有participant類型的服務器 */
		/* 也就是構造ToSend實例,加入到阻塞隊列裏 */
        sendNotifications();

        /*
         * Loop in which we exchange notifications until we find a leader
         */
		/* 還沒有選舉出leader,並且這個選舉過程還在進行 */
        while ((self.getPeerState() == ServerState.LOOKING) &&
                (!stop)){
            /*
             * Remove next notification from queue, times out after 2 times
             * the termination time
             */
             /* 4. 從LinkedBlockingQueue里拉取通知 */
            Notification n = recvqueue.poll(notTimeout,
                    TimeUnit.MILLISECONDS);

            /*
             * Sends more notifications if haven't received enough.
             * Otherwise processes new notification.
             */
            if(n == null){
            	/* 檢查連接管理器的所有阻塞隊列,判斷其中有隊列爲空,也就是表示所有的消息是否已經被交付 */
                if(manager.haveDelivered()){
                	/* 如果爲空,再次發送通知 */
                    sendNotifications();
                } else {
                	/* 如果不爲空,重新連接集羣中的其他機器 */
                    manager.connectAll();
                }

                /*
                 * Exponential backoff
                 */
                int tmpTimeOut = notTimeout*2;
                notTimeout = (tmpTimeOut < maxNotificationInterval?
                        tmpTimeOut : maxNotificationInterval);
                LOG.info("Notification time out: " + notTimeout);
            }
            /* 從接收隊列收到的通知不爲空,並且該通知是集羣成員的通知 */
            else if(self.getVotingView().containsKey(n.sid)) {
                /*
                 * Only proceed if the vote comes from a replica in the
                 * voting view.
                 */
                switch (n.state) {
                case LOOKING:
                    // If notification > current, replace and send messages out
                    if (n.electionEpoch > logicalclock) {
                        logicalclock = n.electionEpoch;
                        recvset.clear();
                        if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                                getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
                            updateProposal(n.leader, n.zxid, n.peerEpoch);
                        } else {
                            updateProposal(getInitId(),
                                    getInitLastLoggedZxid(),
                                    getPeerEpoch());
                        }
                        sendNotifications();
                    } else if (n.electionEpoch < logicalclock) {
                        if(LOG.isDebugEnabled()){
                            LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
                                    + Long.toHexString(n.electionEpoch)
                                    + ", logicalclock=0x" + Long.toHexString(logicalclock));
                        }
                        break;
                    } else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                            proposedLeader, proposedZxid, proposedEpoch)) {
                        updateProposal(n.leader, n.zxid, n.peerEpoch);
                        sendNotifications();
                    }

                    if(LOG.isDebugEnabled()){
                        LOG.debug("Adding vote: from=" + n.sid +
                                ", proposed leader=" + n.leader +
                                ", proposed zxid=0x" + Long.toHexString(n.zxid) +
                                ", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
                    }

                    recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));

					/* 如果有選票得到了半數以上的支持 */
                    if (termPredicate(recvset,
                            new Vote(proposedLeader, proposedZxid,
                                    logicalclock, proposedEpoch))) {

                        // Verify if there is any change in the proposed leader
                        /* 從接收隊列中循環拉取通知 */
                        while((n = recvqueue.poll(finalizeWait,
                                TimeUnit.MILLISECONDS)) != null){
                            /* 如果拉取的通知比上文得到半數以上的選票更優 */
                            if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                                    proposedLeader, proposedZxid, proposedEpoch)){
                                /* 將該通知加入到接收隊列中 */
                                recvqueue.put(n);
                                break;
                            }
                        }

                        /*
                         * This predicate is true once we don't read any new
                         * relevant message from the reception queue
                         */
                        if (n == null) {
                            self.setPeerState((proposedLeader == self.getId()) ?
                                    ServerState.LEADING: learningState());

                            Vote endVote = new Vote(proposedLeader,
                                                    proposedZxid,
                                                    logicalclock,
                                                    proposedEpoch);
                            leaveInstance(endVote);
                            return endVote;
                        }
                    }
                    break;
                case OBSERVING:
                    LOG.debug("Notification from observer: " + n.sid);
                    break;
                case FOLLOWING:
                case LEADING:
                    /*
                     * Consider all notifications from the same epoch
                     * together.
                     */
                    if(n.electionEpoch == logicalclock){
                        recvset.put(n.sid, new Vote(n.leader,
                                                      n.zxid,
                                                      n.electionEpoch,
                                                      n.peerEpoch));
                        /* 通知得到了在接收隊列中半數以上的支持,並且此時的leader是有效的 */
                        if(ooePredicate(recvset, outofelection, n)) {
                            self.setPeerState((n.leader == self.getId()) ?
                                    ServerState.LEADING: learningState());

                            Vote endVote = new Vote(n.leader, 
                                    n.zxid, 
                                    n.electionEpoch, 
                                    n.peerEpoch);
                            leaveInstance(endVote);
                            return endVote;
                        }
                    }

                    /*
                     * Before joining an established ensemble, verify
                     * a majority is following the same leader.
                     */
                    outofelection.put(n.sid, new Vote(n.version,
                                                        n.leader,
                                                        n.zxid,
                                                        n.electionEpoch,
                                                        n.peerEpoch,
                                                        n.state));
       
                    if(ooePredicate(outofelection, outofelection, n)) {
                        synchronized(this){
                            logicalclock = n.electionEpoch;
                            self.setPeerState((n.leader == self.getId()) ?
                                    ServerState.LEADING: learningState());
                        }
                        Vote endVote = new Vote(n.leader,
                                                n.zxid,
                                                n.electionEpoch,
                                                n.peerEpoch);
                        leaveInstance(endVote);
                        return endVote;
                    }
                    break;
                default:
                    LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)",
                            n.state, n.sid);
                    break;
                }
            } else {
                LOG.warn("Ignoring notification from non-cluster member " + n.sid);
            }
        }
        return null;
    } finally {
        try {
            if(self.jmxLeaderElectionBean != null){
                MBeanRegistry.getInstance().unregister(
                        self.jmxLeaderElectionBean);
            }
        } catch (Exception e) {
            LOG.warn("Failed to unregister with JMX", e);
        }
        self.jmxLeaderElectionBean = null;
    }
}


	/* 檢查leader是否有效,從而可以來判斷是否取代已有的leader */
	protected boolean checkLeader(
            HashMap<Long, Vote> votes,
            long leader,
            long electionEpoch){

        boolean predicate = true;

        /*
         * If everyone else thinks I'm the leader, I must be the leader.
         * The other two checks are just for the case in which I'm not the
         * leader. If I'm not the leader and I haven't received a message
         * from leader stating that it is leading, then predicate is false.
         */

		/* 我不是leader */
        if(leader != self.getId()){
        	/* 我沒有收到leader的消息 */
            if(votes.get(leader) == null) predicate = false;
            /* 我收到了leader的消息,但是leader不知道自己是leader */
            else if(votes.get(leader).getState() != ServerState.LEADING) predicate = false;
        /* 我認爲我是leader,但是選舉週期不匹配 */
        } else if(logicalclock != electionEpoch) {
            predicate = false;
        } 

        return predicate;
    }
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章