ZooKeeper架构 - Leader选举 - FastLeaderElection

Leader选举概述

  1. 更新逻辑时钟。

  2. 初始化投票,投给自己。

  3. 发送通知给所有的参与者。

  4. 如果还没有选举出leader,而且这个选举还在进行中,就循环以下步骤。

  5. 从接收队列中拉取通知。

    如果通知为空,从连接管理器中检查所有的阻塞队列,只要有一个队列为空,再次发送通知;否则重新连接集群中的其他机器。

  6. 如果通知不为空,并且属于集群成员的通知。然后判断通知的状态。

  • 如果是looking状态,如果外部的选举周期大于内部的选举周期,更新内部的选举周期,清空收到的投票集合,比较epoch、zxid、sid,从而选出更优的选票,更新自己的选票,发送通知给所有的参与者;如果外部的选举周期小于内部的选举周期,执行下一次循环;如果外部的选举周期等于内部的选举周期,更新自己的选票,发送通知给所有的参与者。
    然后将更新后的选票加入到接收到的投票集合中。再从接收队列中循环拉取通知,选出更优的选票。最后更新服务器状态,清空接收队列。

  • 如果是observing状态,仅仅日志记录。

  • 如果是following或者leading状态,如果外部的选举周期等于内部的选举周期,如果该通知得到了接收的投票集合中半数以上的支持并且此时leader有效,更新服务器状态,清空接收队列,退出选举过程。否则,将通知的id、以及选票加入到不参与选举的HashMap中,也就是加入到已有集群中。如果该通知在不参与选举的集合(已有集群)中得到了半数以上的支持,并且此时leader有效,更新内部选举周期,更新服务器状态,清空接收队列。

其他

在这里插入图片描述

源码分析

public Vote lookForLeader() throws InterruptedException {
    try {
        self.jmxLeaderElectionBean = new LeaderElectionBean();
        MBeanRegistry.getInstance().register(
                self.jmxLeaderElectionBean, self.jmxLocalPeerBean);
    } catch (Exception e) {
        LOG.warn("Failed to register with JMX", e);
        self.jmxLeaderElectionBean = null;
    }
    if (self.start_fle == 0) {
       self.start_fle = System.currentTimeMillis();
    }
    try {
        HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();

        HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();

        int notTimeout = finalizeWait;

        synchronized(this){
        	/* 1. 更新逻辑时钟 */
        	/* volatile long类型,表示逻辑时钟(选举轮次也OK) */
            logicalclock++;
            /* 2. 初始投票(选择自己)*/
            /* myid, zxid, epoch */
            updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
        }

        LOG.info("New election. My id =  " + self.getId() +
                ", proposed zxid=0x" + Long.toHexString(proposedZxid));
		/* 3. 将选票信息发送给参与选票的所有participant类型的服务器 */
		/* 也就是构造ToSend实例,加入到阻塞队列里 */
        sendNotifications();

        /*
         * Loop in which we exchange notifications until we find a leader
         */
		/* 还没有选举出leader,并且这个选举过程还在进行 */
        while ((self.getPeerState() == ServerState.LOOKING) &&
                (!stop)){
            /*
             * Remove next notification from queue, times out after 2 times
             * the termination time
             */
             /* 4. 从LinkedBlockingQueue里拉取通知 */
            Notification n = recvqueue.poll(notTimeout,
                    TimeUnit.MILLISECONDS);

            /*
             * Sends more notifications if haven't received enough.
             * Otherwise processes new notification.
             */
            if(n == null){
            	/* 检查连接管理器的所有阻塞队列,判断其中有队列为空,也就是表示所有的消息是否已经被交付 */
                if(manager.haveDelivered()){
                	/* 如果为空,再次发送通知 */
                    sendNotifications();
                } else {
                	/* 如果不为空,重新连接集群中的其他机器 */
                    manager.connectAll();
                }

                /*
                 * Exponential backoff
                 */
                int tmpTimeOut = notTimeout*2;
                notTimeout = (tmpTimeOut < maxNotificationInterval?
                        tmpTimeOut : maxNotificationInterval);
                LOG.info("Notification time out: " + notTimeout);
            }
            /* 从接收队列收到的通知不为空,并且该通知是集群成员的通知 */
            else if(self.getVotingView().containsKey(n.sid)) {
                /*
                 * Only proceed if the vote comes from a replica in the
                 * voting view.
                 */
                switch (n.state) {
                case LOOKING:
                    // If notification > current, replace and send messages out
                    if (n.electionEpoch > logicalclock) {
                        logicalclock = n.electionEpoch;
                        recvset.clear();
                        if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                                getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
                            updateProposal(n.leader, n.zxid, n.peerEpoch);
                        } else {
                            updateProposal(getInitId(),
                                    getInitLastLoggedZxid(),
                                    getPeerEpoch());
                        }
                        sendNotifications();
                    } else if (n.electionEpoch < logicalclock) {
                        if(LOG.isDebugEnabled()){
                            LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
                                    + Long.toHexString(n.electionEpoch)
                                    + ", logicalclock=0x" + Long.toHexString(logicalclock));
                        }
                        break;
                    } else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                            proposedLeader, proposedZxid, proposedEpoch)) {
                        updateProposal(n.leader, n.zxid, n.peerEpoch);
                        sendNotifications();
                    }

                    if(LOG.isDebugEnabled()){
                        LOG.debug("Adding vote: from=" + n.sid +
                                ", proposed leader=" + n.leader +
                                ", proposed zxid=0x" + Long.toHexString(n.zxid) +
                                ", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
                    }

                    recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));

					/* 如果有选票得到了半数以上的支持 */
                    if (termPredicate(recvset,
                            new Vote(proposedLeader, proposedZxid,
                                    logicalclock, proposedEpoch))) {

                        // Verify if there is any change in the proposed leader
                        /* 从接收队列中循环拉取通知 */
                        while((n = recvqueue.poll(finalizeWait,
                                TimeUnit.MILLISECONDS)) != null){
                            /* 如果拉取的通知比上文得到半数以上的选票更优 */
                            if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                                    proposedLeader, proposedZxid, proposedEpoch)){
                                /* 将该通知加入到接收队列中 */
                                recvqueue.put(n);
                                break;
                            }
                        }

                        /*
                         * This predicate is true once we don't read any new
                         * relevant message from the reception queue
                         */
                        if (n == null) {
                            self.setPeerState((proposedLeader == self.getId()) ?
                                    ServerState.LEADING: learningState());

                            Vote endVote = new Vote(proposedLeader,
                                                    proposedZxid,
                                                    logicalclock,
                                                    proposedEpoch);
                            leaveInstance(endVote);
                            return endVote;
                        }
                    }
                    break;
                case OBSERVING:
                    LOG.debug("Notification from observer: " + n.sid);
                    break;
                case FOLLOWING:
                case LEADING:
                    /*
                     * Consider all notifications from the same epoch
                     * together.
                     */
                    if(n.electionEpoch == logicalclock){
                        recvset.put(n.sid, new Vote(n.leader,
                                                      n.zxid,
                                                      n.electionEpoch,
                                                      n.peerEpoch));
                        /* 通知得到了在接收队列中半数以上的支持,并且此时的leader是有效的 */
                        if(ooePredicate(recvset, outofelection, n)) {
                            self.setPeerState((n.leader == self.getId()) ?
                                    ServerState.LEADING: learningState());

                            Vote endVote = new Vote(n.leader, 
                                    n.zxid, 
                                    n.electionEpoch, 
                                    n.peerEpoch);
                            leaveInstance(endVote);
                            return endVote;
                        }
                    }

                    /*
                     * Before joining an established ensemble, verify
                     * a majority is following the same leader.
                     */
                    outofelection.put(n.sid, new Vote(n.version,
                                                        n.leader,
                                                        n.zxid,
                                                        n.electionEpoch,
                                                        n.peerEpoch,
                                                        n.state));
       
                    if(ooePredicate(outofelection, outofelection, n)) {
                        synchronized(this){
                            logicalclock = n.electionEpoch;
                            self.setPeerState((n.leader == self.getId()) ?
                                    ServerState.LEADING: learningState());
                        }
                        Vote endVote = new Vote(n.leader,
                                                n.zxid,
                                                n.electionEpoch,
                                                n.peerEpoch);
                        leaveInstance(endVote);
                        return endVote;
                    }
                    break;
                default:
                    LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)",
                            n.state, n.sid);
                    break;
                }
            } else {
                LOG.warn("Ignoring notification from non-cluster member " + n.sid);
            }
        }
        return null;
    } finally {
        try {
            if(self.jmxLeaderElectionBean != null){
                MBeanRegistry.getInstance().unregister(
                        self.jmxLeaderElectionBean);
            }
        } catch (Exception e) {
            LOG.warn("Failed to unregister with JMX", e);
        }
        self.jmxLeaderElectionBean = null;
    }
}


	/* 检查leader是否有效,从而可以来判断是否取代已有的leader */
	protected boolean checkLeader(
            HashMap<Long, Vote> votes,
            long leader,
            long electionEpoch){

        boolean predicate = true;

        /*
         * If everyone else thinks I'm the leader, I must be the leader.
         * The other two checks are just for the case in which I'm not the
         * leader. If I'm not the leader and I haven't received a message
         * from leader stating that it is leading, then predicate is false.
         */

		/* 我不是leader */
        if(leader != self.getId()){
        	/* 我没有收到leader的消息 */
            if(votes.get(leader) == null) predicate = false;
            /* 我收到了leader的消息,但是leader不知道自己是leader */
            else if(votes.get(leader).getState() != ServerState.LEADING) predicate = false;
        /* 我认为我是leader,但是选举周期不匹配 */
        } else if(logicalclock != electionEpoch) {
            predicate = false;
        } 

        return predicate;
    }
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章