Leader选举概述
-
更新逻辑时钟。
-
初始化投票,投给自己。
-
发送通知给所有的参与者。
-
如果还没有选举出leader,而且这个选举还在进行中,就循环以下步骤。
-
从接收队列中拉取通知。
如果通知为空,从连接管理器中检查所有的阻塞队列,只要有一个队列为空,再次发送通知;否则重新连接集群中的其他机器。
-
如果通知不为空,并且属于集群成员的通知。然后判断通知的状态。
-
如果是looking状态,如果外部的选举周期大于内部的选举周期,更新内部的选举周期,清空收到的投票集合,比较epoch、zxid、sid,从而选出更优的选票,更新自己的选票,发送通知给所有的参与者;如果外部的选举周期小于内部的选举周期,执行下一次循环;如果外部的选举周期等于内部的选举周期,更新自己的选票,发送通知给所有的参与者。
然后将更新后的选票加入到接收到的投票集合中。再从接收队列中循环拉取通知,选出更优的选票。最后更新服务器状态,清空接收队列。 -
如果是observing状态,仅仅日志记录。
-
如果是following或者leading状态,如果外部的选举周期等于内部的选举周期,如果该通知得到了接收的投票集合中半数以上的支持并且此时leader有效,更新服务器状态,清空接收队列,退出选举过程。否则,将通知的id、以及选票加入到不参与选举的HashMap中,也就是加入到已有集群中。如果该通知在不参与选举的集合(已有集群)中得到了半数以上的支持,并且此时leader有效,更新内部选举周期,更新服务器状态,清空接收队列。
其他
源码分析
public Vote lookForLeader() throws InterruptedException {
try {
self.jmxLeaderElectionBean = new LeaderElectionBean();
MBeanRegistry.getInstance().register(
self.jmxLeaderElectionBean, self.jmxLocalPeerBean);
} catch (Exception e) {
LOG.warn("Failed to register with JMX", e);
self.jmxLeaderElectionBean = null;
}
if (self.start_fle == 0) {
self.start_fle = System.currentTimeMillis();
}
try {
HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();
HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();
int notTimeout = finalizeWait;
synchronized(this){
/* 1. 更新逻辑时钟 */
/* volatile long类型,表示逻辑时钟(选举轮次也OK) */
logicalclock++;
/* 2. 初始投票(选择自己)*/
/* myid, zxid, epoch */
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
LOG.info("New election. My id = " + self.getId() +
", proposed zxid=0x" + Long.toHexString(proposedZxid));
/* 3. 将选票信息发送给参与选票的所有participant类型的服务器 */
/* 也就是构造ToSend实例,加入到阻塞队列里 */
sendNotifications();
/*
* Loop in which we exchange notifications until we find a leader
*/
/* 还没有选举出leader,并且这个选举过程还在进行 */
while ((self.getPeerState() == ServerState.LOOKING) &&
(!stop)){
/*
* Remove next notification from queue, times out after 2 times
* the termination time
*/
/* 4. 从LinkedBlockingQueue里拉取通知 */
Notification n = recvqueue.poll(notTimeout,
TimeUnit.MILLISECONDS);
/*
* Sends more notifications if haven't received enough.
* Otherwise processes new notification.
*/
if(n == null){
/* 检查连接管理器的所有阻塞队列,判断其中有队列为空,也就是表示所有的消息是否已经被交付 */
if(manager.haveDelivered()){
/* 如果为空,再次发送通知 */
sendNotifications();
} else {
/* 如果不为空,重新连接集群中的其他机器 */
manager.connectAll();
}
/*
* Exponential backoff
*/
int tmpTimeOut = notTimeout*2;
notTimeout = (tmpTimeOut < maxNotificationInterval?
tmpTimeOut : maxNotificationInterval);
LOG.info("Notification time out: " + notTimeout);
}
/* 从接收队列收到的通知不为空,并且该通知是集群成员的通知 */
else if(self.getVotingView().containsKey(n.sid)) {
/*
* Only proceed if the vote comes from a replica in the
* voting view.
*/
switch (n.state) {
case LOOKING:
// If notification > current, replace and send messages out
if (n.electionEpoch > logicalclock) {
logicalclock = n.electionEpoch;
recvset.clear();
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
} else {
updateProposal(getInitId(),
getInitLastLoggedZxid(),
getPeerEpoch());
}
sendNotifications();
} else if (n.electionEpoch < logicalclock) {
if(LOG.isDebugEnabled()){
LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
+ Long.toHexString(n.electionEpoch)
+ ", logicalclock=0x" + Long.toHexString(logicalclock));
}
break;
} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
sendNotifications();
}
if(LOG.isDebugEnabled()){
LOG.debug("Adding vote: from=" + n.sid +
", proposed leader=" + n.leader +
", proposed zxid=0x" + Long.toHexString(n.zxid) +
", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
}
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
/* 如果有选票得到了半数以上的支持 */
if (termPredicate(recvset,
new Vote(proposedLeader, proposedZxid,
logicalclock, proposedEpoch))) {
// Verify if there is any change in the proposed leader
/* 从接收队列中循环拉取通知 */
while((n = recvqueue.poll(finalizeWait,
TimeUnit.MILLISECONDS)) != null){
/* 如果拉取的通知比上文得到半数以上的选票更优 */
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)){
/* 将该通知加入到接收队列中 */
recvqueue.put(n);
break;
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
if (n == null) {
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(proposedLeader,
proposedZxid,
logicalclock,
proposedEpoch);
leaveInstance(endVote);
return endVote;
}
}
break;
case OBSERVING:
LOG.debug("Notification from observer: " + n.sid);
break;
case FOLLOWING:
case LEADING:
/*
* Consider all notifications from the same epoch
* together.
*/
if(n.electionEpoch == logicalclock){
recvset.put(n.sid, new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch));
/* 通知得到了在接收队列中半数以上的支持,并且此时的leader是有效的 */
if(ooePredicate(recvset, outofelection, n)) {
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
}
/*
* Before joining an established ensemble, verify
* a majority is following the same leader.
*/
outofelection.put(n.sid, new Vote(n.version,
n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch,
n.state));
if(ooePredicate(outofelection, outofelection, n)) {
synchronized(this){
logicalclock = n.electionEpoch;
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
}
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
break;
default:
LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)",
n.state, n.sid);
break;
}
} else {
LOG.warn("Ignoring notification from non-cluster member " + n.sid);
}
}
return null;
} finally {
try {
if(self.jmxLeaderElectionBean != null){
MBeanRegistry.getInstance().unregister(
self.jmxLeaderElectionBean);
}
} catch (Exception e) {
LOG.warn("Failed to unregister with JMX", e);
}
self.jmxLeaderElectionBean = null;
}
}
/* 检查leader是否有效,从而可以来判断是否取代已有的leader */
protected boolean checkLeader(
HashMap<Long, Vote> votes,
long leader,
long electionEpoch){
boolean predicate = true;
/*
* If everyone else thinks I'm the leader, I must be the leader.
* The other two checks are just for the case in which I'm not the
* leader. If I'm not the leader and I haven't received a message
* from leader stating that it is leading, then predicate is false.
*/
/* 我不是leader */
if(leader != self.getId()){
/* 我没有收到leader的消息 */
if(votes.get(leader) == null) predicate = false;
/* 我收到了leader的消息,但是leader不知道自己是leader */
else if(votes.get(leader).getState() != ServerState.LEADING) predicate = false;
/* 我认为我是leader,但是选举周期不匹配 */
} else if(logicalclock != electionEpoch) {
predicate = false;
}
return predicate;
}