一、zookeeper的順序一致性
- google的chubby就是解決分佈式環境下多個服務選舉出leader時,由於網絡環境的不可靠性,會出現丟失、篡改問題(拜占庭將軍),因此服務間用paxos算法實現分佈式鎖和master選舉。zk就是chubby的開源實現。
- zk是使用zab協議完成數據同步,並不是強一致性,而是簡化版的順序一致性,如圖:
1.當客戶端B和C在zk修改時去讀取x的值,如果B讀到x=0,那麼C可能0或1;如果B讀到x=1,C在B時間點後,那麼C一定是x=1。這就是zk的順序一致性
2.由於網絡原因客戶端同時發送請求可能有先後順序,因此讀到的值不一定是以客戶端的時間軸爲準。
3.zk不保證不同客戶端的zk數據視圖一模一樣。假設A,B兩個客戶端,A將x=1改爲2,B讀取可能x=1,如果想B每次讀取更新後的值,可以使用zk提供的sync方法。
4.zk基於zxid和阻塞隊列保證請求的順序一致性,客戶端A請求zk讀取數據,zk會返回數據和zxid給客戶端,如果客戶端A斷開重連zk集羣,連接到數據還未同步的follower上,那麼客戶端A用記錄到最大的zxid拿去對比follower的zxid,發現自己的zxid比follower的大,那麼就會連接失敗。這算不算是對zk官網說到的single system image 的解釋,一個客戶端只要連接到zk,就不會去讀到舊數據。
二、zk集羣leader選舉分析
-
我們在配置zk集羣的時候,會在zk安裝目錄下/data下寫入myid文件,假設三個節點,分別爲1,2,3。再去/conf目錄下配置zoo.cfg,新增server.1=127.0.0.1:2887:3887,server.2=127.0.0.1:2888:3888,server.3=127.0.0.1:2889:3889,分別代表不同的myid,第一個端口爲zk集羣通信,第二個端口爲leader選舉。
-
啓動zk的時候會 執行:sh zkServer.sh 腳本,運行org.apache.zookeeper.server.quorum.QuorumPeerMain這個類的main方法。
1.1. 入口QuorumPeerMain的main方法 public static void main(String[] args) { QuorumPeerMain main = new QuorumPeerMain(); try { #1.2 //啓動配置args,會傳入zoo.cfg main.initializeAndRun(args); } catch (IllegalArgumentException e) { LOG.error("Invalid arguments, exiting abnormally", e); LOG.info(USAGE); System.err.println(USAGE); System.exit(2); } catch (ConfigException e) { LOG.error("Invalid config, exiting abnormally", e); System.err.println("Invalid config, exiting abnormally"); System.exit(2); } catch (Exception e) { LOG.error("Unexpected exception, exiting abnormally", e); System.exit(1); } LOG.info("Exiting normally"); System.exit(0); } 1.2. protected void initializeAndRun(String[] args) throws ConfigException, IOException { //保存zoo.cfg文件解析之後的所有參數(一定在後面有用到) QuorumPeerConfig config = new QuorumPeerConfig(); if (args.length == 1) { #1.3 config.parse(args[0]); } // ()Start and schedule the the purge task DatadirCleanupManager purgeMgr = new DatadirCleanupManager(config .getDataDir(), config.getDataLogDir(), config .getSnapRetainCount(), config.getPurgeInterval()); purgeMgr.start(); if (args.length == 1 && config.servers.size() > 0) { #1.4 //如果args==1,走這段代碼 runFromConfig(config); } else { LOG.warn("Either no config or no quorum defined in config, running " + " in standalone mode"); // there is only server in the quorum -- run as standalone ZooKeeperServerMain.main(args); } } 1.3. public void parse(String path) throws ConfigException { File configFile = new File(path); LOG.info("Reading configuration from: " + configFile); try { if (!configFile.exists()) { throw new IllegalArgumentException(configFile.toString() + " file is missing"); } //將文件流轉爲properties Properties cfg = new Properties(); FileInputStream in = new FileInputStream(configFile); try { cfg.load(in); } finally { in.close(); } //解析配置文件 //會將三個節點server.1=ip:port1:port2保存起來 parseProperties(cfg); } catch (IOException e) { throw new ConfigException("Error processing " + path, e); } catch (IllegalArgumentException e) { throw new ConfigException("Error processing " + path, e); } } 1.4. 從配置中運行 public void runFromConfig(QuorumPeerConfig config) throws IOException { try { ManagedUtil.registerLog4jMBeans(); } catch (JMException e) { LOG.warn("Unable to register log4j JMX control", e); } LOG.info("Starting quorum peer"); try { //創建通信工廠,默認用的NIO,配置了就用Netty ServerCnxnFactory cnxnFactory = ServerCnxnFactory.createFactory(); #1.5 cnxnFactory.configure(config.getClientPortAddress(), config.getMaxClientCnxns()); quorumPeer = getQuorumPeer(); //getView() //zoo.cfg裏面解析的servers節點 quorumPeer.setQuorumPeers(config.getServers()); quorumPeer.setTxnFactory(new FileTxnSnapLog( new File(config.getDataLogDir()), new File(config.getDataDir()))); //選舉類型用的什麼算法,默認是3 quorumPeer.setElectionType(config.getElectionAlg()); quorumPeer.setMyid(config.getServerId()); quorumPeer.setTickTime(config.getTickTime()); quorumPeer.setInitLimit(config.getInitLimit()); quorumPeer.setSyncLimit(config.getSyncLimit()); quorumPeer.setQuorumListenOnAllIPs(config.getQuorumListenOnAllIPs()); // 設置cnxnFacotory quorumPeer.setCnxnFactory(cnxnFactory); quorumPeer.setQuorumVerifier(config.getQuorumVerifier()); quorumPeer.setClientPortAddress(config.getClientPortAddress()); quorumPeer.setMinSessionTimeout(config.getMinSessionTimeout()); quorumPeer.setMaxSessionTimeout(config.getMaxSessionTimeout()); quorumPeer.setZKDatabase(new ZKDatabase(quorumPeer.getTxnFactory())); quorumPeer.setLearnerType(config.getPeerType()); quorumPeer.setSyncEnabled(config.getSyncEnabled()); // sets quorum sasl authentication configurations quorumPeer.setQuorumSaslEnabled(config.quorumEnableSasl); if(quorumPeer.isQuorumSaslAuthEnabled()){ quorumPeer.setQuorumServerSaslRequired(config.quorumServerRequireSasl); quorumPeer.setQuorumLearnerSaslRequired(config.quorumLearnerRequireSasl); quorumPeer.setQuorumServicePrincipal(config.quorumServicePrincipal); quorumPeer.setQuorumServerLoginContext(config.quorumServerLoginContext); quorumPeer.setQuorumLearnerLoginContext(config.quorumLearnerLoginContext); } quorumPeer.setQuorumCnxnThreadsSize(config.quorumCnxnThreadsSize); quorumPeer.initialize(); #1.6 quorumPeer.start(); quorumPeer.join(); } catch (InterruptedException e) { // warn, but generally this is ok LOG.warn("Quorum Peer interrupted", e); } } 1.5. 通信配置初始化 @Override public void configure(InetSocketAddress addr, int maxcc) throws IOException { configureSaslLogin(); thread = new ZooKeeperThread(this, "NIOServerCxn.Factory:" + addr); //設置守護線程,運行在用戶線程上的守護線程,只要用戶線程不結束,守護線程一直運行直到用戶線程結束 thread.setDaemon(true); maxClientCnxns = maxcc; this.ss = ServerSocketChannel.open(); //端口可複用 ss.socket().setReuseAddress(true); LOG.info("binding to port " + addr); //綁定ip 和端口2181 ss.socket().bind(addr); //非阻塞 ss.configureBlocking(false); //註冊一個accept事件 ss.register(selector, SelectionKey.OP_ACCEPT); } 1.6. QuorumPeer啓動 @Override public synchronized void start() { //加載數據() loadDataBase(); #1.7 //cnxnFacotory 通信 客戶端的2181端口號 cnxnFactory.start(); #2.1 //開始leader選舉-> 啓動一個投票的監聽、初始化一個選舉算法FastLeader. startLeaderElection(); #4.1 //當前的QuorumPeer繼承Thread,調用Thread.start() ->QuorumPeer.run() super.start(); } 1.7. 啓動線程,運行NIOServerCnxnFactory的run方法 //當收到客戶端的create/delete/setdata請求時,會進入這個方法 public void run() { while (!ss.socket().isClosed()) { try { selector.select(1000); Set<SelectionKey> selected; synchronized (this) { selected = selector.selectedKeys(); } ArrayList<SelectionKey> selectedList = new ArrayList<SelectionKey>( selected); Collections.shuffle(selectedList); for (SelectionKey k : selectedList) { if ((k.readyOps() & SelectionKey.OP_ACCEPT) != 0) { SocketChannel sc = ((ServerSocketChannel) k .channel()).accept(); InetAddress ia = sc.socket().getInetAddress(); int cnxncount = getClientCnxnCount(ia); if (maxClientCnxns > 0 && cnxncount >= maxClientCnxns){ LOG.warn("Too many connections from " + ia + " - max is " + maxClientCnxns ); sc.close(); } else { LOG.info("Accepted socket connection from " + sc.socket().getRemoteSocketAddress()); sc.configureBlocking(false); SelectionKey sk = sc.register(selector, SelectionKey.OP_READ); NIOServerCnxn cnxn = createConnection(sc, sk); sk.attach(cnxn); addCnxn(cnxn); } } else if ((k.readyOps() & (SelectionKey.OP_READ | SelectionKey.OP_WRITE)) != 0) { NIOServerCnxn c = (NIOServerCnxn) k.attachment(); c.doIO(k); } else { if (LOG.isDebugEnabled()) { LOG.debug("Unexpected ops in select " + k.readyOps()); } } } selected.clear(); } catch (RuntimeException e) { LOG.warn("Ignoring unexpected runtime exception", e); } catch (Exception e) { LOG.warn("Ignoring exception", e); } } closeAll(); LOG.info("NIOServerCnxn factory exited run method"); } 2.1. 啓動leader選舉 synchronized public void startLeaderElection() { try { //構建一個票據, (myid ,zxid ,epoch),用來投票的。 currentVote = new Vote(myid, getLastLoggedZxid(), getCurrentEpoch()); } catch(IOException e) { RuntimeException re = new RuntimeException(e.getMessage()); re.setStackTrace(e.getStackTrace()); throw re; } //getView返回保存的地址Map(Long, QuorumServer)= (1, ip:) for (QuorumServer p : getView().values()) { if (p.id == myid) { myQuorumAddr = p.addr; //地址: 1 ->myQuorumAddr=192.168.13.102 break; } } if (myQuorumAddr == null) { throw new RuntimeException("My id " + myid + " not in the peer list"); } //選舉的策略 if (electionType == 0) { try { udpSocket = new DatagramSocket(myQuorumAddr.getPort()); responder = new ResponderThread(); responder.start(); } catch (SocketException e) { throw new RuntimeException(e); } } #2.2 //創建選舉算法 this.electionAlg = createElectionAlgorithm(electionType); } 2.2. protected Election createElectionAlgorithm(int electionAlgorithm){ Election le=null; //TODO: use a factory rather than a switch switch (electionAlgorithm) { case 0: le = new LeaderElection(this); break; case 1: le = new AuthFastLeaderElection(this); break; case 2: le = new AuthFastLeaderElection(this, true); break; case 3: //QuorumCnxManager用來接收投票的。職責劃分明確 qcm = createCnxnManager(); QuorumCnxManager.Listener listener = qcm.listener; if(listener != null){ #2.3 //啓動兩個worker線程進行投票的收發 listener.start(); #3.1 //創建一個FastLeaderElection選舉算法,處理投票 le = new FastLeaderElection(this, qcm); } else { LOG.error("Null listener when initializing cnx manager"); } break; default: assert false; } return le; } 2.3. Listener的run方法 @Override public void run() { int numRetries = 0; InetSocketAddress addr; while((!shutdown) && (numRetries < 3)){ try { ss = new ServerSocket(); ss.setReuseAddress(true); if (listenOnAllIPs) { int port = view.get(QuorumCnxManager.this.mySid) .electionAddr.getPort(); addr = new InetSocketAddress(port); } else { addr = view.get(QuorumCnxManager.this.mySid) .electionAddr; } LOG.info("My election bind port: " + addr.toString()); setName(view.get(QuorumCnxManager.this.mySid) .electionAddr.toString()); ss.bind(addr); while (!shutdown) { //阻塞在這裏,等待連接 //先去3.1.看選舉算法,待會再從選舉算法回到2.4. Socket client = ss.accept(); setSockOpts(client); LOG.info("Received connection request " + client.getRemoteSocketAddress()); // Receive and handle the connection request // asynchronously if the quorum sasl authentication is // enabled. This is required because sasl server // authentication process may take few seconds to finish, // this may delay next peer connection requests. if (quorumSaslAuthEnabled) { receiveConnectionAsync(client); } else { #2.4 //接收連接 receiveConnection(client); } numRetries = 0; } } catch (IOException e) { LOG.error("Exception while listening", e); numRetries++; try { ss.close(); Thread.sleep(1000); } catch (IOException ie) { LOG.error("Error closing server socket", ie); } catch (InterruptedException ie) { LOG.error("Interrupted while sleeping. " + "Ignoring exception", ie); } } } LOG.info("Leaving listener"); if (!shutdown) { LOG.error("As I'm leaving the listener thread, " + "I won't be able to participate in leader " + "election any longer: " + view.get(QuorumCnxManager.this.mySid).electionAddr); } } 2.4. public void receiveConnection(final Socket sock) { DataInputStream din = null; try { din = new DataInputStream( new BufferedInputStream(sock.getInputStream())); #2.5 //處理接收到的數據包 handleConnection(sock, din); } catch (IOException e) { LOG.error("Exception handling connection, addr: {}, closing server connection", sock.getRemoteSocketAddress()); closeSocket(sock); } } 2.5. private void handleConnection(Socket sock, DataInputStream din) throws IOException { Long sid = null; try { // Read server id //myid sid = din.readLong(); if (sid < 0) { // this is not a server id but a protocol version (see ZOOKEEPER-1633) sid = din.readLong(); // next comes the #bytes in the remainder of the message // note that 0 bytes is fine (old servers) int num_remaining_bytes = din.readInt(); if (num_remaining_bytes < 0 || num_remaining_bytes > maxBuffer) { LOG.error("Unreasonable buffer length: {}", num_remaining_bytes); closeSocket(sock); return; } byte[] b = new byte[num_remaining_bytes]; // remove the remainder of the message from din int num_read = din.read(b); if (num_read != num_remaining_bytes) { LOG.error("Read only " + num_read + " bytes out of " + num_remaining_bytes + " sent by server " + sid); } } if (sid == QuorumPeer.OBSERVER_ID) { /* * Choose identifier at random. We need a value to identify * the connection. */ sid = observerCounter.getAndDecrement(); LOG.info("Setting arbitrary identifier to observer: " + sid); } } catch (IOException e) { closeSocket(sock); LOG.warn("Exception reading or writing challenge: " + e.toString()); return; } // do authenticating learner LOG.debug("Authenticating learner server.id: {}", sid); authServer.authenticate(sock, din); //防止重複去建立連接,只能myid大的去連接小的 //If wins the challenge, then close the new connection. if (sid < this.mySid) { /* * This replica might still believe that the connection to sid is * up, so we have to shut down the workers before trying to open a * new connection. */ SendWorker sw = senderWorkerMap.get(sid); if (sw != null) { sw.finish(); } /* * Now we start a new connection */ LOG.debug("Create new connection to server: " + sid); closeSocket(sock); //去建立連接 connectOne(sid); // Otherwise start worker threads to receive data. } else { SendWorker sw = new SendWorker(sock, sid); RecvWorker rw = new RecvWorker(sock, din, sid, sw); sw.setRecv(rw); SendWorker vsw = senderWorkerMap.get(sid); if(vsw != null) vsw.finish(); senderWorkerMap.put(sid, sw); queueSendMap.putIfAbsent(sid, new ArrayBlockingQueue<ByteBuffer>(SEND_CAPACITY)); sw.start(); rw.start(); return; } } 3.1. FastLeaderElection構造方法調用starter方法,創建兩個阻塞隊列並new Messenger private void starter(QuorumPeer self, QuorumCnxManager manager) { this.self = self; proposedLeader = -1; proposedZxid = -1; //發送隊列 sendqueue = new LinkedBlockingQueue<ToSend>(); //接收隊列 recvqueue = new LinkedBlockingQueue<Notification>(); #3.2 this.messenger = new Messenger(manager); } 3.2. Messenger構造 Messenger(QuorumCnxManager manager) { #3.3 //發送票據的線程(用於消費sendQueue) this.ws = new WorkerSender(manager); Thread t = new Thread(this.ws, "WorkerSender[myid=" + self.getId() + "]"); //守護線程 t.setDaemon(true); t.start(); #3.4 //接收票據的線程(用於消費recvqueue) this.wr = new WorkerReceiver(manager); t = new Thread(this.wr, "WorkerReceiver[myid=" + self.getId() + "]"); t.setDaemon(true); t.start(); } 3.3. WorkerSender public void run() { while (!stop) { try { //帶有超時阻塞的機制去從阻塞隊列中獲得數據 ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS); if(m == null) continue; process(m); } catch (InterruptedException e) { break; } } LOG.info("WorkerSender is down"); } /** * Called by run() once there is a new message to send. * * @param m message to send */ void process(ToSend m) { ByteBuffer requestBuffer = buildMsg(m.state.ordinal(), m.leader, m.zxid, m.electionEpoch, m.peerEpoch); #3.3.1 manager.toSend(m.sid, requestBuffer); } 3.3.1 public void toSend(Long sid, ByteBuffer b) { /* * If sending message to myself, then simply enqueue it (loopback). */ if (this.mySid == sid) { b.position(0); addToRecvQueue(new Message(b.duplicate(), sid)); /* * Otherwise send to the corresponding thread to send. */ } else { /* * Start a new connection if doesn't have one already. */ ArrayBlockingQueue<ByteBuffer> bq = new ArrayBlockingQueue<ByteBuffer>(SEND_CAPACITY); ArrayBlockingQueue<ByteBuffer> bqExisting = queueSendMap.putIfAbsent(sid, bq); if (bqExisting != null) { //發送 addToSendQueue(bqExisting, b); } else { addToSendQueue(bq, b); } #3.3.2 //目標機器的sid connectOne(sid); } } 3.3.2 connectOne() → initiateConnection() → startConnection() private boolean startConnection(Socket sock, Long sid) throws IOException { DataOutputStream dout = null; DataInputStream din = null; try { // Sending id and challenge dout = new DataOutputStream(sock.getOutputStream()); dout.writeLong(this.mySid); dout.flush(); din = new DataInputStream( new BufferedInputStream(sock.getInputStream())); } catch (IOException e) { LOG.warn("Ignoring exception reading or writing challenge: ", e); closeSocket(sock); return false; } // authenticate learner authLearner.authenticate(sock, view.get(sid).hostname); ////防止重複建立連接,myid大的主動連接myid小的 // If lost the challenge, then drop the new connection if (sid > this.mySid) { LOG.info("Have smaller server identifier, so dropping the " + "connection: (" + sid + ", " + this.mySid + ")"); closeSocket(sock); // Otherwise proceed with the connection } else { //開啓線程,將WorkerSender創建的socket傳入,用於發送票據,消費QuorumCnxManager中queueSendMap隊列 SendWorker sw = new SendWorker(sock, sid); //開啓線程,將收到的票據放入QuorumCnxManager的recvQueue隊列中 RecvWorker rw = new RecvWorker(sock, din, sid, sw); sw.setRecv(rw); SendWorker vsw = senderWorkerMap.get(sid); if(vsw != null) vsw.finish(); senderWorkerMap.put(sid, sw); queueSendMap.putIfAbsent(sid, new ArrayBlockingQueue<ByteBuffer>(SEND_CAPACITY)); sw.start(); rw.start(); return true; } return false; } 3.4. WorkerReceiver public void run() { Message response; while (!stop) { // Sleeps on receive try{ //QuorumCnxManager中的recvQueue隊列取出票據 response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS); if(response == null) continue; /* * If it is from an observer, respond right away. * Note that the following predicate assumes that * if a server is not a follower, then it must be * an observer. If we ever have any other type of * learner in the future, we'll have to change the * way we check for observers. */ if(!validVoter(response.sid)){ Vote current = self.getCurrentVote(); ToSend notmsg = new ToSend(ToSend.mType.notification, current.getId(), current.getZxid(), logicalclock.get(), self.getPeerState(), response.sid, current.getPeerEpoch()); sendqueue.offer(notmsg); } else { // Receive new message if (LOG.isDebugEnabled()) { LOG.debug("Receive new notification message. My id = " + self.getId()); } /* * We check for 28 bytes for backward compatibility */ if (response.buffer.capacity() < 28) { LOG.error("Got a short response: " + response.buffer.capacity()); continue; } boolean backCompatibility = (response.buffer.capacity() == 28); response.buffer.clear(); // Instantiate Notification and set its attributes Notification n = new Notification(); // State of peer that sent this message QuorumPeer.ServerState ackstate = QuorumPeer.ServerState.LOOKING; switch (response.buffer.getInt()) { case 0: ackstate = QuorumPeer.ServerState.LOOKING; break; case 1: ackstate = QuorumPeer.ServerState.FOLLOWING; break; case 2: ackstate = QuorumPeer.ServerState.LEADING; break; case 3: ackstate = QuorumPeer.ServerState.OBSERVING; break; default: continue; } n.leader = response.buffer.getLong(); n.zxid = response.buffer.getLong(); n.electionEpoch = response.buffer.getLong(); n.state = ackstate; n.sid = response.sid; if(!backCompatibility){ n.peerEpoch = response.buffer.getLong(); } else { if(LOG.isInfoEnabled()){ LOG.info("Backward compatibility mode, server id=" + n.sid); } n.peerEpoch = ZxidUtils.getEpochFromZxid(n.zxid); } /* * Version added in 3.4.6 */ n.version = (response.buffer.remaining() >= 4) ? response.buffer.getInt() : 0x0; /* * Print notification info */ if(LOG.isInfoEnabled()){ printNotification(n); } /* * If this server is looking, then send proposed leader */ if(self.getPeerState() == QuorumPeer.ServerState.LOOKING){ recvqueue.offer(n); /* * Send a notification back if the peer that sent this * message is also looking and its logical clock is * lagging behind. */ if((ackstate == QuorumPeer.ServerState.LOOKING) && (n.electionEpoch < logicalclock.get())){ Vote v = getVote(); ToSend notmsg = new ToSend(ToSend.mType.notification, v.getId(), v.getZxid(), logicalclock.get(), self.getPeerState(), response.sid, v.getPeerEpoch()); sendqueue.offer(notmsg); } } else { /* * If this server is not looking, but the one that sent the ack * is looking, then send back what it believes to be the leader. */ Vote current = self.getCurrentVote(); if(ackstate == QuorumPeer.ServerState.LOOKING){ if(LOG.isDebugEnabled()){ LOG.debug("Sending new notification. My id = " + self.getId() + " recipient=" + response.sid + " zxid=0x" + Long.toHexString(current.getZxid()) + " leader=" + current.getId()); } ToSend notmsg; if(n.version > 0x0) { notmsg = new ToSend( ToSend.mType.notification, current.getId(), current.getZxid(), current.getElectionEpoch(), self.getPeerState(), response.sid, current.getPeerEpoch()); } else { Vote bcVote = self.getBCVote(); notmsg = new ToSend( ToSend.mType.notification, bcVote.getId(), bcVote.getZxid(), bcVote.getElectionEpoch(), self.getPeerState(), response.sid, bcVote.getPeerEpoch()); } sendqueue.offer(notmsg); } } } } catch (InterruptedException e) { System.out.println("Interrupted Exception while waiting for new message" + e.toString()); } } LOG.info("WorkerReceiver is down"); } 4.1. 啓動QuorumPeer線程 進行選舉 @Override public void run() { setName("QuorumPeer" + "[myid=" + getId() + "]" + cnxnFactory.getLocalAddress()); LOG.debug("Starting quorum peer"); try { jmxQuorumBean = new QuorumBean(this); MBeanRegistry.getInstance().register(jmxQuorumBean, null); for(QuorumServer s: getView().values()){ ZKMBeanInfo p; if (getId() == s.id) { p = jmxLocalPeerBean = new LocalPeerBean(this); try { MBeanRegistry.getInstance().register(p, jmxQuorumBean); } catch (Exception e) { LOG.warn("Failed to register with JMX", e); jmxLocalPeerBean = null; } } else { p = new RemotePeerBean(s); try { MBeanRegistry.getInstance().register(p, jmxQuorumBean); } catch (Exception e) { LOG.warn("Failed to register with JMX", e); } } } } catch (Exception e) { LOG.warn("Failed to register with JMX", e); jmxQuorumBean = null; } try { /* * Main loop * 死循環 */ while (running) { //第一次啓動的時候,LOOKING switch (getPeerState()) { case LOOKING: LOG.info("LOOKING"); if (Boolean.getBoolean("readonlymode.enabled")) { LOG.info("Attempting to start ReadOnlyZooKeeperServer"); // Create read-only server but don't start it immediately final ReadOnlyZooKeeperServer roZk = new ReadOnlyZooKeeperServer( logFactory, this, new ZooKeeperServer.BasicDataTreeBuilder(), this.zkDb); // Instead of starting roZk immediately, wait some grace // period before we decide we're partitioned. // // Thread is used here because otherwise it would require // changes in each of election strategy classes which is // unnecessary code coupling. Thread roZkMgr = new Thread() { public void run() { try { // lower-bound grace period to 2 secs sleep(Math.max(2000, tickTime)); if (ServerState.LOOKING.equals(getPeerState())) { roZk.startup(); } } catch (InterruptedException e) { LOG.info("Interrupted while attempting to start ReadOnlyZooKeeperServer, not started"); } catch (Exception e) { LOG.error("FAILED to start ReadOnlyZooKeeperServer", e); } } }; try { roZkMgr.start(); setBCVote(null); setCurrentVote(makeLEStrategy().lookForLeader()); } catch (Exception e) { LOG.warn("Unexpected exception",e); setPeerState(ServerState.LOOKING); } finally { // If the thread is in the the grace period, interrupt // to come out of waiting. roZkMgr.interrupt(); roZk.shutdown(); } } else { try { setBCVote(null); # 4.1.1 //setCurrentVote -> 確定了誰是leader了。 setCurrentVote(makeLEStrategy().lookForLeader()); } catch (Exception e) { LOG.warn("Unexpected exception", e); setPeerState(ServerState.LOOKING); } } break; case OBSERVING: try { LOG.info("OBSERVING"); setObserver(makeObserver(logFactory)); observer.observeLeader(); } catch (Exception e) { LOG.warn("Unexpected exception",e ); } finally { observer.shutdown(); setObserver(null); setPeerState(ServerState.LOOKING); } break; case FOLLOWING: try { LOG.info("FOLLOWING"); setFollower(makeFollower(logFactory)); follower.followLeader(); //連接到leader } catch (Exception e) { LOG.warn("Unexpected exception",e); } finally { follower.shutdown(); setFollower(null); setPeerState(ServerState.LOOKING); } break; case LEADING: LOG.info("LEADING"); try { setLeader(makeLeader(logFactory)); leader.lead(); //lead 狀態 setLeader(null); } catch (Exception e) { LOG.warn("Unexpected exception",e); } finally { if (leader != null) { leader.shutdown("Forcing shutdown"); setLeader(null); } setPeerState(ServerState.LOOKING); } break; } } } finally { LOG.warn("QuorumPeer main thread exited"); try { MBeanRegistry.getInstance().unregisterAll(); } catch (Exception e) { LOG.warn("Failed to unregister with JMX", e); } jmxQuorumBean = null; jmxLocalPeerBean = null; } } 4.1.1 public Vote lookForLeader() throws InterruptedException { try { self.jmxLeaderElectionBean = new LeaderElectionBean(); MBeanRegistry.getInstance().register( self.jmxLeaderElectionBean, self.jmxLocalPeerBean); } catch (Exception e) { LOG.warn("Failed to register with JMX", e); self.jmxLeaderElectionBean = null; } if (self.start_fle == 0) { self.start_fle = Time.currentElapsedTime(); } try { //接收到的票據的集合 HashMap<Long, Vote> recvset = new HashMap<Long, Vote>(); // HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>(); int notTimeout = finalizeWait; synchronized(this){ //邏輯時鐘->epoch logicalclock.incrementAndGet(); //proposal updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch()); } LOG.info("New election. My id = " + self.getId() + ", proposed zxid=0x" + Long.toHexString(proposedZxid)); //廣播自己票據 sendNotifications(); /* * Loop in which we exchange notifications until we find a leader */ //接收到了票據 while ((self.getPeerState() == ServerState.LOOKING) && (!stop)){ /* * Remove next notification from queue, times out after 2 times * the termination time */ //recvqueue是從網絡上接收到的其他機器的Notification Notification n = recvqueue.poll(notTimeout, TimeUnit.MILLISECONDS); /* * Sends more notifications if haven't received enough. * Otherwise processes new notification. */ if(n == null){ if(manager.haveDelivered()){ sendNotifications(); } else { manager.connectAll();//重新連接集羣中的所有節點 } /* * Exponential backoff */ int tmpTimeOut = notTimeout*2; notTimeout = (tmpTimeOut < maxNotificationInterval? tmpTimeOut : maxNotificationInterval); LOG.info("Notification time out: " + notTimeout); } else if(validVoter(n.sid) && validVoter(n.leader)) {//判斷是否是一個有效的票據 /* * Only proceed if the vote comes from a replica in the * voting view for a replica in the voting view. */ switch (n.state) { case LOOKING: //第一次進入到這個case // If notification > current, replace and send messages out if (n.electionEpoch > logicalclock.get()) { // logicalclock.set(n.electionEpoch); recvset.clear();//清空 //收到票據之後,當前的server要聽誰的。 //可能是聽server1的、也可能是聽server2,也可能是聽server3 //zab leader選舉算法 if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) { //把自己的票據更新成對方的票據,那麼下一次,發送的票據就是新的票據 updateProposal(n.leader, n.zxid, n.peerEpoch); } else { //收到的票據小於當前的節點的票據,下一次發送票據,仍然發送自己的 updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch()); } //繼續發送通知 sendNotifications(); //說明當前接收到的票據已經過期了,直接丟棄 } else if (n.electionEpoch < logicalclock.get()) { if(LOG.isDebugEnabled()){ LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x" + Long.toHexString(n.electionEpoch) + ", logicalclock=0x" + Long.toHexString(logicalclock.get())); } break; //這個判斷表示收到的票據的 epoch 是相同的, //那麼按照 epoch、zxid、myid 順序進行比較比較成功以後,把對方的票據信息更新到自己的節點 } else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) { updateProposal(n.leader, n.zxid, n.peerEpoch); sendNotifications(); } if(LOG.isDebugEnabled()){ LOG.debug("Adding vote: from=" + n.sid + ", proposed leader=" + n.leader + ", proposed zxid=0x" + Long.toHexString(n.zxid) + ", proposed election epoch=0x" + Long.toHexString(n.electionEpoch)); } recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch)); //決斷時刻(當前節點的更新後的vote信息,和recvset集合中的票據進行歸納,) if (termPredicate(recvset, new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch))) { // Verify if there is any change in the proposed leader //進入這個判斷,說明選票達到了 leader 選舉的要求 //在更新狀態之前,服務器會等待 finalizeWait 毫秒時間來接收新的選票,以防止漏下關鍵選票。 //如果收到可能改變 Leader 的新選票,則重新進行計票 while((n = recvqueue.poll(finalizeWait, TimeUnit.MILLISECONDS)) != null){ if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)){ recvqueue.put(n); break; } } /* * This predicate is true once we don't read any new * relevant message from the reception queue */ //如果 notifaction 爲空,說明Leader節點確定好了 if (n == null) { //設置當前當前節點的狀態(判斷 leader 節點是不是我自己,如果是,直接更新當前節點的 state 爲 LEADING) //否則,根據當前節點的特性進行判斷,決定是FOLLOWING 還是 OBSERVING self.setPeerState((proposedLeader == self.getId()) ? ServerState.LEADING: learningState()); Vote endVote = new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch); leaveInstance(endVote); //返回這次選舉的票據 return endVote; } } break; case OBSERVING: LOG.debug("Notification from observer: " + n.sid); break; case FOLLOWING: case LEADING: /* * Consider all notifications from the same epoch * together. */ if(n.electionEpoch == logicalclock.get()){ recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch)); if(ooePredicate(recvset, outofelection, n)) { self.setPeerState((n.leader == self.getId()) ? ServerState.LEADING: learningState()); Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } } /* * Before joining an established ensemble, verify * a majority is following the same leader. */ outofelection.put(n.sid, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)); if(ooePredicate(outofelection, outofelection, n)) { synchronized(this){ logicalclock.set(n.electionEpoch); self.setPeerState((n.leader == self.getId()) ? ServerState.LEADING: learningState()); } Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } break; default: LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)", n.state, n.sid); break; } } else { if (!validVoter(n.leader)) { LOG.warn("Ignoring notification for non-cluster member sid {} from sid {}", n.leader, n.sid); } if (!validVoter(n.sid)) { LOG.warn("Ignoring notification for sid {} from non-quorum member sid {}", n.leader, n.sid); } } } return null; } finally { try { if(self.jmxLeaderElectionBean != null){ MBeanRegistry.getInstance().unregister( self.jmxLeaderElectionBean); } } catch (Exception e) { LOG.warn("Failed to unregister with JMX", e); } self.jmxLeaderElectionBean = null; LOG.debug("Number of connection processing threads: {}", manager.getConnectionThreadCount()); } }
-
投票 流程圖
-
zk選舉流程圖