博客地址: http://blog.csdn.net/yueqian_zhu/
這一節講解master 選舉以及之後的處理流程
上一節說到在Master啓動過程中,首先調用了 Akka actor的preStart方法。
override def preStart() {
logInfo("Starting Spark master at " + masterUrl)
logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}")
// Listen for remote client disconnection events, since they don't go through Akka's watch()
context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
webUi.bind()
masterWebUiUrl = "http://" + masterPublicAddress + ":" + webUi.boundPort
context.system.scheduler.schedule(0 millis, WORKER_TIMEOUT millis, self, CheckForWorkerTimeOut)
masterMetricsSystem.registerSource(masterSource)
masterMetricsSystem.start()
applicationMetricsSystem.start()
// Attach the master and app metrics servlet handler to the web ui after the metrics systems are
// started.
masterMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
applicationMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
//HA的流程從這裏開始
//這裏可以選擇Master的元數據信息保存在哪裏,我們以ZK爲例講解
//這裏用Apache Curator作爲zk的client,它包裝了zk client 複雜的api
val (persistenceEngine_, leaderElectionAgent_) = RECOVERY_MODE match {
case "ZOOKEEPER" =>
logInfo("Persisting recovery state to ZooKeeper")
val zkFactory =
new ZooKeeperRecoveryModeFactory(conf, SerializationExtension(context.system))
(zkFactory.createPersistenceEngine(), zkFactory.createLeaderElectionAgent(this))
case "FILESYSTEM" =>
val fsFactory =
new FileSystemRecoveryModeFactory(conf, SerializationExtension(context.system))
(fsFactory.createPersistenceEngine(), fsFactory.createLeaderElectionAgent(this))
case "CUSTOM" =>
val clazz = Class.forName(conf.get("spark.deploy.recoveryMode.factory"))
val factory = clazz.getConstructor(classOf[SparkConf], classOf[Serialization])
.newInstance(conf, SerializationExtension(context.system))
.asInstanceOf[StandaloneRecoveryModeFactory]
(factory.createPersistenceEngine(), factory.createLeaderElectionAgent(this))
case _ =>
(new BlackHolePersistenceEngine(), new MonarchyLeaderAgent(this))
}
persistenceEngine = persistenceEngine_
leaderElectionAgent = leaderElectionAgent_
}
上面的persistenceEngine_封裝了在zk中讀寫元數據信息,以及序列化反序列化的接口
leaderElectionAgent_封裝了master的選舉過程,見下面代碼註釋中的解釋
private[master] class ZooKeeperLeaderElectionAgent(val masterActor: LeaderElectable,
conf: SparkConf) extends LeaderLatchListener with LeaderElectionAgent with Logging {
//依賴zk中的一個節點來判斷選主
val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election"
private var zk: CuratorFramework = _
private var leaderLatch: LeaderLatch = _
private var status = LeadershipStatus.NOT_LEADER
//構造這個對象之後就調用了start方法
start()
//leaderLatch.start()一旦調用,LeaderLatch會和其它使用相同latch path的其它LeaderLatch交涉,然後隨機的選擇其中一個作爲leader
private def start() {
logInfo("Starting ZooKeeper LeaderElection agent")
zk = SparkCuratorUtil.newClient(conf)
leaderLatch = new LeaderLatch(zk, WORKING_DIR)
leaderLatch.addListener(this)
leaderLatch.start()
}
override def stop() {
leaderLatch.close()
zk.close()
}
//當一個master被選爲主時,isLeader方法被回調,說明在這一輪選舉中勝出
override def isLeader() {
synchronized {
// could have lost leadership by now.
if (!leaderLatch.hasLeadership) {
return
}
logInfo("We have gained leadership")
updateLeadershipStatus(true)
}
}
//當一個master被選爲備時,notLeader方法被回調,說明在這一輪選舉中落敗
override def notLeader() {
synchronized {
// could have gained leadership by now.
if (leaderLatch.hasLeadership) {
return
}
logInfo("We have lost leadership")
updateLeadershipStatus(false)
}
}
private def updateLeadershipStatus(isLeader: Boolean) {
//當一個master之前狀態爲備,目前被選爲主
if (isLeader && status == LeadershipStatus.NOT_LEADER) {
status = LeadershipStatus.LEADER
masterActor.electedLeader()//調用master類的electedLeader方法
//當一個master之前狀態爲主,目前被選爲備
} else if (!isLeader && status == LeadershipStatus.LEADER) {
status = LeadershipStatus.NOT_LEADER
masterActor.revokedLeadership()//調用master類的revokedLeadership方法
}
}
private object LeadershipStatus extends Enumeration {
type LeadershipStatus = Value
val LEADER, NOT_LEADER = Value
}
}
繼續查看master中的邏輯
override def receiveWithLogging: PartialFunction[Any, Unit] = {
case ElectedLeader => {
//既然之前是備,現在想變成主,就需要讀取zk中的必要的信息來構造元數據
val (storedApps, storedDrivers, storedWorkers) = persistenceEngine.readPersistedData()
state = if (storedApps.isEmpty && storedDrivers.isEmpty && storedWorkers.isEmpty) {
RecoveryState.ALIVE//如果沒有任何元數據需要構造,則直接置爲alive狀態
} else {
RecoveryState.RECOVERING//不然需要置爲恢復中
}
logInfo("I have been elected leader! New state: " + state)
if (state == RecoveryState.RECOVERING) {
beginRecovery(storedApps, storedDrivers, storedWorkers)//見下面介紹
recoveryCompletionTask = context.system.scheduler.scheduleOnce(WORKER_TIMEOUT millis, self,
CompleteRecovery)
}
}
case CompleteRecovery => completeRecovery()
//之前是主,現在被置爲備了,不需要額外操作,退出即可
case RevokedLeadership => {
logError("Leadership has been revoked -- master shutting down.")
System.exit(0)
}
開始恢復private def beginRecovery(storedApps: Seq[ApplicationInfo], storedDrivers: Seq[DriverInfo],
storedWorkers: Seq[WorkerInfo]) {
for (app <- storedApps) {
logInfo("Trying to recover app: " + app.id)
try {
registerApplication(app)//將讀到的app加載到內存
app.state = ApplicationState.UNKNOWN//狀態置爲unknown
app.driver ! MasterChanged(masterUrl, masterWebUiUrl)//向driver發送MasterChanged消息
} catch {
case e: Exception => logInfo("App " + app.id + " had exception on reconnect")
}
}
for (driver <- storedDrivers) {
// Here we just read in the list of drivers. Any drivers associated with now-lost workers
// will be re-launched when we detect that the worker is missing.
drivers += driver//將讀到的driver加載到內存
}
for (worker <- storedWorkers) {
logInfo("Trying to recover worker: " + worker.id)
try {
registerWorker(worker)//將讀到的worker信息加載到內存
worker.state = WorkerState.UNKNOWN//同樣狀態需要置爲unknown,需要等到worker發送消息過來之後才能認爲該worker是可用的
worker.actor ! MasterChanged(masterUrl, masterWebUiUrl)//向worker發送MasterChanged消息
} catch {
case e: Exception => logInfo("Worker " + worker.id + " had exception on reconnect")
}
}
}
看driver端收到MasterChanged消息會發生什麼?在AppClient.scala中只有主master會發送MasterChanged消息,所以這裏的masterUrl肯定是新的主master的
case MasterChanged(masterUrl, masterWebUiUrl) =>
logInfo("Master has changed, new master is at " + masterUrl)
//收到這個消息之後,driver需要修改之前保存的master信息,用於之後向新的master通信
changeMaster(masterUrl)
alreadyDisconnected = false
sender ! MasterChangeAcknowledged(appId)//向master反饋MasterChangeAcknowledged消息
master這時會收到所有app中driver發來的消息,我們看master收到MasterChangeAcknowledged消息的處理方式,參數爲appId
case MasterChangeAcknowledged(appId) => {
idToApp.get(appId) match {
case Some(app) =>
logInfo("Application has been re-registered: " + appId)
app.state = ApplicationState.WAITING //收到消息後將app狀態置爲WAITING
case None =>
logWarning("Master change ack from unknown app: " + appId)
}
if (canCompleteRecovery) { completeRecovery() } //這個只是優先判斷消息處理是否都結束了,這樣就不用等待worker_timeout的時間間隔再調用completeRecovery了
}
看worker端收到MasterChanged消息會發生什麼?在Worker.scala中
case MasterChanged(masterUrl, masterWebUiUrl) =>
logInfo("Master has changed, new master is at " + masterUrl)
changeMaster(masterUrl, masterWebUiUrl)//同上
//master不與Executor交互,所以需要worker來告訴master關於Executor的信息
val execs = executors.values.
map(e => new ExecutorDescription(e.appId, e.execId, e.cores, e.state))
sender ! WorkerSchedulerStateResponse(workerId, execs.toList, drivers.keys.toSeq)
繼續看master中的處理邏輯
case WorkerSchedulerStateResponse(workerId, executors, driverIds) => {
idToWorker.get(workerId) match {
case Some(worker) =>
logInfo("Worker has been re-registered: " + workerId)
worker.state = WorkerState.ALIVE //這時可以將之前worker狀態unknown修改爲ALIVE,代表該worker可用
//將接受到的Executor信息更新到相關的app,worker中
val validExecutors = executors.filter(exec => idToApp.get(exec.appId).isDefined)
for (exec <- validExecutors) {
val app = idToApp.get(exec.appId).get
val execInfo = app.addExecutor(worker, exec.cores, Some(exec.execId))
worker.addExecutor(execInfo)
execInfo.copyState(exec)
}
//將master中driver信息更新,狀態置爲RUNNING
for (driverId <- driverIds) {
drivers.find(_.id == driverId).foreach { driver =>
driver.worker = Some(worker)
driver.state = DriverState.RUNNING
worker.drivers(driverId) = driver
}
}
case None =>
logWarning("Scheduler state from unknown worker: " + workerId)
}
if (canCompleteRecovery) { completeRecovery() } //同上
}
這一切都處理完畢之後,看master的completeRecovery,這個是在beginRecovery調用之後,在延遲worker_timeout時間之後調用,一般情況下,上面的消息來回發送處理應該都已經結束了
private def completeRecovery() {
// Ensure "only-once" recovery semantics using a short synchronization period.
synchronized {
if (state != RecoveryState.RECOVERING) { return }
state = RecoveryState.COMPLETING_RECOVERY//狀態置爲恢復完成
}
// Kill off any workers and apps that didn't respond to us.
//清理在這個worker_timeout間隔過後還未處理成功的worker和app
workers.filter(_.state == WorkerState.UNKNOWN).foreach(removeWorker)
apps.filter(_.state == ApplicationState.UNKNOWN).foreach(finishApplication)
// Reschedule drivers which were not claimed by any workers
//在一番消息通信之後,本應該在driver中更新的worker信息不見了,則重啓driver或者刪除
drivers.filter(_.worker.isEmpty).foreach { d =>
logWarning(s"Driver ${d.id} was not found after master recovery")
if (d.desc.supervise) {
logWarning(s"Re-launching ${d.id}")
relaunchDriver(d)
} else {
removeDriver(d.id, DriverState.ERROR, None)
logWarning(s"Did not re-launch ${d.id} because it was not supervised")
}
}
state = RecoveryState.ALIVE //這時恢復狀態真正結束了
schedule() //整個選主流程結束時候,重新調度一次
logInfo("Recovery complete - resuming operations!")
}