1、查看kafka-reassign-partitions.sh腳本
cd kafka_home/bin
cat kafka-reassign-partitions.sh
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
exec $(dirname $0)/kafka-run-class.sh kafka.admin.ReassignPartitionsCommand "$@"
由上可得:kafka-reassign-partitions.sh腳本實際執行kafka-run-class.sh腳本,調用kafka.admin.ReassignPartitionsCommand類
2、ReassignPartitionsCommand類詳情
def executeAssignment(zkUtils: ZkUtils, opts: ReassignPartitionsCommandOptions) {
//讀取遷移計劃json格式
val reassignmentJsonFile = opts.options.valueOf(opts.reassignmentJsonFileOpt)
//將遷移計劃json格式轉string
val reassignmentJsonString = Utils.readFileAsString(reassignmentJsonFile)
//如果有限額的參數,則將限額的參數進行讀取,傳入executeAssignment中去
val throttle = if (opts.options.has(opts.throttleOpt)) opts.options.valueOf(opts.throttleOpt) else -1
executeAssignment(zkUtils, reassignmentJsonString, throttle)
}
def executeAssignment(zkUtils: ZkUtils, reassignmentJsonString: String, throttle: Long = -1) {
val partitionsToBeReassigned = parseAndValidate(zkUtils, reassignmentJsonString)
val reassignPartitionsCommand = new ReassignPartitionsCommand(zkUtils, partitionsToBeReassigned.toMap)
// If there is an existing rebalance running, attempt to change its throttle
if (zkUtils.pathExists(ZkUtils.ReassignPartitionsPath)) {
println("There is an existing assignment running.")
reassignPartitionsCommand.maybeLimit(throttle)
}
else {
printCurrentAssignment(zkUtils, partitionsToBeReassigned)
if (throttle >= 0)
println(String.format("Warning: You must run Verify periodically, until the reassignment completes, to ensure the throttle is removed. You can also alter the throttle by rerunning the Execute command passing a new value."))
if (reassignPartitionsCommand.reassignPartitions(throttle)) {
println("Successfully started reassignment of partitions.")
} else
println("Failed to reassign partitions %s".format(partitionsToBeReassigned))
}
}
def reassignPartitions(throttle: Long = -1): Boolean = {
maybeThrottle(throttle)
try {
val validPartitions = proposedAssignment.filter { case (p, _) => validatePartition(zkUtils, p.topic, p.partition) }
if (validPartitions.isEmpty) false
else {
val jsonReassignmentData = ZkUtils.formatAsReassignmentJson(validPartitions)
zkUtils.createPersistentPath(ZkUtils.ReassignPartitionsPath, jsonReassignmentData)
true
}
} catch {
case ze: ZkNodeExistsException =>
val partitionsBeingReassigned = zkUtils.getPartitionsBeingReassigned()
throw new AdminCommandFailedException("Partition reassignment currently in " +
"progress for %s. Aborting operation".format(partitionsBeingReassigned))
case e: Throwable => error("Admin command failed", e); false
}
}
由上可知
- executeAssignment(zkUtils: ZkUtils, opts: ReassignPartitionsCommandOptions)方法獲取ReassignPartitionsCommandOptions參數對應的執行計劃文件內容
- executeAssignment(zkUtils: ZkUtils, reassignmentJsonString: String, throttle: Long = -1)方法進行校驗、判斷限流值(以防分區重分配/遷移網絡影響到leader,導致生產消費異常)
- reassignPartitions(throttle: Long = -1)方法中的
zkUtils.createPersistentPath(ZkUtils.ReassignPartitionsPath, jsonReassignmentData)
在zk的/admin/reassign_partitions創建一個執行計劃的節點,至此,execute命令執行結束,zk節點創建完畢,等待監聽器監聽、回調具體的執行邏輯
3、Controller監聽動作
主controller會回調 KafkaController.onControllerFailover 這個方法, 這個方法註冊了監聽 “/admin/reassign_partitions” 目錄的事件,如下代碼
/**
* This callback is invoked by the zookeeper leader elector on electing the current broker as the new controller.
* It does the following things on the become-controller state change -
* 1. Register controller epoch changed listener
* 2. Increments the controller epoch
* 3. Initializes the controller's context object that holds cache objects for current topics, live brokers and
* leaders for all existing partitions.
* 4. Starts the controller's channel manager
* 5. Starts the replica state machine
* 6. Starts the partition state machine
* If it encounters any unexpected exception/error while becoming controller, it resigns as the current controller.
* This ensures another controller election will be triggered and there will always be an actively serving controller
*/
def onControllerFailover() {
if(isRunning) {
info("Broker %d starting become controller state transition".format(config.brokerId))
//read controller epoch from zk
readControllerEpochFromZookeeper()
// increment the controller epoch
incrementControllerEpoch(zkUtils.zkClient)
// before reading source of truth from zookeeper, register the listeners to get broker/topic callbacks
registerReassignedPartitionsListener()
registerIsrChangeNotificationListener()
registerPreferredReplicaElectionListener()
partitionStateMachine.registerListeners()
replicaStateMachine.registerListeners()
initializeControllerContext()
replicaStateMachine.startup()
partitionStateMachine.startup()
// register the partition change listeners for all existing topics on failover
controllerContext.allTopics.foreach(topic => partitionStateMachine.registerPartitionChangeListener(topic))
info("Broker %d is ready to serve as the new controller with epoch %d".format(config.brokerId, epoch))
maybeTriggerPartitionReassignment()
maybeTriggerPreferredReplicaElection()
/* send partition leadership info to all live brokers */
sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq)
if (config.autoLeaderRebalanceEnable) {
info("starting the partition rebalance scheduler")
autoRebalanceScheduler.startup()
autoRebalanceScheduler.schedule("partition-rebalance-thread", checkAndTriggerPartitionRebalance,
5, config.leaderImbalanceCheckIntervalSeconds.toLong, TimeUnit.SECONDS)
}
deleteTopicManager.start()
}
else
info("Controller has been shut down, aborting startup/failover")
}
4、Controller處理動作
處理 事件是通過 PartitionsReassignedListener 的handleDataChange來處理的。
實際上最終處理的是通過 onPartitionReassignment的方法
/**
* Starts the partition reassignment process unless -
* 1. Partition previously existed
* 2. New replicas are the same as existing replicas
* 3. Any replica in the new set of replicas are dead
* If any of the above conditions are satisfied, it logs an error and removes the partition from list of reassigned
* partitions.
*/
class PartitionsReassignedListener(controller: KafkaController) extends IZkDataListener with Logging {
this.logIdent = "[PartitionsReassignedListener on " + controller.config.brokerId + "]: "
val zkUtils = controller.controllerContext.zkUtils
val controllerContext = controller.controllerContext
/**
* Invoked when some partitions are reassigned by the admin command
*
* @throws Exception On any error.
*/
@throws(classOf[Exception])
def handleDataChange(dataPath: String, data: Object) {
debug("Partitions reassigned listener fired for path %s. Record partitions to be reassigned %s"
.format(dataPath, data))
val partitionsReassignmentData = ZkUtils.parsePartitionReassignmentData(data.toString)
val partitionsToBeReassigned = inLock(controllerContext.controllerLock) {
partitionsReassignmentData.filterNot(p => controllerContext.partitionsBeingReassigned.contains(p._1))
}
partitionsToBeReassigned.foreach { partitionToBeReassigned =>
inLock(controllerContext.controllerLock) {
if(controller.deleteTopicManager.isTopicQueuedUpForDeletion(partitionToBeReassigned._1.topic)) {
error("Skipping reassignment of partition %s for topic %s since it is currently being deleted"
.format(partitionToBeReassigned._1, partitionToBeReassigned._1.topic))
controller.removePartitionFromReassignedPartitions(partitionToBeReassigned._1)
} else {
val context = new ReassignedPartitionsContext(partitionToBeReassigned._2)
//initiateReassignReplicasForTopicPartition會調用onPartitionReassignment方法按照分配/遷移計劃內容進行具體的數據遷移
controller.initiateReassignReplicasForTopicPartition(partitionToBeReassigned._1, context)
}
}
}
}
/**
* Called when the leader information stored in zookeeper has been delete. Try to elect as the leader
*
* @throws Exception On any error.
*/
@throws(classOf[Exception])
def handleDataDeleted(dataPath: String) {
}
}
controller.initiateReassignReplicasForTopicPartition(partitionToBeReassigned._1, context)
會調用onPartitionReassignment方法按照分配/遷移計劃內容進行具體的數據遷移
def initiateReassignReplicasForTopicPartition(topicAndPartition: TopicAndPartition,
reassignedPartitionContext: ReassignedPartitionsContext) {
val newReplicas = reassignedPartitionContext.newReplicas
val topic = topicAndPartition.topic
val partition = topicAndPartition.partition
val aliveNewReplicas = newReplicas.filter(r => controllerContext.liveBrokerIds.contains(r))
try {
val assignedReplicasOpt = controllerContext.partitionReplicaAssignment.get(topicAndPartition)
assignedReplicasOpt match {
case Some(assignedReplicas) =>
if(assignedReplicas == newReplicas) {
throw new KafkaException("Partition %s to be reassigned is already assigned to replicas".format(topicAndPartition) +
" %s. Ignoring request for partition reassignment".format(newReplicas.mkString(",")))
} else {
info("Handling reassignment of partition %s to new replicas %s".format(topicAndPartition, newReplicas.mkString(",")))
// first register ISR change listener
watchIsrChangesForReassignedPartition(topic, partition, reassignedPartitionContext)
controllerContext.partitionsBeingReassigned.put(topicAndPartition, reassignedPartitionContext)
// mark topic ineligible for deletion for the partitions being reassigned
deleteTopicManager.markTopicIneligibleForDeletion(Set(topic))
//按照分配/遷移計劃內容進行具體的數據遷移
onPartitionReassignment(topicAndPartition, reassignedPartitionContext)
}
case None => throw new KafkaException("Attempt to reassign partition %s that doesn't exist"
.format(topicAndPartition))
}
} catch {
case e: Throwable => error("Error completing reassignment of partition %s".format(topicAndPartition), e)
// remove the partition from the admin path to unblock the admin client
removePartitionFromReassignedPartitions(topicAndPartition)
}
}
5、onPartitionReassignment核心方法
/**
* This callback is invoked by the reassigned partitions listener. When an admin command initiates a partition
* reassignment, it creates the /admin/reassign_partitions path that triggers the zookeeper listener.
* Reassigning replicas for a partition goes through a few steps listed in the code.
* RAR = Reassigned replicas
* OAR = Original list of replicas for partition
* AR = current assigned replicas
*
* 1. Update AR in ZK with OAR + RAR.
* 2. Send LeaderAndIsr request to every replica in OAR + RAR (with AR as OAR + RAR). We do this by forcing an update
* of the leader epoch in zookeeper.
* 3. Start new replicas RAR - OAR by moving replicas in RAR - OAR to NewReplica state.
* 4. Wait until all replicas in RAR are in sync with the leader.
* 5 Move all replicas in RAR to OnlineReplica state.
* 6. Set AR to RAR in memory.
* 7. If the leader is not in RAR, elect a new leader from RAR. If new leader needs to be elected from RAR, a LeaderAndIsr
* will be sent. If not, then leader epoch will be incremented in zookeeper and a LeaderAndIsr request will be sent.
* In any case, the LeaderAndIsr request will have AR = RAR. This will prevent the leader from adding any replica in
* RAR - OAR back in the isr.
* 8. Move all replicas in OAR - RAR to OfflineReplica state. As part of OfflineReplica state change, we shrink the
* isr to remove OAR - RAR in zookeeper and send a LeaderAndIsr ONLY to the Leader to notify it of the shrunk isr.
* After that, we send a StopReplica (delete = false) to the replicas in OAR - RAR.
* 9. Move all replicas in OAR - RAR to NonExistentReplica state. This will send a StopReplica (delete = true) to
* the replicas in OAR - RAR to physically delete the replicas on disk.
* 10. Update AR in ZK with RAR.
* 11. Update the /admin/reassign_partitions path in ZK to remove this partition.
* 12. After electing leader, the replicas and isr information changes. So resend the update metadata request to every broker.
*
* For example, if OAR = {1, 2, 3} and RAR = {4,5,6}, the values in the assigned replica (AR) and leader/isr path in ZK
* may go through the following transition.
* AR leader/isr
* {1,2,3} 1/{1,2,3} (initial state)
* {1,2,3,4,5,6} 1/{1,2,3} (step 2)
* {1,2,3,4,5,6} 1/{1,2,3,4,5,6} (step 4)
* {1,2,3,4,5,6} 4/{1,2,3,4,5,6} (step 7)
* {1,2,3,4,5,6} 4/{4,5,6} (step 8)
* {4,5,6} 4/{4,5,6} (step 10)
*
* Note that we have to update AR in ZK with RAR last since it's the only place where we store OAR persistently.
* This way, if the controller crashes before that step, we can still recover.
*/
def onPartitionReassignment(topicAndPartition: TopicAndPartition, reassignedPartitionContext: ReassignedPartitionsContext) {
//reassignedPartitionContext對象爲執行計劃中的每一個分區詳情。reassignedReplicas爲每個分區計劃重分配的副本對應的brokerId集合.
val reassignedReplicas = reassignedPartitionContext.newReplicas
/*
根據要進行重新的副本分配的topic-partition,從zk中對應的topic/partition的state中找到對應的leader
的切換順序集合(isr)的集合,如果重新分配的副本集合在isr的集合中都包含時,areReplicasInIsr函數的返回值爲true,
否則表示新的副本集合中有副本不在isr中包含返回值爲false.
這裏如果是true時,執行的是對副本的具體分配,
如果是false的情況時,會更新每個partition的state的內容爲新的副本信息,並設置新添加的副本的狀態爲NewReplica的狀態,
*/
//{1,2,3} 1/{1,2,3} (initial state)
if (!areReplicasInIsr(topicAndPartition.topic, topicAndPartition.partition, reassignedReplicas)) {
info("New replicas %s for partition %s being ".format(reassignedReplicas.mkString(","), topicAndPartition) +
"reassigned not yet caught up with the leader")
//這裏先得到新分配的副本與已經存在的副本的差集,也就是新分配的副本在現在的副本中不包含的集合
val newReplicasNotInOldReplicaList = reassignedReplicas.toSet -- controllerContext.partitionReplicaAssignment(topicAndPartition).toSet
//這裏得到新分配的副本與已經存在的副本集合的全集.
val newAndOldReplicas = (reassignedPartitionContext.newReplicas ++ controllerContext.partitionReplicaAssignment(topicAndPartition)).toSet
//1. Update AR in ZK with OAR + RAR.
//這裏把新分配的replicas的副本集合與已經存在的副本集合進行合併後,得到一個新的副本集合,
//把這個集合更新到partitionReplicaAssignment集合中對應的partition上
//把這個topic對應的所有的partition的副本集合當成內容分配信息存儲到zk的/brokers/topics/topicName節點中.
// 由topic修改的監聽程序來處理對這個zk節點的變化後的流程進行處理.
updateAssignedReplicasForPartition(topicAndPartition, newAndOldReplicas.toSeq)
//{1,2,3,4,5,6} 1/{1,2,3} (step 2)
//2. Send LeaderAndIsr request to every replica in OAR + RAR (with AR as OAR + RAR).
//(1)這裏根據對應的partition的新的副本信息(原來分配的副本加上新分配的副本),生成向每一個副本所在的broker
// 進行leaderIsr狀態的請求(這個請求主要是當前的partition對應的副本集合),存儲到leaderAndIsrRequestMap集合中,
// 在提交請求時從這個集合中讀取數據,這個集合中是存儲的LeaderAndIsrRequest請求,
//(2)這裏根據對應的partition,這裏在updateMetadataRequestMap集合中存儲向所有的broker發送partition的
// metadata修改的UpdateMetadataRequest請求,這個請求中,如果topic已經被刪除,發送的請求的leader的id是-2,
// 否則發送的請求是正常的請求,只是副本集合發生了變化.
//(3)向對應的broker發送上面生成的兩個請求.這個請求發送的前提是broker已經被啓動.
updateLeaderEpochAndSendRequest(topicAndPartition, controllerContext.partitionReplicaAssignment(topicAndPartition),
newAndOldReplicas.toSeq)
//3. replicas in RAR - OAR -> NewReplica
//通過對所有的新添加的副本進行迭代,通過replicaStateMachine實例更新副本狀態爲NewReplica
startNewReplicasForReassignedPartition(topicAndPartition, reassignedPartitionContext, newReplicasNotInOldReplicaList)
info("Waiting for new replicas %s for partition %s being ".format(reassignedReplicas.mkString(","), topicAndPartition) +
"reassigned to catch up with the leader")
} else {
//這種情況下,表示當前重新分配的副本在isr的集合中都存在,
//先得到老的已經分配的副本
//4. Wait until all replicas in RAR are in sync with the leader.
val oldReplicas = controllerContext.partitionReplicaAssignment(topicAndPartition).toSet -- reassignedReplicas.toSet
//並設置這些老的副本的狀態爲OnlineReplica.
//5. replicas in RAR -> OnlineReplica
reassignedReplicas.foreach { replica =>
replicaStateMachine.handleStateChanges(Set(new PartitionAndReplica(topicAndPartition.topic, topicAndPartition.partition,
replica)), OnlineReplica)
}
//6. Set AR to RAR in memory.
//7. Send LeaderAndIsr request with a potential new leader (if current leader not in RAR) and
// a new AR (using RAR) and same isr to every broker in RAR
//如果新分配的副本集合中不包含當前的partition的leader節點時,通過partitionStateMachine實例
// 更新partition的狀態爲OnlinePartition.同時通過ReassignedPartitionLeaderSelector實例重新選擇leader.
//(1)如果新分配的副本集合中不包含當前的partition的leader節點時,通過partitionStateMachine實例更新partition的狀態爲OnlinePartition.
// 同時通過ReassignedPartitionLeaderSelector實例重新選擇leader.
//(2)如果當前活着的broker節點中包含有Partition的leader節點時,向所有的broker節點發送更新leaderAndIsr與metadata的請求.
//(3)如果當前的partition的leader節點已經下線,通過partitionStateMachine實例更新partition的狀態爲OnlinePartition.
// 同時通過ReassignedPartitionLeaderSelector實例重新選擇leader.
moveReassignedPartitionLeaderIfRequired(topicAndPartition, reassignedPartitionContext)
//(1)更新replicaStateMachine中老副本的狀態爲OfflineReplica狀態,並向這個副本對應的節點發起StopReplicaRequest請求.
// 更新zk中topic,paritions,state節點下isr的信息多原來的副本節點中移出這個節點.並向餘下的isr節點發起LeaderAndIsrRequest的選擇請求.
//(2)更新replicaStateMachine中對應此partition的此副本狀態爲ReplicaDeletionStarted狀態,
// 並向需要執行副本刪除的節點發起StopReplicaRequest請求,此時請求的deletePartition的屬性值爲true.後面的步驟在這裏執行完成後纔會執行.
//(3)更新此partition對應此副本的狀態爲ReplicaDeletionSuccessful狀態.
//(4)從replicaStateMachine的replicaState集合中移出此partition對應被下線的副本的副本狀態.
// 並更新partitionReplicaAssignment集合的副本集合,從這個集合中也移出此節點.
//8. replicas in OAR - RAR -> Offline (force those replicas out of isr)
//9. replicas in OAR - RAR -> NonExistentReplica (force those replicas to be deleted)
stopOldReplicasOfReassignedPartition(topicAndPartition, reassignedPartitionContext, oldReplicas)
//更新partitionReplicaAssignment集合中對應此partition的副本集合,並更新這個topic對應zk的節點信息,
// 主要是所有的partitions的副本集合都會更新一次.由topic修改的監聽程序來處理對這個zk節點的變化後的流程進行處理.
//10. Update AR in ZK with RAR.
updateAssignedReplicasForPartition(topicAndPartition, reassignedReplicas)
//取消對partition的isr的修改的監聽程序,並從partitionsBeingReassigned集合中移出這個準備重新分配的partition.
//11. Update the /admin/reassign_partitions path in ZK to remove this partition.
removePartitionFromReassignedPartitions(topicAndPartition)
info("Removed partition %s from the list of reassigned partitions in zookeeper".format(topicAndPartition))
controllerContext.partitionsBeingReassigned.remove(topicAndPartition)
//向所有的broker節點發送此partition的metadata修改的UpdateMetadataRequest請求.
//12. After electing leader, the replicas and isr information changes, so resend the update metadata request to every broker
sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq, Set(topicAndPartition))
//如果topic是已經被刪除的topic,從準備刪除的topic集合中移出這個topic
// signal delete topic thread if reassignment for some partitions belonging to topics being deleted just completed
deleteTopicManager.resumeDeletionForTopics(Set(topicAndPartition.topic))
}
}