【源碼分析】Kafka分區重分配/遷移(kafka-reassign-partitions.sh)

1、查看kafka-reassign-partitions.sh腳本

cd kafka_home/bin
cat kafka-reassign-partitions.sh


#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

exec $(dirname $0)/kafka-run-class.sh kafka.admin.ReassignPartitionsCommand "$@"

由上可得:kafka-reassign-partitions.sh腳本實際執行kafka-run-class.sh腳本,調用kafka.admin.ReassignPartitionsCommand類

2、ReassignPartitionsCommand類詳情

在這裏插入圖片描述

  def executeAssignment(zkUtils: ZkUtils, opts: ReassignPartitionsCommandOptions) {
    //讀取遷移計劃json格式
    val reassignmentJsonFile =  opts.options.valueOf(opts.reassignmentJsonFileOpt)
    //將遷移計劃json格式轉string
    val reassignmentJsonString = Utils.readFileAsString(reassignmentJsonFile)
    //如果有限額的參數,則將限額的參數進行讀取,傳入executeAssignment中去
    val throttle = if (opts.options.has(opts.throttleOpt)) opts.options.valueOf(opts.throttleOpt) else -1
    executeAssignment(zkUtils, reassignmentJsonString, throttle)
  }


  def executeAssignment(zkUtils: ZkUtils, reassignmentJsonString: String, throttle: Long = -1) {
    val partitionsToBeReassigned = parseAndValidate(zkUtils, reassignmentJsonString)
    val reassignPartitionsCommand = new ReassignPartitionsCommand(zkUtils, partitionsToBeReassigned.toMap)

    // If there is an existing rebalance running, attempt to change its throttle
    if (zkUtils.pathExists(ZkUtils.ReassignPartitionsPath)) {
      println("There is an existing assignment running.")
      reassignPartitionsCommand.maybeLimit(throttle)
    }
    else {
      printCurrentAssignment(zkUtils, partitionsToBeReassigned)
      if (throttle >= 0)
        println(String.format("Warning: You must run Verify periodically, until the reassignment completes, to ensure the throttle is removed. You can also alter the throttle by rerunning the Execute command passing a new value."))
      if (reassignPartitionsCommand.reassignPartitions(throttle)) {
        println("Successfully started reassignment of partitions.")
      } else
        println("Failed to reassign partitions %s".format(partitionsToBeReassigned))
    }
  }



  def reassignPartitions(throttle: Long = -1): Boolean = {
    maybeThrottle(throttle)
    try {
      val validPartitions = proposedAssignment.filter { case (p, _) => validatePartition(zkUtils, p.topic, p.partition) }
      if (validPartitions.isEmpty) false
      else {
        val jsonReassignmentData = ZkUtils.formatAsReassignmentJson(validPartitions)
        zkUtils.createPersistentPath(ZkUtils.ReassignPartitionsPath, jsonReassignmentData)
        true
      }
    } catch {
      case ze: ZkNodeExistsException =>
        val partitionsBeingReassigned = zkUtils.getPartitionsBeingReassigned()
        throw new AdminCommandFailedException("Partition reassignment currently in " +
        "progress for %s. Aborting operation".format(partitionsBeingReassigned))
      case e: Throwable => error("Admin command failed", e); false
    }
  }

由上可知

  • executeAssignment(zkUtils: ZkUtils, opts: ReassignPartitionsCommandOptions)方法獲取ReassignPartitionsCommandOptions參數對應的執行計劃文件內容
  • executeAssignment(zkUtils: ZkUtils, reassignmentJsonString: String, throttle: Long = -1)方法進行校驗、判斷限流值(以防分區重分配/遷移網絡影響到leader,導致生產消費異常)
  • reassignPartitions(throttle: Long = -1)方法中的zkUtils.createPersistentPath(ZkUtils.ReassignPartitionsPath, jsonReassignmentData)在zk的/admin/reassign_partitions創建一個執行計劃的節點,至此,execute命令執行結束,zk節點創建完畢,等待監聽器監聽、回調具體的執行邏輯

3、Controller監聽動作

主controller會回調 KafkaController.onControllerFailover 這個方法, 這個方法註冊了監聽 “/admin/reassign_partitions” 目錄的事件,如下代碼

 /**
   * This callback is invoked by the zookeeper leader elector on electing the current broker as the new controller.
   * It does the following things on the become-controller state change -
   * 1. Register controller epoch changed listener
   * 2. Increments the controller epoch
   * 3. Initializes the controller's context object that holds cache objects for current topics, live brokers and
   *    leaders for all existing partitions.
   * 4. Starts the controller's channel manager
   * 5. Starts the replica state machine
   * 6. Starts the partition state machine
   * If it encounters any unexpected exception/error while becoming controller, it resigns as the current controller.
   * This ensures another controller election will be triggered and there will always be an actively serving controller
   */
  def onControllerFailover() {
    if(isRunning) {
      info("Broker %d starting become controller state transition".format(config.brokerId))
      //read controller epoch from zk
      readControllerEpochFromZookeeper()
      // increment the controller epoch
      incrementControllerEpoch(zkUtils.zkClient)
      // before reading source of truth from zookeeper, register the listeners to get broker/topic callbacks
      registerReassignedPartitionsListener()
      registerIsrChangeNotificationListener()
      registerPreferredReplicaElectionListener()
      partitionStateMachine.registerListeners()
      replicaStateMachine.registerListeners()
      initializeControllerContext()
      replicaStateMachine.startup()
      partitionStateMachine.startup()
      // register the partition change listeners for all existing topics on failover
      controllerContext.allTopics.foreach(topic => partitionStateMachine.registerPartitionChangeListener(topic))
      info("Broker %d is ready to serve as the new controller with epoch %d".format(config.brokerId, epoch))
      maybeTriggerPartitionReassignment()
      maybeTriggerPreferredReplicaElection()
      /* send partition leadership info to all live brokers */
      sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq)
      if (config.autoLeaderRebalanceEnable) {
        info("starting the partition rebalance scheduler")
        autoRebalanceScheduler.startup()
        autoRebalanceScheduler.schedule("partition-rebalance-thread", checkAndTriggerPartitionRebalance,
          5, config.leaderImbalanceCheckIntervalSeconds.toLong, TimeUnit.SECONDS)
      }
      deleteTopicManager.start()
    }
    else
      info("Controller has been shut down, aborting startup/failover")
  }

4、Controller處理動作

在這裏插入圖片描述
處理 事件是通過 PartitionsReassignedListener 的handleDataChange來處理的。
實際上最終處理的是通過 onPartitionReassignment的方法

/**
 * Starts the partition reassignment process unless -
 * 1. Partition previously existed
 * 2. New replicas are the same as existing replicas
 * 3. Any replica in the new set of replicas are dead
 * If any of the above conditions are satisfied, it logs an error and removes the partition from list of reassigned
 * partitions.
 */
class PartitionsReassignedListener(controller: KafkaController) extends IZkDataListener with Logging {
  this.logIdent = "[PartitionsReassignedListener on " + controller.config.brokerId + "]: "
  val zkUtils = controller.controllerContext.zkUtils
  val controllerContext = controller.controllerContext

  /**
   * Invoked when some partitions are reassigned by the admin command
   *
   * @throws Exception On any error.
   */
  @throws(classOf[Exception])
  def handleDataChange(dataPath: String, data: Object) {
    debug("Partitions reassigned listener fired for path %s. Record partitions to be reassigned %s"
      .format(dataPath, data))
    val partitionsReassignmentData = ZkUtils.parsePartitionReassignmentData(data.toString)
    val partitionsToBeReassigned = inLock(controllerContext.controllerLock) {
      partitionsReassignmentData.filterNot(p => controllerContext.partitionsBeingReassigned.contains(p._1))
    }
    partitionsToBeReassigned.foreach { partitionToBeReassigned =>
      inLock(controllerContext.controllerLock) {
        if(controller.deleteTopicManager.isTopicQueuedUpForDeletion(partitionToBeReassigned._1.topic)) {
          error("Skipping reassignment of partition %s for topic %s since it is currently being deleted"
            .format(partitionToBeReassigned._1, partitionToBeReassigned._1.topic))
          controller.removePartitionFromReassignedPartitions(partitionToBeReassigned._1)
        } else {
          val context = new ReassignedPartitionsContext(partitionToBeReassigned._2)
          //initiateReassignReplicasForTopicPartition會調用onPartitionReassignment方法按照分配/遷移計劃內容進行具體的數據遷移
          controller.initiateReassignReplicasForTopicPartition(partitionToBeReassigned._1, context)
        }
      }
    }
  }

  /**
   * Called when the leader information stored in zookeeper has been delete. Try to elect as the leader
   *
   * @throws Exception On any error.
   */
  @throws(classOf[Exception])
  def handleDataDeleted(dataPath: String) {
  }
}

controller.initiateReassignReplicasForTopicPartition(partitionToBeReassigned._1, context)會調用onPartitionReassignment方法按照分配/遷移計劃內容進行具體的數據遷移

def initiateReassignReplicasForTopicPartition(topicAndPartition: TopicAndPartition,
                                        reassignedPartitionContext: ReassignedPartitionsContext) {
    val newReplicas = reassignedPartitionContext.newReplicas
    val topic = topicAndPartition.topic
    val partition = topicAndPartition.partition
    val aliveNewReplicas = newReplicas.filter(r => controllerContext.liveBrokerIds.contains(r))
    try {
      val assignedReplicasOpt = controllerContext.partitionReplicaAssignment.get(topicAndPartition)
      assignedReplicasOpt match {
        case Some(assignedReplicas) =>
          if(assignedReplicas == newReplicas) {
            throw new KafkaException("Partition %s to be reassigned is already assigned to replicas".format(topicAndPartition) +
              " %s. Ignoring request for partition reassignment".format(newReplicas.mkString(",")))
          } else {
            info("Handling reassignment of partition %s to new replicas %s".format(topicAndPartition, newReplicas.mkString(",")))
            // first register ISR change listener
            watchIsrChangesForReassignedPartition(topic, partition, reassignedPartitionContext)
            controllerContext.partitionsBeingReassigned.put(topicAndPartition, reassignedPartitionContext)
            // mark topic ineligible for deletion for the partitions being reassigned
            deleteTopicManager.markTopicIneligibleForDeletion(Set(topic))
            //按照分配/遷移計劃內容進行具體的數據遷移
            onPartitionReassignment(topicAndPartition, reassignedPartitionContext)
          }
        case None => throw new KafkaException("Attempt to reassign partition %s that doesn't exist"
          .format(topicAndPartition))
      }
    } catch {
      case e: Throwable => error("Error completing reassignment of partition %s".format(topicAndPartition), e)
      // remove the partition from the admin path to unblock the admin client
      removePartitionFromReassignedPartitions(topicAndPartition)
    }
  }

5、onPartitionReassignment核心方法

/**
   * This callback is invoked by the reassigned partitions listener. When an admin command initiates a partition
   * reassignment, it creates the /admin/reassign_partitions path that triggers the zookeeper listener.
   * Reassigning replicas for a partition goes through a few steps listed in the code.
   * RAR = Reassigned replicas
   * OAR = Original list of replicas for partition
   * AR = current assigned replicas
   *
   * 1. Update AR in ZK with OAR + RAR.
   * 2. Send LeaderAndIsr request to every replica in OAR + RAR (with AR as OAR + RAR). We do this by forcing an update
   *    of the leader epoch in zookeeper.
   * 3. Start new replicas RAR - OAR by moving replicas in RAR - OAR to NewReplica state.
   * 4. Wait until all replicas in RAR are in sync with the leader.
   * 5  Move all replicas in RAR to OnlineReplica state.
   * 6. Set AR to RAR in memory.
   * 7. If the leader is not in RAR, elect a new leader from RAR. If new leader needs to be elected from RAR, a LeaderAndIsr
   *    will be sent. If not, then leader epoch will be incremented in zookeeper and a LeaderAndIsr request will be sent.
   *    In any case, the LeaderAndIsr request will have AR = RAR. This will prevent the leader from adding any replica in
   *    RAR - OAR back in the isr.
   * 8. Move all replicas in OAR - RAR to OfflineReplica state. As part of OfflineReplica state change, we shrink the
   *    isr to remove OAR - RAR in zookeeper and send a LeaderAndIsr ONLY to the Leader to notify it of the shrunk isr.
   *    After that, we send a StopReplica (delete = false) to the replicas in OAR - RAR.
   * 9. Move all replicas in OAR - RAR to NonExistentReplica state. This will send a StopReplica (delete = true) to
   *    the replicas in OAR - RAR to physically delete the replicas on disk.
   * 10. Update AR in ZK with RAR.
   * 11. Update the /admin/reassign_partitions path in ZK to remove this partition.
   * 12. After electing leader, the replicas and isr information changes. So resend the update metadata request to every broker.
   *
   * For example, if OAR = {1, 2, 3} and RAR = {4,5,6}, the values in the assigned replica (AR) and leader/isr path in ZK
   * may go through the following transition.
   * AR                 leader/isr
   * {1,2,3}            1/{1,2,3}           (initial state)
   * {1,2,3,4,5,6}      1/{1,2,3}           (step 2)
   * {1,2,3,4,5,6}      1/{1,2,3,4,5,6}     (step 4)
   * {1,2,3,4,5,6}      4/{1,2,3,4,5,6}     (step 7)
   * {1,2,3,4,5,6}      4/{4,5,6}           (step 8)
   * {4,5,6}            4/{4,5,6}           (step 10)
   *
   * Note that we have to update AR in ZK with RAR last since it's the only place where we store OAR persistently.
   * This way, if the controller crashes before that step, we can still recover.
   */
  def onPartitionReassignment(topicAndPartition: TopicAndPartition, reassignedPartitionContext: ReassignedPartitionsContext) {
    //reassignedPartitionContext對象爲執行計劃中的每一個分區詳情。reassignedReplicas爲每個分區計劃重分配的副本對應的brokerId集合.
    val reassignedReplicas = reassignedPartitionContext.newReplicas
    /*
    根據要進行重新的副本分配的topic-partition,從zk中對應的topic/partition的state中找到對應的leader
    的切換順序集合(isr)的集合,如果重新分配的副本集合在isr的集合中都包含時,areReplicasInIsr函數的返回值爲true,
    否則表示新的副本集合中有副本不在isr中包含返回值爲false.
    這裏如果是true時,執行的是對副本的具體分配,
    如果是false的情況時,會更新每個partition的state的內容爲新的副本信息,並設置新添加的副本的狀態爲NewReplica的狀態,
     */
    //{1,2,3}            1/{1,2,3}           (initial state)
    if (!areReplicasInIsr(topicAndPartition.topic, topicAndPartition.partition, reassignedReplicas)) {
      info("New replicas %s for partition %s being ".format(reassignedReplicas.mkString(","), topicAndPartition) +
        "reassigned not yet caught up with the leader")
      //這裏先得到新分配的副本與已經存在的副本的差集,也就是新分配的副本在現在的副本中不包含的集合
      val newReplicasNotInOldReplicaList = reassignedReplicas.toSet -- controllerContext.partitionReplicaAssignment(topicAndPartition).toSet
      //這裏得到新分配的副本與已經存在的副本集合的全集.
      val newAndOldReplicas = (reassignedPartitionContext.newReplicas ++ controllerContext.partitionReplicaAssignment(topicAndPartition)).toSet
      //1. Update AR in ZK with OAR + RAR.
      //這裏把新分配的replicas的副本集合與已經存在的副本集合進行合併後,得到一個新的副本集合,
      //把這個集合更新到partitionReplicaAssignment集合中對應的partition上
      //把這個topic對應的所有的partition的副本集合當成內容分配信息存儲到zk的/brokers/topics/topicName節點中.
      // 由topic修改的監聽程序來處理對這個zk節點的變化後的流程進行處理.
      updateAssignedReplicasForPartition(topicAndPartition, newAndOldReplicas.toSeq)
      //{1,2,3,4,5,6}      1/{1,2,3}           (step 2)
      //2. Send LeaderAndIsr request to every replica in OAR + RAR (with AR as OAR + RAR).
      //(1)這裏根據對應的partition的新的副本信息(原來分配的副本加上新分配的副本),生成向每一個副本所在的broker
      // 進行leaderIsr狀態的請求(這個請求主要是當前的partition對應的副本集合),存儲到leaderAndIsrRequestMap集合中,
      // 在提交請求時從這個集合中讀取數據,這個集合中是存儲的LeaderAndIsrRequest請求,
      //(2)這裏根據對應的partition,這裏在updateMetadataRequestMap集合中存儲向所有的broker發送partition的
      // metadata修改的UpdateMetadataRequest請求,這個請求中,如果topic已經被刪除,發送的請求的leader的id是-2,
      // 否則發送的請求是正常的請求,只是副本集合發生了變化.
      //(3)向對應的broker發送上面生成的兩個請求.這個請求發送的前提是broker已經被啓動.
      updateLeaderEpochAndSendRequest(topicAndPartition, controllerContext.partitionReplicaAssignment(topicAndPartition),
        newAndOldReplicas.toSeq)
      //3. replicas in RAR - OAR -> NewReplica
      //通過對所有的新添加的副本進行迭代,通過replicaStateMachine實例更新副本狀態爲NewReplica
      startNewReplicasForReassignedPartition(topicAndPartition, reassignedPartitionContext, newReplicasNotInOldReplicaList)
      info("Waiting for new replicas %s for partition %s being ".format(reassignedReplicas.mkString(","), topicAndPartition) +
        "reassigned to catch up with the leader")
    } else {
      //這種情況下,表示當前重新分配的副本在isr的集合中都存在,
      //先得到老的已經分配的副本
      //4. Wait until all replicas in RAR are in sync with the leader.
      val oldReplicas = controllerContext.partitionReplicaAssignment(topicAndPartition).toSet -- reassignedReplicas.toSet
      //並設置這些老的副本的狀態爲OnlineReplica.
      //5. replicas in RAR -> OnlineReplica
      reassignedReplicas.foreach { replica =>
        replicaStateMachine.handleStateChanges(Set(new PartitionAndReplica(topicAndPartition.topic, topicAndPartition.partition,
          replica)), OnlineReplica)
      }
      //6. Set AR to RAR in memory.
      //7. Send LeaderAndIsr request with a potential new leader (if current leader not in RAR) and
      //   a new AR (using RAR) and same isr to every broker in RAR
      //如果新分配的副本集合中不包含當前的partition的leader節點時,通過partitionStateMachine實例
      // 更新partition的狀態爲OnlinePartition.同時通過ReassignedPartitionLeaderSelector實例重新選擇leader.
      //(1)如果新分配的副本集合中不包含當前的partition的leader節點時,通過partitionStateMachine實例更新partition的狀態爲OnlinePartition.
      // 同時通過ReassignedPartitionLeaderSelector實例重新選擇leader.
      //(2)如果當前活着的broker節點中包含有Partition的leader節點時,向所有的broker節點發送更新leaderAndIsr與metadata的請求.
      //(3)如果當前的partition的leader節點已經下線,通過partitionStateMachine實例更新partition的狀態爲OnlinePartition.
      // 同時通過ReassignedPartitionLeaderSelector實例重新選擇leader.

      moveReassignedPartitionLeaderIfRequired(topicAndPartition, reassignedPartitionContext)
      //(1)更新replicaStateMachine中老副本的狀態爲OfflineReplica狀態,並向這個副本對應的節點發起StopReplicaRequest請求.
      // 更新zk中topic,paritions,state節點下isr的信息多原來的副本節點中移出這個節點.並向餘下的isr節點發起LeaderAndIsrRequest的選擇請求.
      //(2)更新replicaStateMachine中對應此partition的此副本狀態爲ReplicaDeletionStarted狀態,
      // 並向需要執行副本刪除的節點發起StopReplicaRequest請求,此時請求的deletePartition的屬性值爲true.後面的步驟在這裏執行完成後纔會執行.
      //(3)更新此partition對應此副本的狀態爲ReplicaDeletionSuccessful狀態.
      //(4)從replicaStateMachine的replicaState集合中移出此partition對應被下線的副本的副本狀態.
      // 並更新partitionReplicaAssignment集合的副本集合,從這個集合中也移出此節點.

      //8. replicas in OAR - RAR -> Offline (force those replicas out of isr)
      //9. replicas in OAR - RAR -> NonExistentReplica (force those replicas to be deleted)
      stopOldReplicasOfReassignedPartition(topicAndPartition, reassignedPartitionContext, oldReplicas)
      //更新partitionReplicaAssignment集合中對應此partition的副本集合,並更新這個topic對應zk的節點信息,
      // 主要是所有的partitions的副本集合都會更新一次.由topic修改的監聽程序來處理對這個zk節點的變化後的流程進行處理.
      //10. Update AR in ZK with RAR.
      updateAssignedReplicasForPartition(topicAndPartition, reassignedReplicas)
      //取消對partition的isr的修改的監聽程序,並從partitionsBeingReassigned集合中移出這個準備重新分配的partition.
      //11. Update the /admin/reassign_partitions path in ZK to remove this partition.
      removePartitionFromReassignedPartitions(topicAndPartition)
      info("Removed partition %s from the list of reassigned partitions in zookeeper".format(topicAndPartition))
      controllerContext.partitionsBeingReassigned.remove(topicAndPartition)
      //向所有的broker節點發送此partition的metadata修改的UpdateMetadataRequest請求.
      //12. After electing leader, the replicas and isr information changes, so resend the update metadata request to every broker
      sendUpdateMetadataRequest(controllerContext.liveOrShuttingDownBrokerIds.toSeq, Set(topicAndPartition))
      //如果topic是已經被刪除的topic,從準備刪除的topic集合中移出這個topic
      // signal delete topic thread if reassignment for some partitions belonging to topics being deleted just completed
      deleteTopicManager.resumeDeletionForTopics(Set(topicAndPartition.topic))
    }
  }
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章