Flink源碼-10-CheckPoint實現

CheckPoint

CheckPoint 是實現一次性語義的核心,接下來我們看一下如何實現

CheckpointCoordinator

checkpoint協調器
單線程,固定週期去觸發checkpoint

private ScheduledFuture<?> scheduleTriggerWithDelay(long initDelay) {
   return timer.scheduleAtFixedRate(
      new ScheduledTrigger(),
      initDelay, baseInterval, TimeUnit.MILLISECONDS);
}

checkpoint線程類

private final class ScheduledTrigger implements Runnable {

   @Override
   public void run() {
      try {
         triggerCheckpoint(System.currentTimeMillis(), true);
      }
      catch (Exception e) {
         LOG.error("Exception while triggering checkpoint for job {}.", job, e);
      }
   }
}

具體的實現類

public CompletableFuture<CompletedCheckpoint> triggerCheckpoint(
      long timestamp,
      CheckpointProperties props,
      @Nullable String externalSavepointLocation,
      boolean isPeriodic,
      boolean advanceToEndOfTime) throws CheckpointException {
          // send the messages to the tasks that trigger their checkpoint
for (Execution execution: executions) {
   if (props.isSynchronous()) {
      execution.triggerSynchronousSavepoint(checkpointID, timestamp, checkpointOptions, advanceToEndOfTime);
   } else {
       對每個execution觸發checkpoint
      execution.triggerCheckpoint(checkpointID, timestamp, checkpointOptions);
   }
} 
          }

接下來從Execution跳到Task

public void triggerCheckpointBarrier(
			final long checkpointID,
			final long checkpointTimestamp,
			final CheckpointOptions checkpointOptions,
			final boolean advanceToEndOfEventTime) {

         //很熟悉的東西,這個實現類就是具體的task
        final AbstractInvokable invokable = this.invokable;
		invokable.triggerCheckpointAsync(checkpointMetaData, checkpointOptions, advanceToEndOfEventTime);

	}

StreamTask

private boolean performCheckpoint(
			CheckpointMetaData checkpointMetaData,
			CheckpointOptions checkpointOptions,
			CheckpointMetrics checkpointMetrics,
			boolean advanceToEndOfTime) throws Exception {

				// Step (1): Prepare the checkpoint, allow operators to do some pre-bar.
				//讓所有的算子,提前做一些準備
				operatorChain.prepareSnapshotPreBarrier(checkpointId);

				// Step (2): Send the checkpoint barrier downstream
				//發送barrier
				operatorChain.broadcastCheckpointBarrier(
						checkpointId,
						checkpointMetaData.getTimestamp(),
						checkpointOptions);

				// Step (3): Take the state snapshot. This should be largely asynchronous, to not
				//           impact progress of the streaming topology
				// 做state持久化,會影響流計算
				checkpointState(checkpointMetaData, checkpointOptions, checkpointMetrics);
		}
	}

發送barrier
從這裏可以看出來barrier和普通事件流混在一起

public void broadcastCheckpointBarrier(long id, long timestamp, CheckpointOptions checkpointOptions) throws IOException {
		CheckpointBarrier barrier = new CheckpointBarrier(id, timestamp, checkpointOptions);
		for (RecordWriterOutput<?> streamOutput : streamOutputs) {
			streamOutput.broadcastEvent(barrier);
		}
	}

CheckpointedInputGate
處理遇到Barrier

@Override
	public Optional<BufferOrEvent> pollNext() throws Exception {
		while (true) {
		     //取事件
		    BufferOrEvent bufferOrEvent = next.get();
             //事件是barrier處理barrier
			else if (bufferOrEvent.getEvent().getClass() == CheckpointBarrier.class) {
				CheckpointBarrier checkpointBarrier = (CheckpointBarrier) bufferOrEvent.getEvent();
				if (!endOfInputGate) {
					// process barriers only if there is a chance of the checkpoint completing
					if (barrierHandler.processBarrier(checkpointBarrier, offsetChannelIndex(bufferOrEvent.getChannelIndex()), bufferStorage.getPendingBytes())) {
						bufferStorage.rollOver();
					}
				}
			}
			
		}
	}

CheckpointBarrierAligner
處理barrier對齊

@Override
	public boolean processBarrier(CheckpointBarrier receivedBarrier, int channelIndex, long bufferedBytes) throws Exception {
		final long barrierId = receivedBarrier.getId();

		//barrierId 合法,開始新的對齊過程
		else if (barrierId > currentCheckpointId) {
			beginNewAlignment(barrierId, channelIndex);
		}
		
		// check if we have all barriers - since canceled checkpoints always have zero barriers
		// this can only happen on a non canceled checkpoint
		//對齊了,通知這個task checkpoint完成
		if (numBarriersReceived + numClosedChannels == totalNumberOfInputChannels) {
			// actually trigger checkpoint
			if (LOG.isDebugEnabled()) {
				LOG.debug("{}: Received all barriers, triggering checkpoint {} at {}.",
					taskName,
					receivedBarrier.getId(),
					receivedBarrier.getTimestamp());
			}

			releaseBlocksAndResetBarriers();
			notifyCheckpoint(receivedBarrier, bufferedBytes, latestAlignmentDurationNanos);
			return true;
		}
		return checkpointAborted;
	}

onBarrier
收到barrier的處理
標識收到barrier的管道阻塞,數據不處理,放在buffer裏,收到的barrier+1

protected void onBarrier(int channelIndex) throws IOException {
		if (!blockedChannels[channelIndex]) {
			blockedChannels[channelIndex] = true;

			numBarriersReceived++;

			if (LOG.isDebugEnabled()) {
				LOG.debug("{}: Received barrier from channel {}.", taskName, channelIndex);
			}
		}
		else {
			throw new IOException("Stream corrupt: Repeated barrier for same checkpoint on input " + channelIndex);
		}
	}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章