十八、Flink源碼閱讀--JobManager啓動過程

本文我們詳細看下Jm的啓動步驟,主要看下stand-alone模式下Jm的啓動,在關鍵步驟是穿插on-yarn模式的說明

入口分析

從flink安裝包的bin/start-cluster.sh分析開始,會發現Jm的主類org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint,tm的主類是
org.apache.flink.runtime.taskexecutor.TaskManagerRunner

StandaloneSessionClusterEntrypoint
public static void main(String[] args) { //args --configDir xxx --executionMode cluster
	... 解析參數

	Configuration configuration = loadConfiguration(entrypointClusterConfiguration);
	StandaloneSessionClusterEntrypoint entrypoint = new StandaloneSessionClusterEntrypoint(configuration);
	entrypoint.startCluster();//啓動
}

===>

protected void startCluster() {
	LOG.info("Starting {}.", getClass().getSimpleName());
	try {
		configureFileSystems(configuration);

		SecurityContext securityContext = installSecurityContext(configuration);

		securityContext.runSecured((Callable<Void>) () -> {
			runCluster(configuration);//啓動cluster

			return null;
		});
	} catch (Throwable t) {
		LOG.error("Cluster initialization failed.", t);

		shutDownAndTerminate(
			STARTUP_FAILURE_RETURN_CODE,
			ApplicationStatus.FAILED,
			t.getMessage(),
			false);
	}
}

===>

protected void runCluster(Configuration configuration) throws Exception {
	synchronized (lock) {
		initializeServices(configuration);//初始化組件

		// write host information into configuration
		configuration.setString(JobManagerOptions.ADDRESS, commonRpcService.getAddress());
		configuration.setInteger(JobManagerOptions.PORT, commonRpcService.getPort());

		startClusterComponents(//啓動組件
			configuration,
			commonRpcService,
			haServices,
			blobServer,
			heartbeatServices,
			metricRegistry);

		dispatcher.getTerminationFuture().whenComplete(
			(Void value, Throwable throwable) -> {
				if (throwable != null) {
					LOG.info("Could not properly terminate the Dispatcher.", throwable);
				}

				// This is the general shutdown path. If a separate more specific shutdown was
				// already triggered, this will do nothing
				shutDownAndTerminate(
					SUCCESS_RETURN_CODE,
					ApplicationStatus.SUCCEEDED,
					throwable != null ? throwable.getMessage() : null,
					true);
			});
	}
}

接着我們進入到initializeServices初始化方法中,看看具體初始化了哪些服務

protected void initializeServices(Configuration configuration) throws Exception {

	LOG.info("Initializing cluster services.");

	synchronized (lock) {
		//locahost:6123
		final String bindAddress = configuration.getString(JobManagerOptions.ADDRESS);
		final String portRange = getRPCPortRange(configuration);

		commonRpcService = createRpcService(configuration, bindAddress, portRange);// 創建akka rpc服務

		// update the configuration used to create the high availability services
		configuration.setString(JobManagerOptions.ADDRESS, commonRpcService.getAddress());
		configuration.setInteger(JobManagerOptions.PORT, commonRpcService.getPort());

		haServices = createHaServices(configuration, commonRpcService.getExecutor());
		blobServer = new BlobServer(configuration, haServices.createBlobStore());
		blobServer.start();//啓動blobServer,blobServer是一個線程,監聽端口隨機
		heartbeatServices = createHeartbeatServices(configuration);//啓動心跳服務
		metricRegistry = createMetricRegistry(configuration);//創建監控指標服務

		// TODO: This is a temporary hack until we have ported the MetricQueryService to the new RpcEndpoint
		// start the MetricQueryService
		final ActorSystem actorSystem = ((AkkaRpcService) commonRpcService).getActorSystem();
		metricRegistry.startQueryService(actorSystem, null);

		archivedExecutionGraphStore = createSerializableExecutionGraphStore(configuration, commonRpcService.getScheduledExecutor());//ExecutionGraph 存在磁盤和緩存

		clusterInformation = new ClusterInformation(
			commonRpcService.getAddress(),
			blobServer.getPort());
		//實例化blobCache
		transientBlobCache = new TransientBlobCache(
			configuration,
			new InetSocketAddress(
				clusterInformation.getBlobServerHostname(),
				clusterInformation.getBlobServerPort()));
	}
}

實例化了這些服務後,我們來看下startClusterComponents方法,有三個核心方法調用,webMonitorEndpoint,resourceManager,dispatcher的創建與啓動

protected void startClusterComponents(
		Configuration configuration,
		RpcService rpcService,
		HighAvailabilityServices highAvailabilityServices,
		BlobServer blobServer,
		HeartbeatServices heartbeatServices,
		MetricRegistry metricRegistry) throws Exception {
	synchronized (lock) {
		dispatcherLeaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();

		resourceManagerRetrievalService = highAvailabilityServices.getResourceManagerLeaderRetriever();

		LeaderGatewayRetriever<DispatcherGateway> dispatcherGatewayRetriever = new RpcGatewayRetriever<>(
			rpcService,
			DispatcherGateway.class,
			DispatcherId::fromUuid,
			10,
			Time.milliseconds(50L));

		LeaderGatewayRetriever<ResourceManagerGateway> resourceManagerGatewayRetriever = new RpcGatewayRetriever<>(
			rpcService,
			ResourceManagerGateway.class,
			ResourceManagerId::fromUuid,
			10,
			Time.milliseconds(50L));

		// TODO: Remove once we have ported the MetricFetcher to the RpcEndpoint
		final ActorSystem actorSystem = ((AkkaRpcService) rpcService).getActorSystem();
		final Time timeout = Time.milliseconds(configuration.getLong(WebOptions.TIMEOUT));

		webMonitorEndpoint = createRestEndpoint(
			configuration,
			dispatcherGatewayRetriever,
			resourceManagerGatewayRetriever,
			transientBlobCache,
			WebMonitorEndpoint.createExecutorService(
				configuration.getInteger(RestOptions.SERVER_NUM_THREADS),
				configuration.getInteger(RestOptions.SERVER_THREAD_PRIORITY),
				"DispatcherRestEndpoint"),
			new AkkaQueryServiceRetriever(actorSystem, timeout),
			highAvailabilityServices.getWebMonitorLeaderElectionService());

		LOG.debug("Starting Dispatcher REST endpoint.");
		webMonitorEndpoint.start();//webMonitor啓動

		jobManagerMetricGroup = MetricUtils.instantiateJobManagerMetricGroup(metricRegistry, rpcService.getAddress());

		resourceManager = createResourceManager(
			configuration,
			ResourceID.generate(),
			rpcService,
			highAvailabilityServices,
			heartbeatServices,
			metricRegistry,
			this,
			clusterInformation,
			webMonitorEndpoint.getRestBaseUrl(),
			jobManagerMetricGroup);

		final HistoryServerArchivist historyServerArchivist = HistoryServerArchivist.createHistoryServerArchivist(configuration, webMonitorEndpoint);

		dispatcher = createDispatcher(
			configuration,
			rpcService,
			highAvailabilityServices,
			resourceManager.getSelfGateway(ResourceManagerGateway.class),
			blobServer,
			heartbeatServices,
			jobManagerMetricGroup,
			metricRegistry.getMetricQueryServicePath(),
			archivedExecutionGraphStore,
			this,
			webMonitorEndpoint.getRestBaseUrl(),
			historyServerArchivist);

		LOG.debug("Starting ResourceManager.");
		resourceManager.start();//resourceManager啓動
		resourceManagerRetrievalService.start(resourceManagerGatewayRetriever);

		LOG.debug("Starting Dispatcher.");
		dispatcher.start();//dispatcher 啓動
		dispatcherLeaderRetrievalService.start(dispatcherGatewayRetriever);
	}
}

webMonitor 負責web服務,checkpoint,異常恢復
resourceManager 負責leader選舉,slot註冊管理,與taskExecutor監控,心跳管理,負責和yarn交互
dispatcher 負責任務的接收,負責JobManagerRunner(封裝了JobMaster)的啓動。
這三個核心組件啓動後,Jm就可以正常工作了。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章