十八、Flink源码阅读--JobManager启动过程

本文我们详细看下Jm的启动步骤,主要看下stand-alone模式下Jm的启动,在关键步骤是穿插on-yarn模式的说明

入口分析

从flink安装包的bin/start-cluster.sh分析开始,会发现Jm的主类org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint,tm的主类是
org.apache.flink.runtime.taskexecutor.TaskManagerRunner

StandaloneSessionClusterEntrypoint
public static void main(String[] args) { //args --configDir xxx --executionMode cluster
	... 解析参数

	Configuration configuration = loadConfiguration(entrypointClusterConfiguration);
	StandaloneSessionClusterEntrypoint entrypoint = new StandaloneSessionClusterEntrypoint(configuration);
	entrypoint.startCluster();//启动
}

===>

protected void startCluster() {
	LOG.info("Starting {}.", getClass().getSimpleName());
	try {
		configureFileSystems(configuration);

		SecurityContext securityContext = installSecurityContext(configuration);

		securityContext.runSecured((Callable<Void>) () -> {
			runCluster(configuration);//启动cluster

			return null;
		});
	} catch (Throwable t) {
		LOG.error("Cluster initialization failed.", t);

		shutDownAndTerminate(
			STARTUP_FAILURE_RETURN_CODE,
			ApplicationStatus.FAILED,
			t.getMessage(),
			false);
	}
}

===>

protected void runCluster(Configuration configuration) throws Exception {
	synchronized (lock) {
		initializeServices(configuration);//初始化组件

		// write host information into configuration
		configuration.setString(JobManagerOptions.ADDRESS, commonRpcService.getAddress());
		configuration.setInteger(JobManagerOptions.PORT, commonRpcService.getPort());

		startClusterComponents(//启动组件
			configuration,
			commonRpcService,
			haServices,
			blobServer,
			heartbeatServices,
			metricRegistry);

		dispatcher.getTerminationFuture().whenComplete(
			(Void value, Throwable throwable) -> {
				if (throwable != null) {
					LOG.info("Could not properly terminate the Dispatcher.", throwable);
				}

				// This is the general shutdown path. If a separate more specific shutdown was
				// already triggered, this will do nothing
				shutDownAndTerminate(
					SUCCESS_RETURN_CODE,
					ApplicationStatus.SUCCEEDED,
					throwable != null ? throwable.getMessage() : null,
					true);
			});
	}
}

接着我们进入到initializeServices初始化方法中,看看具体初始化了哪些服务

protected void initializeServices(Configuration configuration) throws Exception {

	LOG.info("Initializing cluster services.");

	synchronized (lock) {
		//locahost:6123
		final String bindAddress = configuration.getString(JobManagerOptions.ADDRESS);
		final String portRange = getRPCPortRange(configuration);

		commonRpcService = createRpcService(configuration, bindAddress, portRange);// 创建akka rpc服务

		// update the configuration used to create the high availability services
		configuration.setString(JobManagerOptions.ADDRESS, commonRpcService.getAddress());
		configuration.setInteger(JobManagerOptions.PORT, commonRpcService.getPort());

		haServices = createHaServices(configuration, commonRpcService.getExecutor());
		blobServer = new BlobServer(configuration, haServices.createBlobStore());
		blobServer.start();//启动blobServer,blobServer是一个线程,监听端口随机
		heartbeatServices = createHeartbeatServices(configuration);//启动心跳服务
		metricRegistry = createMetricRegistry(configuration);//创建监控指标服务

		// TODO: This is a temporary hack until we have ported the MetricQueryService to the new RpcEndpoint
		// start the MetricQueryService
		final ActorSystem actorSystem = ((AkkaRpcService) commonRpcService).getActorSystem();
		metricRegistry.startQueryService(actorSystem, null);

		archivedExecutionGraphStore = createSerializableExecutionGraphStore(configuration, commonRpcService.getScheduledExecutor());//ExecutionGraph 存在磁盘和缓存

		clusterInformation = new ClusterInformation(
			commonRpcService.getAddress(),
			blobServer.getPort());
		//实例化blobCache
		transientBlobCache = new TransientBlobCache(
			configuration,
			new InetSocketAddress(
				clusterInformation.getBlobServerHostname(),
				clusterInformation.getBlobServerPort()));
	}
}

实例化了这些服务后,我们来看下startClusterComponents方法,有三个核心方法调用,webMonitorEndpoint,resourceManager,dispatcher的创建与启动

protected void startClusterComponents(
		Configuration configuration,
		RpcService rpcService,
		HighAvailabilityServices highAvailabilityServices,
		BlobServer blobServer,
		HeartbeatServices heartbeatServices,
		MetricRegistry metricRegistry) throws Exception {
	synchronized (lock) {
		dispatcherLeaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();

		resourceManagerRetrievalService = highAvailabilityServices.getResourceManagerLeaderRetriever();

		LeaderGatewayRetriever<DispatcherGateway> dispatcherGatewayRetriever = new RpcGatewayRetriever<>(
			rpcService,
			DispatcherGateway.class,
			DispatcherId::fromUuid,
			10,
			Time.milliseconds(50L));

		LeaderGatewayRetriever<ResourceManagerGateway> resourceManagerGatewayRetriever = new RpcGatewayRetriever<>(
			rpcService,
			ResourceManagerGateway.class,
			ResourceManagerId::fromUuid,
			10,
			Time.milliseconds(50L));

		// TODO: Remove once we have ported the MetricFetcher to the RpcEndpoint
		final ActorSystem actorSystem = ((AkkaRpcService) rpcService).getActorSystem();
		final Time timeout = Time.milliseconds(configuration.getLong(WebOptions.TIMEOUT));

		webMonitorEndpoint = createRestEndpoint(
			configuration,
			dispatcherGatewayRetriever,
			resourceManagerGatewayRetriever,
			transientBlobCache,
			WebMonitorEndpoint.createExecutorService(
				configuration.getInteger(RestOptions.SERVER_NUM_THREADS),
				configuration.getInteger(RestOptions.SERVER_THREAD_PRIORITY),
				"DispatcherRestEndpoint"),
			new AkkaQueryServiceRetriever(actorSystem, timeout),
			highAvailabilityServices.getWebMonitorLeaderElectionService());

		LOG.debug("Starting Dispatcher REST endpoint.");
		webMonitorEndpoint.start();//webMonitor启动

		jobManagerMetricGroup = MetricUtils.instantiateJobManagerMetricGroup(metricRegistry, rpcService.getAddress());

		resourceManager = createResourceManager(
			configuration,
			ResourceID.generate(),
			rpcService,
			highAvailabilityServices,
			heartbeatServices,
			metricRegistry,
			this,
			clusterInformation,
			webMonitorEndpoint.getRestBaseUrl(),
			jobManagerMetricGroup);

		final HistoryServerArchivist historyServerArchivist = HistoryServerArchivist.createHistoryServerArchivist(configuration, webMonitorEndpoint);

		dispatcher = createDispatcher(
			configuration,
			rpcService,
			highAvailabilityServices,
			resourceManager.getSelfGateway(ResourceManagerGateway.class),
			blobServer,
			heartbeatServices,
			jobManagerMetricGroup,
			metricRegistry.getMetricQueryServicePath(),
			archivedExecutionGraphStore,
			this,
			webMonitorEndpoint.getRestBaseUrl(),
			historyServerArchivist);

		LOG.debug("Starting ResourceManager.");
		resourceManager.start();//resourceManager启动
		resourceManagerRetrievalService.start(resourceManagerGatewayRetriever);

		LOG.debug("Starting Dispatcher.");
		dispatcher.start();//dispatcher 启动
		dispatcherLeaderRetrievalService.start(dispatcherGatewayRetriever);
	}
}

webMonitor 负责web服务,checkpoint,异常恢复
resourceManager 负责leader选举,slot注册管理,与taskExecutor监控,心跳管理,负责和yarn交互
dispatcher 负责任务的接收,负责JobManagerRunner(封装了JobMaster)的启动。
这三个核心组件启动后,Jm就可以正常工作了。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章