文章目录
接着上面两篇文章:
Flink1.9.0任务提交源码阅读(一):flink脚本
Flink1.9.0任务提交源码阅读(二):入口类CliFrontend
今天继续阅读Flink1.9.0任务提交源码的run()方法,这个方法是flink任务提交的核心逻辑。
run()的执行逻辑
代码:
/**
* 执行run操作
*
* @param args 运行操作的命令行参数。
*/
protected void run(String[] args) throws Exception {
LOG.info("Running 'run' command.");
// 获取配置信息
final Options commandOptions = CliFrontendParser.getRunCommandOptions();
final Options commandLineOptions = CliFrontendParser.mergeOptions(commandOptions, customCommandLineOptions);
final CommandLine commandLine = CliFrontendParser.parse(commandLineOptions, args, true);
final RunOptions runOptions = new RunOptions(commandLine);
// 1.判断下是否是help操作
if (runOptions.isPrintHelp()) {
CliFrontendParser.printHelpForRun(customCommandLines);
return;
}
if (!runOptions.isPython()) {
// Java program should be specified a JAR file
// 2.检查用户指定jar包路径是否为空,如果为空,则打印异常信息,并抛出异常:CliArgsException,
if (runOptions.getJarFilePath() == null) {
throw new CliArgsException("Java program should be specified a JAR file.");
}
}
/**
* 3.创建PackagedProgram对象,org.apache.flink.client.program.PackagedProgram类用于在用户指定的jar包中
*
* <p>(1).寻找程序入口
*
* <p>(2).解析客户代码获取任务拓扑图
*
* <p>(3).提取嵌套库
*
* <p>在这里是去寻找用户的程序入口
*/
final PackagedProgram program;
try {
LOG.info("Building program from JAR file");
program = buildProgram(runOptions);
}
catch (FileNotFoundException e) {
throw new CliArgsException("Could not build the program from JAR file.", e);
}
final CustomCommandLine<?> customCommandLine = getActiveCustomCommandLine(commandLine);
try {
// 4. 【重点】执行任务程序.调用runProgram方法,将之前准备好的用户程序入口,配置项传入方法中,运行程序
runProgram(customCommandLine, commandLine, runOptions, program);
} finally {
// 删除为了打包所创建的临时文件
program.deleteExtractedLibraries();
}
}
继续执行 runProgram(customCommandLine, commandLine, runOptions, program);
/**
* 执行逻辑
* @param customCommandLine
* @param commandLine
* @param runOptions
* @param program
* @param <T>
* @throws ProgramInvocationException
* @throws FlinkException
*/
private <T> void runProgram(
CustomCommandLine<T> customCommandLine,
CommandLine commandLine,
RunOptions runOptions,
PackagedProgram program) throws ProgramInvocationException, FlinkException {
// 根据用户命令行参数,创建ClusterDescriptor,ClusterDescriptor是一个集群属性的描述对象,
// 用于部署集群(例如 Yarn、Mesos),并且返回一个与集群通信的客户端
final ClusterDescriptor<T> clusterDescriptor = customCommandLine.createClusterDescriptor(commandLine);
try {
final T clusterId = customCommandLine.getClusterId(commandLine);
// 集群客户端,ClusterClient封装了提交一个程序到远程集群的必要的功能
final ClusterClient<T> client;
// directly deploy the job if the cluster is started in job mode and detached
//如果clusterId为空且运行模式为Detached(分离)模式,即命令行中添加 -d 参数
if (clusterId == null && runOptions.getDetachedMode()) {
// --(1)获取默认并行度
int parallelism = runOptions.getParallelism() == -1 ? defaultParallelism : runOptions.getParallelism();
// -- (2)根据用户自定义程序,解析DAG拓扑图,构建JobGraph
final JobGraph jobGraph = PackagedProgramUtils.createJobGraph(program, configuration, parallelism);
// --(3)根据命令行获取集群配置
final ClusterSpecification clusterSpecification = customCommandLine.getClusterSpecification(commandLine);
// 装载任务
// --(4)调用解析好的Job 图,获取的配置信息,运行模式等传入,调用clusterDescriptor.deployJobCluster方法,
// 获得ClusterClient,并运行程序client端将job部署到集群
client = clusterDescriptor.deployJobCluster(
clusterSpecification,
jobGraph,
runOptions.getDetachedMode());
logAndSysout("Job has been submitted with JobID " + jobGraph.getJobID());
try {
// -- (5)关闭client
client.shutdown();
} catch (Exception e) {
LOG.info("Could not properly shut down the client.", e);
}
} else {
// -- 使用正常方式进行提交
// --(1)创建shutdownHook,用于关闭cluster,在非分离模式下,需要使用shutdownHook在client退出后关闭cluster
final Thread shutdownHook;
// -- (2) 判断cluster id 是否为空,如果为空,获取ClusterClient,如果不为空,在client退出后,关闭cluster
if (clusterId != null) {
client = clusterDescriptor.retrieve(clusterId);
shutdownHook = null;
} else {
// also in job mode we have to deploy a session cluster because the job
// might consist of multiple parts (e.g. when using collect)
final ClusterSpecification clusterSpecification = customCommandLine.getClusterSpecification(commandLine);
client = clusterDescriptor.deploySessionCluster(clusterSpecification);
// if not running in detached mode, add a shutdown hook to shut down cluster if client exits
// there's a race-condition here if cli is killed before shutdown hook is installed
if (!runOptions.getDetachedMode() && runOptions.isShutdownOnAttachedExit()) {
shutdownHook = ShutdownHookUtil.addShutdownHook(client::shutDownCluster, client.getClass().getSimpleName(), LOG);
} else {
shutdownHook = null;
}
}
try {
client.setPrintStatusDuringExecution(runOptions.getStdoutLogging());
client.setDetached(runOptions.getDetachedMode());
LOG.debug("Client slots is set to {}", client.getMaxSlots());
LOG.debug("{}", runOptions.getSavepointRestoreSettings());
int userParallelism = runOptions.getParallelism();
LOG.debug("User parallelism is set to {}", userParallelism);
if (client.getMaxSlots() != MAX_SLOTS_UNKNOWN && userParallelism == -1) {
logAndSysout("Using the parallelism provided by the remote cluster ("
+ client.getMaxSlots() + "). "
+ "To use another parallelism, set it at the ./bin/flink client.");
userParallelism = client.getMaxSlots();
} else if (ExecutionConfig.PARALLELISM_DEFAULT == userParallelism) {
userParallelism = defaultParallelism;
}
// 执行程序核心逻辑--调用executeProgram方法去执行程序
executeProgram(program, client, userParallelism);
} finally {
if (clusterId == null && !client.isDetached()) {
// terminate the cluster only if we have started it before and if it's not detached
try {
client.shutDownCluster();
} catch (final Exception e) {
LOG.info("Could not properly terminate the Flink cluster.", e);
}
if (shutdownHook != null) {
// we do not need the hook anymore as we have just tried to shutdown the cluster.
ShutdownHookUtil.removeShutdownHook(shutdownHook, client.getClass().getSimpleName(), LOG);
}
}
try {
client.shutdown();
} catch (Exception e) {
LOG.info("Could not properly shut down the client.", e);
}
}
}
} finally {
try {
clusterDescriptor.close();
} catch (Exception e) {
LOG.info("Could not properly close the cluster descriptor.", e);
}
}
}
接着分析executeProgram(program, client, userParallelism)的逻辑:
protected void executeProgram(PackagedProgram program, ClusterClient<?> client, int parallelism) throws ProgramMissingJobException, ProgramInvocationException {
logAndSysout("Starting execution of program");
// 执行任务
final JobSubmissionResult result = client.run(program, parallelism);
if (null == result) {
throw new ProgramMissingJobException("No JobSubmissionResult returned, please make sure you called " +
"ExecutionEnvironment.execute()");
}
// 判断是否返回了任务程序执行的结果。即代表任务正常执行完了。
if (result.isJobExecutionResult()) {
logAndSysout("Program execution finished");
JobExecutionResult execResult = result.getJobExecutionResult();
System.out.println("Job with JobID " + execResult.getJobID() + " has finished.");
System.out.println("Job Runtime: " + execResult.getNetRuntime() + " ms");
Map<String, Object> accumulatorsResult = execResult.getAllAccumulatorResults();
if (accumulatorsResult.size() > 0) {
System.out.println("Accumulator Results: ");
System.out.println(AccumulatorHelper.getResultsFormatted(accumulatorsResult));
}
} else {
logAndSysout("Job has been submitted with JobID " + result.getJobID());
}
}
这里是通过ClusterClient来运行已经打包好的任务。并且获取到执行完之后的结果JobSubmissionResult。
ClusterClient运行任务的逻辑如下:
/**
* 从CliFronted中运行一个用户自定义的jar包来运行任务程序。运行模式有阻塞(blocking)模式和分离(detached)模式。
* 具体是什么模式,主要看{@code setDetached(true)} or {@code setDetached(false)}.
* @param prog 打包过的程序
* @param parallelism 执行Flink job的并行度
* @return 执行的结果
* @throws ProgramMissingJobException
* @throws ProgramInvocationException
*/
public JobSubmissionResult run(PackagedProgram prog, int parallelism)
throws ProgramInvocationException, ProgramMissingJobException {
Thread.currentThread().setContextClassLoader(prog.getUserCodeClassLoader());
// 1. 如果程序指定了执行入口
if (prog.isUsingProgramEntryPoint()) {
final JobWithJars jobWithJars;
if (hasUserJarsInClassPath(prog.getAllLibraries())) {
jobWithJars = prog.getPlanWithoutJars();
} else {
jobWithJars = prog.getPlanWithJars();
}
// 执行主逻辑
return run(jobWithJars, parallelism, prog.getSavepointSettings());
}
// 2. 如果没有指定执行入口,那么就利用交互模式执行程序
else if (prog.isUsingInteractiveMode()) {
log.info("Starting program in interactive mode (detached: {})", isDetached());
final List<URL> libraries;
if (hasUserJarsInClassPath(prog.getAllLibraries())) {
libraries = Collections.emptyList();
} else {
libraries = prog.getAllLibraries();
}
ContextEnvironmentFactory factory = new ContextEnvironmentFactory(this, libraries,
prog.getClasspaths(), prog.getUserCodeClassLoader(), parallelism, isDetached(),
prog.getSavepointSettings());
ContextEnvironment.setAsContext(factory);
try {
// invoke main method
prog.invokeInteractiveModeForExecution();
if (lastJobExecutionResult == null && factory.getLastEnvCreated() == null) {
throw new ProgramMissingJobException("The program didn't contain a Flink job.");
}
if (isDetached()) {
// in detached mode, we execute the whole user code to extract the Flink job, afterwards we run it here
return ((DetachedEnvironment) factory.getLastEnvCreated()).finalizeExecute();
}
else {
// in blocking mode, we execute all Flink jobs contained in the user code and then return here
return this.lastJobExecutionResult;
}
}
finally {
ContextEnvironment.unsetContext();
}
}
else {
throw new ProgramInvocationException("PackagedProgram does not have a valid invocation mode.");
}
}
我们这里不考虑交互模型,即只考虑任务程序的执行入口给定的情况。所以重点分析run(jobWithJars, parallelism, prog.getSavepointSettings())的逻辑。
/**
* 通过客户端,在Flink集群中运行程序。调用将一直阻塞,知道执行结果返回。
*
* @param jobWithJars 任务jar包.
* @param parallelism 运行该任务的并行度
*
*/
public JobSubmissionResult run(JobWithJars jobWithJars, int parallelism, SavepointRestoreSettings savepointSettings)
throws CompilerException, ProgramInvocationException {
// 获取类加载器
ClassLoader classLoader = jobWithJars.getUserCodeClassLoader();
if (classLoader == null) {
throw new IllegalArgumentException("The given JobWithJars does not provide a usercode class loader.");
}
// 得到优化执行计划
OptimizedPlan optPlan = getOptimizedPlan(compiler, jobWithJars, parallelism);
// 执行
return run(optPlan, jobWithJars.getJarFiles(), jobWithJars.getClasspaths(), classLoader, savepointSettings);
}
这里重点是优化执行计划是怎么生成的。//TODO
进一步分析run(optPlan, jobWithJars.getJarFiles(), jobWithJars.getClasspaths(), classLoader, savepointSettings)流程。
public JobSubmissionResult run(FlinkPlan compiledPlan,
List<URL> libraries, List<URL> classpaths, ClassLoader classLoader, SavepointRestoreSettings savepointSettings)
throws ProgramInvocationException {
// 得到JobGraph
JobGraph job = getJobGraph(flinkConfig, compiledPlan, libraries, classpaths, savepointSettings);
// 提交任务执行
return submitJob(job, classLoader);
}
提交任务执行的逻辑submitJob(job, classLoader)。//TODO
参考博客:https://blog.csdn.net/hxcaifly/article/details/87864154