文章目錄
接着上面兩篇文章:
Flink1.9.0任務提交源碼閱讀(一):flink腳本
Flink1.9.0任務提交源碼閱讀(二):入口類CliFrontend
今天繼續閱讀Flink1.9.0任務提交源碼的run()方法,這個方法是flink任務提交的核心邏輯。
run()的執行邏輯
代碼:
/**
* 執行run操作
*
* @param args 運行操作的命令行參數。
*/
protected void run(String[] args) throws Exception {
LOG.info("Running 'run' command.");
// 獲取配置信息
final Options commandOptions = CliFrontendParser.getRunCommandOptions();
final Options commandLineOptions = CliFrontendParser.mergeOptions(commandOptions, customCommandLineOptions);
final CommandLine commandLine = CliFrontendParser.parse(commandLineOptions, args, true);
final RunOptions runOptions = new RunOptions(commandLine);
// 1.判斷下是否是help操作
if (runOptions.isPrintHelp()) {
CliFrontendParser.printHelpForRun(customCommandLines);
return;
}
if (!runOptions.isPython()) {
// Java program should be specified a JAR file
// 2.檢查用戶指定jar包路徑是否爲空,如果爲空,則打印異常信息,並拋出異常:CliArgsException,
if (runOptions.getJarFilePath() == null) {
throw new CliArgsException("Java program should be specified a JAR file.");
}
}
/**
* 3.創建PackagedProgram對象,org.apache.flink.client.program.PackagedProgram類用於在用戶指定的jar包中
*
* <p>(1).尋找程序入口
*
* <p>(2).解析客戶代碼獲取任務拓撲圖
*
* <p>(3).提取嵌套庫
*
* <p>在這裏是去尋找用戶的程序入口
*/
final PackagedProgram program;
try {
LOG.info("Building program from JAR file");
program = buildProgram(runOptions);
}
catch (FileNotFoundException e) {
throw new CliArgsException("Could not build the program from JAR file.", e);
}
final CustomCommandLine<?> customCommandLine = getActiveCustomCommandLine(commandLine);
try {
// 4. 【重點】執行任務程序.調用runProgram方法,將之前準備好的用戶程序入口,配置項傳入方法中,運行程序
runProgram(customCommandLine, commandLine, runOptions, program);
} finally {
// 刪除爲了打包所創建的臨時文件
program.deleteExtractedLibraries();
}
}
繼續執行 runProgram(customCommandLine, commandLine, runOptions, program);
/**
* 執行邏輯
* @param customCommandLine
* @param commandLine
* @param runOptions
* @param program
* @param <T>
* @throws ProgramInvocationException
* @throws FlinkException
*/
private <T> void runProgram(
CustomCommandLine<T> customCommandLine,
CommandLine commandLine,
RunOptions runOptions,
PackagedProgram program) throws ProgramInvocationException, FlinkException {
// 根據用戶命令行參數,創建ClusterDescriptor,ClusterDescriptor是一個集羣屬性的描述對象,
// 用於部署集羣(例如 Yarn、Mesos),並且返回一個與集羣通信的客戶端
final ClusterDescriptor<T> clusterDescriptor = customCommandLine.createClusterDescriptor(commandLine);
try {
final T clusterId = customCommandLine.getClusterId(commandLine);
// 集羣客戶端,ClusterClient封裝了提交一個程序到遠程集羣的必要的功能
final ClusterClient<T> client;
// directly deploy the job if the cluster is started in job mode and detached
//如果clusterId爲空且運行模式爲Detached(分離)模式,即命令行中添加 -d 參數
if (clusterId == null && runOptions.getDetachedMode()) {
// --(1)獲取默認並行度
int parallelism = runOptions.getParallelism() == -1 ? defaultParallelism : runOptions.getParallelism();
// -- (2)根據用戶自定義程序,解析DAG拓撲圖,構建JobGraph
final JobGraph jobGraph = PackagedProgramUtils.createJobGraph(program, configuration, parallelism);
// --(3)根據命令行獲取集羣配置
final ClusterSpecification clusterSpecification = customCommandLine.getClusterSpecification(commandLine);
// 裝載任務
// --(4)調用解析好的Job 圖,獲取的配置信息,運行模式等傳入,調用clusterDescriptor.deployJobCluster方法,
// 獲得ClusterClient,並運行程序client端將job部署到集羣
client = clusterDescriptor.deployJobCluster(
clusterSpecification,
jobGraph,
runOptions.getDetachedMode());
logAndSysout("Job has been submitted with JobID " + jobGraph.getJobID());
try {
// -- (5)關閉client
client.shutdown();
} catch (Exception e) {
LOG.info("Could not properly shut down the client.", e);
}
} else {
// -- 使用正常方式進行提交
// --(1)創建shutdownHook,用於關閉cluster,在非分離模式下,需要使用shutdownHook在client退出後關閉cluster
final Thread shutdownHook;
// -- (2) 判斷cluster id 是否爲空,如果爲空,獲取ClusterClient,如果不爲空,在client退出後,關閉cluster
if (clusterId != null) {
client = clusterDescriptor.retrieve(clusterId);
shutdownHook = null;
} else {
// also in job mode we have to deploy a session cluster because the job
// might consist of multiple parts (e.g. when using collect)
final ClusterSpecification clusterSpecification = customCommandLine.getClusterSpecification(commandLine);
client = clusterDescriptor.deploySessionCluster(clusterSpecification);
// if not running in detached mode, add a shutdown hook to shut down cluster if client exits
// there's a race-condition here if cli is killed before shutdown hook is installed
if (!runOptions.getDetachedMode() && runOptions.isShutdownOnAttachedExit()) {
shutdownHook = ShutdownHookUtil.addShutdownHook(client::shutDownCluster, client.getClass().getSimpleName(), LOG);
} else {
shutdownHook = null;
}
}
try {
client.setPrintStatusDuringExecution(runOptions.getStdoutLogging());
client.setDetached(runOptions.getDetachedMode());
LOG.debug("Client slots is set to {}", client.getMaxSlots());
LOG.debug("{}", runOptions.getSavepointRestoreSettings());
int userParallelism = runOptions.getParallelism();
LOG.debug("User parallelism is set to {}", userParallelism);
if (client.getMaxSlots() != MAX_SLOTS_UNKNOWN && userParallelism == -1) {
logAndSysout("Using the parallelism provided by the remote cluster ("
+ client.getMaxSlots() + "). "
+ "To use another parallelism, set it at the ./bin/flink client.");
userParallelism = client.getMaxSlots();
} else if (ExecutionConfig.PARALLELISM_DEFAULT == userParallelism) {
userParallelism = defaultParallelism;
}
// 執行程序核心邏輯--調用executeProgram方法去執行程序
executeProgram(program, client, userParallelism);
} finally {
if (clusterId == null && !client.isDetached()) {
// terminate the cluster only if we have started it before and if it's not detached
try {
client.shutDownCluster();
} catch (final Exception e) {
LOG.info("Could not properly terminate the Flink cluster.", e);
}
if (shutdownHook != null) {
// we do not need the hook anymore as we have just tried to shutdown the cluster.
ShutdownHookUtil.removeShutdownHook(shutdownHook, client.getClass().getSimpleName(), LOG);
}
}
try {
client.shutdown();
} catch (Exception e) {
LOG.info("Could not properly shut down the client.", e);
}
}
}
} finally {
try {
clusterDescriptor.close();
} catch (Exception e) {
LOG.info("Could not properly close the cluster descriptor.", e);
}
}
}
接着分析executeProgram(program, client, userParallelism)的邏輯:
protected void executeProgram(PackagedProgram program, ClusterClient<?> client, int parallelism) throws ProgramMissingJobException, ProgramInvocationException {
logAndSysout("Starting execution of program");
// 執行任務
final JobSubmissionResult result = client.run(program, parallelism);
if (null == result) {
throw new ProgramMissingJobException("No JobSubmissionResult returned, please make sure you called " +
"ExecutionEnvironment.execute()");
}
// 判斷是否返回了任務程序執行的結果。即代表任務正常執行完了。
if (result.isJobExecutionResult()) {
logAndSysout("Program execution finished");
JobExecutionResult execResult = result.getJobExecutionResult();
System.out.println("Job with JobID " + execResult.getJobID() + " has finished.");
System.out.println("Job Runtime: " + execResult.getNetRuntime() + " ms");
Map<String, Object> accumulatorsResult = execResult.getAllAccumulatorResults();
if (accumulatorsResult.size() > 0) {
System.out.println("Accumulator Results: ");
System.out.println(AccumulatorHelper.getResultsFormatted(accumulatorsResult));
}
} else {
logAndSysout("Job has been submitted with JobID " + result.getJobID());
}
}
這裏是通過ClusterClient來運行已經打包好的任務。並且獲取到執行完之後的結果JobSubmissionResult。
ClusterClient運行任務的邏輯如下:
/**
* 從CliFronted中運行一個用戶自定義的jar包來運行任務程序。運行模式有阻塞(blocking)模式和分離(detached)模式。
* 具體是什麼模式,主要看{@code setDetached(true)} or {@code setDetached(false)}.
* @param prog 打包過的程序
* @param parallelism 執行Flink job的並行度
* @return 執行的結果
* @throws ProgramMissingJobException
* @throws ProgramInvocationException
*/
public JobSubmissionResult run(PackagedProgram prog, int parallelism)
throws ProgramInvocationException, ProgramMissingJobException {
Thread.currentThread().setContextClassLoader(prog.getUserCodeClassLoader());
// 1. 如果程序指定了執行入口
if (prog.isUsingProgramEntryPoint()) {
final JobWithJars jobWithJars;
if (hasUserJarsInClassPath(prog.getAllLibraries())) {
jobWithJars = prog.getPlanWithoutJars();
} else {
jobWithJars = prog.getPlanWithJars();
}
// 執行主邏輯
return run(jobWithJars, parallelism, prog.getSavepointSettings());
}
// 2. 如果沒有指定執行入口,那麼就利用交互模式執行程序
else if (prog.isUsingInteractiveMode()) {
log.info("Starting program in interactive mode (detached: {})", isDetached());
final List<URL> libraries;
if (hasUserJarsInClassPath(prog.getAllLibraries())) {
libraries = Collections.emptyList();
} else {
libraries = prog.getAllLibraries();
}
ContextEnvironmentFactory factory = new ContextEnvironmentFactory(this, libraries,
prog.getClasspaths(), prog.getUserCodeClassLoader(), parallelism, isDetached(),
prog.getSavepointSettings());
ContextEnvironment.setAsContext(factory);
try {
// invoke main method
prog.invokeInteractiveModeForExecution();
if (lastJobExecutionResult == null && factory.getLastEnvCreated() == null) {
throw new ProgramMissingJobException("The program didn't contain a Flink job.");
}
if (isDetached()) {
// in detached mode, we execute the whole user code to extract the Flink job, afterwards we run it here
return ((DetachedEnvironment) factory.getLastEnvCreated()).finalizeExecute();
}
else {
// in blocking mode, we execute all Flink jobs contained in the user code and then return here
return this.lastJobExecutionResult;
}
}
finally {
ContextEnvironment.unsetContext();
}
}
else {
throw new ProgramInvocationException("PackagedProgram does not have a valid invocation mode.");
}
}
我們這裏不考慮交互模型,即只考慮任務程序的執行入口給定的情況。所以重點分析run(jobWithJars, parallelism, prog.getSavepointSettings())的邏輯。
/**
* 通過客戶端,在Flink集羣中運行程序。調用將一直阻塞,知道執行結果返回。
*
* @param jobWithJars 任務jar包.
* @param parallelism 運行該任務的並行度
*
*/
public JobSubmissionResult run(JobWithJars jobWithJars, int parallelism, SavepointRestoreSettings savepointSettings)
throws CompilerException, ProgramInvocationException {
// 獲取類加載器
ClassLoader classLoader = jobWithJars.getUserCodeClassLoader();
if (classLoader == null) {
throw new IllegalArgumentException("The given JobWithJars does not provide a usercode class loader.");
}
// 得到優化執行計劃
OptimizedPlan optPlan = getOptimizedPlan(compiler, jobWithJars, parallelism);
// 執行
return run(optPlan, jobWithJars.getJarFiles(), jobWithJars.getClasspaths(), classLoader, savepointSettings);
}
這裏重點是優化執行計劃是怎麼生成的。//TODO
進一步分析run(optPlan, jobWithJars.getJarFiles(), jobWithJars.getClasspaths(), classLoader, savepointSettings)流程。
public JobSubmissionResult run(FlinkPlan compiledPlan,
List<URL> libraries, List<URL> classpaths, ClassLoader classLoader, SavepointRestoreSettings savepointSettings)
throws ProgramInvocationException {
// 得到JobGraph
JobGraph job = getJobGraph(flinkConfig, compiledPlan, libraries, classpaths, savepointSettings);
// 提交任務執行
return submitJob(job, classLoader);
}
提交任務執行的邏輯submitJob(job, classLoader)。//TODO
參考博客:https://blog.csdn.net/hxcaifly/article/details/87864154