spark-core_03: org.apache.spark.launcher.Main源碼分析

承接上文“spark-core_02: spark-submit、spark-class腳本分析

launcher.Main主要作用是就是檢測,注入spark-submit參數到spark環境中,然後返回SparkSubmit需要執行的參數,給spark-class腳本中的exec "${CMD[@]}"進行執行

class Main {
  public static void main(String[] argsArray) throws Exception {
    checkArgument(argsArray.length > 0, "Not enough arguments: missing class name.");
/**
 * java -cp spark_home/lib/spark-assembly-1.6.0-hadoop2.6.0.jar org.apache.spark.launcher.Main  org.apache.spark.deploy.SparkSubmit
 *      --class org.apache.spark.repl.Main --name "Spark shell" --master spark://luyl152:7077
    這個main方法最終會將org.apache.spark.deploy.SparkSubmit --class org.apache.spark.repl.Main 
--name "Spark shell" --master spark://luyl152:7077
    給spark-class的 exec "${CMD[@]}"執行
 */
    List<String> args = new ArrayList<String>(Arrays.asList(argsArray));
    String className = args.remove(0);
//可以在spark-class或別的配製文件中 export SPARK_PRINT_LAUNCH_COMMAND=任何值,只要不爲空即可
 可以用它來打印cmd,也就是spark-class的 exec "${CMD[@]}"中的值


    boolean
printLaunchCommand = !isEmpty(System.getenv("SPARK_PRINT_LAUNCH_COMMAND"));
   
AbstractCommandBuilder builder;//創建命令解析器
    //spark-shell執行時第1個參數就是SparkSubmit
    if (className.equals("org.apache.spark.deploy.SparkSubmit")) {
     
try {
       
//將參數解析到spark對應的變量中,如 --class的值 放到mainClass變量中。
        //如果有多出來的參數則將該參數放到SparkSubmitCommandBuilder成員sparkArgs這個集合中

       
builder = new SparkSubmitCommandBuilder(args);

一、分析new SparkSubmitCommandBuilder(args)的代碼:

// args參數是這些:--class org.apache.spark.repl.Main--name "Spark shell" --master spark://luyl152:7077
 
SparkSubmitCommandBuilder(List<String> args) {
   
this.sparkArgs = new ArrayList<String>();
   
List<String> submitArgs = args;
   
//第一個參數值是pyspark-shell-main,如果python執行的
    if (args.size() > 0 && args.get(0).equals(PYSPARK_SHELL)) {
     
this.allowsMixedArguments = true;
     
appResource = PYSPARK_SHELL_RESOURCE;
     
submitArgs = args.subList(1, args.size());
     
//第一個參數值是:sparkr-shell-main,如果是R執行
    } else if (args.size() > 0 &&args.get(0).equals(SPARKR_SHELL)) {
     
this.allowsMixedArguments = true;
     
appResource = SPARKR_SHELL_RESOURCE;
     
submitArgs = args.subList(1, args.size());
   
} else{
     
//如果不是python或r,則allowsMixedArguments值是false
      this.allowsMixedArguments = false;
   
}

    OptionParser parser =
new OptionParser();
  
 //作用就是將spark-submit放進來的參數對應值賦到spark對應的變量中,如 --class的值 放到mainClass變量中
    parser.parse(submitArgs); //它的父類方法SparkSubmitOptionParser實現的
    this.printInfo = parser.infoRequested;
 
}

1,查看一下OptionParser的父類SparkSubmitOptionParser對parse的實現

/**
 * Parse a list of spark-submit commandline options.
 *
<p>
 
* See SparkSubmitArguments.scala for a more formaldescription of available options.
 *
 * @throws IllegalArgumentExceptionIf an error is found during parsing
 
* 參數是這些:--class org.apache.spark.repl.Main --name"Spark shell" --master spark://luyl152:7077.
 * 作用就是將spark-submit放進來的參數對應值賦到spark對應的變量中,如 --class的值 放到mainClass變量中

 */
protected finalvoid parse(List<String> args) {
 
 //spark-submit可以傳sparkConf參數:--confPROP=VALUE ,參數可以看org.apache.spark.deploy.SparkSubmitArguments類最後面
  //或spark-submit-h就可以查看

 
Pattern eqSeparatedOpt = Pattern.compile("(--[^=]+)=(.+)");

  int
idx = 0;
  for
(idx = 0; idx < args.size(); idx++) {
    String arg = args.get(idx)
;
   
String value = null;
   
//當出現--conf PROP=VALUE這種類型的參數arg、value值變成if代碼裏面的值
    Matcher m = eqSeparatedOpt.matcher(arg);
    if
(m.matches()) {
      arg = m.group(
1); //--conf PROP
      value = m.group(2); //VALUE
    }

   
// Look for options with a value.
   //該方法主要是找到spark-submit後面的帶有--參數,如args 放進"--class",和opts二維數組進行匹配
    //匹配到的還是返回--class,如果沒有匹配到則null

   
String name = findCliOption(arg, opts);
    if
(name != null) {
     
if (value== null) {
       
if (idx== args.size() - 1) { //如果匹配了並且沒有參數值則報錯,如:只有 --class ,則size是1,idx此時0, 1-1=0
          throw new IllegalArgumentException(
              String.format(
"Missing argument for option'%s'.", arg));
       
}
        idx++
;
       
value = args.get(idx); //如果有值,則idx索引的下一位就是參數對應的值
      }
     
//name就是spark-submit的參數如--class,而value就是參數對應的值
      //在它的自身OptionParser做的實現,作用就是將spark-submit放進來的參數對應值賦到spark對應的變量中
      //如 --class的值放到mainClass變量中(裏面實現很easy,就不寫了)

      if (!handle(name, value)) {
       
break;
     
}
     
continue; //只有匹配到纔會讓idx再次加1
    }

  
 // Look for aswitch. 如果上面沒有匹配到,會再去匹配一下是否有出現-verbose這樣參數
    name = findCliOption(arg, switches);
    if
(name != null) {
     
if (!handle(name, null)) {
       
break;
     
}
     
continue;
   
}

   
if (!handleUnknown(arg)){
     
break;
   
}
  }

 
if (idx< args.size()) {
    idx++
;
 
}
 
 // 將多出來的參數加到 SparkSubmitCommandBuilder() {his.sparkArgs = new ArrayList<String>();..}
  handleExtraArgs(args.subList(idx, args.size()));
}

===》上面handle(name,value)在OptionParser的實現如下

/**
 *作用就是將spark-submit放進來的參數對應值賦到spark對應的變量中
   */

@Override
protected boolean handle(String opt, String value) {
 
if (opt.equals(MASTER)) {
   
master =value;
 
} elseif (opt.equals(DEPLOY_MODE)) {
   
deployMode =value;
 
} elseif (opt.equals(PROPERTIES_FILE)) {
   
propertiesFile = value;
 
} elseif (opt.equals(DRIVER_MEMORY)) {
   
conf.put(SparkLauncher.DRIVER_MEMORY, value);
 
} elseif (opt.equals(DRIVER_JAVA_OPTIONS)) {
   
conf.put(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, value);
 
} elseif (opt.equals(DRIVER_LIBRARY_PATH)) {
   
conf.put(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, value);。。。。。


2,此時就從new SparkSubmitCommandBuilder(args)主構造方法返回了,接着launcher.Main$main()下面的方法進行分析:

                                          
     
} catch(IllegalArgumentException e) {

//初始化SparkSubmitCommandBuilder出現異常的容錯代碼
        printLaunchCommand =
false;
       
System.err.println("Error: " + e.getMessage());
       
System.err.println();
       
MainClassOptionParser parser = new MainClassOptionParser();
        try
{
          parser.parse(args)
;
       
} catch(Exception ignored) {
         
// Ignore parsing exceptions.
       
}
        List<String> help =
new ArrayList<String>();
        if
(parser.className!= null) {
          help.add(parser.
CLASS);
         
help.add(parser.className);
       
}
        help.add(parser.
USAGE_ERROR);
       
builder = new SparkSubmitCommandBuilder(help);
      
}
    }
else {
    
 //第一個參數如果不是:org.apache.spark.deploy.SparkSubmit,則使用SparkClassCommandBuilder,解析器
     
builder = new SparkClassCommandBuilder(className, args);
   
}

    Map<String
, String>env = new HashMap<String, String>();
   //將所有和jvm及SparkSubmit相關的參數返回
   
List<String> cmd = builder.buildCommand(env);

二、此處看一下SparkSubmitOptionParser.buildCommand(Map)這個方法

@Override
public List<String> buildCommand(Map<String, String> env) throws IOException{
 
//PYSPARK_SHELL_RESOURCE表示python,SPARKR_SHELL_RESOURCE表示r語言
  if (PYSPARK_SHELL_RESOURCE.equals(appResource)&& !printInfo) {
   
return buildPySparkShellCommand(env);
 
} elseif (SPARKR_SHELL_RESOURCE.equals(appResource) && !printInfo) {
   
return buildSparkRCommand(env);
 
} else{
   
//這個env就是一個空的Map,會調用buildSparkSubmitCommand()方法
   
return buildSparkSubmitCommand(env);
 
}
}

1,查看一下buildSparkSubmitCommand(env)


private List<String> buildSparkSubmitCommand(Map<String, String> env) throws IOException {
 
// Load the properties file and check whether spark-submitwill be running the app's driver
  // or just launching a cluster app.When running the driver, the JVM's argument will be
  // modified to cover the driver'sconfiguration.
 
//加載屬性文件,並檢查spark-submit是否正在運行driver的應用程序或僅啓動集羣應用程序。
  // 在運行驅動程序時,JVM的參數將被修改以涵蓋驅動程序的配置。

  Map<String, String> config = getEffectiveConfig();

  boolean isClientMode = isClientMode(config)

//默認如果standalone不匹配--deploy-mode cluster就是client,所以這個值是true
  //這個DRIVER_EXTRA_CLASSPATH在client模式是不能直接在SparkConf中設置的,因爲driver的JVM已經被Spark-submit通過反射啓動起來了
  // 而是通過參數:--driver-class-path來設置的

  //這個DRIVER_EXTRA_CLASSPATH在client模式是不能直接在SparkConf中設置的,因爲driver的JVM已經被Spark-submit通過反射啓動起來了,應該通過參數:--driver-class-path來設置的
  String extraClassPath = isClientMode ? config.get(SparkLauncher.DRIVER_EXTRA_CLASSPATH) : null;

 
List<String> cmd = buildJavaCommand(extraClassPath);
 
// Take Thrift Server as daemon
 
if (isThriftServer(mainClass)) {
    addOptionString(cmd
, System.getenv("SPARK_DAEMON_JAVA_OPTS"));
 
}
 
//SPARK_SUBMIT_OPTS就是在spark-shell中提到的,需要將java的classpath手動設置到scala中 SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS-Dscala.usejavacp=true"
  addOptionString(cmd, System.getenv("SPARK_SUBMIT_OPTS"));
 
addOptionString(cmd, System.getenv("SPARK_JAVA_OPTS"));

  if
(isClientMode) {
   
// Figuring out where the memory value come from is alittle tricky due to precedence.
    // Precedence is observed in thefollowing order:
   // - explicit configuration (setConf()), which also covers--driver-memory cli argument.
    // - properties file.
    // - SPARK_DRIVER_MEMORY env variable
    // - SPARK_MEM env variable
    // - default value (1g)
    // Take Thrift Server as daemon
   
String tsMemory =
      isThriftServer(
mainClass) ? System.getenv("SPARK_DAEMON_MEMORY") : null;
   
String memory = firstNonEmpty(tsMemory, config.get(SparkLauncher.DRIVER_MEMORY),
     
System.getenv("SPARK_DRIVER_MEMORY"), System.getenv("SPARK_MEM"), DEFAULT_MEM);
   
cmd.add("-Xms"+ memory); //最大、小堆內存默認是1g
    cmd.add("-Xmx" + memory);
   
addOptionString(cmd, config.get(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS));
   
mergeEnvPathList(env, getLibPathEnvName(),
     
config.get(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH));
 
}

  addPermGenSizeOpt(cmd)
;
 
cmd.add("org.apache.spark.deploy.SparkSubmit");
 
 //buildSparkSubmitArgs()返回list將上面spark-submit參數注入進來的參數及對應值取出來
  cmd.addAll(buildSparkSubmitArgs());
  return
cmd;
}

==》此時將要執行的cmd參數返回,接着launcher.Main$main()下面的方法進行分析


    if
(printLaunchCommand) {
      System.
err.println("Spark Command: " + join(" ", cmd));
     
System.err.println("========================================");
   
}

   
if (isWindows()){
      System.
out.println(prepareWindowsCommand(cmd, env));
   
} else{
     
// In bash, use NULL as the arg separator since it cannotbe used in an argument.
      //
返回有效的參數,會通過打印的方式給spark-class的 exec "${CMD[@]}"執行
  
    /**
       * '\0'
和空格不是同一個概念。
         '\0'表示字符串結束符,代表字符串結束,而空格是一個普通字符,顯示在文本中可以選中。
         '\0'的ASCII碼爲0,空格的ASCII碼爲32,兩個不是同一個字符
          在計算機程序中通常使用'\0'表示字符串結束,空格爲文本字符,二者完全不同
       */

     
List<String> bashCmd = prepareBashCommand(cmd, env);
      for
(String c : bashCmd) {
        System.
out.print(c);
       
System.out.print('\0');
     
}
    }
  }

 

CMD的內容如下

/usr/local/java/jdk1.8.0_91/bin/java-cp

/data/spark-1.6.0-bin-hadoop2.6/conf/:/data/spark-1.6.0-bin-hadoop2.6/lib/spark-assembly-1.6.0-hadoop2.6.0.jar:/data/spark-1.6.0-bin-hadoop2.6/lib/datanucleus-api-jdo-3.2.6.jar:/data/spark-1.6.0-bin-hadoop2.6/lib/datanucleus-rdbms-3.2.9.jar:/data/spark-1.6.0-bin-hadoop2.6/lib/datanucleus-core-3.2.10.jar:/data/hadoop-2.6.5/etc/hadoop/

-Xms1g-Xmx1g -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005

org.apache.spark.deploy.SparkSubmit

--classorg.apache.spark.repl.Main

--nameSpark shell

--masterspark://luyl152:7077,luyl153:7077,luyl154:7077

--verbose/tool/jarDir/maven_scala-1.0-SNAPSHOT.jar

接收下分析一下“org.apache.spark.deploy.SparkSubmit“源碼
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章