1、spark-default
export SPARK_LOCAL_DIRS=/home/hadoop/spark/tmp
export SPARK_HOME=/usr/install/spark
2、spark-env
//This requires spark.shuffle.service.enabled to be set. The following //configurations are also relevant: //spark.dynamicAllocation.minExecutors, //spark.dynamicAllocation.maxExecutors, and //spark.dynamicAllocation.initialExecutors
spark.dynamicAllocation.enabled true
spark.shuffle.service.enabled true
spark.dynamicAllocation.minExecutors 0
spark.dynamicAllocation.maxExecutors 20
spark.dynamicAllocation.executorIdleTimeout 120s
spark.dynamicAllocation.cachedExecutorIdleTimeout 1800s
Spark.shuffle.service.port 7338
spark.shuffle.io.connectionTimeout 600s
spark.yarn.jars hdfs://master:9000/user/yarn_jars/spark2.0/*
spark.yarn.executor.memoryOverhead 3g
spark.driver.memory 3g
spark.yarn.am.memory 3g
spark.executor.memory 8g
spark.executor.cores 3
spark.yarn.queue test
spark.ui.enabled true
spark.port.maxRetries 50
spark.locality.wait 0s
spark.master yarn
應用程序上載到HDFS的複製份數
spark.yarn.submit.file.replication 3
spark.yarn.am.waitTime 100s
設置爲true,在job結束後,將stage相關的文件保留而不是刪除。 (一般無需保留,設置成false)
spark.preserve.staging.files false
Spark application master給YARN ResourceManager 發送心跳的時間間隔(ms)
spark.yarn.scheduler.heartbeat.interal-ms 5000
僅適用於HashShuffleMananger的實現,同樣是爲了解決生成過多文件的問題,採用的方式是在不同批次運行的Map任務之間重用Shuffle輸出文件,也就是說合並的是不同批次的Map任務的輸出數據,但是每個Map任務所需要的文件還是取決於Reduce分區的數量,因此,它並不減少同時打開的輸出文件的數量,因此對內存使用量的減少並沒有幫助。只是HashShuffleManager裏的一個折中的解決方案。
spark.shuffle.consolidateFiles true
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.executor.extraJavaOptions -XX:+PrintGCDetails-XX:+PrintGCTimeStamps
spark.driver.cores 1
spark.driver.maxResultSize 1g
spark.driver.memory 1g
spark.executor.memory 1g
//including map output files and RDDs that get stored on disk
spark.local.dir /tmp
spark.submit.deployMode client/cluster
spark.reducer.maxSizeInFlight 48m
spark.shuffle.compress true
spark.shuffle.file.buffer 32k
spark.shuffle.io.maxRetries 3
spark.shuffle.io.preferDirectBufs true
spark.shuffle.io.retryWait 5s
//This must be enabled if spark.dynamicAllocation.enabled is “true”.
spark.shuffle.service.enabled false
spark.shuffle.service.port 7337
//在sort-shuffle裏面如果沒有map-side 聚合,避免合併排序數據,最多允許有這麼多分區
spark.shuffle.sort.bypassMergeThreshold 200
spark.shuffle.spill.compress true
spark.io.compression.codec lz4
org.apache.spark.io.LZ4CompressionCodec, org.apache.spark.io.LZFCompressionCodec, and org.apache.spark.io.SnappyCompressionCodec.
spark.broadcast.compress true
spark.io.compression.snappy.blockSize 32k
spark.io.compression.lz4.blockSize 32k
spark.kryoserializer.buffer.max 64m
spark.kryoserializer.buffer 64k
spark.rdd.compress false
spark.memory.fraction 0.6
spark.memory.storageFraction 0.5
spark.memory.offHeap.enabled false
spark.memory.offHeap.size 0
Spark.executor.cores 1
spark.default.parallelism 2
spark.executor.heartbeatInterval 10s
spark.files.useFetchCache true
spark.storage.memoryMapThreshold 2m
//This config will be used in place of //spark.core.connection.ack.wait.timeout, //spark.storage.blockManagerSlaveTimeoutMs, spark.shuffle.io.connectionTimeout, spark.rpc.askTimeout or spark.rpc.lookupTimeout
spark.network.timeout 120s
spark.cores.max (not set)
spark.locality.wait 3s
//Useful for multi-user services.
spark.scheduler.mode FIFO
//任務推測機制
spark.speculation false
//檢查任務推測的頻率
spark.speculation.interval 100ms
//任務慢多少倍開始推測
//完成任務的百分比 開始啓用
spark.speculation.quantile 0.75
spark.speculation.multiplier 1.5
spark.sql.autoBroadcastJoinThreshold -1
spark.sql.shuffle.partitions 800
spark.shuffle.manager tungsten-sort
//Spark SQL在每次執行次,先把SQL查詢編譯JAVA字節碼。針對執行時間長//的SQL查詢或頻繁執行的SQL查詢,此配置能加快查詢速度,因爲它產生特殊//的字節碼去執行。但是針對很短的查詢,可能會增加開銷,因爲它必須先編譯//每一個查詢
spark.sql.codegen true
//shuffle默認情況下的文件數據爲map tasks * reduce tasks,通過設置其爲//true,可以使spark合併shuffle的中間文件爲reduce的tasks數目
spark.shuffle.consolidateFiles true