1.生產集羣若是是Spark/Yarn,方便集成(docker內部鏈接到現有spark on yarn集羣)
2.自定義鏡像
2.1 work機上安裝python3.7 link到/opt/conda/bin/python
FROM jupyter/all-spark-notebook:2ce7c06a61a1
ENV HADOOP_HOME /usr/local/hadoop
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64
ENV HADOOP_CONF_HOME /usr/local/hadoop/etc/hadoop
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
ENV PYSPARK_PYTHON /opt/conda/bin/python
ENV PYSPARK_DRIVER_PYTHON /opt/conda/bin/python
USER root
COPY hadoop /usr/local/hadoop
# spark-default.conf
RUN echo "spark.driver.extraJavaOptions -Dhdp.version=2.5.3.0-37" >> /usr/local/spark/conf/spark-defaults.conf && \
echo "spark.yarn.am.extraJavaOptions -Dhdp.version=2.5.3.0-37" >> /usr/local/spark/conf/spark-defaults.conf && \
echo "spark.master=yarn" >> /usr/local/spark/conf/spark-defaults.conf && \
echo "spark.yarn.jars=hdfs://192.168.56.103:9000/spark/jars/*" >> /usr/local/spark/conf/spark-defaults.conf && \
echo "spark.eventLog.dir=hdfs://192.168.56.103:9000/spark/logs" >> /usr/local/spark/conf/spark-defaults.conf && \
echo "spark.hadoop.yarn.timeline-service.enabled=false" >> /usr/local/spark/conf/spark-defaults.conf && \
chown -R $NB_USER:users /usr/local/spark/conf/spark-defaults.conf
RUN jupyter toree install --sys-prefix --spark_opts="--master yarn --deploy-mode cluster --driver-memory 512m \
--executor-memory 512m --executor-cores 1 --driver-java-options -Dhdp.version=2.5.3.0-37 --conf spark.hadoop.yarn.timeline-service.enabled=false"
RUN chown jovyan -R /home/jovyan/.local
COPY slaves /usr/local/spark/conf
COPY spark-env.sh /usr/local/spark/conf
USER $NB_USER
3.測試(查看yarn集羣application)(scala\pthon)