Python 使用 Boto3 API 創建 EMR 集羣

EMR 指南

Amazon EMR

[PDF]Amazon EMR - 開發人員指南 - Amazon.com

使用API創建EMR

Python依賴包安裝

pip install boto3

API參考

EMR集羣主要採用 boto3 EMR.Client.run_job_flow api 進行集羣創建, 詳細描述參考官網文檔。

樣例代碼

# -*- encoding:utf8 -*-
"""
    author: quanbin_zhu
    time  : 2017/11/13 15:19
"""

import re
import boto3

class EmrClient(object):

    def __init__(self):
        self.emr_version = "emr-5.9.0"
        self.emr_log_url = "s3://dev-xxx/borey-zhu/sparklog"
        self.aws_ec2_key = "borey.zhu"
        self.aws_access_key = "xxxxxxxxxxxxxxxxxxxxxxxx"
        self.aws_secret_key = "xxxxxxxxxxxxxxxxxxxxxxxx+xxxxxxxxxx"
        self.client = boto3.client('emr', region_name='cn-north-1', aws_access_key_id=self.aws_access_key,
                                   aws_secret_access_key=self.aws_secret_key)
        self.job_flow_id = None

    def generate_step(self, step_name, step_command):
        cmds = re.split('\\s+', step_command)
        if not cmds:
            raise ValueError
        return {
            'Name': step_name,
            'ActionOnFailure': 'CANCEL_AND_WAIT',
            'HadoopJarStep': {
                'Jar': 'command-runner.jar',
                'Args': cmds
            }
        }

    def add_job_flow_steps(self, steps, flow_id=None):
        if flow_id:
            return self.client.add_job_flow_steps(JobFlowId=flow_id, Steps=steps)

        elif self.job_flow_id:
            return self.client.add_job_flow_steps(JobFlowId=self.job_flow_id, Steps=steps)

        else:
            return self.run_job_flow(steps)

    def run_job_flow(self, steps=None):
        response = self.client.run_job_flow(
            # 集羣名稱
            Name="SnifferAnalyse",
            # 集羣版本號 http://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-release-components.html
            ReleaseLabel=self.emr_version,
            # 集羣日誌存儲Amazon S3的路徑
            LogUri=self.emr_log_url,
            Instances={
                # 實例類型
                # https://amazonaws-china.com/cn/ec2/purchasing-options/dedicated-instances/
                'MasterInstanceType': 'm3.xlarge',
                'SlaveInstanceType':  'm3.xlarge',

                # 實例數
                'InstanceCount': 9,

                # 保持EMR集羣運行當沒有步驟時
                # True  - 切換爲 WAITING
                # False - 關閉集羣
                'KeepJobFlowAliveWhenNoSteps': False,

                # 終止保護
                # True  - 鎖定集羣和防止被API調用終止,用戶干預,或在工作時流錯誤
                'TerminationProtected': False,

                # 使用現有 EC2 密鑰對以 SSH 方式連接到 Amazon EMR 集羣的主節點
                'Ec2KeyName': self.aws_ec2_key,

                # 指定子網 ID,  集羣虛擬私有云 Amazon Virtual Private Cloud (亞馬遜 VPC)
                'Ec2SubnetId': 'subnet-xxxxxxx'
            },
            Applications=[
                {
                    'Name': 'Hadoop'
                },
                {
                    'Name': 'Spark'
                },
            ],
            BootstrapActions=[
                {
                    # YARN 資源最大化配置 for Spark
                    'Name': 'Spark Default Config',
                    'ScriptBootstrapAction': {
                        # Initialization shell for spark, you will find the doc in
                        # https://github.com/awslabs/emr-bootstrap-actions/tree/master/spark
                        'Path': 's3://support.elasticmapreduce/spark/maximize-spark-default-config',
                    }
                },
            ],
            Steps=steps,
            # 對自己賬號可見
            VisibleToAllUsers=False,
            # EC2 實例配置
            JobFlowRole='EMR_EC2_DefaultRole',
            # EMR 角色
            ServiceRole='EMR_DefaultRole'
        )
        self.job_flow_id = response['JobFlowId']

        return response

    def describe_cluster(self, jobFlowId=None):
        job_id = jobFlowId if jobFlowId else self.job_flow_id
        if job_id:
            return self.client.describe_cluster(
                ClusterId=jobFlowId
            )

if __name__ == "__main__":

    emr = EmrClient()

    steps = []

    steps.append(
        emr.generate_step("Load spark project from S3", "hadoop fs -get s3://dev-xxx/borey-zhu/spark/example.jar /home/hadoop/spark-example.jar")
    )

    spark_submit_cmd = """
            spark-submit --master yarn  
                --packages org.apache.hbase:hbase-hadoop-compat:1.3.0,org.apache.hbase:hbase-server:1.3.0,org.apache.hbase:hbase-common:1.3.0,org.apache.hbase:hbase-client:1.3.0 --repositories http://maven.aliyun.com/nexus/content/groups/public/ 
                --driver-memory 2G 
                --executor-cores 2 
                --num-executors 32 
                --executor-memory 2200M 
                --conf spark.memory.fraction=0.75
                --conf spark.memory.storageFraction=0.1
                --conf spark.serializer=org.apache.spark.serializer.KryoSerializer
                --conf spark.driver.extraJavaOptions=-XX:+UseCompressedOops
                --conf spark.executor.extraJavaOptions=-XX:+UseCompressedOops
                --conf mapreduce.input.fileinputformat.list-status.num-threads=2
                --class com.borey.spark.SparkTest
                /home/hadoop/spark-example.jar
"""

    steps.append(
            emr.generate_step("Spark Example", spark_submit_cmd),
        )


    response = emr.add_job_flow_steps(steps)
    print response

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章