EMR 指南
[PDF]Amazon EMR - 開發人員指南 - Amazon.com
使用API創建EMR
Python依賴包安裝
pip install boto3
API參考
EMR集羣主要採用 boto3 EMR.Client.run_job_flow api 進行集羣創建, 詳細描述參考官網文檔。
樣例代碼
# -*- encoding:utf8 -*-
"""
author: quanbin_zhu
time : 2017/11/13 15:19
"""
import re
import boto3
class EmrClient(object):
def __init__(self):
self.emr_version = "emr-5.9.0"
self.emr_log_url = "s3://dev-xxx/borey-zhu/sparklog"
self.aws_ec2_key = "borey.zhu"
self.aws_access_key = "xxxxxxxxxxxxxxxxxxxxxxxx"
self.aws_secret_key = "xxxxxxxxxxxxxxxxxxxxxxxx+xxxxxxxxxx"
self.client = boto3.client('emr', region_name='cn-north-1', aws_access_key_id=self.aws_access_key,
aws_secret_access_key=self.aws_secret_key)
self.job_flow_id = None
def generate_step(self, step_name, step_command):
cmds = re.split('\\s+', step_command)
if not cmds:
raise ValueError
return {
'Name': step_name,
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': cmds
}
}
def add_job_flow_steps(self, steps, flow_id=None):
if flow_id:
return self.client.add_job_flow_steps(JobFlowId=flow_id, Steps=steps)
elif self.job_flow_id:
return self.client.add_job_flow_steps(JobFlowId=self.job_flow_id, Steps=steps)
else:
return self.run_job_flow(steps)
def run_job_flow(self, steps=None):
response = self.client.run_job_flow(
# 集羣名稱
Name="SnifferAnalyse",
# 集羣版本號 http://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-release-components.html
ReleaseLabel=self.emr_version,
# 集羣日誌存儲Amazon S3的路徑
LogUri=self.emr_log_url,
Instances={
# 實例類型
# https://amazonaws-china.com/cn/ec2/purchasing-options/dedicated-instances/
'MasterInstanceType': 'm3.xlarge',
'SlaveInstanceType': 'm3.xlarge',
# 實例數
'InstanceCount': 9,
# 保持EMR集羣運行當沒有步驟時
# True - 切換爲 WAITING
# False - 關閉集羣
'KeepJobFlowAliveWhenNoSteps': False,
# 終止保護
# True - 鎖定集羣和防止被API調用終止,用戶干預,或在工作時流錯誤
'TerminationProtected': False,
# 使用現有 EC2 密鑰對以 SSH 方式連接到 Amazon EMR 集羣的主節點
'Ec2KeyName': self.aws_ec2_key,
# 指定子網 ID, 集羣虛擬私有云 Amazon Virtual Private Cloud (亞馬遜 VPC)
'Ec2SubnetId': 'subnet-xxxxxxx'
},
Applications=[
{
'Name': 'Hadoop'
},
{
'Name': 'Spark'
},
],
BootstrapActions=[
{
# YARN 資源最大化配置 for Spark
'Name': 'Spark Default Config',
'ScriptBootstrapAction': {
# Initialization shell for spark, you will find the doc in
# https://github.com/awslabs/emr-bootstrap-actions/tree/master/spark
'Path': 's3://support.elasticmapreduce/spark/maximize-spark-default-config',
}
},
],
Steps=steps,
# 對自己賬號可見
VisibleToAllUsers=False,
# EC2 實例配置
JobFlowRole='EMR_EC2_DefaultRole',
# EMR 角色
ServiceRole='EMR_DefaultRole'
)
self.job_flow_id = response['JobFlowId']
return response
def describe_cluster(self, jobFlowId=None):
job_id = jobFlowId if jobFlowId else self.job_flow_id
if job_id:
return self.client.describe_cluster(
ClusterId=jobFlowId
)
if __name__ == "__main__":
emr = EmrClient()
steps = []
steps.append(
emr.generate_step("Load spark project from S3", "hadoop fs -get s3://dev-xxx/borey-zhu/spark/example.jar /home/hadoop/spark-example.jar")
)
spark_submit_cmd = """
spark-submit --master yarn
--packages org.apache.hbase:hbase-hadoop-compat:1.3.0,org.apache.hbase:hbase-server:1.3.0,org.apache.hbase:hbase-common:1.3.0,org.apache.hbase:hbase-client:1.3.0 --repositories http://maven.aliyun.com/nexus/content/groups/public/
--driver-memory 2G
--executor-cores 2
--num-executors 32
--executor-memory 2200M
--conf spark.memory.fraction=0.75
--conf spark.memory.storageFraction=0.1
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer
--conf spark.driver.extraJavaOptions=-XX:+UseCompressedOops
--conf spark.executor.extraJavaOptions=-XX:+UseCompressedOops
--conf mapreduce.input.fileinputformat.list-status.num-threads=2
--class com.borey.spark.SparkTest
/home/hadoop/spark-example.jar
"""
steps.append(
emr.generate_step("Spark Example", spark_submit_cmd),
)
response = emr.add_job_flow_steps(steps)
print response