spark-submit --master yarn-client --num-executors 8 --driver-memory 4g --executor-memory 2g spark_demo.py
spark = SparkSession.builder \
.master("yarn") \
.appName("version_1") \
.config("spark.executor.heartbeatInterval", 120) \
.config("spark.debug.maxToStringFields", 1000) \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.kryoserializer.buffer.max", "256m") \
.config("spark.rpc.message.maxSize",256) \
.config("spark.port.maxRetries", 32) \
.config("spark.rpc.numRetries", 9) \
.config("spark.rpc.retry.wait","9s") \
.config("spark.scheduler.maxRegisteredResourcesWaitingTime","120s") \
.config("spark.shuffle.service.enabled","true") \
.config("spark.dynamicAllocation.enabled","true") \
.config("spark.dynamicAllocation.maxExecutors",108) \
.config("spark.dynamicAllocation.minExecutors",1) \
.config("spark.dynamicAllocation.schedulerBacklogTimeout","10s") \
.config("spark.dynamicAllocation.executorIdleTimeout","120s") \
.config("spark.network.timeout","240s") \
.config("spark.executor.memory", "4g") \
.config("spark.executor.memoryOverhead", "2g") \
.config("spark.driver.cores", 16) \
.config("spark.driver.memory", "64g") \
.config("spark.driver.maxResultSize", "4g") \
.config("spark.python.worker.memory", "1g") \
.config("spark.python.worker.reuse", "true") \
.enableHiveSupport() \
.getOrCreate()
spark = SparkSession.builder \
.master("yarn") \
.appName("version_2") \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.kryoserializer.buffer.max", "512m") \
.config("spark.debug.maxToStringFields", 1024) \
.config("spark.broadcast.blockSize", "4m") \
.config("spark.executor.heartbeatInterval", 240) \
.config("spark.network.timeout", "360s") \
.config("spark.rpc.message.maxSize", 256) \
.config("spark.port.maxRetries", 32) \
.config("spark.rpc.numRetries", 24) \
.config("spark.rpc.retry.wait", "3s") \
.config("spark.shuffle.io.numConnectionsPerPeer", 3) \
.config("spark.shuffle.io.maxRetries", 9) \
.config("spark.sql.execution.arrow.enabled", "true") \
.config("spark.scheduler.maxRegisteredResourcesWaitingTime", "120s") \
.config("spark.driver.cores", 16) \
.config("spark.driver.memory", "64g") \
.config("spark.driver.maxResultSize", "4g") \
.config("spark.driver.memoryOverhead","2G") \
.config("spark.executor.memory", "4g") \
.config("spark.executor.memoryOverhead", "2g") \
.enableHiveSupport() \
.getOrCreate()
reload(sys)
sys.setdefaultencoding('utf-8')
spark = SparkSession.builder \
.master("yarn") \
.appName("version_3") \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.kryoserializer.buffer.max", "512m") \
.config("spark.debug.maxToStringFields", 1024) \
.config("spark.sql.execution.arrow.enabled", "true") \
.config("spark.driver.cores", 4) \
.config("spark.driver.memory", "128g") \
.config("spark.driver.maxResultSize", "2g") \
.config("spark.driver.memoryOverhead", "4g") \
.config("spark.executor.cores", 4) \
.config("spark.executor.memory", "6g") \
.config("spark.executor.memoryOverhead", "4g") \
.config("spark.dynamicAllocation.maxExecutors", 1000) \
.config("spark.default.parallelism", 1000) \
.config("spark.cleaner.periodicGC.interval", "10min") \
.enableHiveSupport() \
.getOrCreate()
from __future__ import division, print_function, absolute_import
import sys, time, datetime
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
reload(sys)
sys.setdefaultencoding('utf-8')
spark = SparkSession.builder \
.master("yarn") \
.appName("xxx") \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.kryoserializer.buffer.max", "1024m") \
.config("spark.debug.maxToStringFields", 1024) \
.config("spark.driver.cores", 4) \
.config("spark.driver.memory", "16g") \
.config("spark.driver.maxResultSize", "4g") \
.config("spark.driver.memoryOverhead", "4g") \
.config("spark.executor.cores", 4) \
.config("spark.executor.memory", "16g") \
.config("spark.executor.memoryOverhead", "4g") \
.config("spark.dynamicAllocation.maxExecutors", 1000) \
.config("spark.default.parallelism", 1000) \
.config("spark.cleaner.periodicGC.interval", "10min") \
.enableHiveSupport() \
.getOrCreate()
def getDate(fakeToday='today', moveDays=0, isReturnPt=False):
fakeToday = datetime.date.today() if fakeToday == 'today' else datetime.datetime.strptime(fakeToday, '%Y-%m-%d')
the_day = fakeToday + datetime.timedelta(days=moveDays)
date_format = '%Y%m%d' if isReturnPt else '%Y-%m-%d'
return the_day.strftime(date_format)
def ptToDate(pt, isDetial=False, isMinTime=True):
dt = datetime.datetime.strptime(pt, '%Y%m%d').strftime('%Y-%m-%d')
dt = dt + ' 00:00:00' if isMinTime else dt + ' 23:59:59'
return dt if isDetial else dt[:10]
def toTimestamp(datetime):
fmt = '%Y-%m-%d %H:%M:%S'
if len(datetime) == 8:
fmt = '%Y%m%d'
elif len(datetime) == 10:
fmt = '%Y-%m-%d'
elif len(datetime) == 19:
fmt = '%Y-%m-%d %H:%M:%S'
return int(time.mktime(time.strptime(datetime, fmt)) * 1000)
def getLastMonday(fakeToday='today',includeTodayMonday=False,isReturnPt=False):
fakeToday = datetime.date.today() if fakeToday == 'today' else datetime.datetime.strptime(fakeToday, '%Y-%m-%d')
moveDays = -7 if fakeToday.weekday() == 0 else -fakeToday.weekday()
moveDays = 0 if includeTodayMonday and moveDays==-7 else moveDays
the_day = fakeToday + datetime.timedelta(days=moveDays)
date_format = '%Y%m%d' if isReturnPt else '%Y-%m-%d'
return the_day.strftime(date_format)
def timeStat(func):
def wrapper(*args, **kwargs):
import time
t1 = time.time()
r = func(*args, **kwargs)
t2 = time.time()
print('UseTimeMinutes(fn: {}): '.format(func.__name__), round((t2 - t1) / 60, 2))
return r
return wrapper
import pandas as pd
pt_list = [str(x)[:10].replace('-','') for x in pd.date_range(end=last_1_date, freq='D',periods=30)]