pyspark參數設置,常用時間函數

spark-submit --master yarn-client --num-executors 8 --driver-memory 4g --executor-memory 2g spark_demo.py 
spark = SparkSession.builder \
    .master("yarn") \
    .appName("version_1") \
    #.config("spark.sql.warehouse.dir", "spark.warehouse") \
    .config("spark.executor.heartbeatInterval", 120) \
    .config("spark.debug.maxToStringFields", 1000) \
    #.config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "256m") \
    .config("spark.rpc.message.maxSize",256) \
    .config("spark.port.maxRetries", 32) \
    .config("spark.rpc.numRetries", 9) \
    .config("spark.rpc.retry.wait","9s") \
    .config("spark.scheduler.maxRegisteredResourcesWaitingTime","120s") \
    .config("spark.shuffle.service.enabled","true") \
    .config("spark.dynamicAllocation.enabled","true") \
    .config("spark.dynamicAllocation.maxExecutors",108) \
    .config("spark.dynamicAllocation.minExecutors",1) \
    .config("spark.dynamicAllocation.schedulerBacklogTimeout","10s") \
    .config("spark.dynamicAllocation.executorIdleTimeout","120s") \
    .config("spark.network.timeout","240s") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.driver.cores", 16) \
    .config("spark.driver.memory", "64g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.python.worker.memory", "1g") \
    .config("spark.python.worker.reuse", "true") \
    .enableHiveSupport() \
    .getOrCreate()
spark = SparkSession.builder \
    .master("yarn") \
    .appName("version_2") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .config("spark.debug.maxToStringFields", 1024) \
    .config("spark.broadcast.blockSize", "4m") \
    .config("spark.executor.heartbeatInterval", 240) \
    .config("spark.network.timeout", "360s") \
    .config("spark.rpc.message.maxSize", 256) \
    .config("spark.port.maxRetries", 32) \
    .config("spark.rpc.numRetries", 24) \
    .config("spark.rpc.retry.wait", "3s") \
    .config("spark.shuffle.io.numConnectionsPerPeer", 3) \
    .config("spark.shuffle.io.maxRetries", 9) \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.scheduler.maxRegisteredResourcesWaitingTime", "120s") \
    .config("spark.driver.cores", 16) \
    .config("spark.driver.memory", "64g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.driver.memoryOverhead","2G") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .enableHiveSupport() \
    .getOrCreate()
reload(sys)
sys.setdefaultencoding('utf-8')

spark = SparkSession.builder \
    .master("yarn") \
    .appName("version_3") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "512m") \
    .config("spark.debug.maxToStringFields", 1024) \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.driver.cores", 4) \
    .config("spark.driver.memory", "128g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.driver.memoryOverhead", "4g") \
    .config("spark.executor.cores", 4) \
    .config("spark.executor.memory", "6g") \
    .config("spark.executor.memoryOverhead", "4g") \
    .config("spark.dynamicAllocation.maxExecutors", 1000) \
    .config("spark.default.parallelism", 1000) \
    .config("spark.cleaner.periodicGC.interval", "10min") \
    .enableHiveSupport() \
    .getOrCreate()
#常用時間函數
from __future__ import division, print_function, absolute_import
import sys, time, datetime
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn

reload(sys)
sys.setdefaultencoding('utf-8')

spark = SparkSession.builder \
    .master("yarn") \
    .appName("xxx") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1024m") \
    .config("spark.debug.maxToStringFields", 1024) \
    .config("spark.driver.cores", 4) \
    .config("spark.driver.memory", "16g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.driver.memoryOverhead", "4g") \
    .config("spark.executor.cores", 4) \
    .config("spark.executor.memory", "16g") \
    .config("spark.executor.memoryOverhead", "4g") \
    .config("spark.dynamicAllocation.maxExecutors", 1000) \
    .config("spark.default.parallelism", 1000) \
    .config("spark.cleaner.periodicGC.interval", "10min") \
    .enableHiveSupport() \
    .getOrCreate()


def getDate(fakeToday='today', moveDays=0, isReturnPt=False):
    fakeToday = datetime.date.today() if fakeToday == 'today' else datetime.datetime.strptime(fakeToday, '%Y-%m-%d')
    the_day = fakeToday + datetime.timedelta(days=moveDays)
    date_format = '%Y%m%d' if isReturnPt else '%Y-%m-%d'
    return the_day.strftime(date_format)


def ptToDate(pt, isDetial=False, isMinTime=True):
    dt = datetime.datetime.strptime(pt, '%Y%m%d').strftime('%Y-%m-%d')
    dt = dt + ' 00:00:00' if isMinTime else dt + ' 23:59:59'
    return dt if isDetial else dt[:10]


def toTimestamp(datetime):
    fmt = '%Y-%m-%d %H:%M:%S'
    if len(datetime) == 8:
        fmt = '%Y%m%d'
    elif len(datetime) == 10:
        fmt = '%Y-%m-%d'
    elif len(datetime) == 19:
        fmt = '%Y-%m-%d %H:%M:%S'
    return int(time.mktime(time.strptime(datetime, fmt)) * 1000)  # len(x)==13


def getLastMonday(fakeToday='today',includeTodayMonday=False,isReturnPt=False):
    fakeToday = datetime.date.today() if fakeToday == 'today' else datetime.datetime.strptime(fakeToday, '%Y-%m-%d')
    moveDays = -7 if fakeToday.weekday() == 0 else -fakeToday.weekday()
    moveDays = 0 if includeTodayMonday and moveDays==-7 else moveDays
    the_day = fakeToday + datetime.timedelta(days=moveDays)
    date_format = '%Y%m%d' if isReturnPt else '%Y-%m-%d'
    return the_day.strftime(date_format)

def timeStat(func):
    def wrapper(*args, **kwargs):
        import time
        t1 = time.time()
        r = func(*args, **kwargs)
        t2 = time.time()
        print('UseTimeMinutes(fn: {}): '.format(func.__name__), round((t2 - t1) / 60, 2))
        return r
    return wrapper

import pandas as pd
pt_list = [str(x)[:10].replace('-','') for x in pd.date_range(end=last_1_date, freq='D',periods=30)]
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章