PySpark DataFrame 操作

SQl 獲取 DF 操作

# coding:utf-8
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException

if __name__ == '__main__':
    spark = SparkSession.builder.enableHiveSupport().appName(name="AppName").getOrCreate()

    payDebtWholeDta_df = spark.sql(
        "SELECT xxxx ...")
        
    # 轉換成 rdd 返回
    # .rdd.distinct()
    
    print(payDebtWholeDta_df)
    # DataFrame[omsOrderItemId: string, billType: string, ...]
    
    print(payDebtWholeDta_df.collect())
	# [Row(omsOrderItemId=u'20190122332201', billType=u'1', ...)]
    # payDebtWholeDta_df = spark.createDataFrame(payDebtWholeDta_rRdd)

    # createOrReplaceTempView 的生命週期跟隨 SparkSession 生命週期
    payDebtWholeDta_df.createOrReplaceTempView("payDebtWholeDataTable")
    # 不會報錯,直接替換臨時表
    payDebtWholeDta_df.createOrReplaceTempView("payDebtWholeDataTable")

    df3 = spark.sql("select * from payDebtWholeDataTable ")
    print(df3)
    # 同上 payDebtWholeDta_df
    payDebtWholeDta_df.createGlobalTempView("payDebtWholeDataTable")
    
    # 若不捕獲異常,會報下面的錯誤 ,因爲 createGlobalTempView 不能重複創建
    # pyspark.sql.utils.AnalysisException: u"Temporary view 'paydebtwholedatatable' already exists;"
    
    try:
        payDebtWholeDta_df.createGlobalTempView("payDebtWholeDataTable")
    except(AnalysisException) as e:
        # err is u"Temporary view 'paydebtwholedatatable' already exists;"
        print("err is {}".format(e))

    if len(payDebtWholeDta_df.take(1)) > 0:
        # DF 打印前1條
        print(payDebtWholeDta_df.take(1))
        
        # 寫入到json  "xxxx"
        payDebtWholeDta_df.write.json("xxx")
        payDebtWholeDta_rdd00 = spark.read.json("xxxx")

    else:
        print("無結果")


其他常用操作

DataFrame().crossJoin()
DataFrame().describe()
DataFrame().distinct()
DataFrame().drop()
DataFrame().toJSON
DataFrame().toPandas
DataFrame().union(other)
DataFrame().write()
DataFrame().take(num)
DataFrame().sort(*cols, **kwargs)
DataFrame().select(*cols)
DataFrame().rdd()
DataFrame().orderBy(*cols, **kwargs)
DataFrame().limit
DataFrame().groupBy(*col)
DataFrame().dtypes

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章