pyspark 讀取和處理csv文件

pyspark讀取csv文件

方式一:SparkSession

# 讀取csv文件
spark=SparkSession.builder.appName('bianzu').getOrCreate()
sqlContext = SQLContext(spark)
raw_date = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',encoding='GBK').load(r'D:\tmp\bianzu\2020-05-13-夜班-新能源-編組統計信息.csv')
raw_date.show(5)
total_number = len(raw_date.collect())


#對讀取的csv文件處理
palo_fail_num = raw_date[raw_date['數倉編組關聯面型狀態']=='異常'].collect()
palo_fail_num = len(palo_fail_num)
palo_success_num = total_number - palo_fail_num
diaodian_fail_num = raw_date[raw_date['掉點數據狀態']=='異常'].collect()
diaodian_fail_num = len(diaodian_fail_num)
diaodian_success_num = total_number - diaodian_fail_num
cp_fail_num = raw_date[raw_date['計算平臺結果狀態']=='異常'].collect()
cp_fail_num = len(cp_fail_num)
cp_success_num = total_number - cp_fail_num


#把小數轉化爲百分數
palo_success_rate = palo_success_num/total_number
palo_success_rate = "%.2f%%" %(palo_success_rate*100)
print(palo_success_rate)
cp_success_rate = (total_number-cp_fail_num+palo_fail_num)/total_number
cp_success_rate = "%.2f%%" %(cp_success_rate*100)
print(cp_success_rate)
diaodian_success_rate = (total_number-diaodian_fail_num+cp_success_num)/total_number
diaodian_success_rate_rate = "%.2f%%" %(diaodian_success_rate*100)
total_number_rate = (total_number-diaodian_fail_num)/total_number
total_number_rate = "%.2f%%" %(total_number_rate*100)
print(total_number_rate)

# 創建dataframe
total_success_rate=[]
total_success_rate.append(pre_today_str)
total_success_rate.append(pairshift)
total_success_rate.append(total_number)
total_success_rate.append(total_number_rate)
total_success_rate.append(palo_success_num)
total_success_rate.append(palo_success_rate)
total_success_rate.append(cp_success_num)
total_success_rate.append(cp_success_rate)
total_success_rate.append(diaodian_success_num)
total_success_rate.append(diaodian_success_rate)
print("9888888888888888888888888888888888888888")
print(total_success_rate)
total_success_rate_info = spark.sparkContext.parallelize([total_success_rate])
success_rate_fields = [
    StructField("編組日期", StringType(), True),
    StructField("班次", StringType(), True),
    StructField("編組總數", IntegerType(), True),
    StructField("任務整體成功率", StringType(), True),
    StructField("數倉成功數", IntegerType(), True),
    StructField("數倉成功率", StringType(), True),
    StructField("計算平臺成功數", IntegerType(), True),
    StructField("計算平臺成功率", StringType(), True),
    StructField("掉點計算成功數", IntegerType(), True),
    StructField("掉點計算成功率", StringType(), True)

]
success_rate_schema = StructType(success_rate_fields)
success_rate_info_df = spark.createDataFrame(total_success_rate_info,success_rate_schema)
print(success_rate_info_df['數倉成功率'])
success_rate_info_df.show(10)

方式二: SparkContext()

sc = SparkContext()
sqlContext = SQLContext(sc)
raw_date = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',encoding='GBK').load(r'D:\tmp\bianzu\2020-05-13-夜班-新能源-編組統計信息.csv')
raw_date.show(5)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章