pyspark讀取csv文件
方式一:SparkSession
spark=SparkSession.builder.appName('bianzu').getOrCreate()
sqlContext = SQLContext(spark)
raw_date = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',encoding='GBK').load(r'D:\tmp\bianzu\2020-05-13-夜班-新能源-編組統計信息.csv')
raw_date.show(5)
total_number = len(raw_date.collect())
palo_fail_num = raw_date[raw_date['數倉編組關聯面型狀態']=='異常'].collect()
palo_fail_num = len(palo_fail_num)
palo_success_num = total_number - palo_fail_num
diaodian_fail_num = raw_date[raw_date['掉點數據狀態']=='異常'].collect()
diaodian_fail_num = len(diaodian_fail_num)
diaodian_success_num = total_number - diaodian_fail_num
cp_fail_num = raw_date[raw_date['計算平臺結果狀態']=='異常'].collect()
cp_fail_num = len(cp_fail_num)
cp_success_num = total_number - cp_fail_num
palo_success_rate = palo_success_num/total_number
palo_success_rate = "%.2f%%" %(palo_success_rate*100)
print(palo_success_rate)
cp_success_rate = (total_number-cp_fail_num+palo_fail_num)/total_number
cp_success_rate = "%.2f%%" %(cp_success_rate*100)
print(cp_success_rate)
diaodian_success_rate = (total_number-diaodian_fail_num+cp_success_num)/total_number
diaodian_success_rate_rate = "%.2f%%" %(diaodian_success_rate*100)
total_number_rate = (total_number-diaodian_fail_num)/total_number
total_number_rate = "%.2f%%" %(total_number_rate*100)
print(total_number_rate)
total_success_rate=[]
total_success_rate.append(pre_today_str)
total_success_rate.append(pairshift)
total_success_rate.append(total_number)
total_success_rate.append(total_number_rate)
total_success_rate.append(palo_success_num)
total_success_rate.append(palo_success_rate)
total_success_rate.append(cp_success_num)
total_success_rate.append(cp_success_rate)
total_success_rate.append(diaodian_success_num)
total_success_rate.append(diaodian_success_rate)
print("9888888888888888888888888888888888888888")
print(total_success_rate)
total_success_rate_info = spark.sparkContext.parallelize([total_success_rate])
success_rate_fields = [
StructField("編組日期", StringType(), True),
StructField("班次", StringType(), True),
StructField("編組總數", IntegerType(), True),
StructField("任務整體成功率", StringType(), True),
StructField("數倉成功數", IntegerType(), True),
StructField("數倉成功率", StringType(), True),
StructField("計算平臺成功數", IntegerType(), True),
StructField("計算平臺成功率", StringType(), True),
StructField("掉點計算成功數", IntegerType(), True),
StructField("掉點計算成功率", StringType(), True)
]
success_rate_schema = StructType(success_rate_fields)
success_rate_info_df = spark.createDataFrame(total_success_rate_info,success_rate_schema)
print(success_rate_info_df['數倉成功率'])
success_rate_info_df.show(10)
方式二: SparkContext()
sc = SparkContext()
sqlContext = SQLContext(sc)
raw_date = sqlContext.read.format('com.databricks.spark.csv').options(header='true',inferschema='true',encoding='GBK').load(r'D:\tmp\bianzu\2020-05-13-夜班-新能源-編組統計信息.csv')
raw_date.show(5)