spark 改寫信用違約預測

信用違約預測
傳送門：https://www.kesci.com/home/project/5dadfeb675df5c002b20fa45
package LittleTask

/**
 * @CreateUser: eshter
 * @CreateDate: 2019/10/29
 * @UpdateUser:
 */

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions.{col, udf,mean}
import utils.session.IgnoreErrorAndINFO
import org.apache.spark.sql.types.DoubleType

object Credit02 {
      //Logger.getLogger("org").setLevel(Level.ERROR)
      new IgnoreErrorAndINFO().ignoreErrorAndInfo()
      //    更改列名
      def renameColumn(df:DataFrame,columns:Array[String],newCol:Array[String])={
            var NewDf=df
            for(i<- 0 until(columns.length))
            {
                  //println("i="+newCol(i))
                  NewDf= NewDf.withColumn(newCol(i),NewDf(columns(i)))
            }
            NewDf.select(newCol.map(x=>col(x)):_*)
      }

      def missingValueDistribute(df:DataFrame,spark:SparkSession):Unit={
            import spark.implicits._
            val colsName=df.columns
           val missSeq= colsName.map(col=>{
      (col-> df.select(col).filter($"$col"equalTo("NA")).count().toString)
            }).toSeq.toDF("列名","缺失值個數")
            println(missSeq.show())
      }

      def showDropError(df:DataFrame,colName:String,whis:Double=1.5, show: Boolean=false)={
            var tmp=df
            val quartile =df.stat.approxQuantile(colName,Array(0.25,0.5,0.75),0)
            val maxQ=quartile(quartile.length-1)
            val minQ=quartile(0)
            val iqr=maxQ-minQ
            val upbound=maxQ+whis*iqr
            val downbound =minQ-whis*iqr
            val upper_count=df.select(colName).filter(f"'$colName' >$upbound").count()
            println(colName+"->upper_count="+upper_count)
            if (show & upper_count>0)
                  tmp= tmp.select(colName).filter(f"'$colName'<$upbound")
            val down_count=df.select(colName).filter(f"'$colName' <$downbound").count()
            println(colName+"->down_count"+down_count)
            if (show & down_count>0)
                  tmp=tmp.select(colName).filter(f"'$colName'>$downbound")
            tmp


      }

      def filterData(df:DataFrame)={
            var tmp=df
            val all_n=df.count()
            val null2null =udf((arg: String) => {if (arg == "NA" ) null else arg})
            for(col<- Array("月收入","家屬數量"))
            {
                  tmp= tmp.withColumn(col,null2null(df(col)))
            }
            // 更改數據類型
            val colNames = tmp.columns
            val colss = colNames.map(f => col(f).cast(DoubleType))
            tmp = tmp.select(colss:_*)
            //println(tmp.select("月收入").show(20000))
            // 年齡 異常值數量不多，剔除年齡大於100小於18的異常數據
            showDropError(tmp,colName="年齡")
            tmp=tmp.filter("`年齡`>18 and `年齡`<100")
            println("共刪除數據"+(all_n-tmp.count())+"條")
            //println(tmp.show(5))
            // 逾期30-59天的筆數 根據箱型圖去除>80的異常數據
            showDropError(tmp,colName="逾期30-59天的筆數")
            tmp = tmp.filter("`逾期30-59天的筆數`<80")
            println("共刪除數據"+(all_n-tmp.count())+"條")
            /* 逾期90天+的筆數 根據箱型圖去除>80的異常數據
            */
            showDropError(tmp,colName="逾期90天+的筆數")
            tmp = tmp.filter("`逾期90天+的筆數`<80 ")
            println("共刪除數據"+(all_n-tmp.count())+"條")
            /* 逾期60-89天的筆數
             根據箱型圖去除>80的異常數據
             */
            showDropError(tmp,colName="逾期60-89天的筆數")
            tmp = tmp.filter("`逾期60-89天的筆數`<80")
            println("共刪除數據"+(all_n-tmp.count())+"條")
            /*負債率
            根據箱型圖去除>100000的異常數據
             */
            showDropError(tmp,colName="負債率")
            tmp = tmp.filter("`負債率`<100000")
            println("共刪除數據"+(all_n-tmp.count())+"條")
            /*# 月收入
             根據箱型圖去除>500000的異常數據
             */
            showDropError(tmp,colName="月收入")
            tmp = tmp.filter("`月收入`<500000 or `月收入` is null")
            println("共刪除數據"+(all_n-tmp.count())+"條")

            /*# 固定資產貸款數
             根據箱型圖去除>20的異常數據
             */
            showDropError(tmp,"固定資產貸款數")
            tmp = tmp.filter("`固定資產貸款數`<20")
            println("共刪除數據"+(all_n-tmp.count())+"條")
            /*# 家屬數量
            # 根據箱型圖去除>10的異常數據*/
            showDropError(tmp,"家屬數量")
            tmp =tmp.filter("`家屬數量`<12 or `家屬數量`is null")
            println("共刪除數據"+(all_n-tmp.count())+"條")
            tmp
      }
      def collineationProcessing(df:DataFrame,col:String,col1:String,col2:String,name:String)={

            val getRate=(arg:Double,arg1:Double,arg2:Double)=>{
                  if(arg2==0){
                        0
                  }else{
                        arg1/arg2
                  }
            }
            val colFun=udf(getRate)
            val resultDataFrame=df.withColumn(name,colFun(df(col),df(col1),df(col2)))
            resultDataFrame
      }

      def  missingValuesProcessing(df:DataFrame,func1:Int=1)={
           /*
                 缺失值處理
                df：數據源
                func1：默認爲1，衆數填充家屬；0，去除帶空值數據行。
                func2：默認爲1，衆數填充月收入；0，平均數填充月收入。
            */
            var tmp=df
            if (func1==1)
                  {
                        val mod = df.filter("`家屬數量` is not null")
                            .select("家屬數量")
                            .groupBy("家屬數量")
                            .count()
                            .sort("count")
                            .first()(0).toString.toDouble

                       tmp= tmp.na.fill(Map("家屬數量" -> mod))

                  }
            else if (func1==0)
                  {
                        tmp=tmp.na.drop("家屬數量")
                        val incomeMean=tmp.select(mean(col("月收入"))).first()(0)
                        tmp.na.fill(Map("月收入"->incomeMean))


                  }

            tmp
      }

      def resampleDataFrame(df:DataFrame)={
            //使樣本'未來兩年可能違約'標籤的0，1項可以各佔一半，以提高預測效果。
            val label1=df.filter("`未來兩年可能違約`=1")
            val label0=df.filter("`未來兩年可能違約`=0")
            val labelData=label0.sample(label1.count()/label0.count().toDouble)
            val tmp=label1.union(labelData)
            tmp

      }


      def main(args: Array[String]): Unit = {
            val spark = SparkSession.builder()
                .master("local[2]")
                .appName("ty")
                .getOrCreate()
            var src_train = spark
                .read
                .format("csv")
                .option("header", true)
                .option("inferSchema", true)
                //.option("multiLine",true)
                .option("delimiter", ",")
                .load("/Users/eshter/Desktop/give_credit5464/cs-training.csv")
            var src_test = spark
                .read
                .format("csv")
                .option("header", true)
                .option("inferSchema", true) //這是自動推斷屬性列的數據類型
                //.option("multiLine",true)
                .option("delimiter", ",")
                .load("/Users/eshter/Desktop/give_credit5464/cs-test.csv")
            import spark.implicits._
            //刪除無用列

            src_train = src_train.drop("_c0")
            src_test = src_test.drop("_c0")



            val columns = src_train.columns
            //println(columns.foreach(println(_)))
            //df.columns=
           val newCol =Array(
                 "未來兩年可能違約"
                 ,"可用信貸額度比例"
                 ,"年齡"
                 ,"逾期30-59天的筆數"
                 ,"負債率"
                 ,"月收入"
                 ,"信貸數量"
                 ,"逾期90天+的筆數"
                 ,"固定資產貸款數"
                 ,"逾期60-89天的筆數"
                 ,"家屬數量"
           )
            src_train= renameColumn(src_train,columns,newCol)
            //println(src_train.show(2))
            src_test= renameColumn(src_test,columns,newCol)
           // println(src_test.show(100))
            println(src_train.count())
            println(src_test.count())
            print("訓練集的分佈查看：\n "+src_train.summary().show())
            println("測試集的分佈查看：\n "+src_test.summary().show())
//            println(src_train.select("月收入").show(1000))
//
            // 查看label的分佈
            val labelDis=src_train.select("未來兩年可能違約").groupBy("未來兩年可能違約").count()
//            println("label分佈：\n"+labelDis.show)
            // 缺失值的分佈查看
            missingValueDistribute(src_train,spark)
            println(src_train.show(4))
            //異常值處理
            src_train=filterData(src_train)
            src_train=collineationProcessing(src_train,"逾期90天+的筆數","逾期60-89天的筆數","逾期30-59天的筆數","逾期60-89天/30-59天")
            src_train=missingValuesProcessing(src_train,func1=1)

      }

}