spark常用語法

Driver:Driver是Spark中Application也即代碼的發佈程序,可以理解爲我們編寫spark代碼的主程序,因此只有一個,負責對spark中SparkContext對象進行創建,其中SparkContext對象負責創建Spark中的RDD(Spark中的基本數據結構,是一種抽象的邏輯概念)
Driver的另外一個職責是將任務分配給各個Executor進行執行。任務分配的原則主要是就近原則,即數據在哪個Executor所在的機器上,則任務分發給哪個Exectuor。
簡單來說就是:Driver就是new sparkcontext的那個應用程序類可以成爲Driver ,而且Driver的職責是將任務分配給Exectuor執行計算

Executor:是Spark中執行任務的計算資源,可以理解爲分佈式的CPU,每臺機器可能存在多個Executor(因爲計算機的CPU有多個核),這些分佈式的計算機集羣會有很多的Executor,Executor主要負責Spark中的各種算子的實際計算(如map等)

package com.ts.app.task.Ld

import com.ts.config.{MysqlConnect, SparkBuilder}
import com.ts.model.calendar.BusinessCalendarModel
import com.ts.model.location.{RdcSaleQtyModel, SomeMysqlDataModel}
import com.ts.utils.UdfUtil
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DateType, DecimalType, IntegerType, StringType}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.mutable.ListBuffer

/**
 * @author: wujiajian
 * @time: 2021/7/5
 * @description:
 */
object SparkDemo {

  def main(args: Array[String]): Unit = {

    //    val spark = SparkSession.builder()
    //      .appName(this.getClass.getSimpleName)
    //      .master("local[2]")
    //      .config("dfs.client.use.datanode.hostname", "true")
    //      .enableHiveSupport()
    //      .getOrCreate()

    val spark = SparkBuilder.getSparkSession("sf-app-gtm-dpt-sparkHiveTest")
    spark.sql("use loreal_deploy").collect().foreach(println)

    test28(spark)

    spark.stop();
  }

  def test28(spark: SparkSession): Unit = {
    import spark.implicits._
    val df = spark.sparkContext.parallelize(Seq(
      ("loreal", "CN30_A006", "93102", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "3", "7"),
      ("loreal", "CN30_A006", "99103", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "3", "10"),
      ("loreal", "CN30_A006", "99103", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
      ("loreal", "CN30_A006", "99103", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
      ("loreal", "CN30_A006", "99104", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "3", "10"),
      ("loreal", "CN30_A006", "99104", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
      ("loreal", "CN30_A006", "99104", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
      ("loreal", "CN30_A006", "99102", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "3", "10"),
      ("loreal", "CN30_A006", "99102", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
      ("loreal", "CN30_A006", "99102", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "3", "10"),
      ("loreal", "CN30_A006", "2905", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "1", "3", "5"),
      ("loreal", "CN30_A006", "2905", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "1", "3", "5"),
      ("loreal", "CN30_A006", "2905", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "1", "3", "5"),
      ("loreal", "CN30_A006", "2907", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "1", "3", "5"),
      ("loreal", "CN30_A006", "2907", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "1", "3", "5"),
      ("loreal", "CN30_A006", "2907", "1002", "31", "2021-10-14", "2021-01-01 00:00:00", null, null, "weekly", "1", "3", "5"),
      ("loreal", "CN30_A006", "99105", "1002", "31", "2021-09-30", "2021-01-09 00:00:00", null, null, "weekly", "3", "10"),
      ("loreal", "CN30_A006", "99105", "1002", "31", "2021-10-08", "2021-01-01 00:00:00", null, null, "weekly", "3", "10")
    )).toDF("company_code", "channel_code", "store_code", "warehouse_code", "brand_code", "business_day_string", "soq_day", "from_date", "end_date", "calendar_type", "calendar_day", "lead_time")

    val orderCalendarDF = df.filter("calendar_day is not null") //發貨日曆異常的則不補貨
      .groupBy("company_code", "channel_code", "warehouse_code", "store_code", "brand_code", "soq_day", "business_day_string", "calendar_type", "calendar_day", "lead_time")
      //collect_list, date_format
      .agg(collect_list(struct(date_format($"from_date", "yyyy-MM-dd"), date_format($"end_date", "yyyy-MM-dd"))).as("no_ship_day_list"))
    orderCalendarDF.show()

//UdfUtil
    val resultDF = orderCalendarDF.withColumn("ship_day", UdfUtil.getShipDateByDateUdf(col("calendar_day"), col("calendar_type"), col("business_day_string"), col("no_ship_day_list")))
          .withColumn("ship_day", UdfUtil.getShipDateByDateUdf(col("calendar_day"), col("calendar_type"), col("business_day_string")))
      .withColumn("receive_day", UdfUtil.dateAddUdf(col("ship_day").cast(DateType), col("lead_time").cast(IntegerType)))
      //lead
      .withColumn("next_receive_day", lead($"receive_day", 1, "").over(Window.partitionBy("company_code", "channel_code", "store_code", "brand_code").orderBy("receive_day", "business_day_string")))
      //row_number
      .withColumn("rank", row_number().over(Window.partitionBy("company_code", "channel_code", "brand_code", "store_code").orderBy("business_day_string")))
      .filter("rank = 1")
    resultDF.show()

  }

  def test27(spark: SparkSession): Unit = {
    import spark.implicits._

    val itemInventoryDF = spark.sparkContext.parallelize(Seq(
      ("31", "LA1111", 20, "2021-09-24"),
      ("31", "LA1111", 20, "2021-09-25"),
      ("31", "LA1111", 20, "2021-09-28"),
      ("31", "LA1111", 20, "2021-09-21"),
      ("31", "LA1111", 20, "2021-09-23"),
      ("31", "LA1111", 20, "2021-09-30"),
      ("31", "LA1111", 20, "2021-09-27"),
      ("31", "LA1111", 20, "2021-09-26"),
      ("31", "LA1111", 20, "2021-09-22"),
      ("31", "LA1111", 20, "2021-09-29")
    )).toDF("brand_code", "item_code", "inv_qty", "business_day_string")

    val itemDemandFcstDF = spark.sparkContext.parallelize(Seq(
      ("31", "LA1111", 4, "2021-09-24"),
      ("31", "LA1111", 5, "2021-09-25"),
      ("31", "LA1111", 8, "2021-09-28"),
      ("31", "LA1111", 1, "2021-09-21"),
      ("31", "LA1111", 3, "2021-09-23"),
      ("31", "LA1111", 10, "2021-09-30"),
      ("31", "LA1111", 7, "2021-09-27"),
      ("31", "LA1111", 6, "2021-09-26"),
      ("31", "LA1111", 2, "2021-09-22"),
      ("31", "LA1111", 15, "2021-10-05"),
      ("31", "LA1111", 9, "2021-09-29"),
      ("31", "LA1111", 14, "2021-10-04"),
      ("31", "LA1111", 13, "2021-10-03"),
      ("31", "LA1111", 11, "2021-10-01"),
      ("31", "LA1111", 12, "2021-10-02"),
      ("31", "LA1111", 16, "2021-10-06")
    )).toDF("brand_code", "item_code", "fcst_qty", "fcst_day_string")

    //item維度coverage_days
    val itemDemandCoverageDayDF = itemInventoryDF.join(itemDemandFcstDF, Seq("brand_code", "item_code"), "left")
      .selectExpr("brand_code", "item_code", "ifnull(inv_qty,0) as inv_qty", "fcst_qty", "fcst_day_string", "business_day_string")
      // .filter  UdfUtil
      .filter($"fcst_day_string".>=($"business_day_string") and $"fcst_day_string".<=(UdfUtil.dateAddUdf($"business_day_string".cast(DateType), lit(5).cast(IntegerType))))
      .selectExpr("brand_code", "item_code", "inv_qty", "fcst_qty", "fcst_day_string", "business_day_string")
      .groupBy("brand_code", "item_code", "inv_qty", "business_day_string")

      //通過struct組合列的方式
      .agg(concat_ws(",", sort_array(collect_list(struct("fcst_day_string", "fcst_qty"))).getField("fcst_qty")))

      //通過UDF方式
//      .agg(collect_list(struct("fcst_day_string", "fcst_qty")) as "fcst_day_qty")
//      .select(col("brand_code"), col("item_code"), col("inv_qty"), col("business_day_string"), sortUdf(col("fcst_day_qty")).alias("fcst_list"))

      //通過正則替換的方式
//      .agg(concat_ws(",", sort_array(collect_list(concat_ws(":", col("fcst_day_string"), col("fcst_qty"))))).as("list"))
//      .withColumn("fcst_list", regexp_replace(col("list"), "\\d{4}-\\d{2}-\\d{2}\\:", ""))

      //通過選取最大列的方式,效率不好
//      .withColumn("list", concat_ws(",", collect_list($"fcst_qty") over Window.partitionBy("brand_code", "item_code", "inv_qty", "business_day_string").orderBy("fcst_day_string")))
//      .groupBy("brand_code", "item_code", "inv_qty", "business_day_string")
//      .agg(max($"list"))
      .show()
  }

  import org.apache.spark.sql.functions._
  import org.apache.spark.sql._

  val sortUdf = udf((rows: Seq[Row]) => {
    val tuples = rows.map { case Row(dayStr: String, value: Int) => (dayStr, value) }
    val tuples1 = tuples.sortBy { case (dayStr, value) => dayStr } //id if asc
    tuples1.map { case (dayStr, value) => value }
  })

  def test26(): Unit = {
    val list = List((-1, -1), (-1, 2), (1, 2), (-1, -3), (1, -1), (0, 1), (1, 0), (0, 0))
    val list2 = list.filter(x => {
      x._1 <= 0 || x._2 <= 0
    })
    list2.foreach(println)
  }

  def test25(spark: SparkSession): Unit = {
    import spark.implicits._
    val df = spark.sparkContext.parallelize(Seq(
      ("G1", "1221", null),
      ("G3", "2222", null),
      ("G2", "3333", null),
      ("G2", "1111", "第一輪")
    )).toDF("item_code", "store_code", "order_name")
    df.selectExpr("item_code", "order_name", "ifnull(order_name,concat('新店訂單-', store_code))").show()
  }

  def test24(spark: SparkSession): Unit = {
    import spark.implicits._
    val df = spark.sparkContext.parallelize(Seq(
      (1, "g1", "Boutique", "Y", "1", "0122"),
      (2, "g1", "Department", "N", "2", "0123"),
      (3, "g1", "Boutique", "Y", "3", "0124"),
      (4, "g1", "Department", "N", "4", "0120"),
      (5, "g1", "Boutique", "Y", "3", "0110")
    )).toDF("id", "item_code", "store_class_code2", "store_class_code1", "counter_level_sort", "store_code")

    val allocationItemMap = df.rdd.groupBy(r => {
      r.getAs("item_code").toString
    })

    allocationItemMap.flatMap(lines => {
      var resultList = ListBuffer[Row]()
      lines._2.toList.sortBy(row => (
        row.getAs("store_class_code2").toString, //boutique
        row.getAs("store_class_code1").toString, //is-key
        row.getAs("counter_level_sort").toString, //level
        row.getAs("store_code").toString))(Ordering.Tuple4(Ordering.String, Ordering.String.reverse, Ordering.String, Ordering.String.reverse))
        .foreach(row => {
          println(row.getAs[Int]("id"))
        })
      resultList
    }).foreach(println)
  }

  def test23(spark: SparkSession): Unit = {
    val driver = MysqlConnect.getDbDriver("CN30")
    val batchLogDF = SomeMysqlDataModel.getBatchLog(spark, "20210810131012000").selectExpr("decision_day")
    println(batchLogDF.take(1)(0).getString(0))
  }

  def test22(spark: SparkSession): Unit = {
    val df = BusinessCalendarModel.getLastWeekCalendarDay(spark, "2021-01-01", "channel_code = 'A005'")
    df.show()
  }

  def test21(spark: SparkSession): Unit = {
    import spark.implicits._
    val df = spark.sparkContext.parallelize(Seq(
      ("wh_9901", "G01", "2021-06-22 12:21:21"),
      ("wh_9901", "G01", "2021-06-23 12:21:21"),
      ("wh_9901", "G01", "2021-06-24 12:21:21"),
      ("wh_9901", "G01", "2021-06-26 12:21:21"),
      ("wh_9902", "G01", "2021-06-25 12:21:21"),
      ("wh_9902", "G01", "2021-06-27 12:21:21"),
      ("wh_9903", "G01", "2021-06-29 12:21:21")
    )).toDF("location_code", "item_code", "receive_day")
    df.select(date_format(col("receive_day"), "yyyy-MM-dd")).show()
  }

  def test20(spark: SparkSession): Unit = {
    import spark.implicits._
    val df = spark.sparkContext.parallelize(Seq(
      (20, "1,2,1,2,1,2,2,2,2,2,1,1"),
      (10, "1,1,1,1,1"),
      (12, "1,1,1,1,,1,1,1,1,1")
    )).toDF("inv_all_value", "fcst_list")

    val resultDF = df.withColumn("coverage_days", UdfUtil.coverageDaysUdf(lit(0), col("fcst_list"), col("inv_all_value")))
    resultDF.show()
  }

  def test19(spark: SparkSession): Unit = {
    val businessMonthStringList = BusinessCalendarModel.getBusinessMonthListString(spark, "2022-01-01", "2022-01-01", "channel_code = 'A005'")
    val monthPartitionConditionStr = s"business_month_string in (${businessMonthStringList})"
    println(monthPartitionConditionStr)
  }

  def test18(spark: SparkSession): Unit = {
    val rdc91AvgSaleDF = RdcSaleQtyModel.calculateRdcScopeAvgSale(spark, "2021-08-01", "2021-08-04", 2, "1=1", "channel_code = 'CN10_A005'", "CN10_A005")
      .selectExpr("company_code", "channel_code", "location_code", "item_code", "business_day_string", "total_sale_qty", "avg_sale_qty_his")
      .na.fill("", cols = Array("company_code", "channel_code", "location_code", "item_code", "business_day_string"))
    rdc91AvgSaleDF.show()
  }

  def test17(spark: SparkSession): Unit = {
    import spark.implicits._
    val df = spark.sparkContext.parallelize(Seq(
      ("CN30", "A005"),
      ("CN30", "A007"),
      ("CN10", "A005")
    )).toDF("division_code", "channel_code")
    val result = df.filter("division_code <> 'CN10'").groupBy("division_code")
      .agg(concat_ws(",", collect_set($"channel_code")).as("channel_code_list"))
    println(result.take(1)(0).getString(1))
  }

  def test16(spark: SparkSession): Unit = {
    import spark.implicits._
    val df = spark.sparkContext.parallelize(Seq(
      ("wh_9901", "G01", "2021-06-22"),
      ("wh_9901", "G01", "2021-06-23"),
      ("wh_9901", "G01", "2021-06-24"),
      ("wh_9901", "G01", "2021-06-26"),
      ("wh_9902", "G01", "2021-06-25"),
      ("wh_9902", "G01", "2021-06-27"),
      ("wh_9903", "G01", "2021-06-29")
    )).toDF("location_code", "item_code", "receive_day")
    df.withColumn("firstDayOfMonth", UdfUtil.getFirstDayStrOfMonthUdf(col("receive_day")))
      .withColumn("lastDayOfMonth", UdfUtil.getLastDayStrOfMonthUdf(col("receive_day")))
      .withColumn("1st_receive", lead(col("receive_day"), 1, "9999-01-01").over(Window.partitionBy("location_code", "item_code").orderBy("receive_day")))
      .withColumn("next_receive_day", when($"1st_receive".equalTo("9999-01-01"), col("lastDayOfMonth")).otherwise(col("1st_receive")))
      .withColumn("rank", row_number().over(Window.partitionBy("location_code", "item_code").orderBy($"receive_day".asc_nulls_last)))
      .filter("rank == 1")
      .show()
  }

  def test15(spark: SparkSession): Unit = {
    import spark.implicits._
    val df = spark.sparkContext.parallelize(Seq(
      ("wh_9901", "G01", "2021-06-22", "6"),
      ("wh_9901", "G01", "2021-06-23", "4"),
      ("wh_9901", "G02", "2021-06-24", "2"),
      ("wh_9901", "G02", "2021-06-22", "1"),
      ("wh_9901", "G03", "2021-06-24", "2"),
      ("wh_9902", "G01", "2021-06-23", "4"),
      ("wh_9902", "G01", "2021-06-24", "2"),
      ("wh_9902", "G02", "2021-06-24", "3"),
      ("wh_9903", "G02", "2021-06-22", "6"),
      ("wh_9903", "G01", "2021-06-24", "5"),
      ("wh_9903", "G02", "2021-06-22", "3"),
      ("wh_9903", "G03", "2021-06-22", "2")
    )).toDF("location_code", "item_code", "business_day_string", "item_qty")

    df.sort("business_day_string").groupBy("location_code", "item_code")
      .agg(concat_ws(",", collect_list($"item_qty")).as("forecast_qty"),
        collect_list($"item_qty").as("forecast_qty_list"))
      .show()
  }


  def test14(spark: SparkSession): Unit = {
    import spark.implicits._
    val df = spark.sparkContext.parallelize(Seq(
      ("wh_9901", "G01", "2021-06-22", "6"),
      ("wh_9901", "G01", "2021-06-23", "4"),
      ("wh_9901", "G02", "2021-06-24", "2"),
      ("wh_9901", "G02", "2021-06-22", "1"),
      ("wh_9901", "G03", "2021-06-24", "2"),
      ("wh_9902", "G01", "2021-06-23", "4"),
      ("wh_9902", "G01", "2021-06-24", "2"),
      ("wh_9902", "G02", "2021-06-24", "3"),
      ("wh_9903", "G02", "2021-06-22", "6"),
      ("wh_9903", "G01", "2021-06-24", "5"),
      ("wh_9903", "G02", "2021-06-22", "3"),
      ("wh_9903", "G03", "2021-06-22", "2")
    )).toDF("location_code", "item_code", "rdd_day_string", "item_qty")

    var itemSubstituteDF = Seq(
      ("G01", "G02", "2021-05-24")
    ).toDF("item_code", "item_code_latest", "business_day_string")

    itemSubstituteDF = itemSubstituteDF.select("item_code", "item_code_latest")
    df.join(itemSubstituteDF, Seq("item_code"), "left")
      .selectExpr("location_code", "rdd_day_string", "if(item_code_latest is null, item_code, item_code_latest) as item_code", "item_qty")
      .groupBy("location_code", "item_code", "rdd_day_string")
      .agg(sum("item_qty").as("intransit_qty"))
      .na.fill(value = 0, cols = Array("intransit_qty"))
      .groupBy("location_code", "item_code")
      //      .agg(concat_ws(";", collect_list(concat($"rdd_day_string", lit(":"), $"intransit_qty"))) as "intransit_detail")
      .agg(concat_ws(";", collect_list(concat_ws(":", $"rdd_day_string", $"intransit_qty"))) as "intransit_detail")
      .show()
  }

  def test13(spark: SparkSession): Unit = {
    import spark.implicits._
    val df = spark.sparkContext.parallelize(Seq(
      ("G1111", "st_1001", "6"),
      ("G1111", "st_1002", "1"),
      ("G1111", "st_1003", "2"),
      ("G2222", "st_1001", "2"),
      ("G2222", "st_1002", "3"),
      ("G2222", "st_1003", "3"),
      ("G3333", "st_1001", "2"),
      ("G3333", "st_1003", "1")
    )).toDF("location_code", "item_code", "fcst_qty")

    val allocationItemMap = df.rdd.groupBy(row => {
      val location_code: String = row.getAs("location_code").toString
      val item_code: String = row.getAs("item_code").toString
      (location_code, item_code)
    })

    allocationItemMap.flatMap(line => {
      val resultList = ListBuffer[Row]()
      val list = line._2.to[ListBuffer]
      list.foreach(row => {
        //添加兩個字段
        val allocationInventoryQty: BigDecimal = 23
        row.schema.add("allocationInventoryQty", IntegerType, true)
        val newRow = Row.fromSeq(row.toSeq ++ Array[Any](allocationInventoryQty))
        resultList.append(newRow)
      })
      resultList
    }).foreach(println)

  }

  def test12(spark: SparkSession): Unit = {
    import spark.implicits._

    val data = spark.sparkContext.parallelize(Seq(
      ("G1111", "st_1001", "12"),
      ("G1111", "st_1002", "1"),
      ("G1111", "st_1003", "2"),
      ("G2222", "st_1001", "2"),
      ("G2222", "st_1002", "3"),
      ("G2222", "st_1003", "3"),
      ("G3333", "st_1001", "2"),
      ("G3333", "st_1003", "1")
    )).toDF("location_code", "item_code", "fcst_qty")

    val toMap = udf((header: String, line: Seq[String]) => {
      Map(header -> line)
    })
    val toMapList = udf((header: String, line: Seq[String]) => {
      line.map(l => Map(header -> l)).toList
    })

    val grouped = data.groupBy("location_code", "item_code").agg(collect_list("fcst_qty").alias("fcst_qty"))

    grouped.withColumn("headerLineMapGroup", toMap($"item_code", $"fcst_qty"))
      .drop("location_code", "item_code", "fcst_qty")
      .show(false)

    grouped.withColumn("headerLineMapGroupList", toMapList($"item_code", $"fcst_qty"))
      .drop("item_code", "fcst_qty")
      .show(false)
  }

  def test11(spark: SparkSession): Unit = {
    //map操作
    println(List(1, 2, 3, 4, 6) map (_ + 1)) //各元素加1,生成新列表 List(2, 3, 4, 5, 7)

    /** 原始數據解析 */
    val studentsScore = spark.sparkContext.textFile("file:///D:/data.txt").map(_.split(","))
    val groups = studentsScore.map(scoreInfo => (scoreInfo(1), scoreInfo(2).toInt, scoreInfo(3).toInt, scoreInfo(4).toInt, scoreInfo(5), scoreInfo(6)))

    /** 多次分組取TopK */
    val topK = groups.groupBy(item => (item._6, item._5)).map(subG => {
      val (departmentId, classId) = subG._1
      //語文前3
      val languageTopK = subG._2.toList.sortBy(_._2)(Ordering.Int.reverse).take(3).map(item => item._2 + "分:學號" + item._1)
      //數學前3
      val mathTopK = subG._2.toList.sortBy(_._3)(Ordering.Int.reverse).take(3).map(item => item._3 + "分:學號" + item._1)
      //外語前3
      val englishTopK = subG._2.toList.sortBy(_._4)(Ordering.Int.reverse).take(3).map(item => item._4 + "分:學號" + item._1)
      (departmentId, classId, Map("語文前3" -> languageTopK, "數學前3" -> mathTopK, "外語前3" -> englishTopK))
    })

    /** 結果顯示 */
    topK.foreach(println)
  }

  def test10(spark: SparkSession): Unit = {
    import org.apache.spark.sql.functions._
    import spark.implicits._

    val rdcScopeAllDaySaleDF = Seq(("wh_9901", "A1", "12", "2021-05-23"),
      ("wh_9901", "A1", "23", "2021-02-21"),
      ("wh_9901", "B1", "1", "2021-05-23"),
      ("wh_9901", "B2", "3", "2021-05-23"),
      ("wh_9901", "A1", "2", "2021-02-20")).toDF("location_code", "item_code", "item_qty", "business_day_string")

    val itemSubstituteDF = Seq(("A1", "B1", "2021-05-23"),
      //      ("B1","B1","2021-05-23"),
      ("A1", "B2", "2021-05-24")
    ).toDF("item_code", "item_code_latest", "business_day_string")

    rdcScopeAllDaySaleDF.where($"business_day_string".between("2021-02-21", "2021-05-23"))
      .join(itemSubstituteDF.where($"business_day_string".equalTo("2021-05-23")), Seq("item_code"), "left")
      .selectExpr("location_code", "if(item_code_latest is null, item_code, item_code_latest) as item_code", "item_qty")
      .groupBy("location_code", "item_code")
      .agg(sum("item_qty").as("total_sale_qty"))
      .withColumn("avg_sale_qty_his", col("total_sale_qty").divide(91))
      .show()
  }

  def test9(spark: SparkSession): Unit = {
    import spark.implicits._

    val storeQtyDF = Seq(("d1", "b1", "c1", "s1", 100, 1),
      ("d1", "b1", "c2", "s1", 100, 2),
      ("d1", "b2", "c3", "s1", 100, 3),
      ("d1", "b2", "c4", "s1", 40, 10),
      ("d2", "b3", "c5", "s1", 50, 10),
      ("d2", "b3", "c6", "s1", 200, 3),
      ("d1", "b1", "c1", "s2", 100, 7),
      ("d1", "b1", "c2", "s2", 200, 4),
      ("d1", "b2", "c3", "s2", 300, 3),
      ("d1", "b2", "c4", "s2", 500, 2),
      ("d2", "b3", "c5", "s2", 100, 11),
      ("d2", "b3", "c6", "s2", 200, 6)).toDF("division", "brand", "category", "store", "sale_qty", "price")

    import org.apache.spark.sql.expressions._
    import org.apache.spark.sql.functions._
    val storeQtyGroupDF = storeQtyDF
      .withColumn("sale_value", col("sale_qty").multiply(col("price")))
      .groupBy("division", "brand", "category", "store")
      .agg(sum("sale_qty").as("sale_qty"), sum("sale_value").as("sale_value"))
    storeQtyGroupDF.show()

    val d = "2021-05-13"

    storeQtyGroupDF
      .withColumn("category_qty_value", sum("sale_value").over(Window.partitionBy("category")))
      .withColumn("brand_qty_value", sum("sale_value").over(Window.partitionBy("brand")))
      .withColumn("division_qty_value", sum("sale_value").over(Window.partitionBy("division")))
      .withColumn("category_percent", format_number(col("sale_value").divide(col("category_qty_value")).multiply(100), 6))
      .withColumn("brand_percent", format_number(col("sale_value").divide(col("brand_qty_value")).multiply(100), 6))
      .withColumn("division_percent", format_number(col("sale_value").divide(col("division_qty_value")).multiply(100), 6))
      .withColumn("business_day_string", lit(d))
      .show()
  }

  def test8(spark: SparkSession): Unit = {
    import org.apache.spark.sql.functions._
    import spark.implicits._

    val orderDF = Seq(
      ("新鄉市", "華北", "上海市", 2),
      ("南昌市", "華北", "天津市", 3),
      ("撫州市", "華北", "溫州市", 36),
      ("婁底市", "華東", "中山市", 1),
      ("廣州市", "華東", "孝感市", 3),
      ("天門市", "華東", "岳陽市", 1),
      ("長春市", "華南", "瀋陽市", 6),
      ("成都市", "東北", "眉山市", 10),
      ("廣州市", "東北", "懷集縣", 2),
      ("北海市", "東北", "廣州市", 8)).toDF("from_city_name", "district", "to_city_name", "total_num")

    orderDF.withColumn("percent", format_number(col("total_num").divide(sum("total_num").over).multiply(100), 5))
      .show()
  }

  def test7(spark: SparkSession): Unit = {
    import spark.implicits._

    val orders = Seq(
      ("深圳", "鑽石會員", "鑽石會員1個月", 25),
      ("深圳", "鑽石會員", "鑽石會員1個月", 25),
      ("深圳", "鑽石會員", "鑽石會員3個月", 70),
      ("深圳", "鑽石會員", "鑽石會員12個月", 300),
      ("深圳", "鉑金會員", "鉑金會員3個月", 60),
      ("深圳", "鉑金會員", "鉑金會員3個月", 60),
      ("深圳", "鉑金會員", "鉑金會員6個月", 120),
      ("深圳", "黃金會員", "黃金會員1個月", 15),
      ("深圳", "黃金會員", "黃金會員1個月", 15),
      ("深圳", "黃金會員", "黃金會員3個月", 45),
      ("深圳", "黃金會員", "黃金會員12個月", 180),
      ("北京", "鑽石會員", "鑽石會員1個月", 25),
      ("北京", "鑽石會員", "鑽石會員2個月", 30),
      ("北京", "鉑金會員", "鉑金會員3個月", 60),
      ("北京", "黃金會員", "黃金會員3個月", 45),
      ("上海", "鑽石會員", "鑽石會員1個月", 25),
      ("上海", "鑽石會員", "鑽石會員1個月", 25),
      ("上海", "鉑金會員", "鉑金會員3個月", 60),
      ("上海", "黃金會員", "黃金會員3個月", 45)
    )
    //把seq轉換成DataFrame
    val memberDF = orders.toDF("area", "memberType", "product", "price")

    memberDF.groupBy("area", "memberType", "price")
      .count().withColumnRenamed("count", "cnt1")
      .groupBy("area", "memberType")
      .count().withColumnRenamed("count", "cnt2")
      .show()

    memberDF.createTempView("orderTempTable")

    spark.sql("select area,memberType,product,sum(price) as total " +
      "from orderTempTable group by area,memberType,product grouping sets((area,memberType),(memberType,product))").show()

    spark.sql("select area,memberType,product,sum(price) as total " +
      "from orderTempTable group by area,memberType,product with rollup").show()
  }

  def test6(spark: SparkSession) {
    import org.apache.spark.sql.functions._

    val df = spark.createDataFrame(Seq(
      ("2018-01", "項目1", 100), ("2018-01", "項目2", 200), ("2018-01", "項目3", 300),
      ("2018-02", "項目1", 1000), ("2018-02", "項目2", 2000), ("2018-03", "項目x", 999)
    )).toDF("年月", "項目", "收入")
    df.groupBy("年月")
      .pivot("項目", Seq("項目1", "項目2", "項目3", "項目x"))
      .agg(sum("收入"))
      .na.fill(0).show()
  }

  def test5(spark: SparkSession) {
    import spark.implicits._
    val source = Seq(
      ("G1", "Off POG", 1),
      ("G2", "Off POG", 2),
      ("G3", "normal", 3),
      ("G4", null, 3)
    ).toDF("item_code", "item_status", "sale_value")
    source.createTempView("kpi_view")

    spark.sql(s"select item_code,item_status,IF(isnull(item_status) or item_status = 'Off POG',0, sale_value/100) as weigth " +
      s"from kpi_view").show()
    println(UdfUtil.getDataDelayDay("CN10"))
  }

  def test4(spark: SparkSession): Unit = {
    val df = spark.createDataFrame(Seq(
      (1, "A"),
      (2, "B"),
      (3, null)
    )).toDF("id", "source")

    import spark.implicits._
    df.show()
    df.filter($"source".notEqual("A")).show()
    df.filter($"source".notEqual("A").or($"source".isNull)).show()
  }

  def test3(spark: SparkSession): Unit = {
    import spark.implicits._
    val left = Seq(("loreal", 1002, "G234238", 2), ("loreal", 1001, "G27453453", 1)).toDF("company_code", "location_code", "item_code", "item_qty")
    val right = Seq(("loreal", 1001, "G27453453", 3), ("loreal", 1003, "Y236423", 5)).toDF("company_code", "location_code", "item_code", "item_qty")
    left.as("a").join(right.as("b"), Seq("company_code", "location_code", "item_code"), "full")
      .selectExpr("company_code", "location_code", "item_code", "a.item_qty as a_qty", "b.item_qty as b_qty")
      .na.fill(value = 0, cols = Array("a_qty", "b_qty"))
      //      .withColumn("total", $"a_qty" + $"b_qty")
      .withColumn("total", UdfUtil.addDoubleUdf($"a_qty", $"b_qty"))
      .show()
  }

  def test2(spark: SparkSession): Unit = {
    val sc = spark.sparkContext
    //建立一個基本的鍵值對RDD,包含ID和名稱,其中ID爲1、2、3、4
    val rdd1 = sc.makeRDD(Array(("1", "Spark"), ("2", "Hadoop"), ("3", "Scala"), ("4", "Java")), 2)
    //建立一個行業薪水的鍵值對RDD,包含ID和薪水,其中ID爲1、2、3、5
    val rdd2 = sc.makeRDD(Array(("1", "30K"), ("2", "15K"), ("3", "25K"), ("5", "10K")), 2)

    println("//下面做Join操作,預期要得到(1,×)、(2,×)、(3,×)")
    val joinRDD = rdd1.join(rdd2).collect.foreach(println)

    println("//下面做leftOutJoin操作,預期要得到(1,×)、(2,×)、(3,×)、(4,×)")
    val leftJoinRDD = rdd1.leftOuterJoin(rdd2).collect.foreach(println)
    println("//下面做rightOutJoin操作,預期要得到(1,×)、(2,×)、(3,×)、(5,×)")
    val rightJoinRDD = rdd1.rightOuterJoin(rdd2).collect.foreach(println)
  }

  def test1(spark: SparkSession): Unit = {
    val textRdd = spark.sparkContext.parallelize(Seq("hadoop spark", "hadoop flume", "spark sqoop"))

    val splitRdd = textRdd.flatMap(_.split(" "))

    val tupleRdd = splitRdd.map((_, 1))

    //    tupleRdd.reduceByKey((curr,agg)=> curr + agg)
    val reduceRdd = tupleRdd.reduceByKey(_ + _)

    reduceRdd.map(item => s"${item._1} ,${item._2}").collect().foreach(println(_))
  }

}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章