寫sql處理使用groupby 產生的數據傾斜問題:
import java.util.Random
import org.apache.spark.sql.SparkSession
object TestUDF {
def main(args: Array[String]): Unit = {
val spark =
SparkSession.builder()
.appName("TestUDF")
.enableHiveSupport()
.getOrCreate()
spark.udf.register("random_prefix", (value: Int, num: Int) => randomPrefixUDF(value, num))
spark.udf.register("remove_random_prefix", (value: String) => removeRandomPrefixUDF(value))
// 加隨機前綴
val sql1 =
s"""
|select
| random_prefix(name, 6) product,
| id
|from
| ggg.test
""".stripMargin
// 分組求和
val sql2 =
s"""
|select
| product,
| sum(id) click
|from
| (
| select
| random_prefix(name, 6) product,
| id
| from
| ggg.test
| ) t1
|group by
| product
""".stripMargin
// 去掉隨機前綴
val sql3 =
s"""
|select
| remove_random_prefix(product) product,
| click
|from
| (
| select
| product,
| sum(id) click
| from
| (
| select
| random_prefix(name, 6) product,
| id
| from
| ggg.test
| ) t1
| group by
| product
| ) t2
|
""".stripMargin
// 分組求和
val sql4 =
s"""
|select
| product,
| sum(click) click
|from
| (
| select
| remove_random_prefix(product) product,
| click
| from
| (
| select
| product,
| sum(id) click
| from
| (
| select
| random_prefix(name, 6) product,
| id
| from
| ggg.test
| ) t1
| group by
| product
| ) t2
| ) t3
|group by
| product
""".stripMargin
// spark.sql(sql1).show()
// spark.sql(sql2).show()
// spark.sql(sql3).show()
spark.sql(sql4).show()
}
def randomPrefixUDF(value: Int, num: Int): String = {
new Random().nextInt(num).toString + "_" + value
}
def removeRandomPrefixUDF(value: String): String = {
value.toString.split("_")(1)
}
}