Spark SQL 查詢
SQL語法查詢
單行查詢
// 單行查詢
var userDF = List((1, "張三", true, 18, 15000, 1))
.toDF("id", "name", "sex", "age", "salary", "dept")
userDF.createTempView("t_employee")
val sql = "select * from t_employee where name = '張三'"
spark.sql(sql)
.show()
+---+----+----+---+------+----+
| id|name| sex|age|salary|dept|
+---+----+----+---+------+----+
| 1|張三|true| 18| 15000| 1|
+---+----+----+---+------+----+
模糊查詢
var userDF= List((1,"張三",true,18,15000,1))
.toDF("id","name","sex","age","salary","dept")
userDF.createTempView("t_employee")
val sql="select * from t_employee where name like '%三%'"
spark.sql(sql)
.show()
+---+----+----+---+------+----+
| id|name| sex|age|salary|dept|
+---+----+----+---+------+----+
| 1|張三|true| 18| 15000| 1|
+---+----+----+---+------+----+
排序查詢
var userDF = List((1, "張三", true, 18, 15000, 1), (2, "ls", false, 18, 12000, 1))
.toDF("id", "name", "sex", "age", "salary", "dept")
//構建視圖
userDF.createTempView("t_employee")
val sql =
"""
|select * from t_employee where salary > 10000 order by salary desc
"""
.stripMargin
spark.sql(sql)
.show()
+---+----+-----+---+------+----+
| id|name| sex|age|salary|dept|
+---+----+-----+---+------+----+
| 1|張三| true| 18| 15000| 1|
| 2|李四|false| 18| 12000| 1|
+---+----+-----+---+------+----+
limit查詢
var userDF= List( (1,"張三",true,18,15000,1),
(2,"李四",false,18,12000,1),
(3,"王五",false,18,16000,2)
) .toDF("id","name","sex","age","salary","dept")
//構建視圖
userDF.createTempView("t_employee")
val sql=
"""
|select * from t_employee where salary > 10000 order by salary desc limit 2
""".stripMargin
spark.sql(sql)
.show()
+---+----+-----+---+------+----+
| id|name| sex|age|salary|dept|
+---+----+-----+---+------+----+
| 3|王五|false| 18| 16000| 2|
| 1|張三| true| 18| 15000| 1|
+---+----+-----+---+------+----+
分組查詢
var userDF= List( (1,"張三",true,18,15000,1),
(2,"李四",false,18,12000,1),
(3,"王五",false,18,16000,2)
) .toDF("id","name","sex","age","salary","dept")
//構建視圖
userDF.createTempView("t_employee")
val sql=
"""
|select dept ,avg(salary) as avg_slalary from t_employee
|group by dept order by avg_slalary desc
""".stripMargin
spark.sql(sql)
.show()
+----+-----------+
|dept|avg_slalary|
+----+-----------+
| 2| 16000.0|
| 1| 13500.0|
+----+-----------+
having過濾
var userDF= List( (1,"張三",true,18,15000,1),
(2,"李四",false,18,12000,1),
(3,"王五",false,18,16000,2)
) .toDF("id","name","sex","age","salary","dept")
//構建視圖
userDF.createTempView("t_employee")
val sql=
"""
| select dept ,avg(salary) as avg_slalary
| from t_employee group by dept
| having avg_slalary > 13500
| order by avg_slalary desc
""".stripMargin
spark.sql(sql)
.show()
+----+-----------+
|dept|avg_slalary|
+----+-----------+
| 2| 16000.0|
+----+-----------+
case-when
var userDF= List( (1,"張三",true,18,15000,1),
(2,"李四",false,18,12000,1),
(3,"王五",false,18,16000,2)
) .toDF("id","name","sex","age","salary","dept")
//構建視圖
userDF.createTempView("t_employee")
val sql=
"""
|select id,name,case sex when true then '男' else '女' end as sex_alias
|from t_employee
""".stripMargin
spark.sql(sql)
.show()
+---+----+---------+
| id|name|sex_alias|
+---+----+---------+
| 1|張三| 男|
| 2|李四| 女|
| 3|王五| 女|
+---+----+---------+
行轉列
// 行轉列
var scoreDF = List(
(1, "語文", 100),
(1, "數學", 100),
(1, "英語", 100),
(2, "數學", 79),
(2, "語文", 80),
(2, "英語", 100)
).toDF("id", "course", "score")
scoreDF.createOrReplaceTempView("t_course")
val sql =
"""
| select id,
| max(case course when '數學' then score else 0 end) as math,
| max(case course when '英語' then score else 0 end) as english,
| max(case course when '語文' then score else 0 end) as chinese
| from t_course group by id
""".stripMargin
spark.sql(sql)
.show()
+---+----+-------+-------+
| id|math|english|chinese|
+---+----+-------+-------+
| 1| 100| 100| 100|
| 2| 79| 100| 80|
+---+----+-------+-------+
pivot
var scoreDF = List(
(1, "語文", 100),
(1, "數學", 100),
(1, "英語", 100),
(2, "數學", 79),
(2, "語文", 80),
(2, "英語", 100)
).toDF("id", "course", "score")
scoreDF.createOrReplaceTempView("t_course")
val sql =
"""
|select *
|from t_course
|pivot(max(score) for course in ('數學' ,'語文','英語'))
|
""".stripMargin
spark.sql(sql)
.show()
+---+----+----+----+
| id|數學|語文|英語|
+---+----+----+----+
| 1| 100| 100| 100|
| 2| 79| 80| 100|
+---+----+----+----+
在書寫SQL的時候除去聚合字段和輸出列明字段,其他字段作爲groupby後的隱藏字段。
Cube計算
// Cube計算
val frame = List(
(110, 50, 80, 80),
(120, 60, 95, 75),
(120, 50, 96, 70)
) .toDF("height", "weight", "uiq", "ueq")
frame.createTempView("t_user")
val sql=
"""
|select height,weight,avg(uiq),avg(ueq)
|from t_user
|group by cube(height,weight)
""".stripMargin
spark.sql(sql)
.show()
+------+------+-----------------+--------+
|height|weight| avg(uiq)|avg(ueq)|
+------+------+-----------------+--------+
| 110| 50| 80.0| 80.0|
| 120| null| 95.5| 72.5|
| 120| 60| 95.0| 75.0|
| null| 60| 95.0| 75.0| // weight 是60的所有數據 的uiq、ueq平均值
| null| null|90.33333333333333| 75.0| // 所有數據的uiq、ueq平均值
| 120| 50| 96.0| 70.0|
| 110| null| 80.0| 80.0|
| null| 50| 88.0| 75.0|
+------+------+-----------------+--------+
Join表連接
// join
val userCatagoryCostDF=List(
(1,"電腦配件",100),
(1,"母嬰用品",100),
(1,"生活用品",100),
(2,"居家美食",79),
(2,"消費電子",80),
(2,"生活用品",100)
).toDF("uid","category","cost")
val usersDF= List(
(1,"張曉三",true,18,15000),
(2,"李曉四",true,18,18000),
(3,"王曉五",false,18,10000)
).toDF("id","name","sex","age","salary")
usersDF.createTempView("t_user")
userCatagoryCostDF.createTempView("t_user_cost")
val sql =
"""
|select u.*,o.*
|from t_user u
|left join t_user_cost o
|on u.id=o.uid
|where uid is not null
""".stripMargin
spark.sql(sql)
.show()
+---+------+----+---+------+---+--------+----+
| id| name| sex|age|salary|uid|category|cost|
+---+------+----+---+------+---+--------+----+
| 1|張曉三|true| 18| 15000| 1|電腦配件| 100|
| 1|張曉三|true| 18| 15000| 1|母嬰用品| 100|
| 1|張曉三|true| 18| 15000| 1|生活用品| 100|
| 2|李曉四|true| 18| 18000| 2|居家美食| 79|
| 2|李曉四|true| 18| 18000| 2|消費電子| 80|
| 2|李曉四|true| 18| 18000| 2|生活用品| 100|
+---+------+----+---+------+---+--------+----+
子查詢
// 子查詢
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.createTempView("t_employee")
val sql=
"""
|select id,name,salary,dept,
|(select avg(salary) from t_employee t2 where t1.dept=t2.dept) as avg_salary
|from t_employee t1
|order by dept desc
""".stripMargin
spark.sql(sql)
.show()
+---+----+------+----+------------------+
| id|name|salary|dept| avg_salary|
+---+----+------+----+------------------+
| 2| ls| 18000| 2| 16000.0|
| 3| ww| 14000| 2| 16000.0|
| 5|win7| 16000| 1|16333.333333333334|
| 1| zs| 15000| 1|16333.333333333334|
| 4| zl| 18000| 1|16333.333333333334|
+---+----+------+----+------------------+
在spark SQL不允許在子查詢中使用非等值連接。(MySQL|Oracle支持)
開窗函數
在正常的統計分析中 ,通常使用聚合函數作爲分析,聚合分析函數的特點是將n行記錄合併成一行,在數據庫的統計當中 還有一種統計稱爲開窗統計,開窗函數可以實現將一行變成多行。可以將數據庫查詢的每一條記錄比作是一幢高樓的一 層, 開窗函數就是在每一層開一扇窗, 讓每一層能看到整裝樓的全貌或一部分。
// 開窗函數
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.createTempView("t_employee")
val sql=
"""
|select id,name,salary,dept,
|count(id) over(partition by dept order by salary desc) as rank,
|(count(id) over(partition by dept order by salary desc rows between current row and unbounded following) - 1) as low_than_me,
|avg(salary) over(partition by dept rows between unbounded preceding and unbounded following) as avg_salary,
|avg(salary) over() as all_avg_salary
|from t_employee t1 order by dept desc
""".stripMargin
spark.sql(sql)
.show()
spark.stop()
+---+----+------+----+----+-----------+------------------+--------------+
| id|name|salary|dept|rank|low_than_me| avg_salary|all_avg_salary|
+---+----+------+----+----+-----------+------------------+--------------+
| 2| ls| 18000| 2| 1| 1| 16000.0| 16200.0|
| 3| ww| 14000| 2| 2| 0| 16000.0| 16200.0|
| 4| zl| 18000| 1| 1| 2|16333.333333333334| 16200.0|
| 5|win7| 16000| 1| 2| 1|16333.333333333334| 16200.0|
| 1| zs| 15000| 1| 3| 0|16333.333333333334| 16200.0|
+---+----+------+----+----+-----------+------------------+--------------+
開窗函數SQL解讀
select id,name,salary,dept,
# 按部門分組、工資倒敘排序展示 當前部門的id總數
count(id) over(partition by dept order by salary desc) as rank,
# 按部門分組、工資倒敘排序展示當前行至最後一行id總數-1
(count(id) over(partition by dept order by salary desc rows between current row and unbounded following) - 1) as low_than_me,
# 按部門分組展示首行至尾行的平均工資 如:2部門平均工資16000 1部門平均工資16333.333333333334
avg(salary) over(partition by dept rows between unbounded preceding and unbounded following) as avg_salary,
# 展示所有員工的平均工資
avg(salary) over() as all_avg_salary
from t_employee t1 order by dept desc
- 總結
聚合函數(字段) over ([[partition by 字段] order by 字段 asc [rows between 起始行偏移量 and 終止偏移量]] )
- 其中:偏移量的取值
preceding:用於累加前N行(分區之內)。若是從分區第一行頭開始,則爲 unbounded。 N爲:相對當前行向前 的偏移量
負數
。
following:與preceding相反,累加後N行(分區之內)。若是累加到該分區結束則爲unbounded。N爲:相對當 前行向後的偏移量正數
current row:顧名思義,當前行,偏移量爲0
ROW_NUM
統計當前記錄所在的行號
// ROW_NUM
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.createTempView("t_employee")
val sql=
"""
|select id,name,salary,dept,
|ROW_NUMBER() over(partition by dept order by salary desc) as rank
|from t_employee t1
|order by dept desc
""".stripMargin
spark.sql(sql)
.show()
+---+----+------+----+----+
| id|name|salary|dept|rank|
+---+----+------+----+----+
| 2| ls| 18000| 2| 1|
| 3| ww| 14000| 2| 2|
| 4| zl| 18000| 1| 1|
| 5|win7| 16000| 1| 2|
| 1| zs| 15000| 1| 3|
+---+----+------+----+----+
如果部門存在相同薪資此時ROW_NUMBER只能表示當前記錄在窗口行標
RANK()
// RANK
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(6,"zl1",true,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.createTempView("t_employee")
val sql=
"""
|select id,name,salary,dept,
|RANK() over(partition by dept order by salary desc) as rank
|from t_employee t1
|order by dept desc
""".stripMargin
spark.sql(sql)
.show()
+---+----+------+----+----+
| id|name|salary|dept|rank|
+---+----+------+----+----+
| 2| ls| 18000| 2| 1|
| 3| ww| 14000| 2| 2|
| 4| zl| 18000| 1| 1|
| 6| zl1| 18000| 1| 1|
| 5|win7| 16000| 1| 3| //因爲出現兩個排名爲1的,所有這裏是3,故而排名序號不連續
| 1| zs| 15000| 1| 4|
+---+----+------+----+----+
與ROW_NUM相比,排名特點是不連續。
DENSE_RANK() /密集排名
// DENSE_RANK/密集排名
var df = List(
(1, "zs", true, 1, 15000),
(2, "ls", false, 2, 18000),
(3, "ww", false, 2, 14000),
(4, "zl", false, 1, 18000),
(6, "zl1", true, 1, 18000),
(5, "win7", false, 1, 16000)
).toDF("id", "name", "sex", "dept", "salary")
df.createTempView("t_employee")
val sql =
"""
|select id,name,salary,dept,
|DENSE_RANK() over(partition by dept order by salary desc) as rank
|from t_employee t1
|order by dept desc
""".stripMargin
spark.sql(sql)
.show()
+---+----+------+----+----+
| id|name|salary|dept|rank|
+---+----+------+----+----+
| 3| ww| 14000| 2| 2|
| 2| ls| 18000| 2| 1|
| 4| zl| 18000| 1| 1|
| 6| zl1| 18000| 1| 1|
| 1| zs| 15000| 1| 3|
| 5|win7| 16000| 1| 2|
+---+----+------+----+----+
自定義函數
單行函數
// 自定義單行函數
var df = List(
(1, "zs", true, 1, 15000),
(2, "ls", false, 2, 18000),
(3, "ww", false, 2, 14000),
(4, "zl", false, 1, 18000),
(6, "zl1", true, 1, 18000),
(5, "win7", false, 1, 16000)
).toDF("id", "name", "sex", "dept", "salary")
df.createTempView("t_employee")
spark.udf
.register("convertSex", (sex: Boolean) => {
sex match {
case true => "男"
case false => "女"
}
})
val sql =
"""
|select id,name,convertSex(sex) as usex
|from t_employee
""".stripMargin
spark.sql(sql)
.show()
+---+----+----+
| id|name|usex|
+---+----+----+
| 1| zs| 男|
| 2| ls| 女|
| 3| ww| 女|
| 4| zl| 女|
| 6| zl1| 男|
| 5|win7| 女|
+---+----+----+
聚合函數(untyped)
只需要寫一個類繼承 UserDefinedAggregateFunction
即可。
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
class CustomSum extends UserDefinedAggregateFunction {
//1.輸入的字段類型信息 name屬性 叫什麼無所謂
override def inputSchema: StructType = {
new StructType().add("salary", DoubleType)
}
//2.中間結果變量類型
override def bufferSchema: StructType = {
new StructType().add("taotalsalary", DoubleType)
}
//3.最終返回結果的類型
override def dataType: DataType = DoubleType
//4.設置返回結果類型是否固定
override def deterministic: Boolean = true
//5.初始化中間結果
override def initialize(buffer: MutableAggregationBuffer): Unit = {
//第0個位置元素是0.0
buffer.update(0, 0.0)
}
//6.將傳如的數值添加到中間結果變量中
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val history = buffer.getAs[Double](0)
val current = input.getAs[Double](0)
buffer.update(0, history + current)
}
//7.將局部結果聚合到buffer1中
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val result = buffer1.getAs[Double](0) + buffer2.getAs[Double](0)
buffer1.update(0, result)
}
//8.返回最終結果
override def evaluate(buffer: Row): Any = {
buffer.getAs[Double](0)
}
}
- spark 代碼
// 自定義聚合函數(untyped)
var df = List(
(1, "zs", true, 1, 15000),
(2, "ls", false, 2, 18000),
(3, "ww", false, 2, 14000),
(4, "zl", false, 1, 18000),
(6, "zl1", true, 1, 18000),
(5, "win7", false, 1, 16000)
).toDF("id", "name", "sex", "dept", "salary")
df.createTempView("t_employee")
spark.udf
.register("customSum", new CustomSum)
val sql =
"""
|select dept,customSum(salary)
|from t_employee
|group by dept
""".stripMargin
spark.sql(sql)
.show()
+----+---------------------------------+
|dept|customsum(CAST(salary AS DOUBLE))|
+----+---------------------------------+
| 1| 67000.0|
| 2| 32000.0|
+----+---------------------------------+
Load/Save
Paquet
- Parquet簡介
Parquet是面向分析型業務的列式存儲格式,由Twitter和Cloudera合作開發,2015年5月從Apache的孵化器裏畢業成爲Apache頂級項目
http://parquet.apache.org/
// paquet
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(6,"zl1",true,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.write
.format("parquet")
.save("file:////Users/mashikang/IdeaProjects/spark_sql/src/main/resources/parquet")
spark.read
.parquet("file:////Users/mashikang/IdeaProjects/spark_sql/src/main/resources/parquet")
.show()
+---+----+-----+----+------+
| id|name| sex|dept|salary|
+---+----+-----+----+------+
| 5|win7|false| 1| 16000|
| 6| zl1| true| 1| 18000|
| 4| zl|false| 1| 18000|
| 3| ww|false| 2| 14000|
| 1| zs| true| 1| 15000|
| 2| ls|false| 2| 18000|
+---+----+-----+----+------+
- 存儲文件樣式
JSON
// json
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(6,"zl1",true,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.write
.format("json")
.save("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/json")
spark.read
.json("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/json")
.show()
+----+---+----+------+-----+
|dept| id|name|salary| sex|
+----+---+----+------+-----+
| 1| 5|win7| 16000|false|
| 2| 3| ww| 14000|false|
| 1| 4| zl| 18000|false|
| 2| 2| ls| 18000|false|
| 1| 6| zl1| 18000| true|
| 1| 1| zs| 15000| true|
+----+---+----+------+-----+
ORC(存儲壓縮格式,比較節省空間)
// ORC
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(6,"zl1",true,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.write
.format("orc")
.save("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/orc")
spark.read
.orc("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/orc")
.show()
+---+----+-----+----+------+
| id|name| sex|dept|salary|
+---+----+-----+----+------+
| 5|win7|false| 1| 16000|
| 4| zl|false| 1| 18000|
| 3| ww|false| 2| 14000|
| 6| zl1| true| 1| 18000|
| 1| zs| true| 1| 15000|
| 2| ls|false| 2| 18000|
+---+----+-----+----+------+
CSV
// CSV
var df=List(
(1,"zs",true,1,15000),
(2,"ls",false,2,18000),
(3,"ww",false,2,14000),
(4,"zl",false,1,18000),
(6,"zl1",true,1,18000),
(5,"win7",false,1,16000)
).toDF("id","name","sex","dept","salary")
df.write
.format("csv")
.option("sep", ",")
.option("inferSchema", "true")
.option("header", "true")
.save("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/csv")
spark.read
.option("sep", ",")
.option("inferSchema", "true")
.option("header", "true")
.csv("file:///Users/mashikang/IdeaProjects/spark_sql/src/main/resources/csv")
.show()
+---+----+-----+----+------+
| id|name| sex|dept|salary|
+---+----+-----+----+------+
| 5|win7|false| 1| 16000|
| 4| zl|false| 1| 18000|
| 3| ww|false| 2| 14000|
| 2| ls|false| 2| 18000|
| 6| zl1| true| 1| 18000|
| 1| zs| true| 1| 15000|
+---+----+-----+----+------+
JDBC
// JDBC
val usersDF = List(
(1, "張曉三", 1, 15000),
(2, "李曉四", 1, 18000),
(3, "王曉五", 1, 10000)
).toDF("id", "name", "dept", "salary")
usersDF.write
.format("jdbc")
.mode(SaveMode.Overwrite)
.option("user", "root")
.option("password", "root")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("dbtable", "t_user")
.save()
val props = new Properties()
props.put("user", "root")
props.put("password", "root")
spark.read
.jdbc("jdbc:mysql://localhost:3306/test", "t_user", props)
.show()
或者
val usersDF = List(
(1, "張曉三", 1, 15000),
(2, "李曉四", 1, 18000),
(3, "王曉五", 1, 10000)
).toDF("id", "name", "dept", "salary")
usersDF.write
.format("jdbc")
.mode(SaveMode.Overwrite)
.option("user", "root")
.option("password", "root")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("dbtable", "t_user")
.save()
spark.read.format("jdbc")
.option("user", "root")
.option("password", "root")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("dbtable", "t_user")
.load()
.show()
DataFrame轉爲RDD
val usersDF = List(
(1, "張曉三", 1, 15000.0),
(2, "李曉四", 1, 18000.0),
(3, "王曉五", 1, 10000.0)
).toDF("id", "name", "dept", "salary")
usersDF.rdd.foreachPartition(its => {
its.foreach(row => {
val id = row.getAs[Int]("id")
val name = row.getAs[String]("name")
val salary = row.getAs[Double]("salary")
println(s"$id,$name,$salary")
})
})
2,李曉四,18000.0
3,王曉五,10000.0
1,張曉三,15000.0