spark.sql("""
SELECT
name
,department
,salary
,row_number() over(partition by department order by salary) as index
,rank() over(partition by department order by salary) as rank
,dense_rank() over(partition by department order by salary) as dense_rank
,percent_rank() over(partition by department order by salary) as percent_rank
,ntile(2) over(partition by department order by salary) as ntile
FROM salary
""").toPandas()
spark.sql("""
SELECT
name
,department
,salary
,row_number() over(partition by department order by salary) as index
,cume_dist() over(partition by department order by salary) as cume_dist
,lag(salary, 1) over(partition by department order by salary) as lag -- 當前行向上
,lead(salary, 1) over(partition by department order by salary) as lead -- 當前行向下
,lag(salary, 0) over(partition by department order by salary) as lag_0
,lead(salary, 0) over(partition by department order by salary) as lead_0
,first_value(salary) over(partition by department order by salary) as first_value
,last_value(salary) over(partition by department order by salary) as last_value
FROM salary
""").toPandas()
name
department
salary
index
cume_dist
lag
lead
lag_0
lead_0
first_value
last_value
0
Patricio
Marketing
2500.0
1
0.500000
NaN
3100.0
2500.0
2500.0
2500.0
2500.0
1
Jeff
Marketing
3100.0
2
1.000000
2500.0
NaN
3100.0
3100.0
2500.0
3100.0
2
Berni
Sales
NaN
1
0.166667
NaN
3000.0
NaN
NaN
NaN
NaN
3
Kyoichi
Sales
3000.0
2
0.333333
NaN
4200.0
3000.0
3000.0
NaN
3000.0
4
Georgi
Sales
4200.0
3
0.666667
3000.0
4200.0
4200.0
4200.0
NaN
4200.0
5
Guoxiang
Sales
4200.0
4
0.666667
4200.0
4500.0
4200.0
4200.0
NaN
4200.0
6
Tom
Sales
4500.0
5
0.833333
4200.0
4700.0
4500.0
4500.0
NaN
4500.0
7
Berni
Sales
4700.0
6
1.000000
4500.0
NaN
4700.0
4700.0
NaN
4700.0
8
Parto
Finance
2700.0
1
0.333333
NaN
3300.0
2700.0
2700.0
2700.0
2700.0
9
Anneke
Finance
3300.0
2
0.666667
2700.0
3900.0
3300.0
3300.0
2700.0
3300.0
10
Sumant
Finance
3900.0
3
1.000000
3300.0
NaN
3900.0
3900.0
2700.0
3900.0
aggregate functions
只是在一定窗口裏實現一些普通的聚合函數。
sql
功能
avg
平均值
sum
求和
min
最小值
max
最大值
spark.sql("""
SELECT
name
,department
,salary
,row_number() over(partition by department order by salary) as index
,sum(salary) over(partition by department order by salary) as sum
,avg(salary) over(partition by department order by salary) as avg
,min(salary) over(partition by department order by salary) as min
,max(salary) over(partition by department order by salary) as max
FROM salary
""").toPandas()
name
department
salary
index
sum
avg
min
max
0
Patricio
Marketing
2500.0
1
2500.0
2500.0
2500.0
2500.0
1
Jeff
Marketing
3100.0
2
5600.0
2800.0
2500.0
3100.0
2
Berni
Sales
NaN
1
NaN
NaN
NaN
NaN
3
Kyoichi
Sales
3000.0
2
3000.0
3000.0
3000.0
3000.0
4
Georgi
Sales
4200.0
3
11400.0
3800.0
3000.0
4200.0
5
Guoxiang
Sales
4200.0
4
11400.0
3800.0
3000.0
4200.0
6
Tom
Sales
4500.0
5
15900.0
3975.0
3000.0
4500.0
7
Berni
Sales
4700.0
6
20600.0
4120.0
3000.0
4700.0
8
Parto
Finance
2700.0
1
2700.0
2700.0
2700.0
2700.0
9
Anneke
Finance
3300.0
2
6000.0
3000.0
2700.0
3300.0
10
Sumant
Finance
3900.0
3
9900.0
3300.0
2700.0
3900.0
窗口子句
ROWS/RANG窗口子句: 用於控制窗口的尺寸邊界,有兩種(ROW,RANGE)
ROWS: 物理窗口,數據篩選基於排序後的index
RANGE: 邏輯窗口,數據篩選基於值
語法:OVER (PARTITION BY … ORDER BY … frame_type BETWEEN start AND end)
有以下5種邊界
CURRENT ROW:
UNBOUNDED PRECEDING: 分區第一行
UNBOUNDED FOLLOWING: 分區最後一行
n PRECEDING: 當前行,向前n行
n FOLLOWING: 當前行,向後n行
UNBOUNDED: 起點
spark.sql("""
SELECT
name
,department
,create_time
,row_number() over(partition by department order by create_time) as index
,row_number() over(partition by department order by (case when salary is not null then create_time end)) as index_ignore_null
,salary
,collect_list(salary) over(partition by department order by create_time rows between UNBOUNDED PRECEDING AND 1 PRECEDING) as before_salarys
,last(salary) over(partition by department order by create_time rows between UNBOUNDED PRECEDING AND 1 PRECEDING) as before_salary1
,lag(salary, 1) over(partition by department order by create_time) as before_salary2
,lead(salary, 1) over(partition by department order by create_time) as after_salary
FROM salary
ORDER BY department, index
""").toPandas()
name
department
create_time
index
index_ignore_null
salary
before_salarys
before_salary1
before_salary2
after_salary
0
Anneke
Finance
2020-01-02 08:20:00
1
1
3300.0
[]
NaN
NaN
3900.0
1
Sumant
Finance
2020-01-30 12:01:05
2
2
3900.0
[3300]
3300.0
3300.0
2700.0
2
Parto
Finance
2020-02-20 12:01:00
3
3
2700.0
[3300, 3900]
3900.0
3900.0
NaN
3
Jeff
Marketing
2020-01-02 12:01:00
1
1
3100.0
[]
NaN
NaN
2500.0
4
Patricio
Marketing
2020-01-05 12:18:00
2
2
2500.0
[3100]
3100.0
3100.0
NaN
5
Tom
Sales
2020-01-01 00:01:00
1
2
4500.0
[]
NaN
NaN
4200.0
6
Georgi
Sales
2020-01-02 12:01:00
2
3
4200.0
[4500]
4500.0
4500.0
NaN
7
Berni
Sales
2020-01-07 11:01:00
3
1
NaN
[4500, 4200]
4200.0
4200.0
4200.0
8
Guoxiang
Sales
2020-01-08 12:11:00
4
4
4200.0
[4500, 4200]
NaN
NaN
4700.0
9
Berni
Sales
2020-01-10 11:01:00
5
5
4700.0
[4500, 4200, 4200]
4200.0
4200.0
3000.0
10
Kyoichi
Sales
2020-02-02 12:10:00
6
6
3000.0
[4500, 4200, 4200, 4700]
4700.0
4700.0
NaN
# 同一個部門,上個非空工資入職同事的收入
spark.sql("""
SELECT
name
,department
,create_time
,index
,salary
,before_salarys[size(before_salarys)-1] as before_salary
FROM(
SELECT
name
,department
,create_time
,row_number() over(partition by department order by create_time) as index
,salary
,collect_list(salary) over(partition by department order by create_time rows between UNBOUNDED PRECEDING AND 1 PRECEDING) as before_salarys
FROM salary
ORDER BY department, index
) AS base
""").toPandas()
name
department
create_time
index
salary
before_salary
0
Anneke
Finance
2020-01-02 08:20:00
1
3300.0
NaN
1
Sumant
Finance
2020-01-30 12:01:05
2
3900.0
3300.0
2
Parto
Finance
2020-02-20 12:01:00
3
2700.0
3900.0
3
Jeff
Marketing
2020-01-02 12:01:00
1
3100.0
NaN
4
Patricio
Marketing
2020-01-05 12:18:00
2
2500.0
3100.0
5
Tom
Sales
2020-01-01 00:01:00
1
4500.0
NaN
6
Georgi
Sales
2020-01-02 12:01:00
2
4200.0
4500.0
7
Berni
Sales
2020-01-07 11:01:00
3
NaN
4200.0
8
Guoxiang
Sales
2020-01-08 12:11:00
4
4200.0
4200.0
9
Berni
Sales
2020-01-10 11:01:00
5
4700.0
4200.0
10
Kyoichi
Sales
2020-02-02 12:10:00
6
3000.0
4700.0
混合應用
spark.sql("""
SELECT
name
,department
,salary
,row_number() over(partition by department order by salary) as index
,salary - (min(salary) over(partition by department order by salary)) as salary_diff -- 比部門最低工資高多少
,min(salary) over() as min_salary_0 -- 最小工資
,first_value(salary) over(order by salary) as max_salary_1
,max(salary) over(order by salary) as current_max_salary_0 -- 截止到當前最大工資
,last_value(salary) over(order by salary) as current_max_salary_1
,max(salary) over(partition by department order by salary rows between 1 FOLLOWING and 1 FOLLOWING) as next_salary_0 -- 按照salary排序下一條記錄
,lead(salary) over(partition by department order by salary) as next_salary_1
FROM salary
WHERE salary is not null
""").toPandas()