你真的懂Hive窗口函數嗎,如何開窗聚合?

目錄

1 窗口函數 Windowing functions

FIRST_VALUE(col, bool DEFAULT)

LAST_VALUE(col, bool DEFAULT)

LEAD(col, n, DEFAULT)

LAG(col, n, DEFAULT)

2 OVER詳解 The OVER clause

FUNCTION(expr) OVER([PARTITION BY statement] [ORDER BY statement] [window clause])

2.1 標準聚合函數

COUNT(expr) OVER()

SUM(expr) OVER()

MIN(expr) OVER()

MAX(expr) OVER()

AVG(expr) OVER()

2.2 分析函數 Analytics functions

RANK() OVER()

ROW_NUMBER() OVER()

DENSE_RANK() OVER()

CUME_DIST() OVER()

PERCENT_RANK() OVER()

NTILE(INTEGER x) OVER()

2.3 OVER子句也支持聚合函數

2.4 window clause 的另一種寫法


1 窗口函數 Windowing functions

FIRST_VALUE(col, bool DEFAULT)

返回分組窗口內第一行col的值,DEFAULT默認爲false,如果指定爲true,則跳過NULL後再取值

WITH tmp AS
(
  SELECT 1 AS group_id, 'a' AS col 
  UNION ALL SELECT 1 AS group_id,  'b' AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  NULL AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       FIRST_VALUE(col) over(partition by group_id order by col) as col_new
FROM tmp;
group_id col col_new
1 a a
1 b a
1 c a
2 NULL NULL
2 e NULL
WITH tmp AS
(
  SELECT 1 AS group_id, NULL AS col 
  UNION ALL SELECT 1 AS group_id,  'b' AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  NULL AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       FIRST_VALUE(col, true) over(partition by group_id order by col) as col_new
FROM tmp;
group_id col col_new
1 NULL NULL
1 b b
1 c b
2 NULL NULL
2 e e

LAST_VALUE(col, bool DEFAULT)

返回分組窗口內最後一行col的值,DEFAULT默認爲false,如果指定爲true,則跳過NULL後再取值

WITH tmp AS
(
  SELECT 1 AS group_id, 'a' AS col 
  UNION ALL SELECT 1 AS group_id,  NULL AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  'd' AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       LAST_VALUE(col) over(partition by group_id order by col desc) as col_new
FROM tmp;
group_id col col_new
1 c c
1 a a
1 NULL NULL
2 e e
2 d d
WITH tmp AS
(
  SELECT 1 AS group_id, 'a' AS col 
  UNION ALL SELECT 1 AS group_id,  NULL AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  'd' AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       LAST_VALUE(col, true) over(order by group_id,col desc rows between 1 preceding and 1 following) as col_new
FROM tmp;
group_id col col_new
1 c a
1 a a
1 NULL e
2 e d
2 d d

LEAD(col, n, DEFAULT)

返回分組窗口內往下第n行col的值,n默認爲1,往下第n沒有時返回DEFAULT(DEFAULT默認爲NULL)

WITH tmp AS
(
  SELECT 1 AS group_id, 'a' AS col 
  UNION ALL SELECT 1 AS group_id,  'b' AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  'd' AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       LEAD(col) over(partition by group_id order by col) as col_new
FROM tmp;

等同於:

WITH tmp AS
(
  SELECT 1 AS group_id, 'a' AS col 
  UNION ALL SELECT 1 AS group_id,  'b' AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  'd' AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       LAST_VALUE(col) over(partition by group_id order by col rows between 1 FOLLOWING and 1 FOLLOWING) as col_new
FROM tmp;

返回結果都是:

group_id col col_new
1 a b
1 b c
1 c NULL
2 d e
2 e NULL
WITH tmp AS
(
  SELECT 1 AS group_id, 'a' AS col 
  UNION ALL SELECT 1 AS group_id,  'b' AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  'd' AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       LEAD(col, 2, 'z') over(partition by group_id order by col) as col_new
FROM tmp;

返回結果:

group_id col col_new
1 a c
1 b z
1 c z
2 d z
2 e z

LAG(col, n, DEFAULT)

返回分組窗口內往上第n行col的值,n默認爲1,往上第n沒有時返回DEFAULT(DEFAULT默認爲NULL)

WITH tmp AS
(
  SELECT 1 AS group_id, 'a' AS col 
  UNION ALL SELECT 1 AS group_id,  'b' AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  'd' AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       LAG(col) over(partition by group_id order by col) as col_new
FROM tmp;

等同於:

WITH tmp AS
(
  SELECT 1 AS group_id, 'a' AS col 
  UNION ALL SELECT 1 AS group_id,  'b' AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  'd' AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       FIRST_VALUE(col) over(partition by group_id order by col rows BETWEEN 1 PRECEDING and 1 PRECEDING) as col_new
FROM tmp;

返回結果都是:

group_id col col_new
1 a NULL
1 b a
1 c b
2 d NULL
2 e d
WITH tmp AS
(
  SELECT 1 AS group_id, 'a' AS col 
  UNION ALL SELECT 1 AS group_id,  'b' AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  'd' AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       LAG(col, 2, 'zz') over(partition by group_id order by col) as col_new
FROM tmp;

返回結果:

group_id col col_new
1 a zz
1 b zz
1 c a
2 d zz
2 e zz

2 OVER詳解 The OVER clause

FUNCTION(expr) OVER([PARTITION BY statement] [ORDER BY statement] [window clause])

  • FUNCTION:包括標準聚合函數(COUNT、SUM、MIN、MAX、AVG)和一些分析函數(RANK、ROW_NUMBER、DENSE_RANK等)
  • PARTITION BY:可以由一個或者多個列組成
  • ORDER BY:可以由一個或者多個列組成
  • window clause:(ROWS | RANGE) BETWEEN (UNBOUNDED PRECEDING | num PRECEDING | CURRENT ROW) AND (UNBOUNDED PRECEDING | num PRECEDING | CURRENT ROW)
  • 當 window clause 未指定時,默認爲 RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW,即分組內第一行至當前行作爲窗口
  • 當 window clause 和 ORDER BY 都未指定時,默認爲 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING,即分組內第一行至最後一行作爲窗口

2.1 標準聚合函數

COUNT(expr) OVER()

返回窗口內行數

WITH tmp AS
(
  SELECT 1 AS group_id, 'a' AS col 
  UNION ALL SELECT 1 AS group_id,  'b' AS col 
  UNION ALL SELECT 1 AS group_id,  'c' AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col 
  UNION ALL SELECT 2 AS group_id,  'e' AS col
)
SELECT group_id,
       col,
       count(col) over(partition by group_id) as cnt1,
       count(col) over(partition by group_id order by col) as cnt2,
       count(col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as cnt3,
       count(distinct col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as cnt4
FROM tmp;
group_id col cnt1 cnt2 cnt3 cnt4
1 a 3 1 3 3
1 b 3 2 2 2
1 c 3 3 1 1
2 e 2 2 2 1
2 e 2 2 1 1

SUM(expr) OVER()

返回窗口內求和值

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  2 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col
)
SELECT group_id,
       col,
       SUM(col) over(partition by group_id) as sum1,
       SUM(col) over(partition by group_id order by col) as sum2,
       SUM(col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as sum3,
       SUM(distinct col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as sum4
FROM tmp;
group_id col sum1 sum2 sum3 sum4
1 1 6 1 6 6
1 2 6 3 5 5
1 3 6 6 3 3
2 4 8 8 8 4
2 4 8 8 4 4

MIN(expr) OVER()

返回窗口內最小值

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  2 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  5 AS col
)
SELECT group_id,
       col,
       MIN(col) over(partition by group_id) as min1,
       MIN(col) over(partition by group_id order by col) as min2,
       MIN(col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as min3
FROM tmp;
group_id col min1 min2 min3
1 1 1 1 1
1 2 1 1 2
1 3 1 1 3
2 4 4 4 4
2 5 4 4 5

MAX(expr) OVER()

返回窗口內最大值

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  2 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  5 AS col
)
SELECT group_id,
       col,
       MAX(col) over(partition by group_id) as max1,
       MAX(col) over(partition by group_id order by col) as max2,
       MAX(col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as max3
FROM tmp;
group_id col max1 max2 max3
1 1 3 1 3
1 2 3 2 3
1 3 3 3 3
2 4 5 4 5
2 5 5 5 5

AVG(expr) OVER()

返回窗口內平均值

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  2 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col
)
SELECT group_id,
       col,
       AVG(col) over(partition by group_id) as avg1,
       AVG(col) over(partition by group_id order by col) as avg2,
       AVG(col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as avg3,
       AVG(distinct col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as avg4
FROM tmp;
group_id col avg1 avg2 avg3 avg4
1 1 2.0 1.0 2.0 2.0
1 2 2.0 1.5 2.5 2.5
1 3 2.0 2.0 3.0 3.0
2 4 4.0 4.0 4.0 4.0
2 4 4.0 4.0 4.0 4.0

2.2 分析函數 Analytics functions

RANK() OVER()

返回分組內排名(不支持自定義窗口)

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  5 AS col
)
SELECT group_id,
       col,
       RANK() over(partition by group_id order by col desc) as r
FROM tmp;
group_id col r
1 3 1
1 3 1
1 1 3
2 5 1
2 4 2

ROW_NUMBER() OVER()

返回分組內行號(不支持自定義窗口)

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  5 AS col
)
SELECT group_id,
       col,
       ROW_NUMBER() over(partition by group_id order by col desc) as r
FROM tmp;
group_id col r
1 3 1
1 3 2
1 1 3
2 5 1
2 4 2

DENSE_RANK() OVER()

返回分組內排名(排名相等不會留下空位,不支持自定義窗口)

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  5 AS col
)
SELECT group_id,
       col,
       DENSE_RANK() over(partition by group_id order by col desc) as r
FROM tmp;
group_id col r
1 3 1
1 3 1
1 1 2
2 5 1
2 4 2

CUME_DIST() OVER()

返回分組內累計分佈值,即分組內小於(或者大於)等於當前值行數/分組內總行數

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  5 AS col
)
SELECT group_id,
       col,
       CUME_DIST() over(partition by group_id order by col asc) as d1,
       CUME_DIST() over(partition by group_id order by col desc) as d2
FROM tmp;
group_id col d1 d2
1 3 1.0 0.6666666666666666
1 3 1.0 0.6666666666666666
1 1 0.3333333333333333 1.0
2 5 1.0 0.5
2 4 0.5 1.0

PERCENT_RANK() OVER()

返回百分比排序值,即分組內當前行的RANK值-1/分組內總行數-1

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  5 AS col
)
SELECT group_id,
       col,
       RANK() over(partition by group_id order by col asc) as r1,
       PERCENT_RANK() over(partition by group_id order by col asc) as p1,
       RANK() over(partition by group_id order by col desc) as r2,
       PERCENT_RANK() over(partition by group_id order by col desc) as p2
FROM tmp;
group_id col r1 p1 r2 p2
1 3 2 0.5 1 0.0
1 3 2 0.5 1 0.0
1 1 1 0.0 3 1.0
2 5 2 1.0 1 0.0
2 4 1 0.0 2 1.0

NTILE(INTEGER x) OVER()

返回分區編號(將有序分區劃分爲x個組,稱爲bucket,併爲分區中的每一行分配一個bucket編號)

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  5 AS col
)
SELECT group_id,
       col,
       NTILE(2) over(partition by group_id order by col asc) as bucket_id
FROM tmp;
group_id col bucket_id
1 1 1
1 3 1
1 3 2
1 3 2
2 4 1
2 5 2

2.3 OVER子句也支持聚合函數

Hive 2.1.0及之後版本,OVER子句也支持聚合函數,如:

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  5 AS col
)
SELECT group_id,
       RANK() over(order by sum(col) desc) as r
FROM tmp
group by group_id;

結果爲:

group_id r
2 1
1 2

2.4 window clause 的另一種寫法

將window子句寫在from後面,在over後使用別名進行引用,如下:

WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  2 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col
)
SELECT group_id,
       col,
       AVG(col) over w1 as avg1,
       AVG(distinct col) over(partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following) as avg2
FROM tmp
WINDOW w1 AS (partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following);

結果爲:

group_id col avg1 avg2
1 1 2.0 2.0
1 2 2.5 2.5
1 3 3.0 3.0
2 4 4.0 4.0
2 4 4.0 4.0
WITH tmp AS
(
  SELECT 1 AS group_id, 1 AS col 
  UNION ALL SELECT 1 AS group_id,  2 AS col 
  UNION ALL SELECT 1 AS group_id,  3 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col 
  UNION ALL SELECT 2 AS group_id,  4 AS col
)
SELECT group_id,
       col,
       AVG(col) over w1 as avg1,
       AVG(distinct col) over w2 as avg2
FROM tmp
WINDOW w1 AS (partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following),
w2 AS (partition by group_id order by col rows between CURRENT ROW and UNBOUNDED following);

結果爲:

group_id col avg1 avg2
1 1 2.0 2.0
1 2 2.5 2.5
1 3 3.0 3.0
2 4 4.0 4.0
2 4 4.0 4.0
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章