一 hive mysql create table as/like區別
mysql
-- 沒有開啓gtid的情況下,不拷貝數據,只創建一模一樣的表結構,包括索引約束等,結合insert語句可以實現複製一個表的結構和數據的目的
create table tbl_test_bak like tbl_test;
insert into tbl_test_bak select * from tbl_test;
-- 以下方式也可以創建表結構,包含數據,但是沒有索引約束等,所以不推薦再使用了。而且,在開啓gtid情況下,會報語法錯誤
create table tbl_test_bak as select * from tbl_test;
hive
create table as
CTAS建表語句(CREATE TABLE AS SELECT)
使用查詢創建並填充表,select中選取的列名會作爲新表的列名(所以通常是要取別名)
1)會改變表的屬性、結構,比如只能是內部表、分區分桶也沒了
2)目標表不允許使用分區分桶的,FAILED: SemanticException [Error 10068]: CREATE-TABLE-AS-SELECT does not support partitioning in the target table
3)對於舊錶中的分區字段,如果通過select * 的方式,新表會把它看作一個新的字段,這裏要注意
目標表不允許使用外部表,如create external table … as select…報錯 FAILED: SemanticException [Error 10070]: CREATE-TABLE-AS-SELECT cannot create external table
4)CTAS創建的表存儲格式會變成默認的格式TEXTFILE
5)對了,還有字段的註釋comment也會丟掉,同時新表也無法加上註釋
6)但可以在CTAS語句中指定表的存儲格式,行和列的分隔符等
create table xxx as select ...
create table xxx
row format delimited
fields terminated by ' '
stored as parquet
as
select ...
create table like
如何快速複製一張分區表?完全複製備份表。
方法1
create table... like...
+insert into table ... partition(xxx)...select...
create [external] table partition_test like old_table;
insert into table partition_test
partition(dt)
select
trim(userid) userid,
trim(cardno) cardno,
if(lower(trim(flag)) in ("true","1","01"),"1","0") flag,
substr(cardtype,1,1) cardtype,
trim(times) times,
substr(times,0,10) dt
from old_table
order by rand()
limit 100;
方法2
- 或者通過hdfs複製數據並修復新表的分區相關元數據
create table newtable like oldtable
hdfs dfs -cp /old_table/ /path/new_table/
msck repair table newtable
二 幾個sql題目的網址
https://www.cnblogs.com/qingyunzong/p/8747656.html
1 列轉行。行轉列
explode就是將hive一行中複雜的array或者map結構拆分成多行。
select explode(split('a,b,c,d,e',','));
--------------------------------------------------------------------------------
OK
a
b
c
d
e
Time taken: 19.317 seconds, Fetched: 5 row(s)
Lateral View一般與用戶自定義表生成函數(如explode())結合使用。 如內置表生成函數中所述,UDTF爲每個輸入行生成零個或多個輸出行。 Lateral View 首先將UDTF應用於基表的每一行,然後將結果輸出行連接到輸入行,以形成具有提供的表別名的虛擬表。
select 'test' a,split('a,b,c,d,e',',') b
--------------------------------------------------------------------------------
OK
test ["a","b","c","d","e"]
Time taken: 10.905 seconds, Fetched: 1 row(s)
with test as (
select 'test' a,split('a,b,c,d,e',',') b
)
select a,adid from test LATERAL VIEW explode(b) adTable AS adid;
--------------------------------------------------------------------------------
OK
test a
test b
test c
test d
test e
Time taken: 6.063 seconds, Fetched: 5 row(s)
正式的行列轉換的例子
hive> select * from col_lie limit 10;
OK
col_lie.user_id col_lie.order_id
104399 1715131
104399 2105395
104399 1758844
104399 981085
104399 2444143
104399 1458638
104399 968412
104400 1609001
104400 2986088
104400 1795054
行轉列#########################
hive>
create table lie_col as
select user_id,
concat_ws(',',collect_list(order_id)) as order_value
from col_lie
group by user_id;
--------------------------------------------------------------------------------
OK
104399 1715131,2105395,1758844,981085,2444143,1458638,968412,1715131,2105395,1758844,981085,2444143,1458638,968412
104400 1609001,2986088,1795054,1609001,2986088,1795054
Time taken: 23.739 seconds, Fetched: 2 row(s)
列轉行###################33
hive>
select user_id,order_id
from lie_col
lateral view explode(split(order_value,',')) num as order_id;
--------------------------------------------------------------------------------
OK
104399 1715131
104399 2105395
104399 1758844
104399 981085
104399 2444143
104399 1458638
104399 968412
104399 1715131
104399 2105395
104399 1758844
104399 981085
104399 2444143
104399 1458638
104399 968412
104400 1609001
104400 2986088
104400 1795054
104400 1609001
104400 2986088
104400 1795054
Time taken: 8.171 seconds, Fetched: 20 row(s)
2 窗口函數(主要用途是篩選沒有出現在group by後面的字段的需求)
over()指定分析函數工作的數據窗口大小,這個窗口大小可能隨着行的變化而變化
current row:當前行
n preceding 往前n行數據
n following 往後n行數據
unbounded 起點
unbounded preceding 從前面的起點
unbounded following 到後面的終點
lag(col,n)往前第n行數據
lead(col,n) 往後第n行數據
ntile(n) 把有序分區中的行分發到指定數據的組中,各個組有編號,編號從1開始,對於每一行,ntile返回此行所屬的組的編號。
例子:
表student中的數據格式如下:
name month degree
s1 201801 A
s1 201802 A
s1 201803 C
s1 201804 A
s1 201805 A
s1 201806 A
s2 201801 A
s2 201802 B
s2 201803 C
s2 201804 A
s2 201805 D
s2 201806 A
s3 201801 C
s3 201802 A
s3 201803 A
s3 201804 A
s3 201805 B
s3 201806 A
現要查詢表中連續三個月以上degree均爲A的記錄
select
a1.name,
a1.month,
a1.degree
from
(
select
name,
month,
degree,
sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS score1,
sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN 1 PRECEDING AND 1 following) AS score2,
sum(if(degree = 'A', 1, 0)) OVER(PARTITION BY name ORDER BY month ROWS BETWEEN CURRENT ROW AND 2 following) AS score3
from student
) as a1
where
a1.score1 = 3 or
a1.score2 = 3 or
a1.score3 = 3
例子2
兩者效果是一樣的, 窗口是開在了group by聚合之後的結果集,臨時表上。
select a.name, count(*) over ()
from
(
select name
from business
where substr(orderdate,1,7)="2017-01"
group by name) a;
--------------------------------------------------------------------------------
OK
andy 4
jack 4
lucy 4
tom 4
Time taken: 7.586 seconds, Fetched: 4 row(s)
select name, count(*) over()
from business
where substr(orderdate,1,7)="2017-01"
group by name;
--------------------------------------------------------------------------------
OK
andy 4
jack 4
lucy 4
tom 4
Time taken: 7.586 seconds, Fetched: 4 row(s)
例子3
over(partition by uname order by umonth) 窗口函數裏面加order by 跟不加有什麼變化。
經過測試 order by 相當於後面再加上rows between unbounded preceding and current row, 感覺兩者結合效果是一樣的。
order by;
order by rows between unbounded preceding and current row;
###背景如下:
select
uname,
umonth,
sum(ucount) mncounts
from t_access
group by uname,umonth;
--------------------------------------------------------------------------------
OK
A 2015-01 33
A 2015-02 10
A 2015-03 38
B 2015-01 30
B 2015-02 15
B 2015-03 34
Time taken: 17.804 seconds, Fetched: 6 row(s)
###第一次計算
with t1 as (
select
uname,
umonth,
sum(ucount) mncounts
from t_access
group by uname,umonth
)
select
uname,
umonth,
mncounts,
max(mncounts) over(partition by uname rows between unbounded preceding and current row),
sum(mncounts) over(partition by uname order by umonth) totalcounts
#######上面這句換成這句效果一樣sum(mncounts) over(partition by uname order by umonth rows ####### between unbounded preceding and current row) totalcounts
from
t1;
--------------------------------------------------------------------------------
OK
A 2015-01 33 33 33
A 2015-02 10 33 43
A 2015-03 38 38 81
B 2015-01 30 30 30
B 2015-02 15 30 45
B 2015-03 34 34 79
Time taken: 14.4 seconds, Fetched: 6 row(s)
####注意這次窗口函數沒有加order by umonth
with t1 as (
select
uname,
umonth,
sum(ucount) mncounts
from t_access
group by uname,umonth
)
select
uname,
umonth,
mncounts,
max(mncounts) over(partition by uname rows between unbounded preceding and current row),
sum(mncounts) over(partition by uname) totalcounts
from
t1;
--------------------------------------------------------------------------------
OK
A 2015-01 33 33 81
A 2015-02 10 33 81
A 2015-03 38 38 81
B 2015-01 30 30 79
B 2015-02 15 30 79
B 2015-03 34 34 79
Time taken: 22.425 seconds, Fetched: 6 row(s)
例子4
分析顧客上次購物的時間?
=====================背景=====================
hive> select * from business;
OK
jack 2017-01-01 10
jack 2017-01-02 10
tom 2017-01-01 10
tom 2017-01-02 10
tom 2017-01-03 10
andy 2017-01-01 10
andy 2017-01-02 10
andy 2017-01-03 10
lucy 2017-01-01 10
lucy 2017-01-02 10
lucy 2017-01-03 10
jack 2017-01-03 10
jack 2017-02-01 10
jack 2017-02-02 10
tom 2017-02-01 10
tom 2017-02-02 10
tom 2017-02-03 10
andy 2017-02-01 10
Time taken: 0.071 seconds, Fetched: 18 row(s)
===================================正式計算============================
select *, lag(orderdate,1) over(distribute by name sort by orderdate) from business;
--------------------------------------------------------------------------------
OK
andy 2017-01-01 10 NULL
andy 2017-01-02 10 2017-01-01
andy 2017-01-03 10 2017-01-02
andy 2017-02-01 10 2017-01-03
jack 2017-01-01 10 NULL
jack 2017-01-02 10 2017-01-01
jack 2017-01-03 10 2017-01-02
jack 2017-02-01 10 2017-01-03
jack 2017-02-02 10 2017-02-01
lucy 2017-01-01 10 NULL
lucy 2017-01-02 10 2017-01-01
lucy 2017-01-03 10 2017-01-02
tom 2017-01-01 10 NULL
tom 2017-01-02 10 2017-01-01
tom 2017-01-03 10 2017-01-02
tom 2017-02-01 10 2017-01-03
tom 2017-02-02 10 2017-02-01
tom 2017-02-03 10 2017-02-02
Time taken: 9.021 seconds, Fetched: 18 row(s)