【Hive】SQL語句大全

文章目錄

庫操作

創建數據庫

-- 創建一個數據庫，在HDFS上的默認路徑爲/user/hive/warehouse/*.db
create database mydatabase;
-- 可以使用if exists判斷數據庫是否已存在(存在則不創建)
create database if not exists mydatabase;
-- 創建一個數據庫，並指定其存放路徑
create database mydatabase location '/mydatabase.db'; 
-- 創建一個數據庫，指定一個已存在的文件夾(my)作爲數據庫內容的存放位置
create database mydatabase location '/databases/my/';

查詢數據庫

-- 顯示所有數據庫
show databases;
-- 模糊搜索
show databases like 'my*';
-- 查看信息
desc database mydatabase;
-- 查看詳細信息
desc database extended mydatabase;
-- 切換當前數據庫
use mydatabase;

修改數據庫

可以修改一些其他的附加信息，不能修改元數據信息

-- 給數據庫添加信息
alter database mydatabase set dbproperties('createtime'='202003');
-- 查看上述添加的信息
desc database extended mydatabase;

刪除數據庫

-- 刪除一個空的數據庫
drop database mydatabase;
-- 最好使用if exists判斷數據庫是否存在
drop database if exists mydatabase;
-- 如果數據庫不爲空,可以採用cascade命令強制刪除
drop database mydatabase cascade;

表操作

創建表

-- 創建一張表
create table student(id int, name string);
-- 創建一張外部表
create external table student(id int, name string);
-- 創建表並設置表中數據的分隔符(以製表符爲例)
create table student(id int, name string) 
row format delimited fields terminated by '\t';
-- 創建表並設置表中數組數據的分隔符(以製表符爲例)
create table student(id int, name string) 
collection items terminated by "\t" ;

查看錶

-- 查看當前數據庫中的所有表
show tables;

修改表

-- 重命名錶
alter table student rename to new_student;
-- 添加列(添加的列在分區字段之前,括號中使用逗號分隔添加多列)
alter table student add columns(gender string);
-- 更新列信息(舊字段名,新字段名,新字段類型都要寫)
alter table student change column name name2 string;
-- 替換表中所有字段(將所有字段合併替換爲一個字段)
alter table student replace columns(replace string);

刪除表

drop table student;

內部表(管理表)和外部表

兩者的區別

刪除時，內部表把元數據和具體數據都刪除，而外部表只刪除元數據。

互相轉換

注意：這裏區分大小寫，括號中的內容要大寫！

如果不是大寫，該屬性會變成普通的附加屬性。

-- 轉爲外部表
alter table student set tblproperties('EXTERNAL'='TRUE');
-- 轉爲內部表
alter table student set tblproperties('EXTERNAL'='FALSE');

分區表

分區在HDFS上對應一個獨立的文件夾，屬於元數據，但用法相當於一個字段，可以用來過濾

創建分區表

-- 創建一個表,並設置以"month"字段分區
create table student(id int, name string) 
partitioned by(month string);
-- 創建二級分區表
create table student(id int, name string)
partitioned by(month string, day string)

添加分區

-- 往分區表裏添加一個分區
alter table student add partition(month='202003');
-- 往分區表裏添加多個分區(以空格隔開)
alter table student add partition(month='202003') partition(month='202003');

往分區表中添加數據

-- 加上關鍵字partition(...)指定分區即可;如果沒有該分區,則自動新建
load data local inpath'/opt/file.txt' into student partition(month='202003');
insert into student partition(month='202003') values(1,'abc');

查詢分區表數據

-- 通過分區查找數據
select * from student where month='202003';
select * from student where month='202003' and day='01';

刪除分區

-- 刪除一個分區表裏的分區
alter table student drop partition(month='202003');
-- 刪除多個分區表裏的分區(以逗號隔開)
alter table student drop partition(month='202003'),partition(month='202003');

查看分區

-- 顯示所有分區
show partitions student;

修復分區

如果數據是通過HDFS直接上傳到分區目錄，如果分區沒定義，則會查詢不到剛上傳的數據

-- 修復命令
msck repair table student;
-- 也可以直接讓此目錄成爲分區目錄(這裏以month='20200316'爲例)
alter table student add partition(month='20200316');

數據操作

數據導入

Load導入

-- 本地文件導入Hive表
load data local inputpath '/opt/student.txt' into table student;
-- HDFS文件導入Hive表(相當於移動文件到數據庫所在的文件夾)
load data inputpath '/student.txt' into table student;
-- 也可以直接移動文件至HDFS中的Hive表目錄下
hadoop fs -put student.txt /user/hive/warehouse/student
-- 導入,並覆蓋所有表中數據
load data local inputpath '/opt/student.txt' overwrite into table student;
-- 建表時通過Location指定加載數據路徑(文件夾)
create table student(id int, name string)
row format delimited fields terminated by '\t'
location '/dir';

Insert插入

-- 直接添加一條記錄
insert into table student values(1,'abc');
-- 添加,並覆蓋所有表中數據
insert overwrite table student values(1,'abc');

Import導入

只能導入被export導出的文件

-- 通過import導入數據
import table student2 from '/export/student';

數據查詢

基本查詢

-- 查詢表中所有數據
select * from student;
-- 查詢表中指定列數據
select id, name from student;
-- 將查詢到的結果插入到其他表
insert into student2 select * from student;
-- 以查詢到的結果創建新表
create table student2 as select id, name from student;
-- 以列別名顯示(as可不寫),使用別名還可以提升性能
select id as sid, name as sname from student;
-- 將查詢到的id值加100後顯示
select id+100 from student;
-- 常用函數(計數:count, 最大值:max, 最小值:min, 求和:sum, 平均數:avg)
select count(*) from student;
-- Limit語句用於限制返回的行數
select * from student limit 3;
-- Where語句用於過濾
select * from student where id = 1;

Floor 取整

-- 對123.123取整,結果爲123
select floor(123.123)

Like 和 Rlike

like: 選擇類似的值

% 代表零個或多個字符(任意個字符)。
_ 代表一個字符。

rlike: Java的正則匹配

-- 查詢姓“小”開頭的學生
select * from student where name like '小%';
-- 查詢姓名以“小”開頭的學生，並且名字只有兩個字
select * from student where name like '小_';
-- 查詢age字段中只包含數字的那些記錄
select * from student where age rlike '\\d+';

Distinct 去重

會將數據放入同一個Reducer，可能會報內存溢出，數據量大時慎用

-- 無論這個年齡段人數有多少,去重後只顯示1個
select age,count(distinct age) from mydatabase.student group by age;

Group By 分組查詢

-- 以字段age分組,配合count使用顯示每組的個數
select age,count(*) from student group by age;
-- 以字段grade分組,配合avg使用顯示每組age的平均數
select grade,avg(age) from student group by grade;
-- 先以age分組,再以gender分組,統計每個年齡段男女人數
select count(gender) from student group by age,gender;

Having 語句

where：對錶中的列發揮作用，不可跟聚合函數

having：對查詢結果中的列發揮作用，相當於二次篩選，可跟聚合函數，只能用於group byf分組統計語句

-- 以字段grade分組,顯示age平均值大於18的grade
select grade from student group by grade having avg(age)>18;

Join 語句

只支持等值連接，不支持非等值連接

-- 假設有兩張表:dept部門表和employee員工表
-- 內連接(只有都存在的數據纔會顯示)
-- 查詢員工表和部門表編號相同的數據,並顯示員工名字和部門名稱
select employee.name,dept.name from employee join dept on dept.d_id=employee.d_id;
-- 左外連接(顯示所有左表中有的數據)
select employee.name,dept.name from employee left join dept on dept.d_id=employee.d_id;
-- 右外連接(顯示所有右表中有的數據)
select employee.name,dept.name from employee right join dept on dept.d_id=employee.d_id;
-- 滿外連接(顯示所有數據,不匹配的值使用NULL值代替)
select employee.name,dept.name from employee full join dept on dept.d_id=employee.d_id;

常用查詢函數

NVL 空字段賦值

NVL(string1, replace_with)

如果string1爲NULL，該函數返回replace_with的值，否則返回string1的值

-- 如果age爲null,用18代替
select nvl(age,18) from student;
-- 替換的參數可以是字段,如果age爲null,用id值代替
select nvl(age,id) from student;

時間類

Date_format

格式化時間，注意:只能匹配橫杆 "-"

select date_format('2020-03-19','yyyy-MM-dd HH:mm:ss');
-- 結果: 2020-03-19 00:00:00

Date_add

時間跟天數相加，天數可以爲負

select date_add('2020-03-19', 10); 
-- 結果: 2020-03-29

Date_sub

時間跟天數相減，天數可以爲負

select date_sub('2020-03-19', 10);
-- 結果: 2020-03-09

Datediff

兩個時間相減，結果爲天數，注意：是參數1 - 參數2

時分秒不影響最後的結果

select datediff('2020-03-19', '2020-03-29');
-- 結果: -10
select datediff('2020-03-29', '2020-03-19');
-- 結果: 10
select datediff('2020-03-29 13:13:13','2020-03-19 12:12:12');
-- 結果: 10

CASE WHEN 語句

-- 判斷，如果gender爲'男'或'女'，分別設置1，最後統計每個年齡段男女人數
select
age,
sum(case gender when '男' then 1 else 0 end) male_count,
sum(case gender when '女' then 1 else 0 end) female_count
from student group by age;

IF 語句

-- 以下代碼等價於上面的case when
select
age,
sum(if(gender='男',1,0)) male_count,
sum(if(gender='女',1,0)) female_count
from student group by age;

行轉列

Concat

concat(string1/col, string2/col, …)

輸入任意個字符串(或字段,可以爲int類型等),返回拼接後的結果

select concat(id,'-',name,'-',age) from student;

Concat_ws

concat_ws(separator, str1, str2, …)

特殊形式的concat()，參數只能爲字符串，第一個參數爲後面參數的分隔符

select concat_ws('-', name, gender) from student;

Collect_set

collect_set(col)

將某字段進行去重處理，返回array類型；該函數只接受基本數據類型

select collect_set(age) from student;

列轉行

Explode

explode(col)

將一列中複雜的array或map結構拆分成多行

-- 將上面collect_set後的結果使用explode拆分
select explode(ages)
from (select collect_set(age) as ages from student ) as n1;

Lateral View

LATERAL VIEW udtf(expression) tableAlias AS columnAlias

配合split, explode等UDTF一起使用，它能夠將一列數據拆成多行數據，並且對拆分後的結果進行聚合

-- 假設有如下movies表,字段名分別爲movie(string)和category(array<string>)
-- movie	category
--《疑犯追蹤》	懸疑,動作,科幻,劇情
--《海豹突擊隊》	動作,劇情,罪案
--《戰狼2》	戰爭,動作,災難
select movie, cate
from movies
lateral view explode(category) tmp_table as cate;
-- 結果:
--《疑犯追蹤》	懸疑
--《疑犯追蹤》	動作
--《疑犯追蹤》	科幻
--《疑犯追蹤》	劇情
--《海豹突擊隊》	動作
-- ...

窗口函數

OVER()：指定分析函數工作的數據窗口大小，這個數據窗口大小可能會隨着行的變化而變化，

注意：該函數會對結果數據產生影響(比如在over(order by id)中排序後,結果也會被排序)

CURRENT ROW：當前行；
n PRECEDING：往前 n 行數據；
n FOLLOWING：往後 n 行數據；
UNBOUNDED：起點，UNBOUNDED PRECEDING 表示從前面的起點， UNBOUNDED FOLLOWING 表示到後面的終點；
LAG(col,n)：往前第 n 行數據；
LEAD(col,n)：往後第 n 行數據；
NTILE(n)：把有序分區中的行分發到指定數據的組中，各個組有編號，編號從 1 開始，
對於每一行，NTILE 返回此行所屬的組的編號。注意：n 必須爲 int 類型。

-- 幾個參數的固定格式寫法
-- 計算從當前行開始計算[2,4]行的gender數量
select *,count(gender) over(rows between 2 following and 4 following) from student;

假設有如下business表

name orderdate cost
------------------
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94
------------------
需求 
（1）查詢在 2017 年 4 月份購買過的顧客及總人數 
（2）查詢顧客的購買明細及月購買總額 
（3）上述的場景,要將 cost 按照日期進行累加 
（4）上述的場景，分別累加每個用戶每個月的開銷
（5）查詢顧客上次的購買時間 
（6）查詢前20%時間的訂單信息

查詢在 2017 年 4 月份購買過的顧客及總人數

select 
name,
count(*) over() as all_person
from business
where date_format(orderdate,'yyyy-MM')='2017-04'
group by name;

查詢顧客的購買明細及該用戶月購買總額

select 
name,
orderdate,
date_format(orderdate,'yyyy-MM') this_month,
cost,
sum(cost) over(distribute by name, date_format(orderdate,'yyyy-MM')) as this_user_this_month_sum
from business;

上述的場景，要將 cost 按照日期進行累加

select 
name,
orderdate,
cost,
sum(cost) over(distribute by name sort by orderdate)
from business;

上述的場景，分別累加每個用戶每個月的開銷

select 
name,
orderdate,
cost,
sum(cost) over(distribute by name,month(orderdate) sort by day(orderdate))
from business;

查詢顧客上次的購買時間

-- lag的第三個參數:如果沒有找到數據,用該參數代替,否則爲NULL
select
name,
orderdate,
cost,
lag(orderdate,1,'0000-00-00') over(distribute by name sort by orderdate)
from business;

查詢前20%時間的訂單信息

-- 使用ntile函數分組實現該操作
select * from
(
    select
    name,
    orderdate,
    cost,
    ntile(5) over(order by orderdate) as sorted
    from business
) as tmp_table
where sorted = 1;

Rank 排序

該函數配合OVER()使用

RANK() 排序相同時會重複，總數不會變
DENSE_RANK() 排序相同時會重複，總數會減少
ROW_NUMBER() 會根據順序計算

假設有如下score表

name  subject  score 
--------------------
小明	語文	87
小明	數學	95
小明	英語	68
小綠	語文	94
小綠	數學	56
小綠	英語	84
小紅	語文	64
小紅	數學	86
小紅	英語	84
小藍	語文	65
小藍	數學	85
小藍	英語	78
---------------------

-- 需求：計算每門學科成績排名。 
select
*,
rank() over(distribute by subject sort by score desc),
dense_rank() over(distribute by subject sort by score desc),
row_number() over(distribute by subject sort by score desc)
from score;

Regexp_replace 正則替換

regexp_replace(string A, string B, replace)

將字符串A中的符合JAVA正則表達式B的部分替換爲replace。

注意，在有些情況下要使用轉義字符

-- 將字符串中的“/”替換爲“-”
select regexp_replace('2020/03/21','/','-');
-- 結果:2020-03-21

數據排序

Order By 全局排序

整張表的排序，只有一個Reducer

-- 將數據按id值升序排序(默認升序,可以不寫asc)
select * from student order by id asc;
-- 將數據按id值降序排序
select * from student order by id desc;

Sort By 內部排序

對每個Reducer進行排序，不影響全局結果集

直接使用會將結果平均分配給每個文件(避免數據傾斜)

一般配合Distribute By使用

-- 先設置reduce個數
set mapreduce.job.reduces=3;
-- 直接查看結果,看不出變化
select * from student sort by id;
-- 將排序結果導出到文件
insert overwrite local directory '/opt/datas/sort-out'
select * from student sort by id;

Distribute By 分區排序

類似MapReduce中的Partition分區，一般配合Sort By排序使用

需要分配多個reduce才能看到效果

注意：該語句需要寫在 Sort By 語句之前！

-- 先設置reduce的個數
set mapreduce.job.reduces=3;
-- 先按照id值分區,再按照age值升序排序
insert overwrite local directory '/opt/datas/dis-out'
select * from student distribute by id sort by age;

Cluster By 排序

當 Distribute By 和 Sort By 字段相同時，可以使用 Cluster By 方式

該排序只能是升序排序

-- 以下兩種寫法等價
select * from student cluster by grade;
select * from student distribute by grade sort by grade;

分桶和抽樣查詢

分區針對的是數據的存儲路徑，分桶針對的是數據文件

創建分桶表

-- 創建分桶表
create table studentbucket (id int, name string, age int)
clustered by (id) into 4 buckets
row format delimited fields terminated by '\t';
-- 可以查看錶結構獲取bucket數量
desc formatted studentbucket;

在導入數據之前，要先設置一些屬性

-- 開啓分桶功能
set hive.enforce.bucketing=true;
-- 設置reduce個數爲-1,會自動根據桶個數決定reduce數
set mapreduce.job.reduces=-1;

插入數據

-- 因爲需要分區,所以要走mr任務的形式插入數據
-- 注意:load方法不走mr任務
-- 所以這裏使用select其他表的數據進行插入
insert into table studentbucket select * from student;

分桶表抽樣查詢

抽樣語法：TABLESAMPLE(BUCKET x OUT OF y)

注意：x的值必須小於等於y的值！

含義：x表示從哪個bucket開始抽取，

y表示總共抽取 (bucket數量 / y) 個bucket的數據，每隔一個y取下一個bucket

-- 抽樣查詢
-- 這裏是從bucket1開始抽取一個bucket數量(4/4=1)的數據
select * from studentbucket tablesample(bucket 1 out of 4 on id);
-- 這裏是從bucket1開始抽取2個bucket(第x=1和第x+y=3個bucket)的數據
select * from studentbucket tablesample(bucket 1 out of 2 on id);

數據導出

Insert 導出

-- 將查詢的結果導出到本地
insert overwrite local directory '/opt/datas' select * from student;
-- 將查詢的結果導出到本地,並按'\t'分割 
insert overwrite local directory '/opt/datas'
row format delimited fields terminated by '\t'
select * from student;
-- 將查詢的結果導出到HDFS
insert overwrite directory '/opt/datas' select * from student;

Hadoop 命令導出

# 直接將HDFS的文件直接發送到本地
hadoop fs -get /user/hive/warehouse/student /opt/datas

Hive Shell 命令導出

# 通過linux中的重定向符將查詢結果導出到文件
bin/hive -e "select * from student" > /opt/datas/student.txt;

Export 導出

-- 通過export導出至HDFS,並且保存了元數據
export table student to '/export/student';

數據刪除

Truncate 刪除

清空表中數據，只能刪除內部表，不能刪除外部表中的數據

-- 使用truncate清空表中數據
truncate table student;

函數

系統內置函數

-- 查看系統內置函數
show functions;
-- 查看系統內置函數的用法(split爲例)
desc function split;
-- 查看系統內置函數的詳細信息(split爲例)
desc function extended split;

自定義函數

UDF

User-Defined-Function

一進一出

如：split，datediff

繼承 org.apache.hadoop.ql.exec.UDF

實現 evaluate 方法

UDAF

User-Defined Aggregation Function

聚集函數，多進一出

類似：count/max/min

UDTF

User-Defined Table-Generating Functions

一進多出

如：lateral view explore()

繼承 org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;

實現三個方法 initialize，process，close

編程步驟

繼承org.apache.hadoop.ql.exec.UDF
需要實現 evaluate 函數；evaluate 函數支持重載；

在 hive 的命令行窗口創建函數

添加 jar包
```
add jar linux_jar_path
```

創建 function

create [temporary] function [dbname.]function_name AS class_name;

在 hive 的命令行窗口刪除函數

Drop [temporary] function [if exists] [dbname.]function_name;

注意事項

UDF 必須要有返回類型，可以返回 null，但是返回類型不能爲 void；

Maven依賴

<dependencies> 
    <!--https://mvnrepository.com/artifact/org.apache.hive/hive-exec --> 
    <dependency> 
      <groupId>org.apache.hive</groupId> 
      <artifactId>hive-exec</artifactId> 
      <version>1.2.1</version> 
    </dependency> 
</dependencies>