hive特有數據類型
array數組
創建表
create table t_movie (movie string,actor array<STRING>,first_show_date string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
COLLECTION ITEMS TERMINATED BY ':';
判斷取值
select movie,actor,first_show_date from t_movie where array_contains(actor,'吳京');
獲取array數據類型長度
select movie,actor,first_show_date,size(actor) from t_movie
map類型
創建表
create table t_map (id int,name string,family_members map<string,string>,age int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' --列分割
COLLECTION ITEMS TERMINATED BY '#' --map數據分割
MAP KEYS TERMINATED BY ':'; --key:value分割
判斷值
select id,name,family_members['father'] as father,age from t_map;
select id,name,family_members['father'] as father,family_members['sister'] as sister,age from t_map;
獲取map的key
select id,name,map_keys(family_members),age from t_map
函數嵌套查詢
select id,name,family_members,age from t_map where array_contains(map_keys(family_members),'brother');
struct類型
創建表
create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>)
row format delimited fields terminated by ','
collection items terminated by ':';
查詢字段信息
select id,name,info.sex,info.addr from t_user;
基本內置函數
類型轉換函數
select cast("123" as int);
select cast("2017-01-01" as date);
表生成函數
行轉列函數explode()
select explode(subjects) from t_stu_subject;
表生產函數 lateral view
select id,name,tmp.sub from t_stu_subject lateral view explode(subjects) tmp as sub where sub = '生物';
日新日活統計
--統計每天的活躍用戶(日活)(需要用戶的ip,用戶的賬號,用戶訪問時間最早的一條url和時間)
create table q_logs(ip string,name string,login_time string,url string) partitioned by (dt string) row format delimited fields terminated by ',';
load data local inpath '/root/hive_file/q_0915.txt' into table q_logs partition (dt = '20170915');
load data local inpath '/root/hive_file/q_0916.txt' into table q_logs partition (dt = '20170916');
load data local inpath '/root/hive_file/q_0917.txt' into table q_logs partition (dt = '20170917');
--日活統計
--創建表
create table q_active(ip string,name string,login_time string,url string) partitioned by (dt string);
--導入數據,分區修改對應的日期
insert into table q_active partition (dt='20170917')
select ip,name,login_time,url from (
select ip,name,login_time,url,row_number()over(partition by name order by login_time) as num from q_logs where dt = '20170917') tmp where num = 1;
select * from q_active;
--日新統計
--創建表
create table q_logs_history_user(name string);
create table q_logs_new_user like q_logs;
insert into table q_logs_new_user partition (dt='20170917')
select ip,name,login_time,url from (
select a.ip,a.name,a.login_time,a.url,h.name as history_name from q_active a left join q_logs_history_user h on a.name = h.name where a.dt = '20170917') tmp where history_name is null;
insert into q_logs_history_user
select name from q_logs_new_user where dt = '20170917';
select * from q_logs_new_user;
select * from q_logs_history_user;
條件控制函數
select id,name,
case
when info.age <=18 then "young"
when info.age >18 and info.age <40 then "zhongnian"
else "old"
end as zhuangtai
from t_user;
select id,`if`(info.age>25,"working","worked") as is_worked from t_user;
select movie,`if`(array_contains(actor,"吳剛"),"好電影","爛電影") as move_like from t_movie
窗口分析函數row_number() over ()
select * from (select id,age,name,sex,row_number() over (partition by sex order by age) as row_number from top_n ) tmp where tmp.row_number<=2;
--練習題----累計查詢
select kk.username,kk.month,max(kk.t_amount) as amount,sum(kk.amount_a) as sum_amount from
(select a.t_amount as amount_a,b.username,b.month,b.t_amount from
(select username,month,sum(counts) as t_amount from t_access group by username,month) a
inner join (select username,month,sum(counts) as t_amount from t_access group by username,month) b
on a.username = b.username where a.month<=b.month) kk group by kk.username,kk.month;
窗口分析函數—sum over
select username,month,t_amount,sum(t_amount) over (partition by username order by month rows between unbounded preceding and current row ) as acount from t_access_amount;
json自定義函數
create table t_ratingjson(json string);
load data local inpath '/root/hive_file/rating.json' into table t_ratingjson;
add jar /root/hive-1.0-SNAPSHOT.jar;
create temporary function myjson as 'MyJsonParser';
select myjson(json,1) as movie,myjson(json,2) as rate,myjson(json,3) as ts,myjson(json,4) as uid from t_ratingjson limit 10;
-
寫java類,實現想要的函數的功能
-
java程序打包成jar包,上傳到hive服務器
-
import org.apache.hadoop.hive.ql.exec.UDF; /** * @Author: lifx * @Description: * @Date: Created in 11:15 2019/9/18 * @Modified By: */ public class MyJsonParser extends UDF { //重載父類中的方法evaluate() public String evaluate(String json,int index){ // {"movie":"1096","rate":"4","timeStamp":"956715648","uid":"6040"} // {"movie":"1097","rate":"4","timeStamp":"956715569","uid":"6040"} String[] split = json.split("\""); return split[4*index - 1]; } }
-
hive命令行中將jar包添加到classpath
-
add jar /root/hive-1.0-SNAPSHOT.jar;
-
hive命令行中創建一個函數叫做xxx 關聯你這個類
create temporary function myjson as 'MyJsonParser';
hive json解析函數
select json_tuple(json,"movie","rate","timeStamp","uid") as (moive,rate,ts,uid) from t_ratingjson
hive問題處理
-
-- 當前hive運行在strict模式,該模式下: -- - 不能進行表的笛卡爾積連接 -- - order by語句必須帶有limit:order by在一個reducer中執行,容易成爲性能瓶頸 -- - 帶分區表的查詢必須使用分區字段,在where條件中 解決: set hive.mapred.mode=nonstrict;
hive練習題
--學生表 --01 趙雷 1990-01-01 男 create table a_student (stu_id int,stu_name string,birthday string,gender string) row format delimited fields terminated by ' '; load data local inpath '/root/hive_file/a_student.data' into table a_student; 01 趙雷 1990-01-01 男 02 錢電 1990-12-21 男 03 孫風 1990-05-20 男 04 李雲 1990-08-06 男 05 周梅 1991-12-01 女 06 吳蘭 1992-03-01 女 07 鄭竹 1989-07-01 女 08 王菊 1990-01-20 女 --------------------- --課程表 --01 語文 02 create table a_class (class_id int,class string,teacher_id int) row format delimited fields terminated by " "; load data local inpath '/root/hive_file/a_class.data' into table a_class; 01 語文 02 02 數學 01 03 英語 03 ------------------- --教師表 --01 張三 create table a_teacher (teacher_id int,teacher_name string) row format delimited fields terminated by " "; load data local inpath '/root/hive_file/a_teacher.data' into table a_teacher; 01 張三 02 李四 03 王五 ----------------- --成績表 --01 01 80 create table a_score(stu_id int,class_id int,score int) row format delimited fields terminated by " "; load data local inpath '/root/hive_file/a_score.data' into table a_score; 01 01 80 01 02 90 01 03 99 02 01 70 02 02 60 02 03 80 03 01 80 03 02 80 03 03 80 04 01 50 04 02 30 04 03 20 05 01 76 05 02 87 06 01 31 06 03 34 07 02 89 07 03 98 -------------------
select * from a_student;
select * from a_class;
select * from a_teacher;
select * from a_score;
--查詢"01"課程比"02"課程成績高的學生的信息及課程分數
select stu.stu_id, stu.stu_name, stu.birthday, stu.gender, sc1.score, sc2.score
from a_student stu
join a_score sc1 on sc1.class_id = '01' and stu.stu_id = sc1.stu_id
left join a_score sc2 on sc2.class_id = '02' and stu.stu_id = sc2.stu_id
--加入下面這句是爲了過濾缺考人員
join a_score sc3 on stu.stu_id = sc3.stu_id
where sc1.score > sc2.score
or sc2.score is null
group by stu.stu_id, stu.stu_name, stu.birthday, stu.gender, sc1.score, sc2.score;
--查詢"01"課程比"02"課程成績低的學生的信息及課程分數
select stu.stu_id, stu.stu_name, stu.birthday, stu.gender, sc1.score, sc2.score
from a_student stu
full outer join a_score sc1 on sc1.class_id = '01' and stu.stu_id = sc1.stu_id
left join a_score sc2 on sc2.class_id = '02' and stu.stu_id = sc2.stu_id
join a_score sc3 on stu.stu_id = sc3.stu_id
where sc1.score < sc2.score
or sc1.score is null
group by stu.stu_id, stu.stu_name, stu.birthday, stu.gender, sc1.score, sc2.score;
--查詢平均成績大於等於60分的同學的學生編號和學生姓名和平均成績
select stu.stu_id, stu.stu_name, round(avg(b.score), 2) as avg_score
from a_student stu
join a_score b on stu.stu_id = b.stu_id
group by stu.stu_id, stu.stu_name
having round(avg(b.score), 2) >= 60;
--查詢平均成績小於60分的同學的學生編號和學生姓名和平均成績
select stu.stu_id, stu.stu_name, round(avg(b.score), 2) as avg_score
from a_student stu
left join a_score b on stu.stu_id = b.stu_id
group by stu.stu_id, stu.stu_name
having round(avg(b.score), 2) < 60
union
select stu.stu_id, stu.stu_name, 0 as avg_score
from a_student stu
left join a_score b on stu.stu_id = b.stu_id
where b.score is null order by avg_score;
--查詢所有同學的學生編號、學生姓名、選課總數、所有課程的總成績
select stu.stu_id, stu.stu_name, count(sc.class_id) as count_class, sum(sc.score) as sum_score
from a_student stu
left join a_score sc on stu.stu_id = sc.stu_id
group by stu.stu_id, stu.stu_name;
--查詢"李"姓老師的數量
select teacher_name, count(1) as count_teacher
from a_teacher
where teacher_name like '李%'
group by teacher_name;
--查詢學過"張三"老師授課的同學的信息
select *
from a_student
where stu_id in (
select stu_id
from a_score
where class_id in
(select b.class_id
from a_teacher a,
a_class b
where teacher_name like "%張三%"
and a.teacher_id = b.teacher_id));
--查詢沒學過"張三"老師授課的同學的信息
select stu.stu_id, stu.stu_name, stu.birthday, stu.gender, c.score
from a_student stu
join a_teacher a on a.teacher_name = '張三'
join a_class b on a.teacher_id = b.teacher_id
left join a_score c on c.stu_id = stu.stu_id and c.class_id = b.class_id
where c.score is null;
--查詢學過編號爲"01"並且也學過編號爲"02"的課程的同學的信息
select stu.stu_id, stu.stu_name, stu.gender, stu.birthday
from a_student stu
join a_score a on a.class_id = '01' and a.stu_id = stu.stu_id
join a_score b on b.class_id = '02' and b.stu_id = stu.stu_id;
--查詢學過編號爲"01"但是沒有學過編號爲"02"的課程的同學的信息
select stu.stu_id, stu.stu_name, stu.gender, stu.birthday
from a_student stu
join a_score a on a.class_id = '01' and a.stu_id = stu.stu_id
left join a_score b on b.class_id = '02' and b.stu_id = stu.stu_id
where b.score is null;