具體練習數據可從Hive sql數據獲取
#0.創建數據庫
create database traindb;
use database triandb;
#1.建表 student表
create table student(sid varchar(10),sname varchar(10),sbirth date,ssex varchar(10))
partitioned by (sclass int)
row format delimited fields terminated by ','
lines terminated by '\n';
#1.1導入數據
#法1. load data local(默認爲hdfs) inpath '數據地址' into(overwrite into) table 表名;
#into是追加到表尾 ,overwrite into是覆蓋
#法2. 先用hadoop將數據傳到hdfs表的存儲地址下,然後修復表;
hdfs hfs -put '本地地址' 'hdfs:///'
msck repair table 表名;
load data local inpath '/home/sparknode/桌面/student1.txt' into table student partition(sclass=1);
load data local inpath '/home/sparknode/桌面/student2.txt' into table student partition(sclass=2);
load data local inpath '/home/sparknode/桌面/student3.txt' into table student partition(sclass=3);
# 表2 course表
create table course(cid varchar(10),cname varchar(10),tid varchar(10))
row format delimited fields terminated by ','
lines terminated by '\n';
load data local inpath '/home/sparknode/桌面/course.txt' into table course;
# 表3 teacher表
create table teacher(tid varchar(10),tname varchar(10))
row format delimited fields terminated by ','
lines terminated by '\n';
#直接用hadoop將數據put到hdfs上
hdfs dfs -put '/home/sparknode/桌面/teacher.txt' 'hdfs:///home/sparknode/hive/traindb.db/teacher/';
#此時只有hdfs中有數據,而mysql的元數據中並無該信息,因此在hive中修復一下表
msck repair table teacher;
# 表4 score表
create table score(sid varchar(10),cid varchar(10),score int)
row format delimited fields terminated by ','
lines terminated by '\n';
load data local inpath '/home/sparknode/桌面/score.txt' overwrite into table score;
hive 查詢 基礎篇,該部分與mysql一致,可參考sql面試50題
#***********練習*****基礎版*******************
#一、模糊查詢
#1.查詢 李姓 學生信息
select sid,sname,sbirth,ssex,sclass
from student
where sname like '李%';
#2.查詢姓名中帶有 四 的學生信息
select sid,sname,sbirth,ssex,sclass
from student
where sname like '%四';
#3.查詢姓名第二位爲 四 的學生信息
select sid,sname,sbirth,ssex,sclass
from student
where sname like '_四%';
#二、彙總分析
#1.查詢課程編號爲“2”的總成績
select sum(score) as sumScore
from score
where cid='2';
#2.查詢選了課程的學生人數
select count(distinct(sid)) from score;
#3.查詢課程編號爲“2”的最高分
select max(score)
from score
where cid = '2';
#三、分組
#1.查詢各科成績的最高及最低分
select cid,max(score) as maxScore,min(score) as minScore
from score
group by cid;
#2.查詢每門課程被選修的學生數
select cid, count(1) as num from score group by cid;
#3.查詢男、女生人數
select ssex,count(1)
from student
group by ssex;
#四、分組+條件過濾
#1.查詢平均成績大於60分學生的學號和平均成績
select sid,avg(score) as avgScore
from score
group by sid
having avgScore > 60;
#2.查詢選修三門課的學生學號
select sid
from score
group by sid
having count(*) = 3;
#3.查詢同名學生名單,並統計同名人數
select sname,count(*) as num
from student
group by sname
having num >1;
#四、複雜查詢
#1.查詢存在成績小於60分的學生的學號、姓名
select sid,sname from student
where sid in
(select distinct(sid) from score where score<60);
select s.sid,s.sname
from student s
right join (select distinct(sid) from score where score<60) as tmp
on s.sid = tmp.sid;
#2.查詢所有成績均小於60的學生的學號,姓名(所有課全不及格的學生)
select s.sid,s.sname from student s right join (
select sid from score
group by sid
having sum(case when score>60 then 1 else 0 end) = 0 ) tmp
on s.sid =tmp.sid;
#3.查詢沒有學全所有課的學生的學號、姓名
select s.sid,s.sname from student s right join
(select sid from score
group by sid
having count(*) < 5 ) as tmp
on s.sid = tmp.sid;
分組 top-N問題 ****重點
#查詢各科成績前兩名的記錄(top-N問題)
#情景0.僅能用於沒有重複的情況
select t1.cid,t1.score from score t1 left join
score t2
on t1.cid = t2.cid
where t1.score <= t2.score
group by t1.cid,t1.score
having count(t1.score)<=2;
#情景1.有並列 相同分數算並列
select t1.cid,t1.score from score t1 left join
score t2
on t1.cid = t2.cid and t1.score < t2.score
group by t1.cid,t1.score
having count(t2.score) <2;
#用on t1.cid = t2.cid 然後通過where過濾 where t1.score < t2.score,會將最高分直接過濾沒
#而on t1.cid = t2.cid and t1.score < t2.score ,最高分則會連接到Null,
#或使用dense_rank()、rank()、row_number() + 窗口函數 over()
#1. dense_rank() 相當於並列
select cid,score,t.rank from
(select cid,score,dense_rank()
over(partition by cid order by score desc)
as rank
from score) t
where t.rank <=2;
#結果
cid score t.rank
1 100 1
1 99 2
2 100 1
2 99 2
3 100 1
3 100 1
3 100 1
3 100 1
3 98 2
4 97 1
4 90 2
5 100 1
5 96 2
#2.rank() 也並列,但是並列的人佔了不止一個名額
select cid,score,t.rank from
(select cid,score,rank()
over(partition by cid order by score desc)
as rank
from score) t
where t.rank <=2;
#結果
cid score t.rank
1 100 1
1 99 2
2 100 1
2 99 2
3 100 1
3 100 1
3 100 1
3 100 1
4 97 1
4 90 2
5 100 1
5 96 2
#3.row_number() 無並列
select cid,score,t.rank from
(select cid,score,row_number()
over(partition by cid order by score desc)
as rank
from score) t
where t.rank <=2;
#結果
cid score t.rank
1 100 1
1 99 2
2 100 1
2 99 2
3 100 1
3 100 2
4 97 1
4 90 2
5 100 1
5 96 2
行列互換問題
#1.某個字段的多行合併爲該字段一行
#統計每個學生選擇的課程,並存爲一張表(每個學生佔用一行)
create table elective_info(sid varchar(10),sname varchar(10),cname varchar(100));
insert into table elective_info
select sid,sname,concat_ws("|",collect_set(cname)) as course from (select s.sid,st.sname,c.cname from score s left join
course c on
s.cid = c.cid
left join
student st
on s.sid = st.sid) t
group by sid,sname;
concat(字段1,...,字段n) 可拼接多個字段(必須爲string類型),也可拼接任何string類型
concat_ws(分隔符,字段1,...,字段n)
concat和concat_ws的區別:如用'|'爲分隔符,拼接字段'a'、'b'、'c',操作爲
concat(a,'|',b,'|')
concat_ws('|',a,b,c)
且concat_ws可拼接array中的字段
collect_set(字段),可將字段去重,然後將結果已array形式返回
結合collect_set+concat_ws+group by 就可完成按key將同key中其它字段合併爲一行的操作
elective_info.sid elective_info.sname elective_info.cname
1 趙雷 語文|數學|英語|政治
10 李四 語文|數學|英語|政治
11 李四 語文
12 趙六 語文
13 孫七 數學
14 鄭雙 語文|數學|英語|政治|歷史
15 王一 語文|數學|英語|政治|歷史
16 馮二 語文|數學|英語|政治|歷史
17 陳三 語文|數學|英語|政治|歷史
2 錢電 語文|數學|英語|歷史
3 孫風 語文|數學|英語|政治|歷史
4 李雲 語文|數學|英語|政治|歷史
5 周梅 語文|數學|英語|歷史
6 吳蘭 語文|數學|歷史
7 鄭竹 語文|數學|英語|政治
8 梅梅 數學|英語|政治
9 張三 語文|數學|政治
#2.某個字段拆分爲多行
#將上述表格還原爲 類似
sid sname t.cname
1 趙雷 語文
1 趙雷 數學
1 趙雷 英語
1 趙雷 政治
10 李四 語文
10 李四 數學
10 李四 英語
10 李四 政治
11 李四 語文
select sid,sname, t.cname
from elective_info
lateral view explode(split(cname,"\\|")) t as cname;
explode(col) 其中col爲array類型,可將每一行的array合併,並以列的形式返回
lateral view 爲側寫
待續...