hive sql練習

具體練習數據可從Hive sql數據獲取

#0.創建數據庫
create database traindb;

use database triandb;

#1.建表 student表
create table student(sid varchar(10),sname varchar(10),sbirth date,ssex varchar(10))
partitioned by (sclass int)
row format delimited fields terminated by ','
lines terminated by '\n';

#1.1導入數據
#法1. load data local(默認爲hdfs) inpath '數據地址' into(overwrite into) table 表名;
#into是追加到表尾 ，overwrite into是覆蓋
#法2. 先用hadoop將數據傳到hdfs表的存儲地址下，然後修復表;
     hdfs hfs -put '本地地址' 'hdfs:///'
     msck repair table 表名;
load data local inpath '/home/sparknode/桌面/student1.txt' into table student partition(sclass=1);
load data local inpath '/home/sparknode/桌面/student2.txt' into table student partition(sclass=2);
load data local inpath '/home/sparknode/桌面/student3.txt' into table student partition(sclass=3);

#  表2 course表
create table course(cid varchar(10),cname varchar(10),tid varchar(10))
row format delimited fields terminated by ','
lines terminated by '\n';

load data local inpath '/home/sparknode/桌面/course.txt' into table course;
  
#  表3 teacher表
create table teacher(tid varchar(10),tname varchar(10))
row format delimited fields terminated by ','
lines terminated by '\n';
#直接用hadoop將數據put到hdfs上
hdfs dfs -put '/home/sparknode/桌面/teacher.txt' 'hdfs:///home/sparknode/hive/traindb.db/teacher/';
#此時只有hdfs中有數據，而mysql的元數據中並無該信息，因此在hive中修復一下表
msck repair table teacher;

#  表4 score表
create table score(sid varchar(10),cid varchar(10),score int)
row format delimited fields terminated by ','
lines terminated by '\n';

load data local inpath '/home/sparknode/桌面/score.txt' overwrite into table score;

hive 查詢基礎篇，該部分與mysql一致，可參考sql面試50題

#***********練習*****基礎版*******************

#一、模糊查詢
#1.查詢 李姓 學生信息
select sid,sname,sbirth,ssex,sclass 
from student 
where sname like '李%';

#2.查詢姓名中帶有 四 的學生信息
select sid,sname,sbirth,ssex,sclass 
from student 
where sname like '%四';

#3.查詢姓名第二位爲 四 的學生信息
select sid,sname,sbirth,ssex,sclass 
from student 
where sname like '_四%';

#二、彙總分析
#1.查詢課程編號爲“2”的總成績
select sum(score) as sumScore 
from score 
where cid='2';
#2.查詢選了課程的學生人數
select count(distinct(sid)) from score;
#3.查詢課程編號爲“2”的最高分
select max(score) 
from score 
where cid = '2';

#三、分組
#1.查詢各科成績的最高及最低分
select cid,max(score) as maxScore,min(score) as minScore 
from score 
group by cid;

#2.查詢每門課程被選修的學生數
select cid, count(1) as num from score group by cid;

#3.查詢男、女生人數
select ssex,count(1) 
from student
group by ssex;


#四、分組+條件過濾
#1.查詢平均成績大於60分學生的學號和平均成績
select sid,avg(score) as avgScore 
from score
group by sid
having avgScore > 60;

#2.查詢選修三門課的學生學號
select sid 
from score
group by sid
having count(*) = 3;

#3.查詢同名學生名單，並統計同名人數
select sname,count(*) as num
from student
group by sname
having num >1;

#四、複雜查詢
#1.查詢存在成績小於60分的學生的學號、姓名
select sid,sname from student 
where sid in 
(select distinct(sid) from score where score<60);

select s.sid,s.sname 
from student s 
right join (select distinct(sid) from score where score<60) as tmp 
on s.sid = tmp.sid;

#2.查詢所有成績均小於60的學生的學號，姓名(所有課全不及格的學生)
select s.sid,s.sname from student s right join (
select sid from score
group by sid
having  sum(case when score>60 then 1 else 0 end) = 0 ) tmp
on s.sid =tmp.sid;

#3.查詢沒有學全所有課的學生的學號、姓名
select s.sid,s.sname from student s right join 
(select sid from score
group by sid
having count(*) < 5 ) as tmp
on s.sid = tmp.sid;

分組 top-N問題 ****重點

#查詢各科成績前兩名的記錄(top-N問題)

#情景0.僅能用於沒有重複的情況
select t1.cid,t1.score from score t1 left join 
score t2 
on t1.cid = t2.cid
where t1.score <= t2.score
group by t1.cid,t1.score
having count(t1.score)<=2;

#情景1.有並列  相同分數算並列
select t1.cid,t1.score from score t1 left join 
score t2 
on t1.cid = t2.cid and t1.score < t2.score
group by t1.cid,t1.score
having count(t2.score) <2;

#用on t1.cid = t2.cid  然後通過where過濾 where t1.score < t2.score,會將最高分直接過濾沒
#而on t1.cid = t2.cid and t1.score < t2.score ,最高分則會連接到Null,



#或使用dense_rank()、rank()、row_number()  + 窗口函數 over()
#1. dense_rank()  相當於並列

select cid,score,t.rank from 
(select cid,score,dense_rank() 
over(partition by cid order by score desc)  
as rank 
from score) t 
where t.rank <=2;


#結果
cid	score	t.rank
1	100	1
1	99	2
2	100	1
2	99	2
3	100	1
3	100	1
3	100	1
3	100	1
3	98	2
4	97	1
4	90	2
5	100	1
5	96	2

#2.rank()  也並列，但是並列的人佔了不止一個名額

select cid,score,t.rank from 
(select cid,score,rank() 
over(partition by cid order by score desc)  
as rank 
from score) t 
where t.rank <=2;

#結果
cid	score	t.rank
1	100	1
1	99	2
2	100	1
2	99	2
3	100	1
3	100	1
3	100	1
3	100	1
4	97	1
4	90	2
5	100	1
5	96	2

#3.row_number()  無並列
select cid,score,t.rank from 
(select cid,score,row_number() 
over(partition by cid order by score desc)  
as rank 
from score) t 
where t.rank <=2;

#結果
cid	score	t.rank
1	100	1
1	99	2
2	100	1
2	99	2
3	100	1
3	100	2
4	97	1
4	90	2
5	100	1
5	96	2

行列互換問題

#1.某個字段的多行合併爲該字段一行
#統計每個學生選擇的課程，並存爲一張表(每個學生佔用一行)
create table elective_info(sid varchar(10),sname varchar(10),cname varchar(100));
insert into table elective_info 
select sid,sname,concat_ws("|",collect_set(cname)) as course from (select s.sid,st.sname,c.cname from score s left join 
course c on
s.cid = c.cid
left join 
student st 
on s.sid = st.sid) t
group by sid,sname;

concat(字段1,...,字段n)  可拼接多個字段（必須爲string類型），也可拼接任何string類型
concat_ws(分隔符,字段1,...,字段n）

concat和concat_ws的區別：如用'|'爲分隔符，拼接字段'a'、'b'、'c'，操作爲
                        concat(a,'|',b,'|')
                        concat_ws('|',a,b,c)
                       且concat_ws可拼接array中的字段
collect_set(字段)，可將字段去重，然後將結果已array形式返回

結合collect_set+concat_ws+group by 就可完成按key將同key中其它字段合併爲一行的操作

elective_info.sid	elective_info.sname	elective_info.cname
1	趙雷	語文|數學|英語|政治
10	李四	語文|數學|英語|政治
11	李四	語文
12	趙六	語文
13	孫七	數學
14	鄭雙	語文|數學|英語|政治|歷史
15	王一	語文|數學|英語|政治|歷史
16	馮二	語文|數學|英語|政治|歷史
17	陳三	語文|數學|英語|政治|歷史
2	錢電	語文|數學|英語|歷史
3	孫風	語文|數學|英語|政治|歷史
4	李雲	語文|數學|英語|政治|歷史
5	周梅	語文|數學|英語|歷史
6	吳蘭	語文|數學|歷史
7	鄭竹	語文|數學|英語|政治
8	梅梅	數學|英語|政治
9	張三	語文|數學|政治

#2.某個字段拆分爲多行
#將上述表格還原爲 類似
sid	sname	t.cname
1	趙雷	語文
1	趙雷	數學
1	趙雷	英語
1	趙雷	政治
10	李四	語文
10	李四	數學
10	李四	英語
10	李四	政治
11	李四	語文

select sid,sname, t.cname 
from elective_info 
lateral view explode(split(cname,"\\|")) t as cname;

explode(col) 其中col爲array類型，可將每一行的array合併，並以列的形式返回
lateral view 爲側寫

待續...

使用pyinstaller生成exe可執行文件

Shell腳本的基本操作

樹莓派製作電視盒子---國內部分m3u8直播源

kafka掃盲篇

sparkstreaming下的第一個word count程序（python版）

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結