Hive练习题

#选出城市在北京,性别为女的是个用户

select user_name
from user_info
where city = 'beijing' and sex = 'female'
limit 10;

#选出在2019年4月9号,购买的商品品类是food的用户名、购买数量、支付金额

select user_name,piece,pay_amount
from user_trade
where dt = '2019-04-09' and goods_category = 'food';


#2019年一月到四月,每个品类有多少人购买,累计金额是多少

select goods_category,
count(distinct user_name) as num,
sum(pay_amount) as total_amount


from user_trade
where dt between '2019-01-01' and '2019-04-30'
group by goods_category;


#group by的作用:分类汇总
常用聚合函数:
1.count():计数count(distinct...)去重计数
2.sum():求和
3.avg():平均值
4.max():最大值
5.min():最小值

group by .....having

#2019年4月,支付金额超过5万元的用户

select user_name,
sum(pay_amount) as total_amount
from user_trade
where dt between '2019-04-01' and '2019-04-30'
group by user_name
having sum(pay_amount)>50000;

#having :对group by 的对象进行筛选


#2019年4月,支付金额最多的top5用户

select user_name,
sum(pay_amount) as total_amount
from user_trade
where dt between '2019-04-01' and '2019-04-30'
group by user_name
order by total_amount desc limit 5;

#常用函数
1、如何把时间戳转化为日期?

select pay_time,
from_unixtime(pay_time,'yyyy-MM-dd hh:mm:ss')
from user_trade
where dt = '2019-04-09';


#如何计算日期间隔

用户的首次激活时间,与2019年5月1日的日期间隔

select user_name,
datediff('2019-05-01',to_date(firstactivetime))
from user_info
limit 10;

条件函数
case when

#统计一下四个年龄段20岁以下,20-30岁,40岁以上的用户数:

         
select count(distinct user_id) user_num,
case when age<20 then '2osui'
when age>=20 and age<30 then '20-30sui'
when age>=30 and age<40 then '30-40sui'
else '40suiyihang' end as age_type
from user_info
group by case when age<20 then '2osui'
when age>=20 and age<30 then '20-30sui'
when age>=30 and age<40 then '30-40sui'
else '40suiyihang' end;

#if函数
#统计每个性别用户等级高低的分布情况(level大于5为高级)

select sex,
if (level>5,'高','低') as level_type,
count(distinct user_id) user_num
from user_info
group by sex,
if (level>5,'高','低');
from user_info 


4、字符串函数
#每个月新激活的用户数

select count(distinct user_id) as user_num,
       substr(firstactivetime,1,7) as month    
from user_info
group by substr(firstactivetime,1,7);


substr(string A,int start,int len)
备注:如果不指定截取长度则从起始位一直截取到最后

select get_json_object(extra1,'$.phonebrand') as phone_brand,
count(distinct user_id) user_num
from user_info
group by get_json_object(extra1,'$.phonebrand');


5、聚合统计函数

#ELLA用户的2018年的平均支付金额,以及2018年最大的支付日期与最小的支付日期的间隔

select avg(pay_amount) as avg_amount,
datediff(max(from_unixtime(pay_time,'yyyy-MM-dd')),
min(from_unixtime(pay_time,'yyyy-MM-dd')))
from user_trade
where year(dt)='2018'
and user_name = 'ELLA';

#2018年购买的商品品类在两个以上的用户数
select count(a.user_name)
from 
(select user_name,
count(distinct goods_category)as category_num
from user_trade
where year(dt)='2018'
group by user_name having count(distinct goods_category)>2)as a;


#用户激活时间在2018年,年龄段在20-30岁和30-40岁的婚姻状况分布

select a.age_type,
if(a.marraige_status=1,'已婚','未婚'),
count(distinct a.user_id)
from 
(select case when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else '40岁以上' end as age_type,
get_json_object(extra1,'$.marraige_status') as marraige_status,
user_id
from user_info
where to_date(firstactivetime) between '2018-01-01' and '2018-12-31') as a
where a.age_type in ('20-30岁','30-40岁')
group by a.age_type,
if (a.marraige_status=1,'已婚','未婚');

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章