hive笔记----函数、练习等

hive特有数据类型

array数组

创建表

create table t_movie (movie string,actor array<STRING>,first_show_date string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
COLLECTION ITEMS TERMINATED BY ':';

判断取值

select movie,actor,first_show_date from t_movie where array_contains(actor,'吴京');

获取array数据类型长度

select movie,actor,first_show_date,size(actor) from t_movie

map类型

创建表

create table t_map (id int,name string,family_members map<string,string>,age int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' --列分割
COLLECTION ITEMS TERMINATED BY '#'		--map数据分割
MAP KEYS TERMINATED BY ':';			--key:value分割

判断值

select id,name,family_members['father'] as father,age from t_map;
select id,name,family_members['father'] as father,family_members['sister'] as sister,age from t_map;

获取map的key

select id,name,map_keys(family_members),age from t_map

函数嵌套查询

select id,name,family_members,age from t_map where array_contains(map_keys(family_members),'brother');

struct类型

创建表

create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>)
row format delimited fields terminated by ','
collection items terminated by ':';

查询字段信息

select id,name,info.sex,info.addr from t_user;

基本内置函数

类型转换函数

select cast("123" as int);
select cast("2017-01-01" as date);

表生成函数

行转列函数explode()

select explode(subjects) from t_stu_subject;

表生产函数 lateral view

select id,name,tmp.sub from t_stu_subject lateral view explode(subjects) tmp as sub where sub = '生物';

日新日活统计

--统计每天的活跃用户(日活)（需要用户的ip，用户的账号，用户访问时间最早的一条url和时间）

create table q_logs(ip string,name string,login_time string,url string) partitioned by (dt string) row format delimited fields terminated by ',';

load data local inpath '/root/hive_file/q_0915.txt' into table q_logs partition (dt = '20170915');
load data local inpath '/root/hive_file/q_0916.txt' into table q_logs partition (dt = '20170916');
load data local inpath '/root/hive_file/q_0917.txt' into table q_logs partition (dt = '20170917');

--日活统计
    --创建表
create table q_active(ip string,name string,login_time string,url string) partitioned by (dt string);
    --导入数据，分区修改对应的日期
insert into table q_active partition (dt='20170917')
select ip,name,login_time,url from (
select ip,name,login_time,url,row_number()over(partition by name order by login_time) as num from q_logs where dt = '20170917') tmp where num = 1;
select * from q_active;

--日新统计
    --创建表
    create table q_logs_history_user(name string);
    create table q_logs_new_user like q_logs;

    insert into table q_logs_new_user partition (dt='20170917')
    select ip,name,login_time,url from (
    select a.ip,a.name,a.login_time,a.url,h.name as history_name from q_active a left join q_logs_history_user h on a.name = h.name where a.dt = '20170917') tmp where history_name is null;

    insert into q_logs_history_user
    select name from q_logs_new_user where dt = '20170917';

    select * from q_logs_new_user;
    select * from q_logs_history_user;

条件控制函数

select id,name,
       case
           when info.age <=18 then "young"
            when info.age >18 and info.age <40 then "zhongnian"
            else "old"
        end as zhuangtai
        from t_user;

select id,`if`(info.age>25,"working","worked") as is_worked from t_user;

select movie,`if`(array_contains(actor,"吴刚"),"好电影","烂电影") as move_like from t_movie

窗口分析函数row_number() over ()

select * from (select id,age,name,sex,row_number() over (partition by sex order by age) as row_number from top_n ) tmp where tmp.row_number<=2;

--练习题----累计查询
select kk.username,kk.month,max(kk.t_amount) as amount,sum(kk.amount_a) as sum_amount from
               (select a.t_amount as amount_a,b.username,b.month,b.t_amount from
                            (select username,month,sum(counts) as t_amount from t_access group by username,month) a
    inner join (select username,month,sum(counts) as t_amount from t_access group by username,month) b
        on a.username = b.username  where a.month<=b.month) kk group by kk.username,kk.month;

窗口分析函数—sum over

select username,month,t_amount,sum(t_amount) over (partition by username order by month rows between unbounded preceding and current row ) as acount from t_access_amount;

json自定义函数

create table t_ratingjson(json string);
load data local inpath '/root/hive_file/rating.json' into table t_ratingjson;

add jar /root/hive-1.0-SNAPSHOT.jar;

create temporary function myjson as 'MyJsonParser';

select myjson(json,1) as movie,myjson(json,2) as rate,myjson(json,3) as ts,myjson(json,4) as uid from t_ratingjson limit 10;

写java类，实现想要的函数的功能
java程序打包成jar包，上传到hive服务器

import org.apache.hadoop.hive.ql.exec.UDF;

/**
 * @Author: lifx
 * @Description:
 * @Date: Created in 11:15 2019/9/18
 * @Modified By:
 */
public class MyJsonParser extends UDF {
    //重载父类中的方法evaluate()
    public String evaluate(String json,int index){
//        {"movie":"1096","rate":"4","timeStamp":"956715648","uid":"6040"}
//        {"movie":"1097","rate":"4","timeStamp":"956715569","uid":"6040"}
        String[] split = json.split("\"");
        return split[4*index - 1];
    }
}

hive命令行中将jar包添加到classpath
```
add jar /root/hive-1.0-SNAPSHOT.jar;
```

hive命令行中创建一个函数叫做xxx 关联你这个类

create temporary function myjson as 'MyJsonParser';

hive json解析函数

select json_tuple(json,"movie","rate","timeStamp","uid") as (moive,rate,ts,uid) from t_ratingjson

hive问题处理

-- 当前hive运行在strict模式，该模式下：
-- - 不能进行表的笛卡尔积连接
-- - order by语句必须带有limit：order by在一个reducer中执行，容易成为性能瓶颈
-- - 带分区表的查询必须使用分区字段，在where条件中　
解决:
set hive.mapred.mode=nonstrict;

hive练习题

--学生表
--01 赵雷 1990-01-01 男
create table a_student (stu_id int,stu_name string,birthday string,gender string)
row format delimited fields terminated by ' ';
load data local inpath '/root/hive_file/a_student.data' into table a_student;
01 赵雷 1990-01-01 男
02 钱电 1990-12-21 男
03 孙风 1990-05-20 男
04 李云 1990-08-06 男
05 周梅 1991-12-01 女
06 吴兰 1992-03-01 女
07 郑竹 1989-07-01 女
08 王菊 1990-01-20 女
---------------------

--课程表
--01      语文    02
create table a_class (class_id int,class string,teacher_id int)
row format delimited fields terminated by " ";
load data local inpath '/root/hive_file/a_class.data' into table a_class;
01      语文    02
02      数学    01
03      英语    03
-------------------

--教师表
--01      张三
create table a_teacher (teacher_id int,teacher_name string)
row format delimited fields terminated by " ";
load data local inpath '/root/hive_file/a_teacher.data' into table a_teacher;
01      张三
02      李四
03      王五
-----------------

--成绩表
--01      01      80
create table a_score(stu_id int,class_id int,score int)
row format delimited fields terminated by " ";
load data local inpath '/root/hive_file/a_score.data' into table a_score;
01      01      80
01      02      90
01      03      99
02      01      70
02      02      60
02      03      80
03      01      80
03      02      80
03      03      80
04      01      50
04      02      30
04      03      20
05      01      76
05      02      87
06      01      31
06      03      34
07      02      89
07      03      98
-------------------

select * from a_student;
select * from a_class;
select * from a_teacher;
select * from a_score;
--查询"01"课程比"02"课程成绩高的学生的信息及课程分数
select stu.stu_id, stu.stu_name, stu.birthday, stu.gender, sc1.score, sc2.score
from a_student stu
         join a_score sc1 on sc1.class_id = '01' and stu.stu_id = sc1.stu_id
         left join a_score sc2 on sc2.class_id = '02' and stu.stu_id = sc2.stu_id
        --加入下面这句是为了过滤缺考人员
         join a_score sc3 on stu.stu_id = sc3.stu_id
where sc1.score > sc2.score
   or sc2.score is null
group by stu.stu_id, stu.stu_name, stu.birthday, stu.gender, sc1.score, sc2.score;
--查询"01"课程比"02"课程成绩低的学生的信息及课程分数
select stu.stu_id, stu.stu_name, stu.birthday, stu.gender, sc1.score, sc2.score
from a_student stu
         full outer join a_score sc1 on sc1.class_id = '01' and stu.stu_id = sc1.stu_id
         left join a_score sc2 on sc2.class_id = '02' and stu.stu_id = sc2.stu_id
         join a_score sc3 on stu.stu_id = sc3.stu_id
where sc1.score < sc2.score
   or sc1.score is null
group by stu.stu_id, stu.stu_name, stu.birthday, stu.gender, sc1.score, sc2.score;
--查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩
select stu.stu_id, stu.stu_name, round(avg(b.score), 2) as avg_score
from a_student stu
         join a_score b on stu.stu_id = b.stu_id
group by stu.stu_id, stu.stu_name
having round(avg(b.score), 2) >= 60;
--查询平均成绩小于60分的同学的学生编号和学生姓名和平均成绩
select stu.stu_id, stu.stu_name, round(avg(b.score), 2) as avg_score
from a_student stu
         left join a_score b on stu.stu_id = b.stu_id
group by stu.stu_id, stu.stu_name
having round(avg(b.score), 2) < 60
union
select stu.stu_id, stu.stu_name, 0 as avg_score
from a_student stu
         left join a_score b on stu.stu_id = b.stu_id
where b.score is null order by avg_score;
--查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩
select stu.stu_id, stu.stu_name, count(sc.class_id) as count_class, sum(sc.score) as sum_score
from a_student stu
         left join a_score sc on stu.stu_id = sc.stu_id
group by stu.stu_id, stu.stu_name;
--查询"李"姓老师的数量
select teacher_name, count(1) as count_teacher
from a_teacher
where teacher_name like '李%'
group by teacher_name;
--查询学过"张三"老师授课的同学的信息
select *
from a_student
where stu_id in (
    select stu_id
    from a_score
    where class_id in
          (select b.class_id
           from a_teacher a,
                a_class b
           where teacher_name like "%张三%"
             and a.teacher_id = b.teacher_id));
--查询没学过"张三"老师授课的同学的信息
select stu.stu_id, stu.stu_name, stu.birthday, stu.gender, c.score
from a_student stu
         join a_teacher a on a.teacher_name = '张三'
         join a_class b on a.teacher_id = b.teacher_id
         left join a_score c on c.stu_id = stu.stu_id and c.class_id = b.class_id
where c.score is null;
--查询学过编号为"01"并且也学过编号为"02"的课程的同学的信息
select stu.stu_id, stu.stu_name, stu.gender, stu.birthday
from a_student stu
         join a_score a on a.class_id = '01' and a.stu_id = stu.stu_id
         join a_score b on b.class_id = '02' and b.stu_id = stu.stu_id;
--查询学过编号为"01"但是没有学过编号为"02"的课程的同学的信息
select stu.stu_id, stu.stu_name, stu.gender, stu.birthday
from a_student stu
         join a_score a on a.class_id = '01' and a.stu_id = stu.stu_id
         left join a_score b on b.class_id = '02' and b.stu_id = stu.stu_id
where b.score is null;

hive笔记----函数、练习等

hive特有数据类型

array数组

创建表

判断取值

获取array数据类型长度

map类型

创建表

判断值

获取map的key

函数嵌套查询

struct类型

创建表

查询字段信息

基本内置函数

类型转换函数

表生成函数

行转列函数explode()

表生产函数 lateral view

日新日活统计

条件控制函数

窗口分析函数row_number() over ()

窗口分析函数—sum over

json自定义函数

hive json解析函数

hive问题处理

hive练习题

ComfyUI 完全入门：ControlNet 使用教程

基於docker創建apline版本jupyter-server及應用

基於CDH6.2kylin2.6.2安裝部署

協同過濾算法始末記錄

hive筆記----函數、練習等

千萬不要被pyquery的方便所矇蔽了雙眼-----新坑

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結