[hadoop@hadoop102 /opt/soft1]$ll /opt/module/hive/ |grep hivefunction-1.0-SNAPSHOT.jar
-rw-r–r--. 1 hadoop hadoop 5528 Feb 10 02:19 hivefunction-1.0-SNAPSHOT.jar
hive (gmall)> add jar /opt/module/hive/hivefunction-1.0-SNAPSHOT.jar;
Added [/opt/module/hive/hivefunction-1.0-SNAPSHOT.jar] to class path
Added resources: [/opt/module/hive/hivefunction-1.0-SNAPSHOT.jar]
4.2 DWD层事件表数据解析
4.2.1 创建基础明细表
明细表用于存储ODS层原始表转换过来的明细数据。
1)创建事件日志基础明细表
hive (gmall)>
drop table if exists dwd_base_event_log;
CREATE EXTERNAL TABLE dwd_base_event_log(
mid_id
string,
user_id
string,
version_code
string,
version_name
string,
lang
string,
source
string,
os
string,
area
string,
model
string,
brand
string,
sdk_version
string,
gmail
string,
height_width
string,
app_time
string,
network
string,
lng
string,
lat
string,
event_name
string,
event_json
string,
server_time
string)
PARTITIONED BY (dt
string)
stored as parquet
location ‘/warehouse/gmall/dwd/dwd_base_event_log/’;
2)说明:其中event_name和event_json用来对应事件名和整个事件。这个地方将原始日志1对多的形式拆分出来了。操作的时候我们需要将原始日志展平,需要用到UDF和UDTF。
4.2.2 自定义UDF函数(解析公共字段)
4.2.3 自定义UDTF函数(解析具体事件字段)
2)打包
3)将hivefunction-1.0-SNAPSHOT上传到hadoop102的/opt/module/hive/
4)将jar包添加到Hive的classpath
hive (gmall)> add jar /opt/module/hive/hivefunction-1.0-SNAPSHOT.jar;
5)创建临时函数与开发好的java class关联
hive (gmall)>
create temporary function base_analizer as ‘com.atguigu.udf.BaseFieldUDF’;
create temporary function flat_analizer as ‘com.atguigu.udtf.EventJsonUDTF’;
4.2.4 解析事件日志基础明细表
1)解析事件日志基础明细表
hive (gmall)>
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_base_event_log
PARTITION (dt=‘2019-02-10’)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
event_name,
event_json,
server_time
from
(
select
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[0] as mid_id,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[1] as user_id,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[2] as version_code,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[3] as version_name,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[4] as lang,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[5] as source,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[6] as os,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[7] as area,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[8] as model,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[9] as brand,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[10] as sdk_version,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[11] as gmail,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[12] as height_width,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[13] as app_time,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[14] as network,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[15] as lng,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[16] as lat,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[17] as ops,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[18] as server_time
from ods_event_log where dt=‘2019-02-10’ and base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’)<>’’
) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;
2)测试
hive (gmall)> select * from dwd_base_event_log limit 2;
4.2.5 DWD层数据解析脚本
1)在hadoop102的/home/atguigu/bin目录下创建脚本
[atguigu@hadoop102 bin]$ vim dwd_base_log.sh
在脚本中编写如下内容
#!/bin/bash
定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n “$1” ] ;then
do_date=$1
else
do_date=date -d "-1 day" +%F
fi
sql="
add jar /opt/module/hive/hivefunction-1.0-SNAPSHOT.jar;
create temporary function base_analizer as ‘com.atguigu.udf.BaseFieldUDF’;
create temporary function flat_analizer as ‘com.atguigu.udtf.EventJsonUDTF’;
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "do_date’)
select
mid_id,
user_id,
version_code,
version_name,
lang,
source ,
os ,
area ,
model ,
brand ,
sdk_version ,
gmail ,
height_width ,
network ,
lng ,
lat ,
app_time ,
event_name ,
event_json ,
server_time
from
(
select
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[0] as mid_id,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[1] as user_id,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[2] as version_code,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[3] as version_name,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[4] as lang,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[5] as source,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[6] as os,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[7] as area,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[8] as model,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[9] as brand,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[10] as sdk_version,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[11] as gmail,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[12] as height_width,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[13] as app_time,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[14] as network,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[15] as lng,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[16] as lat,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[17] as ops,
split(base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’),’\t’)[18] as server_time
from "do_date’ and base_analizer(line,‘mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la’)<>’’
) sdk_log lateral view flat_analizer(ops) tmp_k as event_name, event_json;
"
sql"
2)增加脚本执行权限
[atguigu@hadoop102 bin]$ chmod 777 dwd_base_log.sh
3)脚本使用
[atguigu@hadoop102 module]$ dwd_base_log.sh 2019-02-11
4)查询导入结果
hive (gmall)>
select * from dwd_base_event_log where dt=‘2019-02-11’ limit 2;
5)脚本执行时间
企业开发中一般在每日凌晨30分~1点