拉鍊表流程圖:
需求:在mysql數據庫中有一個人員信息表(userinfo)
數據字段如下:
在Mysql中的建表語句爲:
– 創建數據庫
create database if not exists exam;
– 創建人員表
create table if not existsexam
.userinfo
(
user_id varchar(50), – 人員編號
phone_number varchar(50), – 人員手機號
health_code varchar(50), – 健康碼的顏色
reg_time varchar(50), – 用戶註冊日期
modify_time varchar(50) – 健康碼顏色修改日期
);
2020年04月30日的數據如下:
插入語句:
insert into
exam
.userinfo
(user_id, phone_number, health_code, reg_time,modify_time) values
(‘001’, ‘15953100001’, ‘綠色’, ‘2020-03-14’, ‘2020-04-30’),
(‘002’, ‘15953100002’, ‘綠色’, ‘2020-03-14’, ‘2020-04-30’),
(‘003’, ‘15953100003’, ‘黃色’, ‘2020-03-14’, ‘2020-04-30’),
(‘004’, ‘15953100004’, ‘紅色’, ‘2020-03-14’, ‘2020-04-30’);
結果:
2020年05月01日的數據如下:
修改和添加的語句:
UPDATE
exam
.userinfo
SET health_code = ‘綠色’, modify_time = ‘2020-05-01’ WHERE user_id = ‘003’;
UPDATEexam
.userinfo
SET health_code = ‘黃色’, modify_time = ‘2020-05-01’ WHERE user_id = ‘004’;
INSERT INTOexam
.userinfo
(user_id, phone_number, health_code, reg_time,modify_time) VALUES
(‘005’, ‘15953100005’, ‘綠色’, ‘2020-05-01’, ‘2020-05-01’);
第一次最後結果視圖:
2020年05月02日的數據如下:
修改和添加語句:
UPDATE
exam
.userinfo
SET health_code = ‘綠色’, modify_time = ‘2020-05-02’ WHERE user_id = ‘004’;
UPDATEexam
.userinfo
SET health_code = ‘黃色’, modify_time = ‘2020-05-02’ WHERE user_id = ‘005’;
INSERT INTOexam
.userinfo
(user_id, phone_number, health_code, reg_time,modify_time) VALUES
(‘006’, ‘15953100006’, ‘綠色’, ‘2020-05-02’, ‘2020-05-02’);
從以上數據可以看出,隨着時間推移而變化,我們需要將人員的所有變化的歷史信息都保存下來,請在Hive中使用拉鍊表進行實現。
說明:分區字段爲dt,判斷字段爲modify_time,在拉鍊表中添加的字段爲dw_start_date(生效時間),dw_end_date(結束時間)。
第二次最後結果視圖:
解題思路:
1、創建mysql庫和表
-- 創建數據庫
CREATE DATABASE IF NOT EXISTS exam;
-- 創建人員表
CREATE TABLE IF NOT EXISTS `exam`.`userinfo`(
user_id VARCHAR(50), -- 人員編號
phone_number VARCHAR(50), -- 人員手機號
health_code VARCHAR(50), -- 健康碼的顏色
reg_time VARCHAR(50), -- 用戶註冊日期
modify_time VARCHAR(50) -- 健康碼顏色修改日期
);
INSERT INTO `exam`.`userinfo`(user_id, phone_number, health_code, reg_time,modify_time) VALUES
('001', '15953100001', '綠色', '2020-03-14', '2020-04-30'),
('002', '15953100002', '綠色', '2020-03-14', '2020-04-30'),
('003', '15953100003', '黃色', '2020-03-14', '2020-04-30'),
('004', '15953100004', '紅色', '2020-03-14', '2020-04-30');
UPDATE `exam`.`userinfo` SET health_code = '綠色', modify_time = '2020-05-01' WHERE user_id = '003';
UPDATE `exam`.`userinfo` SET health_code = '黃色', modify_time = '2020-05-01' WHERE user_id = '004';
INSERT INTO `exam`.`userinfo`(user_id, phone_number, health_code, reg_time,modify_time) VALUES
('005', '15953100005', '綠色', '2020-05-01', '2020-05-01');
UPDATE `exam`.`userinfo` SET health_code = '綠色', modify_time = '2020-05-02' WHERE user_id = '004';
UPDATE `exam`.`userinfo` SET health_code = '黃色', modify_time = '2020-05-02' WHERE user_id = '005';
INSERT INTO `exam`.`userinfo`(user_id, phone_number, health_code, reg_time,modify_time) VALUES
('006', '15953100006', '綠色', '2020-05-02', '2020-05-02');
2、使用kettle把mysql表數據輸出到hive表
3、在hive裏面創建數據庫和表
CREATE TABLE IF NOT EXISTS `itcast_ods`.`userinfo`(
user_id string,-- 人員編號
phone_number string,-- 人員手機號
health_code string,-- 健康碼的顏色
reg_time string,-- 用戶註冊日期
modify_time string-- 健康碼顏色修改日期
)
partitioned by (dt string)
STORED AS PARQUET;
CREATE TABLE IF NOT EXISTS `itcast_dw`.`userinfo`(
user_id string,-- 人員編號
phone_number string,-- 人員手機號
health_code string,-- 健康碼的顏色
reg_time string,-- 用戶註冊日期
modify_time string,-- 健康碼顏色修改日期
dw_start_date string,-- 生效日期
dw_end_date string-- 失效日期
)
STORED AS PARQUET;
-- 將全量數據導入到dw層維度表
insert overwrite table `itcast_dw`.`userinfo`
select
user_id string,
phone_number string,
health_code string,
reg_time string,
modify_time string,
modify_time as dw_start_date,
'9999-12-31' as dw_end_date
from
`itcast_ods`.`userinfo`
where
dt = '20200430';
-- 將增量數據導入到dw層維度表(第一題20200501)
insert overwrite table `itcast_dw`.`userinfo`
select
t1.user_id,
t1.phone_number,
t1.health_code,
t1.reg_time,
t1.modify_time,
t1.dw_start_date,
case when (t2.user_id is not null and t1.dw_end_date > '20200501')
then '20200501'
else t1.dw_end_date
end as dw_end_date
from
`itcast_dw`.`userinfo` t1
left join
(select * from `itcast_ods`.`userinfo` where dt='20200501') t2
on t1.user_id = t2.user_id
union all
select
user_id,
phone_number,
health_code,
reg_time,
modify_time,
modify_time as dw_start_date,
'9999-12-31' as dw_end_date
from
`itcast_ods`.`userinfo` where dt='20200501'
order by dw_start_date, user_id;
-- 將增量數據導入到dw層維度表(第一題20200502)
insert overwrite table `itcast_dw`.`userinfo`
select
t1.user_id,
t1.phone_number,
t1.health_code,
t1.reg_time,
t1.modify_time,
t1.dw_start_date,
case when (t2.user_id is not null and t1.dw_end_date > '20200502')
then '20200502'
else t1.dw_end_date
end as dw_end_date
from
`itcast_dw`.`userinfo` t1
left join
(select * from `itcast_ods`.`userinfo` where dt='20200502') t2
on t1.user_id = t2.user_id
union all
select
user_id,
phone_number,
health_code,
reg_time,
modify_time,
modify_time as dw_start_date,
'9999-12-31' as dw_end_date
from
`itcast_ods`.`userinfo` where dt='20200502'
order by dw_start_date, user_id;