備註:由於不是專業的大數據開發人員,所以,難免有錯誤或不專業的地方,此處,記錄一下是爲了以後開發方便。
一、創建表
- 注意:表名和字段名起的要有意義。
- 獲取當前hive表的創建語句:show create table base_xxxx_d_material_hive;
- 搜索表名含有某個關鍵詞的hive表: hive> show tables '*material*';
- 查看hive表的表結構:desc base_xxxx_d_material_hive ;
非分區表-demo:
CREATE TABLE `base_xxx_d_material_hive`(
`qipu_id` bigint COMMENT '描述信息xxx',
`first_source` string COMMENT '描述信息xxx',
`cut_from` int COMMENT '描述信息xxxx',
`cut_type` double COMMENT '描述信息xxxx',
`reject_recommend` boolean COMMENT '描述信息xxxx',
`has_original_site_logo` boolean COMMENT '描述信息xxxx',
`v_cate_type` int COMMENT '1-長 2-短 3-小',
`user_tag_list` array<string> COMMENT '用戶標籤結構化的數據,數組類型') STORED AS PARQUET;
分區表-demo:
CREATE TABLE `test_base_xx_d_album_audit_hive`(
`id` bigint COMMENT '主鍵ID,自動遞增',
`title` string COMMENT '標題',
`uid` bigint COMMENT '上傳者的uid',
`xxxx_name` string COMMENT '用戶品類,多個用英文逗號分隔',
`page_url` string COMMENT '鏈接地址',
`create_time` string COMMENT '該條記錄的創建時間',
`create_user` string COMMENT '該條記錄的創建人',
`update_time` string COMMENT '該條記錄的最後一次修改時間',
`update_user` string COMMENT '該條記錄的最後一次修改者',
`version` bigint COMMENT '暫無意義',
`pending_time` string COMMENT '提審時間,以最新的爲準(歷史數據沒有)',
`low_quality_rate` double COMMENT '低質率(和播放量一樣,每天更新一次)',
`short_title` array<string> COMMENT '標題集合'
) PARTITIONED BY (
`dt` string) STORED AS PARQUET ;
二、新增字段
//添加新的字段和註釋
alter table base_xxx_d_material_hive add columns(user_tag_list array<string>);
alter table beexxx.base_xxx_d_material_hive change column user_tag_list user_tag_list array<string> COMMENT '用戶標籤結構化的數據,數組類型';
//如果一次想增多個字段,可以這樣
alter table test_xxx_biz_d_mp_tag_audit_data_tbl add columns(
main_tag_info_list_short array<string> comment '主標籤-簡潔版(hanliwei-add)',
content_tag_info_list_short array<string> comment '內容標籤-簡潔版(hanliwei-add)',
nlp_main_tag_info_list_short array<string> comment 'nlp主標籤-簡潔版(hanliwei-add)',
nlp_content_tag_info_list_short array<string> comment 'nlp內容標籤-簡潔版(hanliwei-add)',
key_tag_list_short array<string> comment '重點標籤-簡潔版(hanliwei-add)'
);
//添加字段,分區表需要實時生效的,
alter table open_platform.xxx_biz_d_mp_tag_audit_data_tbl add columns(author_content_tag_info_list_short array<string>) CASCADE;
alter table open_platform.xxx_biz_d_mp_tag_audit_data_tbl change column author_content_tag_info_list_short author_content_tag_info_list_short array<string> COMMENT '用戶標籤-簡潔版(xxx-add)';
CASCADE 可以保證目前分區的數據,在重新寫入時,新字段會有值。否則,爲NULL.只有新的分區纔會生效。
三、刪除字段
//刪除hive字段,其實,刪除之前的所有的字段,再添加新的字段,過程。
//比如刪除user_tag_list字段,就是下面的語句少了user_tag_list字段。
alter TABLE `base_xxxx_d_material_hive` replace columns (
`qipu_id` bigint COMMENT '描述信息xxx',
`first_source` string COMMENT '描述信息xxx',
`cut_from` int COMMENT '描述信息xxxx',
`cut_type` double COMMENT '描述信息xxxx',
`reject_recommend` boolean COMMENT '描述信息xxxx',
`has_original_site_logo` boolean COMMENT '描述信息xxxx',
`xxx_cate_type` int COMMENT '1-長 2-短 3-小'
);
四、修改字段註釋
//修改現有字段的註釋
alter table beexxxx.base_xxxx_d_material_hive change column user_tag user_tag string COMMENT '用戶標籤[字符類型],結構化的數據請使用user_tag_list';
五、刪除表或數據
//永久刪除測試表
drop table xxx.base_xxx_d_material_hive_test purge;
//如果還想恢復數據,不需要加purge
drop table xxx.base_xxx_d_material_hive_test;
//truncate table 表名
truncate 用於刪除表中的所有的行, 或 delete from 表名 where 1 = 1;
//刪除分區數據
ALTER TABLE xxx_biz_d_mp_tag_audit_data_tbl DROP PARTITION (dt='2008-07-28');
批量刪除
ALTER TABLE xxx_biz_d_mp_tag_audit_data_tbl DROP PARTITION (dt<'2020-10-01');
ALTER TABLE xxx_orig_d_mp_tag_audit_data_tbl DROP PARTITION (dt<'2020-10-01');
六、eplode和LATERAL VIEW的簡單使用
參考:https://www.jianshu.com/p/8689a2283cae
1.去掉[]
select regexp_replace('["愛我就別想太多","曹慧生","李一桐","夏可可"]','\\["|"\\]','');
//結果:愛我就別想太多","曹慧生","李一桐","夏可可
select regexp_replace('[9000001026042900,9000001026041200,9000001026043600,9000001026056100]','\\[|\\]','');
//結果:9000001026042900,9000001026041200,9000001026043600,9000001026056100
2.轉爲數組
select split(regexp_replace('[9000001026042900,9000001026041200,9000001026043600,9000001026056100]','\\[|\\]',''),",");
//結果:["9000001026042900","9000001026041200","9000001026043600","9000001026056100"]
3.用表中的數據-測試
select id,qipu_id,split(regexp_replace(video_ids,'\\[|\\]',''),",") from beehive.test_base_beehive_d_album_audit_hive;
//結果:
234 247921701 ["6104181288969800","5367793040785600","5406297001473500","1141181189251800","5730372003942100","3629499648811200"]
592 253091801 ["17006224200","17129812000"]
244 3080818208933901 ["9000001000368700"]
4.使用exploade方法
select explode(split(regexp_replace(video_ids,'\\[|\\]',''),",")) AS video_id from beehive.test_base_xxx_d_album_audit_hive;
結果:
5406297001473500
1141181189251800
5730372003942100
3629499648811200
5.使用 LATERAL VIEW 和 eplode
select qipu_id,video_id from beehive.test_base_xxx_d_album_audit_hive t
LATERAL VIEW explode(split(regexp_replace(t.video_ids,'\\[|\\]',''),",")) tmp_tab as video_id ;
結果:
247921701 6104181288969800
247921701 5367793040785600
247921701 5406297001473500
247921701 1141181189251800
247921701 5730372003942100
247921701 3629499648811200
253091801 17006224200
253091801 17129812000
3080818208933901 9000001000368700
//加入where條件的sql
select t1.qipu_id,t1.video_id,t2.quality_level from
(select qipu_id,video_id from beehive.test_base_xxx_d_album_audit_hive t
LATERAL VIEW explode(split(regexp_replace(t.video_ids,'\\[|\\]',''),",")) tmp_tab as video_id where t.dt = '2022-04-01') t1
left join xxx.base_xxx_d_material_hive_test t2
on t1.video_id = t2.qipu_id;
七、insert插入數據
- 注意:如果我們想給新增的hive表添加一些測試數據,可以用此部分的內容。
//insert overwrite table beehive.test_base_xxx_d_album_quality_level_hive partition (dt='${dt}')
insert overwrite table beehive.test_base_xxx_d_album_quality_level_hive partition (dt='2022-04-01')
select t1.qipu_id,t1.video_id,t2.quality_level from
(select qipu_id,video_id from beehive.test_base_xxx_d_album_audit_hive t
LATERAL VIEW explode(split(regexp_replace(t.video_ids,'\\[|\\]',''),",")) tmp_tab as video_id) t1
left join beehive.base_xxxx_d_material_hive_test t2
on t1.video_id = t2.qipu_id;
八、case when的例子
參考:https://blog.csdn.net/u011944141/article/details/79133692
//查詢結果
select
album_qipu_id,
count(1) total_count,
count(case when video_quality_level = 'LOW' then 1 end) low_quality_num
from
xxx.test_base_xxx_d_album_quality_level_hive
group by album_qipu_id;
九、其他例子
select regexp_replace('["愛我就別想太多","曹慧生","李一桐","夏可可"]','\\["|"\\]','') ;
結果:愛我就別想太多","曹慧生","李一桐","夏可可
select regexp_replace(regexp_replace('["愛我就別想太多","曹慧生","李一桐","夏可可"]','\\["|"\\]',''),'","','%%') ;
結果:愛我就別想太多%%曹慧生%%李一桐%%夏可可
select split(regexp_replace(regexp_replace('["愛我就別想太多","曹慧生","李一桐","夏可可"]','\\["|"\\]',''),'","','%%'),'%%');
結果:["愛我就別想太多","曹慧生","李一桐","夏可可"]
select split(regexp_replace(regexp_replace('["愛我就別想太多"]','\\["|"\\]',''),'","','%%'),'%%');
結果:["愛我就別想太多"]
//---------------中文正則過濾case
select regexp_replace('["娛樂","電視劇-香港","新聞","電影-周邊","張柏芝"]','"[\\u4E00-\\u9FA5]+-','"');
結果:["娛樂","香港","新聞","周邊","張柏芝"]
select regexp_replace('["娛樂","電視劇-香港","新聞","周邊","電影-張柏芝"]','"[^-"]+-','"');
結果:["娛樂","香港","新聞","周邊","張柏芝"]
// ------------------------截取小數點位數
測試:
select regexp_replace('0.9902856349945068,0.9902856349945068','(0\\.[0-9]{1,3})[0-9]+','$1');
結果:0.990,0.990
select regexp_replace('0.9902856349945068','(0\\.[0-9]{1,3})[0-9]+','$1');
結果:0.990
十、get_json_object()的用法
https://blog.csdn.net/qq_34105362/article/details/80454697
https://sjq597.github.io/2015/11/05/Hive-get-json-object%E7%94%A8%E6%B3%95/
https://www.cnblogs.com/drjava/p/10486134.html
https://zhuanlan.zhihu.com/p/40914513
insert overwrite table test_nlp_tag_confidence_tbl_hanliwei
select
qixx_id,
paxx_url,
if(nlp_main_tag_info_list_short='[]',regexp_replace(regexp_replace(nlp_content_tag_info_list_short,'"[\\u4E00-\\u9FA5]+-','"'),'\\["|"\\]|"',''),concat(regexp_replace(regexp_replace(nlp_main_tag_info_list_short,'"[\\u4E00-\\u9FA5]+-','"'),'\\["|"\\]|"',''),',',regexp_replace(regexp_replace(nlp_content_tag_info_list_short,'"[\\u4E00-\\u9FA5]+-','"'),'\\["|"\\]|"',''))) as nlp_tag_info,
if(nlp_main_tag_confidence_list_short='[]',regexp_replace(regexp_replace(nlp_content_tag_confidence_list_short,'\\[|\\]',''),'(0\\.[0-9]{1,3})[0-9]+','$1'),regexp_replace(concat(regexp_replace(nlp_main_tag_confidence_list_short,'\\[|\\]',''),',',regexp_replace(nlp_content_tag_confidence_list_short,'\\[|\\]','')),'(0\\.[0-9]{1,3})[0-9]+','$1')) as nlp_tag_confidence,
algorithm_version
from (
select get_json_object(video_info, '$.qixxId') as qipu_id,
get_json_object(video_info, '$.paxxUrl') as page_url,
get_json_xpath(nlp_tag_info, '$.mainCategoryTagInfoList[*].tagName') as nlp_main_tag_info_list_short,
get_json_xpath(nlp_tag_info, '$.contentTagInfoList[*].tagName') as nlp_content_tag_info_list_short,
get_json_xpath(nlp_tag_info, '$.mainCategoryTagInfoList[*].confidence') as nlp_main_tag_confidence_list_short,
get_json_xpath(nlp_tag_info, '$.contentTagInfoList[*].confidence') as nlp_content_tag_confidence_list_short,
get_json_xpath(nlp_tag_info, '$.algorithmVersion') as algorithm_version
from xxx_orig_d_xxx_tag_audit_data_tbl
where dt = '2020-07-30'
) t;
//結果:
179xxx49600 http://www.xx.com/v_xxrz4uov64.html 綜藝,搞笑,小品,片段,周雲鵬 0.992,0.603,0.603,0.944,0.970
1758xxx8700 http://www.xxx.com/v_1xxxzgq4wn4.html 資訊,新型冠狀病毒,社會,災難意外,抗洪救災,鄱陽縣 0.986,0.986,0.971,0.837,0.825,0.805
1795xxx9700 http://www.xxx.com/v_1xxxxlvs.html 兒童,玩具,母嬰,幼兒,試玩,植物大戰殭屍,模型玩具,益智 0.991,0.991,0.631,0.631,0.625,0.899,0.787,0.799