hive中hive-json格式的表使用

1、create table as 方式生成json表及數據

sql腳本:

drop table if exists tmp.xx_toutiao_userinfo;
create table tmp.xx_toutiao_userinfo as
with tab_user_basic as(
select
a1.uri                                                        id                                                   ,
a1.buyerlevelscores                                           membership_level                                     ,
get_json_object(a1.snsjson,'$.sex')                           gender                                               ,
get_json_object(a1.snsjson,'$.country')                       country                                              ,
get_json_object(a1.snsjson,'$.province')                      province                                             ,
get_json_object(a1.snsjson,'$.city')                          city                                                 ,
a1.bail                                                       extra_info__bail                                     ,
a1.balance                                                    extra_info__balance                                  ,
a1.balancefrozen                                              extra_info__balanceFrozen                            ,
get_json_object(a1.snsjson,'$.language')                      extra_info__language                                 ,
get_json_object(a1.creditsjson,'$.buyTotalNum')               extra_info__creditsJson__buyTotalNum                 ,
get_json_object(a1.creditsjson,'$.buyReturnedNum')            extra_info__creditsJson__buyReturnedNum              ,
get_json_object(a1.creditsjson,'$.buyFaultNum')               extra_info__creditsJson__buyFaultNum                 ,
get_json_object(a1.creditsjson,'$.buyNum')                    extra_info__creditsJson__buyNum                      ,
get_json_object(a1.creditsjson,'$.sellTotalNum')              extra_info__creditsJson__sellTotalNum                ,
get_json_object(a1.creditsjson,'$.sellDisputeNum')            extra_info__creditsJson__sellDisputeNum              ,
get_json_object(a1.creditsjson,'$.sellFaultNum')              extra_info__creditsJson__sellFaultNum                ,
get_json_object(a1.creditsjson,'$.sellMultiWinsNum')          extra_info__creditsJson__sellMultiWinsNum            ,
get_json_object(a1.creditsjson,'$.sellNum')                   extra_info__creditsJson__sellNum                     ,
a1.lastpaymethod                                              extra_info__lastPayMethod                            ,
a1.membertime                                                 extra_info__memeberTime                              ,
a1.scene                                                      extra_info__scene                                    ,
a1.sellerlevelscores                                          extra_info__sellerLevelScores                        
from ods.ods_userinfo_userinfo_full_1d a1
where a1.dt='2020-01-01')
--insert overwrite table tmp.xx_toutiao_userinfo_json
select a1.id,a1.membership_level,
case when a1.gender=1 then 0 when a1.gender=2 then 1 else null end gender,
a1.country,a1.province,a1.city,
concat("\{
\"bail\":",extra_info__bail,",
\"balance\":",extra_info__balance,",
\"balanceFrozen\":",extra_info__balanceFrozen,",
\"language\":\"",extra_info__language,"\",
 \"creditsJson\":
{
\"buyTotalNum\":",extra_info__creditsJson__buyTotalNum,",
\"buyReturnedNum\":",extra_info__creditsJson__buyReturnedNum,",
\"buyFaultNum\":",extra_info__creditsJson__buyFaultNum,",
\"buyNum\":",extra_info__creditsJson__buyNum,",
\"sellTotalNum\":",extra_info__creditsJson__sellTotalNum,",
\"sellDisputeNum\":",extra_info__creditsJson__sellDisputeNum,",
\"sellFaultNum\":",extra_info__creditsJson__sellFaultNum,",
\"sellMultiWinsNum\":",extra_info__creditsJson__sellMultiWinsNum,",
\"sellNum\":",extra_info__creditsJson__sellNum,"
},
\"lastPayMethod\":",extra_info__lastPayMethod,",
\"memeberTime\":",extra_info__memeberTime,",
\"scene\":\"",extra_info__scene,"\",
\"sellerLevelScores\":",extra_info__sellerLevelScores,"
\}") extra_info
from tab_user_basic a1
limit 1000;

查詢:

select * from tmp.xx_toutiao_userinfo;
Time taken: 0.109 seconds, Fetched: 1861 row(s)

生成的表結構查看:

hive> show create table tmp.xx_toutiao_userinfo;;
OK
CREATE TABLE tmp.xx_toutiao_userinfo(
  id string, 
  membership_level int, 
  gender int, 
  country string, 
  province string, 
  city string, 
  extra_info string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://HDFS41368/usr/hive/warehouse/tmp.db/xx_toutiao_userinfo'
TBLPROPERTIES (
  'transient_lastDdlTime'='1577963203')
Time taken: 0.072 seconds, Fetched: 18 row(s)

2、正常建表指定[ROW FORMAT]

建表語句:

drop table if exists tmp.xx_toutiao_userinfo_json;
CREATE TABLE tmp.xx_toutiao_userinfo_json(
  id string, 
  membership_level int, 
  gender int, 
  country string, 
  province string, 
  city string, 
  extra_info string)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
stored as textfile;

插入數據:

with tab_user_basic as(
select
a1.uri                                                        id                                                   ,
a1.buyerlevelscores                                           membership_level                                     ,
get_json_object(a1.snsjson,'$.sex')                           gender                                               ,
get_json_object(a1.snsjson,'$.country')                       country                                              ,
get_json_object(a1.snsjson,'$.province')                      province                                             ,
get_json_object(a1.snsjson,'$.city')                          city                                                 ,
a1.bail                                                       extra_info__bail                                     ,
a1.balance                                                    extra_info__balance                                  ,
a1.balancefrozen                                              extra_info__balanceFrozen                            ,
get_json_object(a1.snsjson,'$.language')                      extra_info__language                                 ,
get_json_object(a1.creditsjson,'$.buyTotalNum')               extra_info__creditsJson__buyTotalNum                 ,
get_json_object(a1.creditsjson,'$.buyReturnedNum')            extra_info__creditsJson__buyReturnedNum              ,
get_json_object(a1.creditsjson,'$.buyFaultNum')               extra_info__creditsJson__buyFaultNum                 ,
get_json_object(a1.creditsjson,'$.buyNum')                    extra_info__creditsJson__buyNum                      ,
get_json_object(a1.creditsjson,'$.sellTotalNum')              extra_info__creditsJson__sellTotalNum                ,
get_json_object(a1.creditsjson,'$.sellDisputeNum')            extra_info__creditsJson__sellDisputeNum              ,
get_json_object(a1.creditsjson,'$.sellFaultNum')              extra_info__creditsJson__sellFaultNum                ,
get_json_object(a1.creditsjson,'$.sellMultiWinsNum')          extra_info__creditsJson__sellMultiWinsNum            ,
get_json_object(a1.creditsjson,'$.sellNum')                   extra_info__creditsJson__sellNum                     ,
a1.lastpaymethod                                              extra_info__lastPayMethod                            ,
a1.membertime                                                 extra_info__memeberTime                              ,
a1.scene                                                      extra_info__scene                                    ,
a1.sellerlevelscores                                          extra_info__sellerLevelScores                        
from ods.ods_userinfo_userinfo_full_1d a1
where a1.dt='2020-01-01')
insert overwrite table tmp.xx_toutiao_userinfo_json
select a1.id,a1.membership_level,
case when a1.gender=1 then 0 when a1.gender=2 then 1 else null end gender,
a1.country,a1.province,a1.city,
concat("\{
\"bail\":",extra_info__bail,",
\"balance\":",extra_info__balance,",
\"balanceFrozen\":",extra_info__balanceFrozen,",
\"language\":\"",extra_info__language,"\",
 \"creditsJson\":
{
\"buyTotalNum\":",extra_info__creditsJson__buyTotalNum,",
\"buyReturnedNum\":",extra_info__creditsJson__buyReturnedNum,",
\"buyFaultNum\":",extra_info__creditsJson__buyFaultNum,",
\"buyNum\":",extra_info__creditsJson__buyNum,",
\"sellTotalNum\":",extra_info__creditsJson__sellTotalNum,",
\"sellDisputeNum\":",extra_info__creditsJson__sellDisputeNum,",
\"sellFaultNum\":",extra_info__creditsJson__sellFaultNum,",
\"sellMultiWinsNum\":",extra_info__creditsJson__sellMultiWinsNum,",
\"sellNum\":",extra_info__creditsJson__sellNum,"
},
\"lastPayMethod\":",extra_info__lastPayMethod,",
\"memeberTime\":",extra_info__memeberTime,",
\"scene\":\"",extra_info__scene,"\",
\"sellerLevelScores\":",extra_info__sellerLevelScores,"
\}") extra_info
from tab_user_basic a1
limit 1000;

查詢:

select * from tmp.xx_toutiao_userinfo_json;
Time taken: 0.09 seconds, Fetched: 1000 row(s)

3、分析說明

create table as 默認的ROW FORMAT SERDE 是'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ,不能夠很好的解析json格式數據;使用建表指定[ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe']的方式,可以很好的對json數據進行解析,避免了Json數據換行所造成的影響。所以,兩者查詢的結果不一致。
SerDe是Serialize/Deserilize的簡稱,目的是用於序列化和反序列化。
SerDe包括內置類型:
Avro (Hive 0.9.1 and later) 
ORC (Hive 0.11 and later) 
RegEx 
Thrift 
Parquet (Hive 0.13 and later) 
CSV (Hive 0.14 and later) 
JsonSerDe (Hive 0.12 and later)。

同時,hadoop dfs -cat /usr/hive/warehouse/tmp.db/xx_toutiao_userinfo_json/000000_0

{"id":"1701031708gi9wjc","membership_level":0,"gender":0,"country":"中國","province":"甘肅","city":"慶陽","extra_info":null}
{"id":"1701031631tnz19z","membership_level":-100,"gender":1,"country":"中國","province":"湖北","city":"黃岡","extra_info":null}
{"id":"17010315549re5j9","membership_level":0,"gender":0,"country":"中國","province":"江西","city":"撫州","extra_info":null}
{"id":"150724123819CeDq","membership_level":-36,"gender":0,"country":"中國","province":"北京","city":"朝陽","extra_info":null}
{"id":"1701031517umzfaa","membership_level":0,"gender":null,"country":"中國","province":"","city":"","extra_info":null}
{"id":"1701031457ctwb4d","membership_level":0,"gender":0,"country":"中國","province":"浙江","city":"紹興","extra_info":null}
{"id":"15050520171xCOXi","membership_level":0,"gender":0,"country":"中國","province":"安徽","city":"","extra_info":null}
{"id":"1701031420um02x7","membership_level":0,"gender":0,"country":"中國","province":"","city":"","extra_info":null}
{"id":"1701031341h8zete","membership_level":0,"gender":1,"country":"中國","province":"遼寧","city":"丹東","extra_info":null}
{"id":"1701031302gv9dfh","membership_level":1086,"gender":0,"country":"中國","province":"重慶","city":"九龍坡","extra_info":"{\n\"bail\":0,\n\"balance\":651,\n\"balanceFrozen\":0,\n\"language\":\"zh_CN\",\n \"creditsJson\":\n{\n\"buyTotalNum\":54,\n\"buyReturnedNum\":0,\n\"buyFaultNum\":1,\n\"buyNum\":53,\n\"sellTotalNum\":0,\n\"sellDisputeNum\":0,\n\"sellFaultNum\":0,\n\"sellMultiWinsNum\":0,\n\"sellNum\":0\n},\n\"lastPayMethod\":2,\n\"memeberTime\":1509761515,\n\"scene\":\"yingyongbao\",\n\"sellerLevelScores\":0\n}"}

hadoop dfs -cat /usr/hive/warehouse/tmp.db/xx_toutiao_userinfo/000000_0 是:

1701031840lqny7q01\N
1701031812bgpk3b20500中國江蘇蘇州\N
1701031741kmuv7w00中國甘肅慶陽\N
1701031708gi9wjc00中國甘肅慶陽\N
1701031631tnz19z-1001中國湖北黃岡\N
17010315549re5j900中國江西撫州\N
150724123819CeDq-360中國北京朝陽\N
1701031517umzfaa0\N中國\N
1701031457ctwb4d00中國浙江紹興\N
15050520171xCOXi00中國安徽\N
1701031420um02x700中國\N
1701031341h8zete01中國遼寧丹東\N
1701031302gv9dfh10860中國重慶九龍坡{
"bail":0,
"balance":651,
"balanceFrozen":0,
"language":"zh_CN",
 "creditsJson":
{
"buyTotalNum":54,
"buyReturnedNum":0,
"buyFaultNum":1,
"buyNum":53,
"sellTotalNum":0,
"sellDisputeNum":0,
"sellFaultNum":0,
"sellMultiWinsNum":0,
"sellNum":0
},
"lastPayMethod":2,
"memeberTime":1509761515,
"scene":"yingyongbao",
"sellerLevelScores":0
}
1701031228bm817y01中國廣東廣州\N
1701031150mzaaf9331中國廣東廣州\N
1701031115wtcxaa00中國廣東廣州\N
17010310409gvku9181中國浙江杭州\N
1701030957xganje01中國浙江衢州\N
1507241157p8zh1X01中國浙江台州\N
170103085827pewa950中國甘肅慶陽\N
1701030738lzwdwv00中國湖北孝感\N
1701030140higkns00中國福建福州\N
1701022335n4drrm0\N\N
1701022252j8hhr501巴林\N
17010222190gxh1h0\N中國\N

另外,使用JsonSerDe,可直接定義Hive複雜數據類型,方便使用。
如:

create external table if not exists dw_stg.student(
student map<string,string> comment "學生信息",
class map<string,string> comment "課程信息",
teacher map<string,string> comment "授課老師信息"
)
comment "學生課程信息"
row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
stored as textfile;

查詢:

select 
    student['name'] as stuName,
    class['book'] as cls_book, 
    class['score'] as cls_score,
    teacher['name'] as tech_name 
from student 
where student['name'] = 'test4';

此時,數據文件裏可以直接存儲json數據:

{"student":{"name":"king","age":11,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
{"student":{"name":"wang","age":12,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
{"student":{"name":"test","age":13,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
{"student":{"name":"test2","age":14,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
{"student":{"name":"test3","age":15,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
{"student":{"name":"test4","age":16,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}

具體可參考https://www.cnblogs.com/30go/p/8318542.html
 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章