1、create table as 方式生成json表及數據
sql腳本:
drop table if exists tmp.xx_toutiao_userinfo;
create table tmp.xx_toutiao_userinfo as
with tab_user_basic as(
select
a1.uri id ,
a1.buyerlevelscores membership_level ,
get_json_object(a1.snsjson,'$.sex') gender ,
get_json_object(a1.snsjson,'$.country') country ,
get_json_object(a1.snsjson,'$.province') province ,
get_json_object(a1.snsjson,'$.city') city ,
a1.bail extra_info__bail ,
a1.balance extra_info__balance ,
a1.balancefrozen extra_info__balanceFrozen ,
get_json_object(a1.snsjson,'$.language') extra_info__language ,
get_json_object(a1.creditsjson,'$.buyTotalNum') extra_info__creditsJson__buyTotalNum ,
get_json_object(a1.creditsjson,'$.buyReturnedNum') extra_info__creditsJson__buyReturnedNum ,
get_json_object(a1.creditsjson,'$.buyFaultNum') extra_info__creditsJson__buyFaultNum ,
get_json_object(a1.creditsjson,'$.buyNum') extra_info__creditsJson__buyNum ,
get_json_object(a1.creditsjson,'$.sellTotalNum') extra_info__creditsJson__sellTotalNum ,
get_json_object(a1.creditsjson,'$.sellDisputeNum') extra_info__creditsJson__sellDisputeNum ,
get_json_object(a1.creditsjson,'$.sellFaultNum') extra_info__creditsJson__sellFaultNum ,
get_json_object(a1.creditsjson,'$.sellMultiWinsNum') extra_info__creditsJson__sellMultiWinsNum ,
get_json_object(a1.creditsjson,'$.sellNum') extra_info__creditsJson__sellNum ,
a1.lastpaymethod extra_info__lastPayMethod ,
a1.membertime extra_info__memeberTime ,
a1.scene extra_info__scene ,
a1.sellerlevelscores extra_info__sellerLevelScores
from ods.ods_userinfo_userinfo_full_1d a1
where a1.dt='2020-01-01')
--insert overwrite table tmp.xx_toutiao_userinfo_json
select a1.id,a1.membership_level,
case when a1.gender=1 then 0 when a1.gender=2 then 1 else null end gender,
a1.country,a1.province,a1.city,
concat("\{
\"bail\":",extra_info__bail,",
\"balance\":",extra_info__balance,",
\"balanceFrozen\":",extra_info__balanceFrozen,",
\"language\":\"",extra_info__language,"\",
\"creditsJson\":
{
\"buyTotalNum\":",extra_info__creditsJson__buyTotalNum,",
\"buyReturnedNum\":",extra_info__creditsJson__buyReturnedNum,",
\"buyFaultNum\":",extra_info__creditsJson__buyFaultNum,",
\"buyNum\":",extra_info__creditsJson__buyNum,",
\"sellTotalNum\":",extra_info__creditsJson__sellTotalNum,",
\"sellDisputeNum\":",extra_info__creditsJson__sellDisputeNum,",
\"sellFaultNum\":",extra_info__creditsJson__sellFaultNum,",
\"sellMultiWinsNum\":",extra_info__creditsJson__sellMultiWinsNum,",
\"sellNum\":",extra_info__creditsJson__sellNum,"
},
\"lastPayMethod\":",extra_info__lastPayMethod,",
\"memeberTime\":",extra_info__memeberTime,",
\"scene\":\"",extra_info__scene,"\",
\"sellerLevelScores\":",extra_info__sellerLevelScores,"
\}") extra_info
from tab_user_basic a1
limit 1000;
查詢:
select * from tmp.xx_toutiao_userinfo;
Time taken: 0.109 seconds, Fetched: 1861 row(s)
生成的表結構查看:
hive> show create table tmp.xx_toutiao_userinfo;;
OK
CREATE TABLE tmp.xx_toutiao_userinfo(
id string,
membership_level int,
gender int,
country string,
province string,
city string,
extra_info string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://HDFS41368/usr/hive/warehouse/tmp.db/xx_toutiao_userinfo'
TBLPROPERTIES (
'transient_lastDdlTime'='1577963203')
Time taken: 0.072 seconds, Fetched: 18 row(s)
2、正常建表指定[ROW FORMAT]
建表語句:
drop table if exists tmp.xx_toutiao_userinfo_json;
CREATE TABLE tmp.xx_toutiao_userinfo_json(
id string,
membership_level int,
gender int,
country string,
province string,
city string,
extra_info string)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
stored as textfile;
插入數據:
with tab_user_basic as(
select
a1.uri id ,
a1.buyerlevelscores membership_level ,
get_json_object(a1.snsjson,'$.sex') gender ,
get_json_object(a1.snsjson,'$.country') country ,
get_json_object(a1.snsjson,'$.province') province ,
get_json_object(a1.snsjson,'$.city') city ,
a1.bail extra_info__bail ,
a1.balance extra_info__balance ,
a1.balancefrozen extra_info__balanceFrozen ,
get_json_object(a1.snsjson,'$.language') extra_info__language ,
get_json_object(a1.creditsjson,'$.buyTotalNum') extra_info__creditsJson__buyTotalNum ,
get_json_object(a1.creditsjson,'$.buyReturnedNum') extra_info__creditsJson__buyReturnedNum ,
get_json_object(a1.creditsjson,'$.buyFaultNum') extra_info__creditsJson__buyFaultNum ,
get_json_object(a1.creditsjson,'$.buyNum') extra_info__creditsJson__buyNum ,
get_json_object(a1.creditsjson,'$.sellTotalNum') extra_info__creditsJson__sellTotalNum ,
get_json_object(a1.creditsjson,'$.sellDisputeNum') extra_info__creditsJson__sellDisputeNum ,
get_json_object(a1.creditsjson,'$.sellFaultNum') extra_info__creditsJson__sellFaultNum ,
get_json_object(a1.creditsjson,'$.sellMultiWinsNum') extra_info__creditsJson__sellMultiWinsNum ,
get_json_object(a1.creditsjson,'$.sellNum') extra_info__creditsJson__sellNum ,
a1.lastpaymethod extra_info__lastPayMethod ,
a1.membertime extra_info__memeberTime ,
a1.scene extra_info__scene ,
a1.sellerlevelscores extra_info__sellerLevelScores
from ods.ods_userinfo_userinfo_full_1d a1
where a1.dt='2020-01-01')
insert overwrite table tmp.xx_toutiao_userinfo_json
select a1.id,a1.membership_level,
case when a1.gender=1 then 0 when a1.gender=2 then 1 else null end gender,
a1.country,a1.province,a1.city,
concat("\{
\"bail\":",extra_info__bail,",
\"balance\":",extra_info__balance,",
\"balanceFrozen\":",extra_info__balanceFrozen,",
\"language\":\"",extra_info__language,"\",
\"creditsJson\":
{
\"buyTotalNum\":",extra_info__creditsJson__buyTotalNum,",
\"buyReturnedNum\":",extra_info__creditsJson__buyReturnedNum,",
\"buyFaultNum\":",extra_info__creditsJson__buyFaultNum,",
\"buyNum\":",extra_info__creditsJson__buyNum,",
\"sellTotalNum\":",extra_info__creditsJson__sellTotalNum,",
\"sellDisputeNum\":",extra_info__creditsJson__sellDisputeNum,",
\"sellFaultNum\":",extra_info__creditsJson__sellFaultNum,",
\"sellMultiWinsNum\":",extra_info__creditsJson__sellMultiWinsNum,",
\"sellNum\":",extra_info__creditsJson__sellNum,"
},
\"lastPayMethod\":",extra_info__lastPayMethod,",
\"memeberTime\":",extra_info__memeberTime,",
\"scene\":\"",extra_info__scene,"\",
\"sellerLevelScores\":",extra_info__sellerLevelScores,"
\}") extra_info
from tab_user_basic a1
limit 1000;
查詢:
select * from tmp.xx_toutiao_userinfo_json;
Time taken: 0.09 seconds, Fetched: 1000 row(s)
3、分析說明
create table as 默認的ROW FORMAT SERDE 是'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' ,不能夠很好的解析json格式數據;使用建表指定[ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe']的方式,可以很好的對json數據進行解析,避免了Json數據換行所造成的影響。所以,兩者查詢的結果不一致。
SerDe是Serialize/Deserilize的簡稱,目的是用於序列化和反序列化。
SerDe包括內置類型:
Avro (Hive 0.9.1 and later)
ORC (Hive 0.11 and later)
RegEx
Thrift
Parquet (Hive 0.13 and later)
CSV (Hive 0.14 and later)
JsonSerDe (Hive 0.12 and later)。
同時,hadoop dfs -cat /usr/hive/warehouse/tmp.db/xx_toutiao_userinfo_json/000000_0
{"id":"1701031708gi9wjc","membership_level":0,"gender":0,"country":"中國","province":"甘肅","city":"慶陽","extra_info":null}
{"id":"1701031631tnz19z","membership_level":-100,"gender":1,"country":"中國","province":"湖北","city":"黃岡","extra_info":null}
{"id":"17010315549re5j9","membership_level":0,"gender":0,"country":"中國","province":"江西","city":"撫州","extra_info":null}
{"id":"150724123819CeDq","membership_level":-36,"gender":0,"country":"中國","province":"北京","city":"朝陽","extra_info":null}
{"id":"1701031517umzfaa","membership_level":0,"gender":null,"country":"中國","province":"","city":"","extra_info":null}
{"id":"1701031457ctwb4d","membership_level":0,"gender":0,"country":"中國","province":"浙江","city":"紹興","extra_info":null}
{"id":"15050520171xCOXi","membership_level":0,"gender":0,"country":"中國","province":"安徽","city":"","extra_info":null}
{"id":"1701031420um02x7","membership_level":0,"gender":0,"country":"中國","province":"","city":"","extra_info":null}
{"id":"1701031341h8zete","membership_level":0,"gender":1,"country":"中國","province":"遼寧","city":"丹東","extra_info":null}
{"id":"1701031302gv9dfh","membership_level":1086,"gender":0,"country":"中國","province":"重慶","city":"九龍坡","extra_info":"{\n\"bail\":0,\n\"balance\":651,\n\"balanceFrozen\":0,\n\"language\":\"zh_CN\",\n \"creditsJson\":\n{\n\"buyTotalNum\":54,\n\"buyReturnedNum\":0,\n\"buyFaultNum\":1,\n\"buyNum\":53,\n\"sellTotalNum\":0,\n\"sellDisputeNum\":0,\n\"sellFaultNum\":0,\n\"sellMultiWinsNum\":0,\n\"sellNum\":0\n},\n\"lastPayMethod\":2,\n\"memeberTime\":1509761515,\n\"scene\":\"yingyongbao\",\n\"sellerLevelScores\":0\n}"}
而hadoop dfs -cat /usr/hive/warehouse/tmp.db/xx_toutiao_userinfo/000000_0 ,是:
1701031840lqny7q01\N
1701031812bgpk3b20500中國江蘇蘇州\N
1701031741kmuv7w00中國甘肅慶陽\N
1701031708gi9wjc00中國甘肅慶陽\N
1701031631tnz19z-1001中國湖北黃岡\N
17010315549re5j900中國江西撫州\N
150724123819CeDq-360中國北京朝陽\N
1701031517umzfaa0\N中國\N
1701031457ctwb4d00中國浙江紹興\N
15050520171xCOXi00中國安徽\N
1701031420um02x700中國\N
1701031341h8zete01中國遼寧丹東\N
1701031302gv9dfh10860中國重慶九龍坡{
"bail":0,
"balance":651,
"balanceFrozen":0,
"language":"zh_CN",
"creditsJson":
{
"buyTotalNum":54,
"buyReturnedNum":0,
"buyFaultNum":1,
"buyNum":53,
"sellTotalNum":0,
"sellDisputeNum":0,
"sellFaultNum":0,
"sellMultiWinsNum":0,
"sellNum":0
},
"lastPayMethod":2,
"memeberTime":1509761515,
"scene":"yingyongbao",
"sellerLevelScores":0
}
1701031228bm817y01中國廣東廣州\N
1701031150mzaaf9331中國廣東廣州\N
1701031115wtcxaa00中國廣東廣州\N
17010310409gvku9181中國浙江杭州\N
1701030957xganje01中國浙江衢州\N
1507241157p8zh1X01中國浙江台州\N
170103085827pewa950中國甘肅慶陽\N
1701030738lzwdwv00中國湖北孝感\N
1701030140higkns00中國福建福州\N
1701022335n4drrm0\N\N
1701022252j8hhr501巴林\N
17010222190gxh1h0\N中國\N
另外,使用JsonSerDe,可直接定義Hive複雜數據類型,方便使用。
如:
create external table if not exists dw_stg.student(
student map<string,string> comment "學生信息",
class map<string,string> comment "課程信息",
teacher map<string,string> comment "授課老師信息"
)
comment "學生課程信息"
row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
stored as textfile;
查詢:
select
student['name'] as stuName,
class['book'] as cls_book,
class['score'] as cls_score,
teacher['name'] as tech_name
from student
where student['name'] = 'test4';
此時,數據文件裏可以直接存儲json數據:
{"student":{"name":"king","age":11,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
{"student":{"name":"wang","age":12,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
{"student":{"name":"test","age":13,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
{"student":{"name":"test2","age":14,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
{"student":{"name":"test3","age":15,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
{"student":{"name":"test4","age":16,"sex":"M"},"class":{"book":"語文","level":2,"score":80},"teacher":{"name":"t1","class":"語文"}}
具體可參考:https://www.cnblogs.com/30go/p/8318542.html