增量採集數據到hive分區表中,進行ETL後用Sqoop同步到SqlServer

最近在公司遇到一個數據處理需求:

1 客戶方SqlServer的表每5分鐘會採集到hive中,每5分鐘作爲一個分區

2 編寫Hsql腳本讀取分區表數據,進行數據轉換,存到Hive的結果分區表中,

3 將結果數據同步到客戶的sqlServer目標表中。

其中,沒5分鐘採集到hive中是另外小組的同事負責的,這裏不進行詳細說明。

2和3部分的編寫腳本data.sh如下:該腳本寫好後可以使用作業調度系統每五分鐘執行一次。

#!/bin/bash
#注意:作業調度的服務器系統時間要正確 建議等原表的分區數據接入成功後再啓動腳本執行

echo 'job start...'
nowtime=`date -d today "+%Y-%m-%d-%H"`
echo 'nowtime='$nowtime

#grepCondition="2020-05-14-22-25"
grepCondition=$nowtime

partition="set hive.cli.print.header=flase;show partitions test.hn_csgcjs_spgl_xmjbxxb;"
v_partition=`hive -e  "$partition" | grep data_dt="${grepCondition}" | sort | tail -n 1` #取最新的分區

echo $v_partition
#獲取源表的最新分區號
partition_nums=${v_partition:8:16}

echo 'partition_nums='$partition_nums

#===================合法數據邏輯開始==========================================
echo 'correct data job start....'
#創建sqoop分區表
HIVE_SQL_CREATE_TABLE="CREATE TABLE IF NOT EXISTS test.hn_csgcjs_spgl_xmjbxxb_transfer_sqoop_p2 (
    dfsjzj string,
    xzqhdm string,
    xmdm string,
    xmmc string,
    gcdm string,
    gcfw string,
    qjdgcdm string,
    xmtzly INT,
    tdhqfs INT,
    tdsfdsjfa INT,
    sfwcqypg INT,
    splclx INT,
    lxlx INT,
    gcfl INT,
    jsxz INT,
    xmzjsx INT,
    gbhydmfbnd string,
    gbhy string,
    nkgsj date,
    njcsj date,
    xmsfwqbj INT,
    xmwqbjsj TIMESTAMP,
    ztze DOUBLE,
    jsddxzqh string,
    jsdd string,
    xmjsddx DOUBLE,
    xmjsddy DOUBLE,
    jsgmjnr string,
    ydmj DOUBLE,
    jzmj DOUBLE,
    sbsj TIMESTAMP,
    splcbm string,
    splbbh DOUBLE,
    sjyxbs INT,
    sjwxyy string,
    SJSCZT INT,
    SBYY string,
    ctime TIMESTAMP,
    utime TIMESTAMP,
    SYNC_TIME TIMESTAMP,
    SystemSource string,
    First_Sync_time TIMESTAMP
) partitioned by(data_dt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001';"


#插入分區數據
HIVE_SQL_INSERT_TABLE="insert overwrite TABLE test.hn_csgcjs_spgl_xmjbxxb_transfer_sqoop_p2 partition(data_dt='$partition_nums')
SELECT
    DFSJZJ,
    CASE
WHEN m.country_xzqhdm IS NOT NULL THEN
    m.country_xzqhdm
ELSE
    a.XZQHDM
END AS XZQHDM,
 XMDM,
 XMMC,
 GCDM,
 GCFW,
 QJDGCDM,
 XMTZLY,
 TDHQFS,
 TDSFDSJFA,
 SFWCQYPG,
 SPLCLX,
 LXLX,
 GCFL,
 JSXZ,
 XMZJSX,
 GBHYDMFBND,
 CONCAT (
    CASE
    WHEN c.priv_code IS NULL THEN
        b.priv_code
    ELSE
        c.priv_code
    END,
    CAST (
        SUBSTRING (
            regexp_replace (
                CASE
                WHEN c.priv_code IS NULL THEN
                    b.priv_code
                ELSE
                    c.priv_code
                END,
                'X',
                ''
            ),
            2,
            6
        ) AS VARCHAR (6)
    )
) AS GBHY,
 NKGSJ,
 NJCSJ,
 XMSFWQBJ,
 XMWQBJSJ,
 ZTZE,
 JSDDXZQH,
 JSDD,
 XMJSDDX,
 XMJSDDY,
 JSGMJNR,
 YDMJ,
 JZMJ,
 SBSJ,
 SPLCBM,
 SPLBBH,
 SJYXBS,
 SJWXYY,
 SJSCZT,
 '' AS SBYY,
 CTIME,
 UTIME,
 from_unixtime(unix_timestamp()) AS SYNC_TIME,
 '' AS SystemSource,
 from_unixtime(unix_timestamp()) AS First_Sync_time
FROM
    test.HN_CSGCJS_SPGL_XMJBXXB a 
LEFT JOIN test.hn_zwfw_xzqhdm_map m ON a.XZQHDM = m.city_xzqhdm
LEFT JOIN (
    SELECT
        a.hy_name,
        a.hy_code,
        a.old_code,
        regexp_replace (c.p_code, 'X', '') AS priv_code
    FROM
        test.hn_zwfw_hyfl a
    LEFT JOIN test.hn_zwfw_hyfl b ON a.p_code = b.hy_code
    LEFT JOIN test.hn_zwfw_hyfl c ON b.p_code = c.hy_code
) b ON a.GBHY = b.old_code
LEFT JOIN (
    SELECT
        a.hy_name,
        a.hy_code,
        a.old_code,
        regexp_replace (c.p_code, 'X', '') AS priv_code
    FROM
        test.hn_zwfw_hyfl a
    LEFT JOIN test.hn_zwfw_hyfl b ON a.p_code = b.hy_code
    LEFT JOIN test.hn_zwfw_hyfl c ON b.p_code = c.hy_code
) c ON a.GBHY = c.hy_code 
WHERE GBHYDMFBND='2017' and a.data_dt='$partition_nums';"

#執行hql
hive -e "$HIVE_SQL_CREATE_TABLE"


echo 'insert correct data start....'
hive -e "$HIVE_SQL_INSERT_TABLE"
echo 'insert correct data end...'

echo 'correct data job sqoop start...'
sqoop export --connect 'jdbc:sqlserver://59.212.146.201:1433;DatabaseName=Exchange_GCJSXM_HNS_TEST' --username 'GCJSXM_GGFW' --password 'GCJSXM_GGFW!@#$20200511' --table 'SPGL_XMJBXXB' --columns 'DFSJZJ,XZQHDM,XMDM,XMMC,GCDM,GCFW,QJDGCDM,XMTZLY,TDHQFS,TDSFDSJFA,SFWCQYPG,SPLCLX,LXLX,GCFL,JSXZ,XMZJSX,GBHYDMFBND,GBHY,NKGSJ,NJCSJ,XMSFWQBJ,XMWQBJSJ,ZTZE,JSDDXZQH,JSDD,XMJSDDX,XMJSDDY,JSGMJNR,YDMJ,JZMJ,SBSJ,SPLCBM,SPLBBH,SJYXBS,SJWXYY,SJSCZT,SBYY,CTIME,UTIME,SYNC_TIME,SystemSource,First_Sync_time' --fields-terminated-by '\001' --export-dir 'hdfs://hainan/apps/hive/warehouse/test.db/hn_csgcjs_spgl_xmjbxxb_transfer_sqoop_p2/data_dt='$partition_nums'' --input-null-string '\\N'  --input-null-non-string '\\N' --hive-partition-key data_dt --hive-partition-value $partition_nums
echo 'correct data job sqoop end...'
echo 'correct data job end....'

#===================不合法數據邏輯開始==========================================
echo 'correct data job start....'
#創建sqoop分區表
HIVE_SQL_CREATE_TABLE_INVALID="CREATE TABLE IF NOT EXISTS test.hn_csgcjs_spgl_xmjbxxb_invalid_sqoop_p2 (
    dfsjzj string,
    xzqhdm string,
    xmdm string,
    xmmc string,
    gcdm string,
    gcfw string,
    qjdgcdm string,
    xmtzly INT,
    tdhqfs INT,
    tdsfdsjfa INT,
    sfwcqypg INT,
    splclx INT,
    lxlx INT,
    gcfl INT,
    jsxz INT,
    xmzjsx INT,
    gbhydmfbnd string,
    gbhy string,
    nkgsj date,
    njcsj date,
    xmsfwqbj INT,
    xmwqbjsj TIMESTAMP,
    ztze DOUBLE,
    jsddxzqh string,
    jsdd string,
    xmjsddx DOUBLE,
    xmjsddy DOUBLE,
    jsgmjnr string,
    ydmj DOUBLE,
    jzmj DOUBLE,
    sbsj TIMESTAMP,
    splcbm string,
    splbbh DOUBLE,
    sjyxbs INT,
    sjwxyy string,
    SJSCZT INT,
    SBYY string,
    ctime TIMESTAMP,
    utime TIMESTAMP,
    SYNC_TIME TIMESTAMP,
    SystemSource string,
    First_Sync_time TIMESTAMP
) partitioned by(data_dt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001';"


#插入分區數據
HIVE_SQL_INSERT_TABLE_INVALID="insert overwrite TABLE test.hn_csgcjs_spgl_xmjbxxb_invalid_sqoop_p2 partition(data_dt='$partition_nums')
SELECT
    DFSJZJ,
    CASE
WHEN m.country_xzqhdm IS NOT NULL THEN
    m.country_xzqhdm
ELSE
    a.XZQHDM
END AS XZQHDM,
 XMDM,
 XMMC,
 GCDM,
 GCFW,
 QJDGCDM,
 XMTZLY,
 TDHQFS,
 TDSFDSJFA,
 SFWCQYPG,
 SPLCLX,
 LXLX,
 GCFL,
 JSXZ,
 XMZJSX,
 GBHYDMFBND,
 CONCAT (
    CASE
    WHEN c.priv_code IS NULL THEN
        b.priv_code
    ELSE
        c.priv_code
    END,
    CAST (
        SUBSTRING (
            regexp_replace (
                CASE
                WHEN c.priv_code IS NULL THEN
                    b.priv_code
                ELSE
                    c.priv_code
                END,
                'X',
                ''
            ),
            2,
            6
        ) AS VARCHAR (6)
    )
) AS GBHY,
 NKGSJ,
 NJCSJ,
 XMSFWQBJ,
 XMWQBJSJ,
 ZTZE,
 JSDDXZQH,
 JSDD,
 XMJSDDX,
 XMJSDDY,
 JSGMJNR,
 YDMJ,
 JZMJ,
 SBSJ,
 SPLCBM,
 SPLBBH,
 SJYXBS,
 SJWXYY,
 SJSCZT,
 '' AS SBYY,
 CTIME,
 UTIME,
 from_unixtime(unix_timestamp()) AS SYNC_TIME,
 '' AS SystemSource,
 from_unixtime(unix_timestamp()) AS First_Sync_time
FROM
    test.HN_CSGCJS_SPGL_XMJBXXB a 
LEFT JOIN test.hn_zwfw_xzqhdm_map m ON a.XZQHDM = m.city_xzqhdm
LEFT JOIN (
    SELECT
        a.hy_name,
        a.hy_code,
        a.old_code,
        regexp_replace (c.p_code, 'X', '') AS priv_code
    FROM
        test.hn_zwfw_hyfl a
    LEFT JOIN test.hn_zwfw_hyfl b ON a.p_code = b.hy_code
    LEFT JOIN test.hn_zwfw_hyfl c ON b.p_code = c.hy_code
) b ON a.GBHY = b.old_code
LEFT JOIN (
    SELECT
        a.hy_name,
        a.hy_code,
        a.old_code,
        regexp_replace (c.p_code, 'X', '') AS priv_code
    FROM
        test.hn_zwfw_hyfl a
    LEFT JOIN test.hn_zwfw_hyfl b ON a.p_code = b.hy_code
    LEFT JOIN test.hn_zwfw_hyfl c ON b.p_code = c.hy_code
) c ON a.GBHY = c.hy_code 
WHERE (GBHYDMFBND<>'2017' or GBHYDMFBND is null) and a.data_dt='$partition_nums';"

#執行hql
hive -e "$HIVE_SQL_CREATE_TABLE_INVALID"

echo 'insert invalid data start....'
hive -e "$HIVE_SQL_INSERT_TABLE_INVALID"
echo 'insert invalid data end...'

echo 'invalid data job sqoop start...'
#錯誤的數據目前還沒確定同步到哪裏
#sqoop export --connect 'jdbc:sqlserver://59.212.146.201:1433;DatabaseName=Exchange_GCJSXM_HNS_TEST' --username 'GCJSXM_GGFW' --password 'GCJSXM_GGFW!@#$20200511' --table 'SPGL_XMJBXXB' --columns 'DFSJZJ,XZQHDM,XMDM,XMMC,GCDM,GCFW,QJDGCDM,XMTZLY,TDHQFS,TDSFDSJFA,SFWCQYPG,SPLCLX,LXLX,GCFL,JSXZ,XMZJSX,GBHYDMFBND,GBHY,NKGSJ,NJCSJ,XMSFWQBJ,XMWQBJSJ,ZTZE,JSDDXZQH,JSDD,XMJSDDX,XMJSDDY,JSGMJNR,YDMJ,JZMJ,SBSJ,SPLCBM,SPLBBH,SJYXBS,SJWXYY,SJSCZT,SBYY,CTIME,UTIME,SYNC_TIME,SystemSource,First_Sync_time' --fields-terminated-by '\001' --export-dir 'hdfs://hainan/apps/hive/warehouse/test.db/hn_csgcjs_spgl_xmjbxxb_invalid_sqoop_p2/data_dt='$partition_nums'' --input-null-string '\\N'  --input-null-non-string '\\N' --hive-partition-key data_dt --hive-partition-value $partition_nums
echo 'invalid data job sqoop end...'
echo 'invalid data job end....'


echo 'job end...'

這裏面主要涉及到hive分區表的創建,以及分區表數據的插入,以及sqoop導出數據到關係型數據庫的知識點。

對於Sqoop導出到sqlserver的語句如下:

sqoop export --connect 'jdbc:sqlserver://59.212.146.201:1433;DatabaseName=Exchange_GCJSXM_HNS_TEST' --username 'GCJSXM_GGFW' --password 'GCJSXM_GGFW!@#$20200511' --table 'SPGL_XMJBXXB' --columns 'DFSJZJ,XZQHDM,XMDM,XMMC,GCDM,GCFW,QJDGCDM,XMTZLY,TDHQFS,TDSFDSJFA,SFWCQYPG,SPLCLX,LXLX,GCFL,JSXZ,XMZJSX,GBHYDMFBND,GBHY,NKGSJ,NJCSJ,XMSFWQBJ,XMWQBJSJ,ZTZE,JSDDXZQH,JSDD,XMJSDDX,XMJSDDY,JSGMJNR,YDMJ,JZMJ,SBSJ,SPLCBM,SPLBBH,SJYXBS,SJWXYY,SJSCZT,SBYY,CTIME,UTIME,SYNC_TIME,SystemSource,First_Sync_time' --fields-terminated-by '\001' --export-dir 'hdfs://hainan/apps/hive/warehouse/test.db/hn_csgcjs_spgl_xmjbxxb_transfer_sqoop_p2/data_dt='$partition_nums'' --input-null-string '\\N'  --input-null-non-string '\\N' --hive-partition-key data_dt --hive-partition-value $partition_nums

需要注意的地方做個小結:

1 通過window編輯的腳本文件要注意轉換成unix各式,否則沒法在linux上運行

2 對於hive表中的null字段處理,否則會報can’t parse input data ‘\N’,使用--input-null-string '\\N'  --input-null-non-string '\\N' 參數試試

3 export導出的參數不能使用--query 導出時使用--columns,且列要和關係型數據庫的列要一一對應。

4 對於關係型數據庫中有自增主鍵的處理,hive建表時不用建自增主鍵的字段,sqoop導出時,指定列也不用包含自增主鍵列,關係型數據庫在導入時會自動生成。

5 在導出時,要注意hive表的分隔符時什麼--fields-terminated-by '\001' ,切分後會將hive表的數據切分成一列一列,此時--colums指定的是關係型數據庫的列,而不是hive中的列。 使用show create table 表名查看錶的信息,包括hive表在hdfs的位置。


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章