清洗數據
#!/bin/bash
hadoop jar jar路徑 類路徑.NginxAccessETL $1
hive -e "
USE 數據庫;
CREATE TABLE IF NOT EXISTS dwb_nginx_access_log(
ip string,
time string,
path string
)
PARTITIONED BY (day string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
LOAD DATA INPATH '/nginx-access-etl/part-m-00000' INTO TABLE dwb_nginx_access_log PARTITION(day='$2');
"
分析數據並導入MySQL
#!/bin/bash
hive -e "
USE 數據庫;
DROP TABLE IF EXISTS dwd_hour_page_log;
CREATE TABLE IF NOT EXISTS dwd_hour_page_log(
ip string,
time string,
path string
)
PARTITIONED BY (hour string);
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT INTO TABLE dwd_hour_page_log PARTITION(hour)
SELECT ip
, time
, path
, substring(time, 12, 2) hour
FROM dwb_nginx_access_log
WHERE time LIKE '$1%';
DELETE jar /root/nginx_log_UDF.jar;
ADD jar /root/nginx_log_UDF.jar;
CREATE TEMPORARY FUNCTION getPathPage as 'com.zhiyou100.udf.path.GetPathPage';
CREATE TEMPORARY FUNCTION getPathId as 'com.zhiyou100.udf.path.GetPathId';
DROP TABLE IF EXISTS dt_hour_page;
CREATE TABLE dt_hour_page AS
SELECT ip
, path
, time
, getPathPage(path) page
, getPathId(path) id
, hour
, substring(time, 15, 2) minute
, substring(time, 18, 2) second
FROM dwd_hour_page_log;
DROP TABLE r_hour_page_pv;
CREATE TABLE r_hour_page_pv ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' AS
SELECT page
, hour
, COUNT(*) pv
, '$1' day
FROM dt_hour_page
GROUP BY hour, page;
"
sqoop-export --connect jdbc:mysql://master:3306/nginx_access_log --username root --table hour_page_pv --export-dir /user/hive/warehouse/dw_nginx_access_log.db/r_hour_page_pv --input-fields-terminated-by '\t' -m 1 --P
echo "數據導出成功,請去 MySQL 查看"