一、需求
監控 Azkaban-web和Azkaban-exec進程狀態和 Azkaban 任務執行的狀態同步到SQLServer數據庫,任務執行 5 分鐘同步一次,發現異常就告警。
1.1.Azkaban 任務狀態
30--------正在運行
50--------運行成功
60--------kill任務
70--------運行失敗
1.2.存儲 Azkaban任務狀態的表
select exec_id,flow_id,status,FROM_UNIXTIME(start_time/1000) as start_time,FROM_UNIXTIME(end_time/1000) as end_time
from execution_flows where status = 70 or status = 50;
二、腳本編寫
2.1.編寫table.ini 的數據庫配置文件
xxx 是不同的客戶不同的數據庫 可以讀取多個數據庫的配置寫到不同的SQLServer數據庫。
[pcsprd@client ~]$ cat /hadoop/datadir/script/hadoop/table.ini
[xxx_CONNECT]
url=xxx
port=1433
username=PCS.Support
password=321@win#
dbname=HDP_TEST
customer=xxx_
2.2.任務狀態寫入SQLServer的shell
set -x
HOSTNAME="xxx"
USER="root"
PASSWD="winner@001"
PORT="3306"
DBNAME="azkaban"
function ReadConnect(){
ReadINI=`awk -F '=' '/\['$2'\]/{a=1}a==1&&$1~/^'$3'$/{print $2;exit}' $1`
}
batchCustomer=xxx_
table_ini=/hadoop/datadir/script/hadoop/table.ini
ReadConnect $table_ini "${batchCustomer}CONNECT" url
server=$ReadINI
ReadConnect $table_ini "${batchCustomer}CONNECT" port
port=$ReadINI
ReadConnect $table_ini "${batchCustomer}CONNECT" dbname
database=$ReadINI
ReadConnect $table_ini "${batchCustomer}CONNECT" username
user=$ReadINI
ReadConnect $table_ini "${batchCustomer}CONNECT" password
paw=$ReadINI
azkaban_exec_tmp_file=/hadoop/datadir/temp/monitor/exec_tmp_file.txt
mysql_cmd="mysql -h${HOSTNAME} -P${PORT} -u${USER} -p${PASSWD} ${DBNAME} -e"
sqlserver_cmd="/opt/mssql-tools/bin/sqlcmd -S $server -U $user -P $paw -d ${database} -Q "
if [ 0 == $azwebCount ];then
# 定義了 80 爲進程 運行正常,90爲進程掛掉
${sqlserver_cmd} "INSERT into task_monitor (flowId,taskId,status,startTime,endTime) VALUES(DATEDIFF(S,'1970-01-01 00:00:00', GETDATE()),'azkban-web-heartbeat',90,GETDATE(),GETDATE())"
else
${sqlserver_cmd} "INSERT into task_monitor (flowId,taskId,status,startTime,endTime) VALUES(DATEDIFF(S,'1970-01-01 00:00:00', GETDATE()),'azkban-web-heartbeat',80,GETDATE(),GETDATE())"
fi
#監控azkaban的exe
azexeCount=`ps -ef |grep azkaban-exe |grep -v "grep" |wc -l`
if [ 0 == $azexeCount ];then
${sqlserver_cmd} "INSERT into task_monitor (flowId,taskId,status,startTime,endTime) VALUES(DATEDIFF(S,'1970-01-01 00:00:00', GETDATE()),'azkban-exec-heartbeat',90,GETDATE(),GETDATE())"
else
${sqlserver_cmd} "INSERT into task_monitor (flowId,taskId,status,startTime,endTime) VALUES(DATEDIFF(S,'1970-01-01 00:00:00', GETDATE()),'azkban-exec-heartbeat',80,GETDATE(),GETDATE())"
fi
#查詢Azkaban調度中 運行成功和失敗的任務
select_exec_sql="select exec_id,flow_id,status,FROM_UNIXTIME(start_time/1000) as start_time,FROM_UNIXTIME(end_time/1000) as end_time from execution_flows where (FROM_UNIXTIME(start_time/1000)>(select task_lastTime from task_lastTime)) and (status = 70 or status = 50)
into outfile \"${azkaban_exec_tmp_file}\" fields terminated by \",\" ;"
# azkaban從臨時表抽入正式
task_move_sql="insert into ${database}.[dbo].[task_monitor] (flowId,taskId,status,startTime,endTime) select flowId,taskId,status,startTime,endTime from ${database}.[dbo].[task_monitor_tmp];"
update_task_lastTime_sql="UPDATE task_lastTime SET task_lastTime = NOW() WHERE id=1;"
#執行SQL 運行成功和失敗的任務 寫入文件
rm -rf ${azkaban_exec_tmp_file}
${mysql_cmd} "${select_exec_sql}"
if [ -f ${azkaban_exec_tmp_file} ];then
${sqlserver_cmd} "truncate table ${database}.[dbo].[task_monitor_tmp]"
/opt/mssql-tools/bin/bcp ${database}.dbo.task_monitor_tmp in ${azkaban_exec_tmp_file} -S${server} -U${user} -P${paw} -c -t, -r'\n' -b 1000
${sqlserver_cmd} "${task_move_sql}"
else
echo file ${azkaban_exec_tmp_file} not exist!
fi
${mysql_cmd} "${update_task_lastTime_sql}"