任務:需要在nagios中定義服務去檢測3個DC的狀態(1.主機狀態、2.consul cluster狀態、3.nomad cluster狀態),只要其中某個服務狀態失效,就觸發nagios eventhandler去改變dns服務器的鏈接文件,如上圖所示。
腳本:腳本中的服務器地址和實際的不同
腳本1:該腳本檢測3個DC的服務狀態,根據檢測到的結果會輸出目前dns應該鏈接的文件名,nagios上會顯示該文件名。如果dns沒有鏈接到正確的文件名,nagios就會報警並觸發event-handler。
#!/bin/bash
#Detection DC host status、consul cluster status、nomad cluster status
DATE=`date +%Y%m%d%H%M%S`
#DC:US(tier1001 and tier1002)
#DC:EU(tier2001 and tier2002)
#DC:AS(tier3001 and tier3002)
#All DC -> axel-geo_us_eu_as.yml default
#DC-EU down -> axel-geo_us_as.yml if DC-EU down
#DC-AS down -> axel-geo_us_eu.yml if DC-AS down
#DC-US down -> axel-geo_eu_as.yml if DC-US down
#detection dc(US) ping status #檢測3個DC的主機狀態,通過nagios自帶插件check_ping去檢測
PING_1001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier1001 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
PING_1002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier1002 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
#detection dc(EU) ping status
PING_2001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier2001 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
PING_2002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier2002 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
#detection dc(AS) ping status
PING_3001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier3001 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
PING_3002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier3002 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
#detection dc(US) consul #檢測3個DC的consul cluster狀態,通過nrpe調用遠程主機上的腳本
if /usr/lib64/nagios/plugins/check_nrpe -H tier1001.axel.network -c check_consul_cluster &>/dev/null ; then CON_US=0 ; else CON_US=1 ; fi
#detection dc(EU) consul
if /usr/lib64/nagios/plugins/check_nrpe -H tier2001.axel.network -c check_consul_cluster &>/dev/null ; then CON_EU=0 ; else CON_EU=1 ; fi
#detection dc(AS) consul
if /usr/lib64/nagios/plugins/check_nrpe -H tier3001.axel.network -c check_consul_cluster &>/dev/null ; then CON_AS=0 ; else CON_AS=1 ; fi
#detection dc(US) nomad #檢測3個DC的nomad cluster狀態,通過nrpe調用遠程主機上的腳本
if /usr/lib64/nagios/plugins/check_nrpe -H tier1001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_US=0 ; else NOM_US=1 ; fi
#detection dc(EU) nomad
if /usr/lib64/nagios/plugins/check_nrpe -H tier2001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_EU=0 ; else NOM_EU=1 ; fi
#detection dc(AS) nomad
if /usr/lib64/nagios/plugins/check_nrpe -H tier3001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_AS=0 ; else NOM_AS=1 ; fi
#detection corrent linkfile #檢測dns服務器上目前鏈接的文件名是什麼
FILE=`/usr/lib64/nagios/plugins/check_nrpe -H romeo.zencoo.com -c check_pdns_link`
[ ! -n "$FILE" ] && {
echo '$FILE is NULL'
exit 1
}
#detection service function #將每個DC的三個服務做判斷,一個DC中,只有所有服務狀態都正常,該DC的變量被賦值0(比如US被賦值爲0)
function service {
#detection ping
[ "$PING_1001" == "OK" -a "$PING_1002" == "OK" ] && PING_US=0 || PING_US=1
[ "$PING_2001" == "OK" -a "$PING_2001" == "OK" ] && PING_EU=0 || PING_EU=1
[ "$PING_3001" == "OK" -a "$PING_3002" == "OK" ] && PING_AS=0 || PING_AS=1
#detection all status
[ "$PING_US" -eq 0 ] && [ "$CON_US" -eq 0 ] && [ "$NOM_US" -eq 0 ] && US=0 || US=1
[ "$PING_EU" -eq 0 ] && [ "$CON_EU" -eq 0 ] && [ "$NOM_EU" -eq 0 ] && EU=0 || EU=1
[ "$PING_AS" -eq 0 ] && [ "$CON_AS" -eq 0 ] && [ "$NOM_AS" -eq 0 ] && AS=0 || AS=1
}
service
#判斷是否需要切換鏈接文件,如果需要,退出狀態碼就是2,nagios就會報警,觸發event-handler
if [ ${US} -eq 0 ] && [ ${EU} -eq 0 ] && [ ${AS} -eq 0 ] && [ "$FILE" == "axel-geo_us_eu_as.yml" ];then
echo "all-DC-is ok,->already axel-geo_us_eu_as.yml";exit 0
elif [ ${US} -eq 0 ] && [ ${EU} -eq 0 ] && [ ${AS} -eq 0 ] && [ "$FILE" != "axel-geo_us_eu_as.yml" ];then
echo "axel-geo_us_eu_as.yml";exit 2
elif [ ${US} -eq 1 -a "$FILE" != "axel-geo_eu_as.yml" ];then
echo "axel-geo_eu_as.yml";exit 2
elif [ ${EU} -eq 1 -a "$FILE" != "axel-geo_us_as.yml" ];then
echo "axel-geo_us_as.yml";exit 2
elif [ ${AS} -eq 1 -a "$FILE" != "axel-geo_us_eu.yml" ];then
echo "axel-geo_us_eu.yml";exit 2
else
echo "link file is ${FILE}"
exit 0
fi
腳本2:觸發event-handler的腳本
#!/bin/bash
#check_service_status.sh dection All dc host status、consul status、nomad status.
#script return a file name ($2 following four)
#All DC -> axel-geo_us_eu_as.yml default
#DC-EU down -> axel-geo_us_as.yml if DC-EU down
#DC-AS down -> axel-geo_us_eu.yml if DC-AS down
#DC-US down -> axel-geo_eu_as.yml if DC-US down
WORKDIR=/usr/lib64/nagios/plugins
DATE=`date +%Y%m%d%H%M%S`
LOG=/tmp/.dns_linkfile
exec &>>${LOG}
case $1 in #$1就是nagios檢測服務的狀態碼,如果報警就是CRITICAL
OK)
#correct link file
exit 0
;;
CRITICAL) #$2是nagios上顯示的信息,也就是文件名,然後通過nrpe去調用dns服務器上的腳本更改鏈接文件
#need to switch link file
case $2 in
axel-geo_us_eu_as.yml)
#DC-EU、DC-AS、DC-US state ok,linkfile->axel-geo_us_eu_as.yml
REMOTE_CMD=update_us_eu_as
;;
axel-geo_us_as.yml)
#DC-EU down,linkfile->axel-geo_us_as.yml
REMOTE_CMD=update_us_as
;;
axel-geo_us_eu.yml)
#DC-AS down, linkfile->axel-geo_us_eu.yml
REMOTE_CMD=update_us_eu
;;
axel-geo_eu_as.yml)
#DC-US down, linkfile->axel-geo_eu_as.yml
REMOTE_CMD=update_eu_as
;;
*)
#default output
echo "${DATE}--warining,no file match"
exit 1
;;
esac
echo "${DATE}--${WORKDIR}/check_nrpe -H {ns1,ns2}.zencoo.com -c ${REMOTE_CMD}"
${WORKDIR}/check_nrpe -H DNS1 -c ${REMOTE_CMD}
${WORKDIR}/check_nrpe -H DNS2 -c ${REMOTE_CMD}
;;
esac
exit 0
腳本3:更改DNS服務上的鏈接文件
#!/bin/bash
#The script is called in the check_dc_status and change_dns_linkfile scripts
LOG=/tmp/.dns_linkfile
DATE=`date +%Y%m%d%H%M%S`
DIR=/etc/pdns
LN=axel-geo.yml
FILE="`ls -l ${DIR}/${LN} | sed -n '/^l/p'|sed 's/.*-> //g'`"
#$1 is check_dc_status and change_dns_linkfile passed parameters
case $1 in #前兩個腳本會通過nrpe來調用該腳本,$1就是傳入的參數
check)
FILE="`ls -l ${DIR}/${LN} | sed -n '/^l/p'|sed 's/.*-> //g'`"
echo "$FILE"
exit 0
;;
us_eu_as)
TAGETFILE="${DIR}/axel-geo_us_eu_as.yml"
;;
us_as)
TAGETFILE="${DIR}/axel-geo_us_as.yml"
;;
us_eu)
TAGETFILE="${DIR}/axel-geo_us_eu.yml"
;;
eu_as)
TAGETFILE="${DIR}/axel-geo_eu_as.yml"
;;
*)
echo '$1 error' >>${LOG}
exit 1
;;
esac
if [ ! -f ${TAGETFILE} ];then
echo '$TAGETFILE does not exist/${DATE}' >>${LOG}
exit 1
elif [ "$FILE" == "$TAGETFILE" ];then
echo "${DATE}-Link file is correct, no need to switch" >>${LOG}
exit 0
else
echo "${HOSTNAME}/${DATE} ln -snf $TAGETFILE ${DIR}/${LN}" >>${LOG}
sudo /usr/bin/ln -snf $TAGETFILE ${DIR}/${LN}
sudo /bin/pdns_control reload && echo "${DATE}-reload dns ok" >>${LOG} || echo "${DATE}-reload dns failed" >>${LOG}
exit 0
fi
nagios配置 #定義檢測服務,定義event-handler
define service{
use generic-service
host_name xxx
service_description check_dc_status
contact_groups admins,admins_jabber
check_command check_nrpe_t60!check_dc_status #調用檢測服務狀態的腳本(腳本1)
event_handler change_dns_linkfile #調用event命令
}
define command {
command_name change_dns_linkfile #$SERVICESTATE$ $SERVICEOUTPUT$ 對應腳本2中的$1和$2
command_line $USER1$/eventhandlers/change_dns_linkfile $SERVICESTATE$ $SERVICEOUTPUT$
}
puppet配置 #腳本1和腳本2會通過nrpe調用腳本3,需要定義相應的命令以及對應的參數
<% if @fqdn == 'dns1xxxx' or @fqdn == 'dns2xxxx' -%>
command[check_pdns_link]=<%= @pluginsdir %>/dns_file_check.sh check
command[update_us_eu_as]=<%= @pluginsdir %>/dns_file_check.sh us_eu_as
command[update_us_eu]=<%= @pluginsdir %>/dns_file_check.sh us_eu
command[update_us_as]=<%= @pluginsdir %>/dns_file_check.sh us_as
command[update_eu_as]=<%= @pluginsdir %>/dns_file_check.sh eu_as
<% end -%>
第一次弄nagios event-handler,感覺很亂,腳本還要再繼續完善