腳本背景:
我所在的公司爲運營CDN業務的IDC公司,客戶域名的流量圖經常會出現毛刺,但是服務的域名非常多,每天挨個流量圖看耗時耗力。因此用python寫了個可以自動檢測異常rrd裏異常數值併發送報警郵件的腳本。
由於我們的rrd文件是以服務域名命名的,所以先在相應的API上獲取服務域名,然後根據域名掃描rrd文件。我設的是掃描半小時的數值,每10分鐘執行一次,大概有2000來個rrd文件,執行一次6、7秒左右。
代碼如下:
#!/usr/bin/env python #coding:utf-8 from pyrrd.graph import DEF,CDEF,AREA from pyrrd.graph import Graph from pyrrd.graph import ColorAttributes from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email.mime.p_w_picpath import MIMEImage from datetime import datetime import calendar import os import time import urllib2 import smtplib import email import sys def graphrrd(files): now_utc =calendar.timegm(datetime.utcnow().utctimetuple()) def1 = DEF(rrdfile=files, vname='back',dsName='RX') def2 = DEF(rrdfile=files, vname='CDN',dsName='TX') cdef1 = CDEF(vname='back_flow',rpn='%s,0.026,*' % def1.vname) cdef2 = CDEF(vname='CDN_flow',rpn='%s,0.026,*' % def2.vname) area1 = AREA(defObj=cdef1, color='#002A97FF', legend='back_flow') area2 = AREA(defObj=cdef2, color='#00CF00FF', legend='CDN_flow') ca = ColorAttributes() ca.back = '#333333' ca.canvas = '#333333' ca.shadea = '#000000' ca.shadeb = '#111111' ca.mgrid = '#CCCCCC' ca.axis = '#FFFFFF' ca.frame = '#AAAAAA' ca.font = '#FFFFFF' ca.arrow = '#FFFFFF' graphfile = p_w_picpath_dir title_url=files[23:-4] g = Graph(graphfile, start= now_utc-43200, end= now_utc,vertical_label='flow',title=title_url ) g.data.extend([def1, def2, cdef1, cdef2, area2, area1]) g.write() def connect(): server=smtplib.SMTP(smtpserver) server.ehlo() server.login(smtpuser,smtppass) return server def sendmessage(server,to,subj,content): msg = MIMEMultipart('related') msg['Subject'] = subj msg['From'] = smtpuser msg['To'] = to msg['Date'] = email.Utils.formatdate() msgText = MIMEText(content,"html", "utf-8") msg.attach(msgText) fp = open(p_w_picpath_dir, 'rb') msgImage = MIMEImage(fp.read()) fp.close() msgImage.add_header('Content-ID', '<p_w_picpath1>') msg.attach(msgImage) try: server.sendmail(smtpuser, to, msg.as_string()) except Exception ,ex: print Exception,ex print 'Error - send failed' def aver(rrd_file,n=6): global dict_data sum1=0 sum2=0 sum3=0 data = os.popen('rrdtool fetch %s AVERAGE -s -1d | tail -%d | grep -v nan| grep -v RX ' % (rrd_file,n)).readlines() if len(data)< (n/2): log("[ERRORS: %s] has not enough record ! please check it!!\n" % rrd_file) return [] for i in data: if len(i) > 25: dict_data[i[:10]]=i.strip()[12:].split() for i in dict_data.values(): try: sum1 = sum1+float(i[0]) sum2 = sum2+float(i[1]) sum3 = sum3+float(i[2]) except: log('%s %s\n' % (rrd_file,i)) if sum2/len(data) < 3500000000: log('WARNING: %s was less then 200M\n' % rrd_file) return [] return [sum1/len(data),sum2/len(data),sum3/len(data)] def check(average): wrong_t=[] for key in dict_data: if float(dict_data[key][1])/average > 1.6: wrong_t.append(key) return wrong_t def update(rrd_file,t,aver1,aver2,aver3): global text global dict_data errors_time=os.popen('date -d "1970-01-01 UTC %s seconds"' % t).readline().strip() content = '<br/><br/>%s 異常信息:<br/> 域名: %s <br/> 時間: %s<br/> 流量值: 回源帶寬: %.2fM , cdn帶寬 : %dM <br/> <br/>rrd 異常信息:<br/> 路徑: %s<br/> UTC 時間: %s<br/> 異常值: [%s], [%s], [%s]<br/><br/><img src="cid:p_w_picpath1">' % (rrd_file[23:-4],rrd_file[23:-4],errors_time,float(dict_data[t][0])*8/300000000,int(float(dict_data[t][1])*8/300000000),rrd_file,t,dict_data[t][0],dict_data[t][1],dict_data[t][2]) write_error('[ %s ]: at[ %s(%s) ],the value was [%s] [%s] [%s] \n' %(rrd_file,errors_time,t,dict_data[t][0],dict_data[t][1],dict_data[t][2])) text = text + content def log(log_write): f = open('%s/rrd_alt1.log' % rrd_bak, 'a') f.write(log_write) f.close() def write_error(log_write): f = open('%s/rrd_error1.log' % rrd_bak, 'a') f.write(log_write) f.close def run_script(rrd_file): global to_all global text aver_rrd=aver(rrd_file) if len(aver_rrd) == 0: return wrong_time=check(aver_rrd[1]) if len(wrong_time)==0: log('[%s] no errors !\n' % (rrd_file)) return for t in wrong_time: update(rrd_file,t,aver_rrd[0],aver_rrd[1],aver_rrd[2]) graphrrd(rrd_file) if text: for to in to_all: server=connect() sendmessage(server,to,subj,text) log('sendmail to %s\n' % to) if __name__=='__main__': p_w_picpath_time=time.strftime("%d-%H-%M") rrd_dir='/data/rrd/db/1/billing' rrd_bak='/data/rrd/db/1/billing/bak' smtpserver='xxx' p_w_picpath_dir='%s/rrdgraph_%s.png' % (rrd_bak,p_w_picpath_time) smtpuser='xxx' smtppass='yyy' to_all=['xxx','yyy'] subj='check the flow of CDN!!!!' while True: url_list=[] local_time = time.strftime("%m-%d %H:%M:%S") url=urllib2.urlopen('xxx').readlines() for u in url: a = "%s/%s.rrd" % (rrd_dir,u.strip()) url_list.append(a) log("-"*60+"\n") log("the script run time at %s \n" % local_time) while len(url_list): text='' dict_data={} rrd_file = url_list.pop() if os.path.exists(rrd_file): run_script(rrd_file) else: continue log("-"*60+"\n") break
郵件截圖