Notify組件定義了路由處理過程中的receiver pipeline(本文不包含Silence和Inhibit部分), 包含等待間隔的WaitState,去重處理DedupStage,重試處理RetryStage和SetNotifyStage,實現上類似於中間件的方式,一層層的順序處理。創建pipeline的函數定義如下:
// createStage creates a pipeline of stages for a receiver.
func createStage(rc *config.Receiver, tmpl *template.Template, wait func() time.Duration, notificationLog NotificationLog, logger log.Logger) Stage {
var fs FanoutStage
for _, i := range BuildReceiverIntegrations(rc, tmpl, logger) {
recv := &nflogpb.Receiver{
GroupName: rc.Name,
Integration: i.name,
Idx: uint32(i.idx),
}
var s MultiStage
s = append(s, NewWaitStage(wait))
s = append(s, NewDedupStage(i, notificationLog, recv))
s = append(s, NewRetryStage(i, rc.Name))
s = append(s, NewSetNotifiesStage(notificationLog, recv))
fs = append(fs, s)
}
return fs
}
WaitStage
等待間隔用來設置發送告警的等待時間,對於集羣操作中,需要根據不同的peer設置不同的超時時間,如果僅僅一個Server本身,等待間隔設置爲0;
// clusterWait returns a function that inspects the current peer state and returns
// a duration of one base timeout for each peer with a higher ID than ourselves.
func clusterWait(p *cluster.Peer, timeout time.Duration) func() time.Duration {
return func() time.Duration {
return time.Duration(p.Position()) * timeout
}
具體的實現上採用一個timer來傳遞信號,一旦時間到達後才返回對應的alerts,由於是串行執行的,所以消息傳遞會中止一段時間。
// Exec implements the Stage interface.
func (ws *WaitStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
select {
case <-time.After(ws.wait()):
case <-ctx.Done():
return ctx, nil, ctx.Err()
}
return ctx, alerts, nil
}
DedupStage
DedupStage用於管理告警的去重,傳遞的參數中包含了一個NotificationLog,用來保存告警的發送記錄。當有多個機器組成集羣的時候,NotificationLog會通過協議去進行通信,傳遞彼此的記錄信息,加入集羣中的A如果發送了告警,該記錄會傳遞給B機器,並進行merge操作,這樣B機器在發送告警的時候如果查詢已經發送,則不再進行告警發送。關於NotificationLog的實現nflog可以查看nflog/nflog.go文件。
// DedupStage filters alerts.
// Filtering happens based on a notification log.
type DedupStage struct {
nflog NotificationLog
recv *nflogpb.Receiver
conf notifierConfig
now func() time.Time
hash func(*types.Alert) uint64
}
具體的處理邏輯如下:
func (n *DedupStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
...
entries, err := n.nflog.Query(nflog.QGroupKey(gkey), nflog.QReceiver(n.recv))
if err != nil && err != nflog.ErrNotFound {
return ctx, nil, err
}
var entry *nflogpb.Entry
switch len(entries) {
case 0:
case 1:
entry = entries[0]
case 2:
return ctx, nil, fmt.Errorf("unexpected entry result size %d", len(entries))
}
if n.needsUpdate(entry, firingSet, resolvedSet, repeatInterval) {
return ctx, alerts, nil
}
return ctx, nil, nil
}
其中的nflog.Query將根據接收和group key進行查詢,一旦查找到,則不再返回對應的alerts. nflog設置了GC用來刪除過期的日誌記錄。防止一直存在log中導致告警無法繼續發送.
RetryStage
RetryStage利用backoff策略來管理告警的重發,對於沒有發送成功的告警將不斷重試,直到超時時間,numFailedNotifications用來傳遞發送失敗的統計metrics,numNotifications用來發送成功的metrics統計信息。
select {
case <-tick.C:
now := time.Now()
retry, err := r.integration.Notify(ctx, sent...)
notificationLatencySeconds.WithLabelValues(r.integration.name).Observe(time.Since(now).Seconds())
if err != nil {
numFailedNotifications.WithLabelValues(r.integration.name).Inc()
level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.name, "receiver", r.groupName, "err", err)
if !retry {
return ctx, alerts, fmt.Errorf("cancelling notify retry for %q due to unrecoverable error: %s", r.integration.name, err)
}
// Save this error to be able to return the last seen error by an
// integration upon context timeout.
iErr = err
} else {
numNotifications.WithLabelValues(r.integration.name).Inc()
return ctx, alerts, nil
}
case <-ctx.Done():
if iErr != nil {
return ctx, nil, iErr
}
return ctx, nil, ctx.Err()
}
SetNotifiesStage
SetNotifiesStage用來設置發送告警的信息到nfLog,該模塊僅僅用於被該AM發送的告警的記錄(Retry組件傳遞的alerts和Dedup組件中發送出去的告警信息)。
// Exec implements the Stage interface.
func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
gkey, ok := GroupKey(ctx)
if !ok {
return ctx, nil, fmt.Errorf("group key missing")
}
firing, ok := FiringAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("firing alerts missing")
}
resolved, ok := ResolvedAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("resolved alerts missing")
}
return ctx, alerts, n.nflog.Log(n.recv, gkey, firing, resolved)
}
告警接收處理
Notify組件根據用戶的配置路由發送告警信息,比如通過webhook,email,wechat,slack等。 我們內部消息系統是通過mattermost實現,可以兼容slack的配置。用戶可以自定義自己的消息發送路由,notify/impl.go中有不同方式具體的實現細節.
// A Notifier notifies about alerts under constraints of the given context.
// It returns an error if unsuccessful and a flag whether the error is
// recoverable. This information is useful for a retry logic.
type Notifier interface {
Notify(context.Context, ...*types.Alert) (bool, error)
}
// An Integration wraps a notifier and its config to be uniquely identified by
// name and index from its origin in the configuration.
type Integration struct {
notifier Notifier
conf notifierConfig
name string
idx int
}
Integration定義一個集成路由組件,包含用戶的配置信息和名稱以及發送告警的實現。自定義的notify路由需要滿足該Notifier接口,實現Notify方法。 比如下面是webhook的實現,首先定義一個管理webhook的結構體Webhook,包含基本的配置和模板信息,WebhookMessage定義了發送webhook的信息
// Webhook implements a Notifier for generic webhooks.
type Webhook struct {
conf *config.WebhookConfig
tmpl *template.Template
logger log.Logger
}
// NewWebhook returns a new Webhook.
func NewWebhook(conf *config.WebhookConfig, t *template.Template, l log.Logger) *Webhook {
return &Webhook{conf: conf, tmpl: t, logger: l}
}
// WebhookMessage defines the JSON object send to webhook endpoints.
type WebhookMessage struct {
*template.Data
// The protocol version.
Version string `json:"version"`
GroupKey string `json:"groupKey"`
}
基本結構定義完成後就可以編寫具體的發送函數Notify來實現告警的發送,根據告警系統發送的告警信息(可能不止一個)將其通過模板生成對應的消息,由於可能包含多個告警,因此GroupKey用來返回聚合組的相關信息。生成的WebhookMessage經過JSON序列化後通過http協議傳遞到配置的web接口中,返回的w.retry(resp.StatusCode)
將檢查是否發送ok,如果失敗則返回錯誤信息。
// Notify implements the Notifier interface.
func (w *Webhook) Notify(ctx context.Context, alerts ...*types.Alert) (bool, error) {
data := w.tmpl.Data(receiverName(ctx, w.logger), groupLabels(ctx, w.logger), alerts...)
groupKey, ok := GroupKey(ctx)
if !ok {
level.Error(w.logger).Log("msg", "group key missing")
}
msg := &WebhookMessage{
Version: "4",
Data: data,
GroupKey: groupKey,
}
var buf bytes.Buffer
if err := json.NewEncoder(&buf).Encode(msg); err != nil {
return false, err
}
req, err := http.NewRequest("POST", w.conf.URL, &buf)
if err != nil {
return true, err
}
req.Header.Set("Content-Type", contentTypeJSON)
req.Header.Set("User-Agent", userAgentHeader)
c, err := commoncfg.NewHTTPClientFromConfig(w.conf.HTTPConfig)
if err != nil {
return false, err
}
resp, err := ctxhttp.Do(ctx, c, req)
if err != nil {
return true, err
}
resp.Body.Close()
return w.retry(resp.StatusCode)
}