fix judge prom

This commit is contained in:
Ulric Qin 2021-08-14 23:17:06 +08:00
parent ca8a8701b4
commit 0b4e3b9656
3 changed files with 59 additions and 32 deletions

View File

@ -449,14 +449,14 @@ func sendEventIfNeed(status []bool, event *models.AlertEvent, stra *models.Alert
} }
now := time.Now().Unix() now := time.Now().Unix()
lastEvent, exists := LastEvents.Get(event.HashId) lastEvent, exists := LastEvents.Get(event.RuleId, event.HashId)
switch event.IsPromePull { switch event.IsPromePull {
case 0: case 0:
// push型的 && 与条件型的 // push型的 && 与条件型的
if exists && lastEvent.IsPromePull == 1 { if exists && lastEvent.IsPromePull == 1 {
// 之前内存中的事件是pull型的先清空内存中的事件 // 之前内存中的事件是pull型的先清空内存中的事件
LastEvents.Del(event.HashId) LastEvents.Del(event.RuleId, event.HashId)
} }
if isTriggered { if isTriggered {
@ -476,7 +476,7 @@ func sendEventIfNeed(status []bool, event *models.AlertEvent, stra *models.Alert
// pull型的产生的事件一定是触发了阈值的即这个case里不存在recovery的场景recovery的场景用resolve_timeout的cron来处理 // pull型的产生的事件一定是触发了阈值的即这个case里不存在recovery的场景recovery的场景用resolve_timeout的cron来处理
if exists && lastEvent.IsPromePull == 0 { if exists && lastEvent.IsPromePull == 0 {
// 之前内存中的事件是push型的先清空内存中的事件 // 之前内存中的事件是push型的先清空内存中的事件
LastEvents.Del(event.HashId) LastEvents.Del(event.RuleId, event.HashId)
} }
// 1. 第一次来并且AlertDuration=0直接发送 // 1. 第一次来并且AlertDuration=0直接发送
@ -490,7 +490,7 @@ func sendEventIfNeed(status []bool, event *models.AlertEvent, stra *models.Alert
SendEvent(event) SendEvent(event)
} else { } else {
// 只有一条事件显然无法满足for AlertDuration的时间放到内存里等待 // 只有一条事件显然无法满足for AlertDuration的时间放到内存里等待
LastEvents.Set(event.HashId, event) LastEvents.Set(event)
} }
return return
} }
@ -529,7 +529,7 @@ func sendEventIfNeed(status []bool, event *models.AlertEvent, stra *models.Alert
func SendEvent(event *models.AlertEvent) { func SendEvent(event *models.AlertEvent) {
// update last event // update last event
LastEvents.Set(event.HashId, event) LastEvents.Set(event)
ok := EventQueue.PushFront(event) ok := EventQueue.PushFront(event)
if !ok { if !ok {
logger.Errorf("push event:%v err", event) logger.Errorf("push event:%v err", event)

View File

@ -4,58 +4,85 @@ import (
"sync" "sync"
"time" "time"
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/v5/models" "github.com/didi/nightingale/v5/models"
"github.com/toolkits/pkg/logger"
) )
// rule_id -> hash_id -> *models.AlertEvent
type SafeEventMap struct { type SafeEventMap struct {
sync.RWMutex sync.RWMutex
M map[string]*models.AlertEvent M map[int64]map[string]*models.AlertEvent
} }
var ( var (
LastEvents = &SafeEventMap{M: make(map[string]*models.AlertEvent)} LastEvents = &SafeEventMap{M: make(map[int64]map[string]*models.AlertEvent)}
) )
func (s *SafeEventMap) Get(key string) (*models.AlertEvent, bool) { func (s *SafeEventMap) Get(ruleId int64, hashId string) (*models.AlertEvent, bool) {
s.RLock() s.RLock()
defer s.RUnlock() defer s.RUnlock()
event, exists := s.M[key]
return event, exists m, has := s.M[ruleId]
if !has {
return nil, false
}
event, has := m[hashId]
return event, has
} }
func (s *SafeEventMap) Set(key string, event *models.AlertEvent) { func (s *SafeEventMap) Set(event *models.AlertEvent) {
s.Lock() s.Lock()
defer s.Unlock() defer s.Unlock()
s.M[key] = event
m, has := s.M[event.RuleId]
if !has {
m = make(map[string]*models.AlertEvent)
m[event.HashId] = event
s.M[event.RuleId] = m
} else {
s.M[event.RuleId][event.HashId] = event
}
} }
func (s *SafeEventMap) Del(key string) { func (s *SafeEventMap) Del(ruleId int64, hashId string) {
s.Lock() s.Lock()
defer s.Unlock() defer s.Unlock()
delete(s.M, key)
_, has := s.M[ruleId]
if !has {
return
}
delete(s.M[ruleId], hashId)
} }
func (s *SafeEventMap) DeleteOrSendRecovery(promql string, toKeepKeys map[string]struct{}) { func (s *SafeEventMap) DeleteOrSendRecovery(ruleId int64, toKeepKeys map[string]struct{}) {
s.Lock() s.Lock()
defer s.Unlock() defer s.Unlock()
for k, ev := range s.M {
m, has := s.M[ruleId]
if !has {
return
}
for k, ev := range m {
if _, loaded := toKeepKeys[k]; loaded { if _, loaded := toKeepKeys[k]; loaded {
continue continue
} }
if ev.ReadableExpression == promql {
logger.Debugf("[to_del][ev.IsRecovery:%+v][ev.LastSend:%+v][promql:%v]", ev.IsRecovery, ev.LastSend, promql) // 如果因为promql修改导致本来是告警状态变成了恢复也接受
now := time.Now().Unix() logger.Debugf("[to_del][ev.IsRecovery:%+v][ev.LastSend:%+v]", ev.IsRecovery, ev.LastSend)
// promql 没查询到结果,需要将告警标记为已恢复并发送
// 同时需要满足 已经发送过触发信息,并且时间差满足 大于AlertDuration // promql 没查询到结果,需要将告警标记为已恢复并发送
// 为了避免 发送告警后 一个点 断点了就立即发送恢复信息的case // 同时需要满足 已经发送过触发信息,并且时间差满足 大于AlertDuration
if ev.IsAlert() && ev.LastSend && now-ev.TriggerTime > ev.AlertDuration { // 为了避免 发送告警后 一个点 断点了就立即发送恢复信息的case
logger.Debugf("[prom.alert.MarkRecov][promql:%v][ev.RuleName:%v]", promql, ev.RuleName) now := time.Now().Unix()
ev.MarkRecov() if ev.IsAlert() && ev.LastSend && now-ev.TriggerTime > ev.AlertDuration {
EventQueue.PushFront(ev) logger.Debugf("[prom.alert.MarkRecov][ev.RuleName:%v]", ev.RuleName)
delete(s.M, k) ev.MarkRecov()
} EventQueue.PushFront(ev)
delete(s.M[ruleId], k)
} }
} }
} }

View File

@ -121,7 +121,7 @@ func handlePromqlVector(pv promql.Vector, r models.AlertRule) {
toKeepKeys := map[string]struct{}{} toKeepKeys := map[string]struct{}{}
if len(pv) == 0 { if len(pv) == 0 {
// 说明没触发或者没查询到删掉rule-id开头的所有event // 说明没触发或者没查询到删掉rule-id开头的所有event
LastEvents.DeleteOrSendRecovery(r.PullExpr.PromQl, toKeepKeys) LastEvents.DeleteOrSendRecovery(r.Id, toKeepKeys)
return return
} }
@ -191,6 +191,6 @@ func handlePromqlVector(pv promql.Vector, r models.AlertRule) {
logger.Debugf("[handlePromqlVector_has_value][event:%+v]\n", event) logger.Debugf("[handlePromqlVector_has_value][event:%+v]\n", event)
sendEventIfNeed([]bool{true}, event, &r) sendEventIfNeed([]bool{true}, event, &r)
} }
LastEvents.DeleteOrSendRecovery(r.PullExpr.PromQl, toKeepKeys) LastEvents.DeleteOrSendRecovery(r.Id, toKeepKeys)
} }