!1 fix judge prom
Merge pull request !1 from Ulric Qin/judge_prom_bugfix
This commit is contained in:
commit
a7cf8f9ec9
|
@ -449,14 +449,14 @@ func sendEventIfNeed(status []bool, event *models.AlertEvent, stra *models.Alert
|
||||||
}
|
}
|
||||||
|
|
||||||
now := time.Now().Unix()
|
now := time.Now().Unix()
|
||||||
lastEvent, exists := LastEvents.Get(event.HashId)
|
lastEvent, exists := LastEvents.Get(event.RuleId, event.HashId)
|
||||||
|
|
||||||
switch event.IsPromePull {
|
switch event.IsPromePull {
|
||||||
case 0:
|
case 0:
|
||||||
// push型的 && 与条件型的
|
// push型的 && 与条件型的
|
||||||
if exists && lastEvent.IsPromePull == 1 {
|
if exists && lastEvent.IsPromePull == 1 {
|
||||||
// 之前内存中的事件是pull型的,先清空内存中的事件
|
// 之前内存中的事件是pull型的,先清空内存中的事件
|
||||||
LastEvents.Del(event.HashId)
|
LastEvents.Del(event.RuleId, event.HashId)
|
||||||
}
|
}
|
||||||
|
|
||||||
if isTriggered {
|
if isTriggered {
|
||||||
|
@ -476,7 +476,7 @@ func sendEventIfNeed(status []bool, event *models.AlertEvent, stra *models.Alert
|
||||||
// pull型的,产生的事件一定是触发了阈值的,即这个case里不存在recovery的场景,recovery的场景用resolve_timeout的cron来处理
|
// pull型的,产生的事件一定是触发了阈值的,即这个case里不存在recovery的场景,recovery的场景用resolve_timeout的cron来处理
|
||||||
if exists && lastEvent.IsPromePull == 0 {
|
if exists && lastEvent.IsPromePull == 0 {
|
||||||
// 之前内存中的事件是push型的,先清空内存中的事件
|
// 之前内存中的事件是push型的,先清空内存中的事件
|
||||||
LastEvents.Del(event.HashId)
|
LastEvents.Del(event.RuleId, event.HashId)
|
||||||
}
|
}
|
||||||
|
|
||||||
// 1. 第一次来,并且AlertDuration=0,直接发送
|
// 1. 第一次来,并且AlertDuration=0,直接发送
|
||||||
|
@ -490,7 +490,7 @@ func sendEventIfNeed(status []bool, event *models.AlertEvent, stra *models.Alert
|
||||||
SendEvent(event)
|
SendEvent(event)
|
||||||
} else {
|
} else {
|
||||||
// 只有一条事件,显然无法满足for AlertDuration的时间,放到内存里等待
|
// 只有一条事件,显然无法满足for AlertDuration的时间,放到内存里等待
|
||||||
LastEvents.Set(event.HashId, event)
|
LastEvents.Set(event)
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -529,7 +529,7 @@ func sendEventIfNeed(status []bool, event *models.AlertEvent, stra *models.Alert
|
||||||
|
|
||||||
func SendEvent(event *models.AlertEvent) {
|
func SendEvent(event *models.AlertEvent) {
|
||||||
// update last event
|
// update last event
|
||||||
LastEvents.Set(event.HashId, event)
|
LastEvents.Set(event)
|
||||||
ok := EventQueue.PushFront(event)
|
ok := EventQueue.PushFront(event)
|
||||||
if !ok {
|
if !ok {
|
||||||
logger.Errorf("push event:%v err", event)
|
logger.Errorf("push event:%v err", event)
|
||||||
|
|
|
@ -4,58 +4,85 @@ import (
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/toolkits/pkg/logger"
|
|
||||||
|
|
||||||
"github.com/didi/nightingale/v5/models"
|
"github.com/didi/nightingale/v5/models"
|
||||||
|
"github.com/toolkits/pkg/logger"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// rule_id -> hash_id -> *models.AlertEvent
|
||||||
type SafeEventMap struct {
|
type SafeEventMap struct {
|
||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
M map[string]*models.AlertEvent
|
M map[int64]map[string]*models.AlertEvent
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
LastEvents = &SafeEventMap{M: make(map[string]*models.AlertEvent)}
|
LastEvents = &SafeEventMap{M: make(map[int64]map[string]*models.AlertEvent)}
|
||||||
)
|
)
|
||||||
|
|
||||||
func (s *SafeEventMap) Get(key string) (*models.AlertEvent, bool) {
|
func (s *SafeEventMap) Get(ruleId int64, hashId string) (*models.AlertEvent, bool) {
|
||||||
s.RLock()
|
s.RLock()
|
||||||
defer s.RUnlock()
|
defer s.RUnlock()
|
||||||
event, exists := s.M[key]
|
|
||||||
return event, exists
|
m, has := s.M[ruleId]
|
||||||
|
if !has {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
event, has := m[hashId]
|
||||||
|
return event, has
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SafeEventMap) Set(key string, event *models.AlertEvent) {
|
func (s *SafeEventMap) Set(event *models.AlertEvent) {
|
||||||
s.Lock()
|
s.Lock()
|
||||||
defer s.Unlock()
|
defer s.Unlock()
|
||||||
s.M[key] = event
|
|
||||||
|
m, has := s.M[event.RuleId]
|
||||||
|
if !has {
|
||||||
|
m = make(map[string]*models.AlertEvent)
|
||||||
|
m[event.HashId] = event
|
||||||
|
s.M[event.RuleId] = m
|
||||||
|
} else {
|
||||||
|
s.M[event.RuleId][event.HashId] = event
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SafeEventMap) Del(key string) {
|
func (s *SafeEventMap) Del(ruleId int64, hashId string) {
|
||||||
s.Lock()
|
s.Lock()
|
||||||
defer s.Unlock()
|
defer s.Unlock()
|
||||||
delete(s.M, key)
|
|
||||||
|
_, has := s.M[ruleId]
|
||||||
|
if !has {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
delete(s.M[ruleId], hashId)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SafeEventMap) DeleteOrSendRecovery(promql string, toKeepKeys map[string]struct{}) {
|
func (s *SafeEventMap) DeleteOrSendRecovery(ruleId int64, toKeepKeys map[string]struct{}) {
|
||||||
s.Lock()
|
s.Lock()
|
||||||
defer s.Unlock()
|
defer s.Unlock()
|
||||||
for k, ev := range s.M {
|
|
||||||
|
m, has := s.M[ruleId]
|
||||||
|
if !has {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, ev := range m {
|
||||||
if _, loaded := toKeepKeys[k]; loaded {
|
if _, loaded := toKeepKeys[k]; loaded {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if ev.ReadableExpression == promql {
|
|
||||||
logger.Debugf("[to_del][ev.IsRecovery:%+v][ev.LastSend:%+v][promql:%v]", ev.IsRecovery, ev.LastSend, promql)
|
// 如果因为promql修改,导致本来是告警状态变成了恢复,也接受
|
||||||
now := time.Now().Unix()
|
logger.Debugf("[to_del][ev.IsRecovery:%+v][ev.LastSend:%+v]", ev.IsRecovery, ev.LastSend)
|
||||||
// promql 没查询到结果,需要将告警标记为已恢复并发送
|
|
||||||
// 同时需要满足 已经发送过触发信息,并且时间差满足 大于AlertDuration
|
// promql 没查询到结果,需要将告警标记为已恢复并发送
|
||||||
// 为了避免 发送告警后 一个点 断点了就立即发送恢复信息的case
|
// 同时需要满足 已经发送过触发信息,并且时间差满足 大于AlertDuration
|
||||||
if ev.IsAlert() && ev.LastSend && now-ev.TriggerTime > ev.AlertDuration {
|
// 为了避免 发送告警后 一个点 断点了就立即发送恢复信息的case
|
||||||
logger.Debugf("[prom.alert.MarkRecov][promql:%v][ev.RuleName:%v]", promql, ev.RuleName)
|
now := time.Now().Unix()
|
||||||
ev.MarkRecov()
|
if ev.IsAlert() && ev.LastSend && now-ev.TriggerTime > ev.AlertDuration {
|
||||||
EventQueue.PushFront(ev)
|
logger.Debugf("[prom.alert.MarkRecov][ev.RuleName:%v]", ev.RuleName)
|
||||||
delete(s.M, k)
|
ev.MarkRecov()
|
||||||
}
|
EventQueue.PushFront(ev)
|
||||||
|
delete(s.M[ruleId], k)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -121,7 +121,7 @@ func handlePromqlVector(pv promql.Vector, r models.AlertRule) {
|
||||||
toKeepKeys := map[string]struct{}{}
|
toKeepKeys := map[string]struct{}{}
|
||||||
if len(pv) == 0 {
|
if len(pv) == 0 {
|
||||||
// 说明没触发,或者没查询到,删掉rule-id开头的所有event
|
// 说明没触发,或者没查询到,删掉rule-id开头的所有event
|
||||||
LastEvents.DeleteOrSendRecovery(r.PullExpr.PromQl, toKeepKeys)
|
LastEvents.DeleteOrSendRecovery(r.Id, toKeepKeys)
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -191,6 +191,6 @@ func handlePromqlVector(pv promql.Vector, r models.AlertRule) {
|
||||||
logger.Debugf("[handlePromqlVector_has_value][event:%+v]\n", event)
|
logger.Debugf("[handlePromqlVector_has_value][event:%+v]\n", event)
|
||||||
sendEventIfNeed([]bool{true}, event, &r)
|
sendEventIfNeed([]bool{true}, event, &r)
|
||||||
}
|
}
|
||||||
LastEvents.DeleteOrSendRecovery(r.PullExpr.PromQl, toKeepKeys)
|
LastEvents.DeleteOrSendRecovery(r.Id, toKeepKeys)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue