extract IamLeader function and fix repeat

This commit is contained in:
Ulric Qin 2021-12-15 20:52:00 +08:00
parent 23b6cf1a68
commit a71edc4040
9 changed files with 99 additions and 68 deletions

View File

@ -104,9 +104,8 @@ func AlertMuteDel(ids []int64) error {
return DB().Where("id in ?", ids).Delete(new(AlertMute)).Error return DB().Where("id in ?", ids).Delete(new(AlertMute)).Error
} }
func AlertMuteStatistics(cluster string, btime int64) (*Statistics, error) { func AlertMuteStatistics(cluster string) (*Statistics, error) {
session := DB().Model(&AlertMute{}).Select("count(*) as total", "max(create_at) as last_updated").Where("btime <= ?", btime) session := DB().Model(&AlertMute{}).Select("count(*) as total", "max(create_at) as last_updated")
if cluster != "" { if cluster != "" {
session = session.Where("cluster = ?", cluster) session = session.Where("cluster = ?", cluster)
} }
@ -120,7 +119,7 @@ func AlertMuteStatistics(cluster string, btime int64) (*Statistics, error) {
return stats[0], nil return stats[0], nil
} }
func AlertMuteGetsByCluster(cluster string, btime int64) ([]*AlertMute, error) { func AlertMuteGetsByCluster(cluster string) ([]*AlertMute, error) {
// clean expired first // clean expired first
buf := int64(30) buf := int64(30)
err := DB().Where("etime < ?", time.Now().Unix()+buf).Delete(new(AlertMute)).Error err := DB().Where("etime < ?", time.Now().Unix()+buf).Delete(new(AlertMute)).Error
@ -129,7 +128,7 @@ func AlertMuteGetsByCluster(cluster string, btime int64) ([]*AlertMute, error) {
} }
// get my cluster's mutes // get my cluster's mutes
session := DB().Model(&AlertMute{}).Where("btime <= ?", btime) session := DB().Model(&AlertMute{})
if cluster != "" { if cluster != "" {
session = session.Where("cluster = ?", cluster) session = session.Where("cluster = ?", cluster)
} }

View File

@ -12,7 +12,6 @@ import (
"github.com/didi/nightingale/v5/src/pkg/httpx" "github.com/didi/nightingale/v5/src/pkg/httpx"
"github.com/didi/nightingale/v5/src/pkg/logx" "github.com/didi/nightingale/v5/src/pkg/logx"
"github.com/didi/nightingale/v5/src/server/naming"
"github.com/didi/nightingale/v5/src/server/reader" "github.com/didi/nightingale/v5/src/server/reader"
"github.com/didi/nightingale/v5/src/server/writer" "github.com/didi/nightingale/v5/src/server/writer"
"github.com/didi/nightingale/v5/src/storage" "github.com/didi/nightingale/v5/src/storage"
@ -77,7 +76,6 @@ func MustLoad(fpaths ...string) {
} }
C.Heartbeat.Endpoint = fmt.Sprintf("%s:%d", C.Heartbeat.IP, C.HTTP.Port) C.Heartbeat.Endpoint = fmt.Sprintf("%s:%d", C.Heartbeat.IP, C.HTTP.Port)
C.Heartbeat.Cluster = C.ClusterName
C.Alerting.RedisPub.ChannelKey = C.Alerting.RedisPub.ChannelPrefix + C.ClusterName C.Alerting.RedisPub.ChannelKey = C.Alerting.RedisPub.ChannelPrefix + C.ClusterName
@ -93,7 +91,7 @@ type Config struct {
Log logx.Config Log logx.Config
HTTP httpx.Config HTTP httpx.Config
BasicAuth gin.Accounts BasicAuth gin.Accounts
Heartbeat naming.HeartbeatConfig Heartbeat HeartbeatConfig
Alerting Alerting Alerting Alerting
NoData NoData NoData NoData
Redis storage.RedisConfig Redis storage.RedisConfig
@ -106,6 +104,12 @@ type Config struct {
Ibex Ibex Ibex Ibex
} }
type HeartbeatConfig struct {
IP string
Interval int64
Endpoint string
}
type Alerting struct { type Alerting struct {
NotifyScriptPath string NotifyScriptPath string
NotifyConcurrency int NotifyConcurrency int

View File

@ -4,10 +4,12 @@ import (
"context" "context"
"time" "time"
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/v5/src/models" "github.com/didi/nightingale/v5/src/models"
"github.com/didi/nightingale/v5/src/server/config" "github.com/didi/nightingale/v5/src/server/config"
"github.com/didi/nightingale/v5/src/server/memsto" "github.com/didi/nightingale/v5/src/server/memsto"
"github.com/toolkits/pkg/logger" "github.com/didi/nightingale/v5/src/server/naming"
) )
func loopRepeat(ctx context.Context) { func loopRepeat(ctx context.Context) {
@ -24,6 +26,17 @@ func loopRepeat(ctx context.Context) {
// 拉取未恢复的告警表中需要重复通知的数据 // 拉取未恢复的告警表中需要重复通知的数据
func repeat() { func repeat() {
isLeader, err := naming.IamLeader()
if err != nil {
logger.Errorf("repeat: %v", err)
return
}
if !isLeader {
logger.Info("repeat: i am not leader")
return
}
events, err := models.AlertCurEventNeedRepeat(config.C.ClusterName) events, err := models.AlertCurEventNeedRepeat(config.C.ClusterName)
if err != nil { if err != nil {
logger.Errorf("repeat: AlertCurEventNeedRepeat: %v", err) logger.Errorf("repeat: AlertCurEventNeedRepeat: %v", err)
@ -35,35 +48,39 @@ func repeat() {
} }
for i := 0; i < len(events); i++ { for i := 0; i < len(events); i++ {
event := events[i] rule := memsto.AlertRuleCache.Get(events[i].RuleId)
rule := memsto.AlertRuleCache.Get(event.RuleId)
if rule == nil { if rule == nil {
// 可能告警规则已经被删了,理论上不应该出现这种情况,因为删除告警规则的时候,会顺带删除活跃告警,无论如何,自保一下
continue continue
} }
if rule.NotifyRepeatStep == 0 { repeatOne(events[i], rule)
// 用户后来调整了这个字段,不让继续发送了
continue
}
event.DB2Mem() if err = events[i].IncRepeatStep(int64(rule.NotifyRepeatStep * 60)); err != nil {
// 重复通知的告警,应该用新的时间来判断是否生效和是否屏蔽,
// 不能使用TriggerTime因为TriggerTime是触发时的时间是一个比较老的时间
// 先发了告警又做了屏蔽本质是不想发了如果继续用TriggerTime判断就还是会发不符合预期
if isNoneffective(event.NotifyRepeatNext, rule) {
continue
}
if isMuted(event, event.NotifyRepeatNext) {
continue
}
fillUsers(event)
notify(event)
if err = event.IncRepeatStep(int64(rule.NotifyRepeatStep * 60)); err != nil {
logger.Errorf("repeat: IncRepeatStep: %v", err) logger.Errorf("repeat: IncRepeatStep: %v", err)
} }
} }
} }
func repeatOne(event *models.AlertCurEvent, rule *models.AlertRule) {
if rule.NotifyRepeatStep == 0 {
// 用户后来调整了这个字段,不让继续发送了
return
}
event.DB2Mem()
// 重复通知的告警,应该用新的时间来判断是否生效和是否屏蔽,
// 不能使用TriggerTime因为TriggerTime是触发时的时间是一个比较老的时间
// 先发了告警又做了屏蔽本质是不想发了如果继续用TriggerTime判断就还是会发不符合预期
if isNoneffective(event.NotifyRepeatNext, rule) {
return
}
if isMuted(event, event.NotifyRepeatNext) {
return
}
fillUsers(event)
notify(event)
}

View File

@ -187,6 +187,10 @@ func (r RuleEval) judge(vectors []Vector) {
alertingKeys := make(map[string]struct{}) alertingKeys := make(map[string]struct{})
now := time.Now().Unix() now := time.Now().Unix()
for i := 0; i < count; i++ { for i := 0; i < count; i++ {
// compute hash
hash := str.MD5(fmt.Sprintf("%d_%s", r.rule.Id, vectors[i].Key))
alertingKeys[hash] = struct{}{}
// rule disabled in this time span? // rule disabled in this time span?
if isNoneffective(vectors[i].Timestamp, r.rule) { if isNoneffective(vectors[i].Timestamp, r.rule) {
continue continue
@ -226,10 +230,6 @@ func (r RuleEval) judge(vectors []Vector) {
continue continue
} }
// compute hash
hash := str.MD5(fmt.Sprintf("%d_%s", r.rule.Id, vectors[i].Key))
alertingKeys[hash] = struct{}{}
tagsArr := labelMapToArr(tagsMap) tagsArr := labelMapToArr(tagsMap)
sort.Strings(tagsArr) sort.Strings(tagsArr)

View File

@ -3,7 +3,6 @@ package idents
import ( import (
"context" "context"
"fmt" "fmt"
"sort"
"strconv" "strconv"
"time" "time"
@ -92,20 +91,13 @@ func loopPushMetrics(ctx context.Context) {
} }
func pushMetrics() { func pushMetrics() {
servers, err := naming.ActiveServers(context.Background(), config.C.ClusterName) isLeader, err := naming.IamLeader()
if err != nil { if err != nil {
logger.Errorf("handle_idents: failed to get active servers: %v", err) logger.Errorf("handle_idents: %v", err)
return return
} }
if len(servers) == 0 { if !isLeader {
logger.Errorf("handle_idents: active servers empty")
return
}
sort.Strings(servers)
if config.C.Heartbeat.Endpoint != servers[0] {
logger.Info("handle_idents: i am not leader") logger.Info("handle_idents: i am not leader")
return return
} }

View File

@ -89,9 +89,8 @@ func loopSyncAlertMutes() {
func syncAlertMutes() error { func syncAlertMutes() error {
start := time.Now() start := time.Now()
btime := start.Unix() - int64(30)
stat, err := models.AlertMuteStatistics(config.C.ClusterName, btime) stat, err := models.AlertMuteStatistics(config.C.ClusterName)
if err != nil { if err != nil {
return errors.WithMessage(err, "failed to exec AlertMuteStatistics") return errors.WithMessage(err, "failed to exec AlertMuteStatistics")
} }
@ -103,7 +102,7 @@ func syncAlertMutes() error {
return nil return nil
} }
lst, err := models.AlertMuteGetsByCluster(config.C.ClusterName, btime) lst, err := models.AlertMuteGetsByCluster(config.C.ClusterName)
if err != nil { if err != nil {
return errors.WithMessage(err, "failed to exec AlertMuteGetsByCluster") return errors.WithMessage(err, "failed to exec AlertMuteGetsByCluster")
} }

View File

@ -10,34 +10,28 @@ import (
"github.com/toolkits/pkg/logger" "github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/v5/src/server/config"
"github.com/didi/nightingale/v5/src/storage" "github.com/didi/nightingale/v5/src/storage"
) )
// local servers // local servers
var localss string var localss string
type HeartbeatConfig struct { func Heartbeat(ctx context.Context) error {
IP string if err := heartbeat(ctx); err != nil {
Interval int64
Endpoint string
Cluster string
}
func Heartbeat(ctx context.Context, cfg HeartbeatConfig) error {
if err := heartbeat(ctx, cfg); err != nil {
fmt.Println("failed to heartbeat:", err) fmt.Println("failed to heartbeat:", err)
return err return err
} }
go loopHeartbeat(ctx, cfg) go loopHeartbeat(ctx)
return nil return nil
} }
func loopHeartbeat(ctx context.Context, cfg HeartbeatConfig) { func loopHeartbeat(ctx context.Context) {
interval := time.Duration(cfg.Interval) * time.Millisecond interval := time.Duration(config.C.Heartbeat.Interval) * time.Millisecond
for { for {
time.Sleep(interval) time.Sleep(interval)
if err := heartbeat(ctx, cfg); err != nil { if err := heartbeat(ctx); err != nil {
logger.Warning(err) logger.Warning(err)
} }
} }
@ -52,15 +46,15 @@ func redisKey(cluster string) string {
return fmt.Sprintf("/server/heartbeat/%s", cluster) return fmt.Sprintf("/server/heartbeat/%s", cluster)
} }
func heartbeat(ctx context.Context, cfg HeartbeatConfig) error { func heartbeat(ctx context.Context) error {
now := time.Now().Unix() now := time.Now().Unix()
key := redisKey(cfg.Cluster) key := redisKey(config.C.ClusterName)
err := storage.Redis.HSet(ctx, key, cfg.Endpoint, now).Err() err := storage.Redis.HSet(ctx, key, config.C.Heartbeat.Endpoint, now).Err()
if err != nil { if err != nil {
return err return err
} }
servers, err := ActiveServers(ctx, cfg.Cluster) servers, err := ActiveServers(ctx, config.C.ClusterName)
if err != nil { if err != nil {
return err return err
} }

View File

@ -0,0 +1,26 @@
package naming
import (
"context"
"sort"
"github.com/didi/nightingale/v5/src/server/config"
"github.com/toolkits/pkg/logger"
)
func IamLeader() (bool, error) {
servers, err := ActiveServers(context.Background(), config.C.ClusterName)
if err != nil {
logger.Errorf("failed to get active servers: %v", err)
return false, err
}
if len(servers) == 0 {
logger.Errorf("active servers empty")
return false, err
}
sort.Strings(servers)
return config.C.Heartbeat.Endpoint == servers[0], nil
}

View File

@ -134,7 +134,7 @@ func (s Server) initialize() (func(), error) {
memsto.Sync() memsto.Sync()
// start heartbeat // start heartbeat
if err = naming.Heartbeat(ctx, config.C.Heartbeat); err != nil { if err = naming.Heartbeat(ctx); err != nil {
return fns.Ret(), err return fns.Ret(), err
} }