extract IamLeader function and fix repeat
This commit is contained in:
parent
23b6cf1a68
commit
a71edc4040
|
@ -104,9 +104,8 @@ func AlertMuteDel(ids []int64) error {
|
||||||
return DB().Where("id in ?", ids).Delete(new(AlertMute)).Error
|
return DB().Where("id in ?", ids).Delete(new(AlertMute)).Error
|
||||||
}
|
}
|
||||||
|
|
||||||
func AlertMuteStatistics(cluster string, btime int64) (*Statistics, error) {
|
func AlertMuteStatistics(cluster string) (*Statistics, error) {
|
||||||
session := DB().Model(&AlertMute{}).Select("count(*) as total", "max(create_at) as last_updated").Where("btime <= ?", btime)
|
session := DB().Model(&AlertMute{}).Select("count(*) as total", "max(create_at) as last_updated")
|
||||||
|
|
||||||
if cluster != "" {
|
if cluster != "" {
|
||||||
session = session.Where("cluster = ?", cluster)
|
session = session.Where("cluster = ?", cluster)
|
||||||
}
|
}
|
||||||
|
@ -120,7 +119,7 @@ func AlertMuteStatistics(cluster string, btime int64) (*Statistics, error) {
|
||||||
return stats[0], nil
|
return stats[0], nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func AlertMuteGetsByCluster(cluster string, btime int64) ([]*AlertMute, error) {
|
func AlertMuteGetsByCluster(cluster string) ([]*AlertMute, error) {
|
||||||
// clean expired first
|
// clean expired first
|
||||||
buf := int64(30)
|
buf := int64(30)
|
||||||
err := DB().Where("etime < ?", time.Now().Unix()+buf).Delete(new(AlertMute)).Error
|
err := DB().Where("etime < ?", time.Now().Unix()+buf).Delete(new(AlertMute)).Error
|
||||||
|
@ -129,7 +128,7 @@ func AlertMuteGetsByCluster(cluster string, btime int64) ([]*AlertMute, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// get my cluster's mutes
|
// get my cluster's mutes
|
||||||
session := DB().Model(&AlertMute{}).Where("btime <= ?", btime)
|
session := DB().Model(&AlertMute{})
|
||||||
if cluster != "" {
|
if cluster != "" {
|
||||||
session = session.Where("cluster = ?", cluster)
|
session = session.Where("cluster = ?", cluster)
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,7 +12,6 @@ import (
|
||||||
|
|
||||||
"github.com/didi/nightingale/v5/src/pkg/httpx"
|
"github.com/didi/nightingale/v5/src/pkg/httpx"
|
||||||
"github.com/didi/nightingale/v5/src/pkg/logx"
|
"github.com/didi/nightingale/v5/src/pkg/logx"
|
||||||
"github.com/didi/nightingale/v5/src/server/naming"
|
|
||||||
"github.com/didi/nightingale/v5/src/server/reader"
|
"github.com/didi/nightingale/v5/src/server/reader"
|
||||||
"github.com/didi/nightingale/v5/src/server/writer"
|
"github.com/didi/nightingale/v5/src/server/writer"
|
||||||
"github.com/didi/nightingale/v5/src/storage"
|
"github.com/didi/nightingale/v5/src/storage"
|
||||||
|
@ -77,7 +76,6 @@ func MustLoad(fpaths ...string) {
|
||||||
}
|
}
|
||||||
|
|
||||||
C.Heartbeat.Endpoint = fmt.Sprintf("%s:%d", C.Heartbeat.IP, C.HTTP.Port)
|
C.Heartbeat.Endpoint = fmt.Sprintf("%s:%d", C.Heartbeat.IP, C.HTTP.Port)
|
||||||
C.Heartbeat.Cluster = C.ClusterName
|
|
||||||
|
|
||||||
C.Alerting.RedisPub.ChannelKey = C.Alerting.RedisPub.ChannelPrefix + C.ClusterName
|
C.Alerting.RedisPub.ChannelKey = C.Alerting.RedisPub.ChannelPrefix + C.ClusterName
|
||||||
|
|
||||||
|
@ -93,7 +91,7 @@ type Config struct {
|
||||||
Log logx.Config
|
Log logx.Config
|
||||||
HTTP httpx.Config
|
HTTP httpx.Config
|
||||||
BasicAuth gin.Accounts
|
BasicAuth gin.Accounts
|
||||||
Heartbeat naming.HeartbeatConfig
|
Heartbeat HeartbeatConfig
|
||||||
Alerting Alerting
|
Alerting Alerting
|
||||||
NoData NoData
|
NoData NoData
|
||||||
Redis storage.RedisConfig
|
Redis storage.RedisConfig
|
||||||
|
@ -106,6 +104,12 @@ type Config struct {
|
||||||
Ibex Ibex
|
Ibex Ibex
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type HeartbeatConfig struct {
|
||||||
|
IP string
|
||||||
|
Interval int64
|
||||||
|
Endpoint string
|
||||||
|
}
|
||||||
|
|
||||||
type Alerting struct {
|
type Alerting struct {
|
||||||
NotifyScriptPath string
|
NotifyScriptPath string
|
||||||
NotifyConcurrency int
|
NotifyConcurrency int
|
||||||
|
|
|
@ -4,10 +4,12 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/toolkits/pkg/logger"
|
||||||
|
|
||||||
"github.com/didi/nightingale/v5/src/models"
|
"github.com/didi/nightingale/v5/src/models"
|
||||||
"github.com/didi/nightingale/v5/src/server/config"
|
"github.com/didi/nightingale/v5/src/server/config"
|
||||||
"github.com/didi/nightingale/v5/src/server/memsto"
|
"github.com/didi/nightingale/v5/src/server/memsto"
|
||||||
"github.com/toolkits/pkg/logger"
|
"github.com/didi/nightingale/v5/src/server/naming"
|
||||||
)
|
)
|
||||||
|
|
||||||
func loopRepeat(ctx context.Context) {
|
func loopRepeat(ctx context.Context) {
|
||||||
|
@ -24,6 +26,17 @@ func loopRepeat(ctx context.Context) {
|
||||||
|
|
||||||
// 拉取未恢复的告警表中需要重复通知的数据
|
// 拉取未恢复的告警表中需要重复通知的数据
|
||||||
func repeat() {
|
func repeat() {
|
||||||
|
isLeader, err := naming.IamLeader()
|
||||||
|
if err != nil {
|
||||||
|
logger.Errorf("repeat: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if !isLeader {
|
||||||
|
logger.Info("repeat: i am not leader")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
events, err := models.AlertCurEventNeedRepeat(config.C.ClusterName)
|
events, err := models.AlertCurEventNeedRepeat(config.C.ClusterName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Errorf("repeat: AlertCurEventNeedRepeat: %v", err)
|
logger.Errorf("repeat: AlertCurEventNeedRepeat: %v", err)
|
||||||
|
@ -35,35 +48,39 @@ func repeat() {
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; i < len(events); i++ {
|
for i := 0; i < len(events); i++ {
|
||||||
event := events[i]
|
rule := memsto.AlertRuleCache.Get(events[i].RuleId)
|
||||||
rule := memsto.AlertRuleCache.Get(event.RuleId)
|
|
||||||
if rule == nil {
|
if rule == nil {
|
||||||
|
// 可能告警规则已经被删了,理论上不应该出现这种情况,因为删除告警规则的时候,会顺带删除活跃告警,无论如何,自保一下
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if rule.NotifyRepeatStep == 0 {
|
repeatOne(events[i], rule)
|
||||||
// 用户后来调整了这个字段,不让继续发送了
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
event.DB2Mem()
|
if err = events[i].IncRepeatStep(int64(rule.NotifyRepeatStep * 60)); err != nil {
|
||||||
|
|
||||||
// 重复通知的告警,应该用新的时间来判断是否生效和是否屏蔽,
|
|
||||||
// 不能使用TriggerTime,因为TriggerTime是触发时的时间,是一个比较老的时间
|
|
||||||
// 先发了告警,又做了屏蔽,本质是不想发了,如果继续用TriggerTime判断,就还是会发,不符合预期
|
|
||||||
if isNoneffective(event.NotifyRepeatNext, rule) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if isMuted(event, event.NotifyRepeatNext) {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
fillUsers(event)
|
|
||||||
notify(event)
|
|
||||||
|
|
||||||
if err = event.IncRepeatStep(int64(rule.NotifyRepeatStep * 60)); err != nil {
|
|
||||||
logger.Errorf("repeat: IncRepeatStep: %v", err)
|
logger.Errorf("repeat: IncRepeatStep: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func repeatOne(event *models.AlertCurEvent, rule *models.AlertRule) {
|
||||||
|
if rule.NotifyRepeatStep == 0 {
|
||||||
|
// 用户后来调整了这个字段,不让继续发送了
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
event.DB2Mem()
|
||||||
|
|
||||||
|
// 重复通知的告警,应该用新的时间来判断是否生效和是否屏蔽,
|
||||||
|
// 不能使用TriggerTime,因为TriggerTime是触发时的时间,是一个比较老的时间
|
||||||
|
// 先发了告警,又做了屏蔽,本质是不想发了,如果继续用TriggerTime判断,就还是会发,不符合预期
|
||||||
|
if isNoneffective(event.NotifyRepeatNext, rule) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if isMuted(event, event.NotifyRepeatNext) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fillUsers(event)
|
||||||
|
notify(event)
|
||||||
|
}
|
||||||
|
|
|
@ -187,6 +187,10 @@ func (r RuleEval) judge(vectors []Vector) {
|
||||||
alertingKeys := make(map[string]struct{})
|
alertingKeys := make(map[string]struct{})
|
||||||
now := time.Now().Unix()
|
now := time.Now().Unix()
|
||||||
for i := 0; i < count; i++ {
|
for i := 0; i < count; i++ {
|
||||||
|
// compute hash
|
||||||
|
hash := str.MD5(fmt.Sprintf("%d_%s", r.rule.Id, vectors[i].Key))
|
||||||
|
alertingKeys[hash] = struct{}{}
|
||||||
|
|
||||||
// rule disabled in this time span?
|
// rule disabled in this time span?
|
||||||
if isNoneffective(vectors[i].Timestamp, r.rule) {
|
if isNoneffective(vectors[i].Timestamp, r.rule) {
|
||||||
continue
|
continue
|
||||||
|
@ -226,10 +230,6 @@ func (r RuleEval) judge(vectors []Vector) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute hash
|
|
||||||
hash := str.MD5(fmt.Sprintf("%d_%s", r.rule.Id, vectors[i].Key))
|
|
||||||
alertingKeys[hash] = struct{}{}
|
|
||||||
|
|
||||||
tagsArr := labelMapToArr(tagsMap)
|
tagsArr := labelMapToArr(tagsMap)
|
||||||
sort.Strings(tagsArr)
|
sort.Strings(tagsArr)
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ package idents
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"sort"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -92,20 +91,13 @@ func loopPushMetrics(ctx context.Context) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func pushMetrics() {
|
func pushMetrics() {
|
||||||
servers, err := naming.ActiveServers(context.Background(), config.C.ClusterName)
|
isLeader, err := naming.IamLeader()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Errorf("handle_idents: failed to get active servers: %v", err)
|
logger.Errorf("handle_idents: %v", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(servers) == 0 {
|
if !isLeader {
|
||||||
logger.Errorf("handle_idents: active servers empty")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
sort.Strings(servers)
|
|
||||||
|
|
||||||
if config.C.Heartbeat.Endpoint != servers[0] {
|
|
||||||
logger.Info("handle_idents: i am not leader")
|
logger.Info("handle_idents: i am not leader")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
|
@ -89,9 +89,8 @@ func loopSyncAlertMutes() {
|
||||||
|
|
||||||
func syncAlertMutes() error {
|
func syncAlertMutes() error {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
btime := start.Unix() - int64(30)
|
|
||||||
|
|
||||||
stat, err := models.AlertMuteStatistics(config.C.ClusterName, btime)
|
stat, err := models.AlertMuteStatistics(config.C.ClusterName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return errors.WithMessage(err, "failed to exec AlertMuteStatistics")
|
return errors.WithMessage(err, "failed to exec AlertMuteStatistics")
|
||||||
}
|
}
|
||||||
|
@ -103,7 +102,7 @@ func syncAlertMutes() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
lst, err := models.AlertMuteGetsByCluster(config.C.ClusterName, btime)
|
lst, err := models.AlertMuteGetsByCluster(config.C.ClusterName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return errors.WithMessage(err, "failed to exec AlertMuteGetsByCluster")
|
return errors.WithMessage(err, "failed to exec AlertMuteGetsByCluster")
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,34 +10,28 @@ import (
|
||||||
|
|
||||||
"github.com/toolkits/pkg/logger"
|
"github.com/toolkits/pkg/logger"
|
||||||
|
|
||||||
|
"github.com/didi/nightingale/v5/src/server/config"
|
||||||
"github.com/didi/nightingale/v5/src/storage"
|
"github.com/didi/nightingale/v5/src/storage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// local servers
|
// local servers
|
||||||
var localss string
|
var localss string
|
||||||
|
|
||||||
type HeartbeatConfig struct {
|
func Heartbeat(ctx context.Context) error {
|
||||||
IP string
|
if err := heartbeat(ctx); err != nil {
|
||||||
Interval int64
|
|
||||||
Endpoint string
|
|
||||||
Cluster string
|
|
||||||
}
|
|
||||||
|
|
||||||
func Heartbeat(ctx context.Context, cfg HeartbeatConfig) error {
|
|
||||||
if err := heartbeat(ctx, cfg); err != nil {
|
|
||||||
fmt.Println("failed to heartbeat:", err)
|
fmt.Println("failed to heartbeat:", err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
go loopHeartbeat(ctx, cfg)
|
go loopHeartbeat(ctx)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func loopHeartbeat(ctx context.Context, cfg HeartbeatConfig) {
|
func loopHeartbeat(ctx context.Context) {
|
||||||
interval := time.Duration(cfg.Interval) * time.Millisecond
|
interval := time.Duration(config.C.Heartbeat.Interval) * time.Millisecond
|
||||||
for {
|
for {
|
||||||
time.Sleep(interval)
|
time.Sleep(interval)
|
||||||
if err := heartbeat(ctx, cfg); err != nil {
|
if err := heartbeat(ctx); err != nil {
|
||||||
logger.Warning(err)
|
logger.Warning(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -52,15 +46,15 @@ func redisKey(cluster string) string {
|
||||||
return fmt.Sprintf("/server/heartbeat/%s", cluster)
|
return fmt.Sprintf("/server/heartbeat/%s", cluster)
|
||||||
}
|
}
|
||||||
|
|
||||||
func heartbeat(ctx context.Context, cfg HeartbeatConfig) error {
|
func heartbeat(ctx context.Context) error {
|
||||||
now := time.Now().Unix()
|
now := time.Now().Unix()
|
||||||
key := redisKey(cfg.Cluster)
|
key := redisKey(config.C.ClusterName)
|
||||||
err := storage.Redis.HSet(ctx, key, cfg.Endpoint, now).Err()
|
err := storage.Redis.HSet(ctx, key, config.C.Heartbeat.Endpoint, now).Err()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
servers, err := ActiveServers(ctx, cfg.Cluster)
|
servers, err := ActiveServers(ctx, config.C.ClusterName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
package naming
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
"github.com/didi/nightingale/v5/src/server/config"
|
||||||
|
"github.com/toolkits/pkg/logger"
|
||||||
|
)
|
||||||
|
|
||||||
|
func IamLeader() (bool, error) {
|
||||||
|
servers, err := ActiveServers(context.Background(), config.C.ClusterName)
|
||||||
|
if err != nil {
|
||||||
|
logger.Errorf("failed to get active servers: %v", err)
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(servers) == 0 {
|
||||||
|
logger.Errorf("active servers empty")
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Strings(servers)
|
||||||
|
|
||||||
|
return config.C.Heartbeat.Endpoint == servers[0], nil
|
||||||
|
}
|
|
@ -134,7 +134,7 @@ func (s Server) initialize() (func(), error) {
|
||||||
memsto.Sync()
|
memsto.Sync()
|
||||||
|
|
||||||
// start heartbeat
|
// start heartbeat
|
||||||
if err = naming.Heartbeat(ctx, config.C.Heartbeat); err != nil {
|
if err = naming.Heartbeat(ctx); err != nil {
|
||||||
return fns.Ret(), err
|
return fns.Ret(), err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue