diff --git a/docker/initsql/a-n9e.sql b/docker/initsql/a-n9e.sql index 543af628..6561fc45 100644 --- a/docker/initsql/a-n9e.sql +++ b/docker/initsql/a-n9e.sql @@ -14,6 +14,7 @@ CREATE TABLE `users` ( `portrait` varchar(255) not null default '' comment 'portrait image url', `roles` varchar(255) not null comment 'Admin | Standard | Guest, split by space', `contacts` varchar(1024) comment 'json e.g. {wecom:xx, dingtalk_robot_token:yy}', + `maintainer` tinyint(1) not null default 0, `create_at` bigint not null default 0, `create_by` varchar(64) not null default '', `update_at` bigint not null default 0, diff --git a/src/models/user.go b/src/models/user.go index 6cca7efb..c82df91e 100644 --- a/src/models/user.go +++ b/src/models/user.go @@ -17,21 +17,22 @@ import ( ) type User struct { - Id int64 `json:"id" gorm:"primaryKey"` - Username string `json:"username"` - Nickname string `json:"nickname"` - Password string `json:"-"` - Phone string `json:"phone"` - Email string `json:"email"` - Portrait string `json:"portrait"` - Roles string `json:"-"` // 这个字段写入数据库 - RolesLst []string `json:"roles" gorm:"-"` // 这个字段和前端交互 - Contacts ormx.JSONObj `json:"contacts"` // 内容为 map[string]string 结构 - CreateAt int64 `json:"create_at"` - CreateBy string `json:"create_by"` - UpdateAt int64 `json:"update_at"` - UpdateBy string `json:"update_by"` - Admin bool `json:"admin" gorm:"-"` // 方便前端使用 + Id int64 `json:"id" gorm:"primaryKey"` + Username string `json:"username"` + Nickname string `json:"nickname"` + Password string `json:"-"` + Phone string `json:"phone"` + Email string `json:"email"` + Portrait string `json:"portrait"` + Roles string `json:"-"` // 这个字段写入数据库 + RolesLst []string `json:"roles" gorm:"-"` // 这个字段和前端交互 + Contacts ormx.JSONObj `json:"contacts"` // 内容为 map[string]string 结构 + Maintainer int `json:"maintainer"` // 是否给管理员发消息 0:not send 1:send + CreateAt int64 `json:"create_at"` + CreateBy string `json:"create_by"` + UpdateAt int64 `json:"update_at"` + UpdateBy string `json:"update_by"` + Admin bool `json:"admin" gorm:"-"` // 方便前端使用 } func (u *User) TableName() string { diff --git a/src/server/engine/notify_maintainer.go b/src/server/engine/notify_maintainer.go new file mode 100644 index 00000000..c2a1cae8 --- /dev/null +++ b/src/server/engine/notify_maintainer.go @@ -0,0 +1,108 @@ +package engine + +import ( + "time" + + "github.com/didi/nightingale/v5/src/server/common/sender" + "github.com/didi/nightingale/v5/src/server/config" + "github.com/didi/nightingale/v5/src/server/memsto" + "github.com/tidwall/gjson" + "github.com/toolkits/pkg/logger" +) + +// notify to maintainer to handle the error +func notifyToMaintainer(e error, title string) { + + logger.Errorf("notifyToMaintainer,title:%s, error:%v", title, e) + + if len(config.C.Alerting.NotifyBuiltinChannels) == 0 { + return + } + + maintainerUsers := memsto.UserCache.GetMaintainerUsers() + if len(maintainerUsers) == 0 { + return + } + + emailset := make(map[string]struct{}) + phoneset := make(map[string]struct{}) + wecomset := make(map[string]struct{}) + dingtalkset := make(map[string]struct{}) + feishuset := make(map[string]struct{}) + + for _, user := range maintainerUsers { + if user.Email != "" { + emailset[user.Email] = struct{}{} + } + + if user.Phone != "" { + phoneset[user.Phone] = struct{}{} + } + + bs, err := user.Contacts.MarshalJSON() + if err != nil { + logger.Errorf("handle_notice: failed to marshal contacts: %v", err) + continue + } + + ret := gjson.GetBytes(bs, "dingtalk_robot_token") + if ret.Exists() { + dingtalkset[ret.String()] = struct{}{} + } + + ret = gjson.GetBytes(bs, "wecom_robot_token") + if ret.Exists() { + wecomset[ret.String()] = struct{}{} + } + + ret = gjson.GetBytes(bs, "feishu_robot_token") + if ret.Exists() { + feishuset[ret.String()] = struct{}{} + } + } + + phones := StringSetKeys(phoneset) + triggerTime := time.Now().Format("2006/01/02 - 15:04:05") + + for _, ch := range config.C.Alerting.NotifyBuiltinChannels { + switch ch { + case "email": + if len(emailset) == 0 { + continue + } + content := "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime + sender.WriteEmail(title, content, StringSetKeys(emailset)) + case "dingtalk": + if len(dingtalkset) == 0 { + continue + } + content := "**【内部处理错误】当前标题: **" + title + "\n**【内部处理错误】当前异常: **" + e.Error() + "\n**【内部处理错误】发送时间: **" + triggerTime + sender.SendDingtalk(sender.DingtalkMessage{ + Title: title, + Text: content, + AtMobiles: phones, + Tokens: StringSetKeys(dingtalkset), + }) + case "wecom": + if len(wecomset) == 0 { + continue + } + content := "**【内部处理错误】当前标题: **" + title + "\n**【内部处理错误】当前异常: **" + e.Error() + "\n**【内部处理错误】发送时间: **" + triggerTime + sender.SendWecom(sender.WecomMessage{ + Text: content, + Tokens: StringSetKeys(wecomset), + }) + case "feishu": + if len(feishuset) == 0 { + continue + } + + content := "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime + sender.SendFeishu(sender.FeishuMessage{ + Text: content, + AtMobiles: phones, + Tokens: StringSetKeys(feishuset), + }) + } + } +} diff --git a/src/server/engine/worker.go b/src/server/engine/worker.go index 84e2be2e..85054222 100644 --- a/src/server/engine/worker.go +++ b/src/server/engine/worker.go @@ -39,6 +39,7 @@ func loopFilterRules(ctx context.Context) { func filterRules() { ids := memsto.AlertRuleCache.GetRuleIds() + logger.Infof("AlertRuleCache.GetRuleIds success,ids.len: %d", len(ids)) count := len(ids) mines := make([]int64, 0, count) @@ -83,6 +84,7 @@ func (r RuleEval) Start() { return default: r.Work() + logger.Infof("rule executed,rule_id=%d", r.RuleID()) interval := r.rule.PromEvalInterval if interval <= 0 { interval = 10 @@ -111,6 +113,8 @@ func (r RuleEval) Work() { value, warnings, err = reader.Reader.Client.Query(context.Background(), promql, time.Now()) if err != nil { logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err) + // 告警查询prometheus逻辑出错,发告警信息给管理员 + notifyToMaintainer(err, "查询prometheus出错") return } @@ -182,6 +186,7 @@ func (ws *WorkersType) Build(rids []int64) { elst, err := models.AlertCurEventGetByRule(rules[hash].Id) if err != nil { logger.Errorf("worker_build: AlertCurEventGetByRule failed: %v", err) + notifyToMaintainer(err, "AlertCurEventGetByRule Error,ruleID="+fmt.Sprint(rules[hash].Id)) continue } diff --git a/src/server/memsto/user_cache.go b/src/server/memsto/user_cache.go index 21d89c7f..4cee8c2b 100644 --- a/src/server/memsto/user_cache.go +++ b/src/server/memsto/user_cache.go @@ -78,6 +78,24 @@ func (uc *UserCacheType) GetByUserIds(ids []int64) []*models.User { return users } +func (uc *UserCacheType) GetMaintainerUsers() []*models.User { + uc.RLock() + defer uc.RUnlock() + + var users []*models.User + for _, v := range uc.users { + if v.Maintainer == 1 { + users = append(users, v) + } + } + + if users == nil { + users = []*models.User{} + } + + return users +} + func SyncUsers() { err := syncUsers() if err != nil {