feat: 告警处理出错给Maintainer管理员发送告警信息 (#955)

* feat: 告警处理出错给管理员发送告警信息

* feat: 告警处理出错给管理员发送告警信息,发送信息自己拼接,不使用模版

* feat: 告警处理出错给管理员发送告警信息,不实用AlertCurEvent结构

* feat: 告警处理出错给管理员发送告警信息,日志打印、文本发送优化
This commit is contained in:
caojiaqiang 2022-05-27 19:00:41 +08:00 committed by GitHub
parent 070e5051c6
commit e22a4394f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 148 additions and 15 deletions

View File

@ -14,6 +14,7 @@ CREATE TABLE `users` (
`portrait` varchar(255) not null default '' comment 'portrait image url',
`roles` varchar(255) not null comment 'Admin | Standard | Guest, split by space',
`contacts` varchar(1024) comment 'json e.g. {wecom:xx, dingtalk_robot_token:yy}',
`maintainer` tinyint(1) not null default 0,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,

View File

@ -27,6 +27,7 @@ type User struct {
Roles string `json:"-"` // 这个字段写入数据库
RolesLst []string `json:"roles" gorm:"-"` // 这个字段和前端交互
Contacts ormx.JSONObj `json:"contacts"` // 内容为 map[string]string 结构
Maintainer int `json:"maintainer"` // 是否给管理员发消息 0:not send 1:send
CreateAt int64 `json:"create_at"`
CreateBy string `json:"create_by"`
UpdateAt int64 `json:"update_at"`

View File

@ -0,0 +1,108 @@
package engine
import (
"time"
"github.com/didi/nightingale/v5/src/server/common/sender"
"github.com/didi/nightingale/v5/src/server/config"
"github.com/didi/nightingale/v5/src/server/memsto"
"github.com/tidwall/gjson"
"github.com/toolkits/pkg/logger"
)
// notify to maintainer to handle the error
func notifyToMaintainer(e error, title string) {
logger.Errorf("notifyToMaintainertitle:%s, error:%v", title, e)
if len(config.C.Alerting.NotifyBuiltinChannels) == 0 {
return
}
maintainerUsers := memsto.UserCache.GetMaintainerUsers()
if len(maintainerUsers) == 0 {
return
}
emailset := make(map[string]struct{})
phoneset := make(map[string]struct{})
wecomset := make(map[string]struct{})
dingtalkset := make(map[string]struct{})
feishuset := make(map[string]struct{})
for _, user := range maintainerUsers {
if user.Email != "" {
emailset[user.Email] = struct{}{}
}
if user.Phone != "" {
phoneset[user.Phone] = struct{}{}
}
bs, err := user.Contacts.MarshalJSON()
if err != nil {
logger.Errorf("handle_notice: failed to marshal contacts: %v", err)
continue
}
ret := gjson.GetBytes(bs, "dingtalk_robot_token")
if ret.Exists() {
dingtalkset[ret.String()] = struct{}{}
}
ret = gjson.GetBytes(bs, "wecom_robot_token")
if ret.Exists() {
wecomset[ret.String()] = struct{}{}
}
ret = gjson.GetBytes(bs, "feishu_robot_token")
if ret.Exists() {
feishuset[ret.String()] = struct{}{}
}
}
phones := StringSetKeys(phoneset)
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
for _, ch := range config.C.Alerting.NotifyBuiltinChannels {
switch ch {
case "email":
if len(emailset) == 0 {
continue
}
content := "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime
sender.WriteEmail(title, content, StringSetKeys(emailset))
case "dingtalk":
if len(dingtalkset) == 0 {
continue
}
content := "**【内部处理错误】当前标题: **" + title + "\n**【内部处理错误】当前异常: **" + e.Error() + "\n**【内部处理错误】发送时间: **" + triggerTime
sender.SendDingtalk(sender.DingtalkMessage{
Title: title,
Text: content,
AtMobiles: phones,
Tokens: StringSetKeys(dingtalkset),
})
case "wecom":
if len(wecomset) == 0 {
continue
}
content := "**【内部处理错误】当前标题: **" + title + "\n**【内部处理错误】当前异常: **" + e.Error() + "\n**【内部处理错误】发送时间: **" + triggerTime
sender.SendWecom(sender.WecomMessage{
Text: content,
Tokens: StringSetKeys(wecomset),
})
case "feishu":
if len(feishuset) == 0 {
continue
}
content := "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime
sender.SendFeishu(sender.FeishuMessage{
Text: content,
AtMobiles: phones,
Tokens: StringSetKeys(feishuset),
})
}
}
}

View File

@ -39,6 +39,7 @@ func loopFilterRules(ctx context.Context) {
func filterRules() {
ids := memsto.AlertRuleCache.GetRuleIds()
logger.Infof("AlertRuleCache.GetRuleIds successids.len: %d", len(ids))
count := len(ids)
mines := make([]int64, 0, count)
@ -83,6 +84,7 @@ func (r RuleEval) Start() {
return
default:
r.Work()
logger.Infof("rule executedrule_id=%d", r.RuleID())
interval := r.rule.PromEvalInterval
if interval <= 0 {
interval = 10
@ -111,6 +113,8 @@ func (r RuleEval) Work() {
value, warnings, err = reader.Reader.Client.Query(context.Background(), promql, time.Now())
if err != nil {
logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err)
// 告警查询prometheus逻辑出错发告警信息给管理员
notifyToMaintainer(err, "查询prometheus出错")
return
}
@ -182,6 +186,7 @@ func (ws *WorkersType) Build(rids []int64) {
elst, err := models.AlertCurEventGetByRule(rules[hash].Id)
if err != nil {
logger.Errorf("worker_build: AlertCurEventGetByRule failed: %v", err)
notifyToMaintainer(err, "AlertCurEventGetByRule ErrorruleID="+fmt.Sprint(rules[hash].Id))
continue
}

View File

@ -78,6 +78,24 @@ func (uc *UserCacheType) GetByUserIds(ids []int64) []*models.User {
return users
}
func (uc *UserCacheType) GetMaintainerUsers() []*models.User {
uc.RLock()
defer uc.RUnlock()
var users []*models.User
for _, v := range uc.users {
if v.Maintainer == 1 {
users = append(users, v)
}
}
if users == nil {
users = []*models.User{}
}
return users
}
func SyncUsers() {
err := syncUsers()
if err != nil {