feat: 告警处理出错给Maintainer管理员发送告警信息 (#955)
* feat: 告警处理出错给管理员发送告警信息 * feat: 告警处理出错给管理员发送告警信息,发送信息自己拼接,不使用模版 * feat: 告警处理出错给管理员发送告警信息,不实用AlertCurEvent结构 * feat: 告警处理出错给管理员发送告警信息,日志打印、文本发送优化
This commit is contained in:
parent
070e5051c6
commit
e22a4394f7
|
@ -14,6 +14,7 @@ CREATE TABLE `users` (
|
||||||
`portrait` varchar(255) not null default '' comment 'portrait image url',
|
`portrait` varchar(255) not null default '' comment 'portrait image url',
|
||||||
`roles` varchar(255) not null comment 'Admin | Standard | Guest, split by space',
|
`roles` varchar(255) not null comment 'Admin | Standard | Guest, split by space',
|
||||||
`contacts` varchar(1024) comment 'json e.g. {wecom:xx, dingtalk_robot_token:yy}',
|
`contacts` varchar(1024) comment 'json e.g. {wecom:xx, dingtalk_robot_token:yy}',
|
||||||
|
`maintainer` tinyint(1) not null default 0,
|
||||||
`create_at` bigint not null default 0,
|
`create_at` bigint not null default 0,
|
||||||
`create_by` varchar(64) not null default '',
|
`create_by` varchar(64) not null default '',
|
||||||
`update_at` bigint not null default 0,
|
`update_at` bigint not null default 0,
|
||||||
|
|
|
@ -17,21 +17,22 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
type User struct {
|
type User struct {
|
||||||
Id int64 `json:"id" gorm:"primaryKey"`
|
Id int64 `json:"id" gorm:"primaryKey"`
|
||||||
Username string `json:"username"`
|
Username string `json:"username"`
|
||||||
Nickname string `json:"nickname"`
|
Nickname string `json:"nickname"`
|
||||||
Password string `json:"-"`
|
Password string `json:"-"`
|
||||||
Phone string `json:"phone"`
|
Phone string `json:"phone"`
|
||||||
Email string `json:"email"`
|
Email string `json:"email"`
|
||||||
Portrait string `json:"portrait"`
|
Portrait string `json:"portrait"`
|
||||||
Roles string `json:"-"` // 这个字段写入数据库
|
Roles string `json:"-"` // 这个字段写入数据库
|
||||||
RolesLst []string `json:"roles" gorm:"-"` // 这个字段和前端交互
|
RolesLst []string `json:"roles" gorm:"-"` // 这个字段和前端交互
|
||||||
Contacts ormx.JSONObj `json:"contacts"` // 内容为 map[string]string 结构
|
Contacts ormx.JSONObj `json:"contacts"` // 内容为 map[string]string 结构
|
||||||
CreateAt int64 `json:"create_at"`
|
Maintainer int `json:"maintainer"` // 是否给管理员发消息 0:not send 1:send
|
||||||
CreateBy string `json:"create_by"`
|
CreateAt int64 `json:"create_at"`
|
||||||
UpdateAt int64 `json:"update_at"`
|
CreateBy string `json:"create_by"`
|
||||||
UpdateBy string `json:"update_by"`
|
UpdateAt int64 `json:"update_at"`
|
||||||
Admin bool `json:"admin" gorm:"-"` // 方便前端使用
|
UpdateBy string `json:"update_by"`
|
||||||
|
Admin bool `json:"admin" gorm:"-"` // 方便前端使用
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u *User) TableName() string {
|
func (u *User) TableName() string {
|
||||||
|
|
|
@ -0,0 +1,108 @@
|
||||||
|
package engine
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/didi/nightingale/v5/src/server/common/sender"
|
||||||
|
"github.com/didi/nightingale/v5/src/server/config"
|
||||||
|
"github.com/didi/nightingale/v5/src/server/memsto"
|
||||||
|
"github.com/tidwall/gjson"
|
||||||
|
"github.com/toolkits/pkg/logger"
|
||||||
|
)
|
||||||
|
|
||||||
|
// notify to maintainer to handle the error
|
||||||
|
func notifyToMaintainer(e error, title string) {
|
||||||
|
|
||||||
|
logger.Errorf("notifyToMaintainer,title:%s, error:%v", title, e)
|
||||||
|
|
||||||
|
if len(config.C.Alerting.NotifyBuiltinChannels) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
maintainerUsers := memsto.UserCache.GetMaintainerUsers()
|
||||||
|
if len(maintainerUsers) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
emailset := make(map[string]struct{})
|
||||||
|
phoneset := make(map[string]struct{})
|
||||||
|
wecomset := make(map[string]struct{})
|
||||||
|
dingtalkset := make(map[string]struct{})
|
||||||
|
feishuset := make(map[string]struct{})
|
||||||
|
|
||||||
|
for _, user := range maintainerUsers {
|
||||||
|
if user.Email != "" {
|
||||||
|
emailset[user.Email] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
if user.Phone != "" {
|
||||||
|
phoneset[user.Phone] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
bs, err := user.Contacts.MarshalJSON()
|
||||||
|
if err != nil {
|
||||||
|
logger.Errorf("handle_notice: failed to marshal contacts: %v", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
ret := gjson.GetBytes(bs, "dingtalk_robot_token")
|
||||||
|
if ret.Exists() {
|
||||||
|
dingtalkset[ret.String()] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = gjson.GetBytes(bs, "wecom_robot_token")
|
||||||
|
if ret.Exists() {
|
||||||
|
wecomset[ret.String()] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = gjson.GetBytes(bs, "feishu_robot_token")
|
||||||
|
if ret.Exists() {
|
||||||
|
feishuset[ret.String()] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
phones := StringSetKeys(phoneset)
|
||||||
|
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
|
||||||
|
|
||||||
|
for _, ch := range config.C.Alerting.NotifyBuiltinChannels {
|
||||||
|
switch ch {
|
||||||
|
case "email":
|
||||||
|
if len(emailset) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
content := "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime
|
||||||
|
sender.WriteEmail(title, content, StringSetKeys(emailset))
|
||||||
|
case "dingtalk":
|
||||||
|
if len(dingtalkset) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
content := "**【内部处理错误】当前标题: **" + title + "\n**【内部处理错误】当前异常: **" + e.Error() + "\n**【内部处理错误】发送时间: **" + triggerTime
|
||||||
|
sender.SendDingtalk(sender.DingtalkMessage{
|
||||||
|
Title: title,
|
||||||
|
Text: content,
|
||||||
|
AtMobiles: phones,
|
||||||
|
Tokens: StringSetKeys(dingtalkset),
|
||||||
|
})
|
||||||
|
case "wecom":
|
||||||
|
if len(wecomset) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
content := "**【内部处理错误】当前标题: **" + title + "\n**【内部处理错误】当前异常: **" + e.Error() + "\n**【内部处理错误】发送时间: **" + triggerTime
|
||||||
|
sender.SendWecom(sender.WecomMessage{
|
||||||
|
Text: content,
|
||||||
|
Tokens: StringSetKeys(wecomset),
|
||||||
|
})
|
||||||
|
case "feishu":
|
||||||
|
if len(feishuset) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
content := "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime
|
||||||
|
sender.SendFeishu(sender.FeishuMessage{
|
||||||
|
Text: content,
|
||||||
|
AtMobiles: phones,
|
||||||
|
Tokens: StringSetKeys(feishuset),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -39,6 +39,7 @@ func loopFilterRules(ctx context.Context) {
|
||||||
|
|
||||||
func filterRules() {
|
func filterRules() {
|
||||||
ids := memsto.AlertRuleCache.GetRuleIds()
|
ids := memsto.AlertRuleCache.GetRuleIds()
|
||||||
|
logger.Infof("AlertRuleCache.GetRuleIds success,ids.len: %d", len(ids))
|
||||||
|
|
||||||
count := len(ids)
|
count := len(ids)
|
||||||
mines := make([]int64, 0, count)
|
mines := make([]int64, 0, count)
|
||||||
|
@ -83,6 +84,7 @@ func (r RuleEval) Start() {
|
||||||
return
|
return
|
||||||
default:
|
default:
|
||||||
r.Work()
|
r.Work()
|
||||||
|
logger.Infof("rule executed,rule_id=%d", r.RuleID())
|
||||||
interval := r.rule.PromEvalInterval
|
interval := r.rule.PromEvalInterval
|
||||||
if interval <= 0 {
|
if interval <= 0 {
|
||||||
interval = 10
|
interval = 10
|
||||||
|
@ -111,6 +113,8 @@ func (r RuleEval) Work() {
|
||||||
value, warnings, err = reader.Reader.Client.Query(context.Background(), promql, time.Now())
|
value, warnings, err = reader.Reader.Client.Query(context.Background(), promql, time.Now())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err)
|
logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err)
|
||||||
|
// 告警查询prometheus逻辑出错,发告警信息给管理员
|
||||||
|
notifyToMaintainer(err, "查询prometheus出错")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,6 +186,7 @@ func (ws *WorkersType) Build(rids []int64) {
|
||||||
elst, err := models.AlertCurEventGetByRule(rules[hash].Id)
|
elst, err := models.AlertCurEventGetByRule(rules[hash].Id)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Errorf("worker_build: AlertCurEventGetByRule failed: %v", err)
|
logger.Errorf("worker_build: AlertCurEventGetByRule failed: %v", err)
|
||||||
|
notifyToMaintainer(err, "AlertCurEventGetByRule Error,ruleID="+fmt.Sprint(rules[hash].Id))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -78,6 +78,24 @@ func (uc *UserCacheType) GetByUserIds(ids []int64) []*models.User {
|
||||||
return users
|
return users
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (uc *UserCacheType) GetMaintainerUsers() []*models.User {
|
||||||
|
uc.RLock()
|
||||||
|
defer uc.RUnlock()
|
||||||
|
|
||||||
|
var users []*models.User
|
||||||
|
for _, v := range uc.users {
|
||||||
|
if v.Maintainer == 1 {
|
||||||
|
users = append(users, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if users == nil {
|
||||||
|
users = []*models.User{}
|
||||||
|
}
|
||||||
|
|
||||||
|
return users
|
||||||
|
}
|
||||||
|
|
||||||
func SyncUsers() {
|
func SyncUsers() {
|
||||||
err := syncUsers()
|
err := syncUsers()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
Loading…
Reference in New Issue