feat: 告警处理出错给Maintainer管理员发送告警信息 (#955)
* feat: 告警处理出错给管理员发送告警信息 * feat: 告警处理出错给管理员发送告警信息,发送信息自己拼接,不使用模版 * feat: 告警处理出错给管理员发送告警信息,不实用AlertCurEvent结构 * feat: 告警处理出错给管理员发送告警信息,日志打印、文本发送优化
This commit is contained in:
parent
070e5051c6
commit
e22a4394f7
|
@ -14,6 +14,7 @@ CREATE TABLE `users` (
|
|||
`portrait` varchar(255) not null default '' comment 'portrait image url',
|
||||
`roles` varchar(255) not null comment 'Admin | Standard | Guest, split by space',
|
||||
`contacts` varchar(1024) comment 'json e.g. {wecom:xx, dingtalk_robot_token:yy}',
|
||||
`maintainer` tinyint(1) not null default 0,
|
||||
`create_at` bigint not null default 0,
|
||||
`create_by` varchar(64) not null default '',
|
||||
`update_at` bigint not null default 0,
|
||||
|
|
|
@ -17,21 +17,22 @@ import (
|
|||
)
|
||||
|
||||
type User struct {
|
||||
Id int64 `json:"id" gorm:"primaryKey"`
|
||||
Username string `json:"username"`
|
||||
Nickname string `json:"nickname"`
|
||||
Password string `json:"-"`
|
||||
Phone string `json:"phone"`
|
||||
Email string `json:"email"`
|
||||
Portrait string `json:"portrait"`
|
||||
Roles string `json:"-"` // 这个字段写入数据库
|
||||
RolesLst []string `json:"roles" gorm:"-"` // 这个字段和前端交互
|
||||
Contacts ormx.JSONObj `json:"contacts"` // 内容为 map[string]string 结构
|
||||
CreateAt int64 `json:"create_at"`
|
||||
CreateBy string `json:"create_by"`
|
||||
UpdateAt int64 `json:"update_at"`
|
||||
UpdateBy string `json:"update_by"`
|
||||
Admin bool `json:"admin" gorm:"-"` // 方便前端使用
|
||||
Id int64 `json:"id" gorm:"primaryKey"`
|
||||
Username string `json:"username"`
|
||||
Nickname string `json:"nickname"`
|
||||
Password string `json:"-"`
|
||||
Phone string `json:"phone"`
|
||||
Email string `json:"email"`
|
||||
Portrait string `json:"portrait"`
|
||||
Roles string `json:"-"` // 这个字段写入数据库
|
||||
RolesLst []string `json:"roles" gorm:"-"` // 这个字段和前端交互
|
||||
Contacts ormx.JSONObj `json:"contacts"` // 内容为 map[string]string 结构
|
||||
Maintainer int `json:"maintainer"` // 是否给管理员发消息 0:not send 1:send
|
||||
CreateAt int64 `json:"create_at"`
|
||||
CreateBy string `json:"create_by"`
|
||||
UpdateAt int64 `json:"update_at"`
|
||||
UpdateBy string `json:"update_by"`
|
||||
Admin bool `json:"admin" gorm:"-"` // 方便前端使用
|
||||
}
|
||||
|
||||
func (u *User) TableName() string {
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
package engine
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/didi/nightingale/v5/src/server/common/sender"
|
||||
"github.com/didi/nightingale/v5/src/server/config"
|
||||
"github.com/didi/nightingale/v5/src/server/memsto"
|
||||
"github.com/tidwall/gjson"
|
||||
"github.com/toolkits/pkg/logger"
|
||||
)
|
||||
|
||||
// notify to maintainer to handle the error
|
||||
func notifyToMaintainer(e error, title string) {
|
||||
|
||||
logger.Errorf("notifyToMaintainer,title:%s, error:%v", title, e)
|
||||
|
||||
if len(config.C.Alerting.NotifyBuiltinChannels) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
maintainerUsers := memsto.UserCache.GetMaintainerUsers()
|
||||
if len(maintainerUsers) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
emailset := make(map[string]struct{})
|
||||
phoneset := make(map[string]struct{})
|
||||
wecomset := make(map[string]struct{})
|
||||
dingtalkset := make(map[string]struct{})
|
||||
feishuset := make(map[string]struct{})
|
||||
|
||||
for _, user := range maintainerUsers {
|
||||
if user.Email != "" {
|
||||
emailset[user.Email] = struct{}{}
|
||||
}
|
||||
|
||||
if user.Phone != "" {
|
||||
phoneset[user.Phone] = struct{}{}
|
||||
}
|
||||
|
||||
bs, err := user.Contacts.MarshalJSON()
|
||||
if err != nil {
|
||||
logger.Errorf("handle_notice: failed to marshal contacts: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
ret := gjson.GetBytes(bs, "dingtalk_robot_token")
|
||||
if ret.Exists() {
|
||||
dingtalkset[ret.String()] = struct{}{}
|
||||
}
|
||||
|
||||
ret = gjson.GetBytes(bs, "wecom_robot_token")
|
||||
if ret.Exists() {
|
||||
wecomset[ret.String()] = struct{}{}
|
||||
}
|
||||
|
||||
ret = gjson.GetBytes(bs, "feishu_robot_token")
|
||||
if ret.Exists() {
|
||||
feishuset[ret.String()] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
phones := StringSetKeys(phoneset)
|
||||
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
|
||||
|
||||
for _, ch := range config.C.Alerting.NotifyBuiltinChannels {
|
||||
switch ch {
|
||||
case "email":
|
||||
if len(emailset) == 0 {
|
||||
continue
|
||||
}
|
||||
content := "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime
|
||||
sender.WriteEmail(title, content, StringSetKeys(emailset))
|
||||
case "dingtalk":
|
||||
if len(dingtalkset) == 0 {
|
||||
continue
|
||||
}
|
||||
content := "**【内部处理错误】当前标题: **" + title + "\n**【内部处理错误】当前异常: **" + e.Error() + "\n**【内部处理错误】发送时间: **" + triggerTime
|
||||
sender.SendDingtalk(sender.DingtalkMessage{
|
||||
Title: title,
|
||||
Text: content,
|
||||
AtMobiles: phones,
|
||||
Tokens: StringSetKeys(dingtalkset),
|
||||
})
|
||||
case "wecom":
|
||||
if len(wecomset) == 0 {
|
||||
continue
|
||||
}
|
||||
content := "**【内部处理错误】当前标题: **" + title + "\n**【内部处理错误】当前异常: **" + e.Error() + "\n**【内部处理错误】发送时间: **" + triggerTime
|
||||
sender.SendWecom(sender.WecomMessage{
|
||||
Text: content,
|
||||
Tokens: StringSetKeys(wecomset),
|
||||
})
|
||||
case "feishu":
|
||||
if len(feishuset) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
content := "【内部处理错误】当前标题: " + title + "\n【内部处理错误】当前异常: " + e.Error() + "\n【内部处理错误】发送时间: " + triggerTime
|
||||
sender.SendFeishu(sender.FeishuMessage{
|
||||
Text: content,
|
||||
AtMobiles: phones,
|
||||
Tokens: StringSetKeys(feishuset),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
|
@ -39,6 +39,7 @@ func loopFilterRules(ctx context.Context) {
|
|||
|
||||
func filterRules() {
|
||||
ids := memsto.AlertRuleCache.GetRuleIds()
|
||||
logger.Infof("AlertRuleCache.GetRuleIds success,ids.len: %d", len(ids))
|
||||
|
||||
count := len(ids)
|
||||
mines := make([]int64, 0, count)
|
||||
|
@ -83,6 +84,7 @@ func (r RuleEval) Start() {
|
|||
return
|
||||
default:
|
||||
r.Work()
|
||||
logger.Infof("rule executed,rule_id=%d", r.RuleID())
|
||||
interval := r.rule.PromEvalInterval
|
||||
if interval <= 0 {
|
||||
interval = 10
|
||||
|
@ -111,6 +113,8 @@ func (r RuleEval) Work() {
|
|||
value, warnings, err = reader.Reader.Client.Query(context.Background(), promql, time.Now())
|
||||
if err != nil {
|
||||
logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err)
|
||||
// 告警查询prometheus逻辑出错,发告警信息给管理员
|
||||
notifyToMaintainer(err, "查询prometheus出错")
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -182,6 +186,7 @@ func (ws *WorkersType) Build(rids []int64) {
|
|||
elst, err := models.AlertCurEventGetByRule(rules[hash].Id)
|
||||
if err != nil {
|
||||
logger.Errorf("worker_build: AlertCurEventGetByRule failed: %v", err)
|
||||
notifyToMaintainer(err, "AlertCurEventGetByRule Error,ruleID="+fmt.Sprint(rules[hash].Id))
|
||||
continue
|
||||
}
|
||||
|
||||
|
|
|
@ -78,6 +78,24 @@ func (uc *UserCacheType) GetByUserIds(ids []int64) []*models.User {
|
|||
return users
|
||||
}
|
||||
|
||||
func (uc *UserCacheType) GetMaintainerUsers() []*models.User {
|
||||
uc.RLock()
|
||||
defer uc.RUnlock()
|
||||
|
||||
var users []*models.User
|
||||
for _, v := range uc.users {
|
||||
if v.Maintainer == 1 {
|
||||
users = append(users, v)
|
||||
}
|
||||
}
|
||||
|
||||
if users == nil {
|
||||
users = []*models.User{}
|
||||
}
|
||||
|
||||
return users
|
||||
}
|
||||
|
||||
func SyncUsers() {
|
||||
err := syncUsers()
|
||||
if err != nil {
|
||||
|
|
Loading…
Reference in New Issue