optimize error report (#1109)
* optimize error report * code refactor * add /-/reload as reload route like prometheus Co-authored-by: ziv <xiaozheng@tuya.com>
This commit is contained in:
parent
635369e3fd
commit
3b5c8d8357
|
@ -2,6 +2,7 @@ package engine
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/toolkits/pkg/logger"
|
"github.com/toolkits/pkg/logger"
|
||||||
|
@ -27,6 +28,18 @@ func Start(ctx context.Context) error {
|
||||||
|
|
||||||
go sender.StartEmailSender()
|
go sender.StartEmailSender()
|
||||||
|
|
||||||
|
go initReporter(func(em map[ErrorType]uint64) {
|
||||||
|
if len(em) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
title := fmt.Sprintf("server %s has some errors, please check server logs for detail", config.C.Heartbeat.IP)
|
||||||
|
msg := ""
|
||||||
|
for k, v := range em {
|
||||||
|
msg += fmt.Sprintf("error: %s, count: %d\n", k, v)
|
||||||
|
}
|
||||||
|
notifyToMaintainer(title, msg)
|
||||||
|
})
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,22 @@ type MaintainMessage struct {
|
||||||
Content string `json:"content"`
|
Content string `json:"content"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*models.User) {
|
// notify to maintainer to handle the error
|
||||||
|
func notifyToMaintainer(title, msg string) {
|
||||||
|
logger.Errorf("notifyToMaintainer, msg: %s", msg)
|
||||||
|
|
||||||
|
users := memsto.UserCache.GetMaintainerUsers()
|
||||||
|
if len(users) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
|
||||||
|
|
||||||
|
notifyMaintainerWithPlugin(title, msg, triggerTime, users)
|
||||||
|
notifyMaintainerWithBuiltin(title, msg, triggerTime, users)
|
||||||
|
}
|
||||||
|
|
||||||
|
func notifyMaintainerWithPlugin(title, msg, triggerTime string, users []*models.User) {
|
||||||
if !config.C.Alerting.CallPlugin.Enable {
|
if !config.C.Alerting.CallPlugin.Enable {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -27,7 +42,7 @@ func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*mod
|
||||||
stdinBytes, err := json.Marshal(MaintainMessage{
|
stdinBytes, err := json.Marshal(MaintainMessage{
|
||||||
Tos: users,
|
Tos: users,
|
||||||
Title: title,
|
Title: title,
|
||||||
Content: "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime,
|
Content: "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime,
|
||||||
})
|
})
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -39,22 +54,7 @@ func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*mod
|
||||||
logger.Debugf("notify maintainer with plugin done")
|
logger.Debugf("notify maintainer with plugin done")
|
||||||
}
|
}
|
||||||
|
|
||||||
// notify to maintainer to handle the error
|
func notifyMaintainerWithBuiltin(title, msg, triggerTime string, users []*models.User) {
|
||||||
func notifyToMaintainer(e error, title string) {
|
|
||||||
logger.Errorf("notifyToMaintainer, title:%s, error:%v", title, e)
|
|
||||||
|
|
||||||
users := memsto.UserCache.GetMaintainerUsers()
|
|
||||||
if len(users) == 0 {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
|
|
||||||
|
|
||||||
notifyMaintainerWithPlugin(e, title, triggerTime, users)
|
|
||||||
notifyMaintainerWithBuiltin(e, title, triggerTime, users)
|
|
||||||
}
|
|
||||||
|
|
||||||
func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*models.User) {
|
|
||||||
if len(config.C.Alerting.NotifyBuiltinChannels) == 0 {
|
if len(config.C.Alerting.NotifyBuiltinChannels) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -104,13 +104,13 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo
|
||||||
if len(emailset) == 0 {
|
if len(emailset) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
content := "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime
|
content := "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime
|
||||||
sender.WriteEmail(title, content, StringSetKeys(emailset))
|
sender.WriteEmail(title, content, StringSetKeys(emailset))
|
||||||
case "dingtalk":
|
case "dingtalk":
|
||||||
if len(dingtalkset) == 0 {
|
if len(dingtalkset) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
content := "**Title: **" + title + "\n**Content: **" + e.Error() + "\n**Time: **" + triggerTime
|
content := "**Title: **" + title + "\n**Content: **" + msg + "\n**Time: **" + triggerTime
|
||||||
sender.SendDingtalk(sender.DingtalkMessage{
|
sender.SendDingtalk(sender.DingtalkMessage{
|
||||||
Title: title,
|
Title: title,
|
||||||
Text: content,
|
Text: content,
|
||||||
|
@ -121,7 +121,7 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo
|
||||||
if len(wecomset) == 0 {
|
if len(wecomset) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
content := "**Title: **" + title + "\n**Content: **" + e.Error() + "\n**Time: **" + triggerTime
|
content := "**Title: **" + title + "\n**Content: **" + msg + "\n**Time: **" + triggerTime
|
||||||
sender.SendWecom(sender.WecomMessage{
|
sender.SendWecom(sender.WecomMessage{
|
||||||
Text: content,
|
Text: content,
|
||||||
Tokens: StringSetKeys(wecomset),
|
Tokens: StringSetKeys(wecomset),
|
||||||
|
@ -131,7 +131,7 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
content := "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime
|
content := "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime
|
||||||
sender.SendFeishu(sender.FeishuMessage{
|
sender.SendFeishu(sender.FeishuMessage{
|
||||||
Text: content,
|
Text: content,
|
||||||
AtMobiles: phones,
|
AtMobiles: phones,
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
package engine
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ErrorType string
|
||||||
|
|
||||||
|
// register new error here
|
||||||
|
const (
|
||||||
|
QueryPrometheusError ErrorType = "QueryPrometheusError"
|
||||||
|
RuntimeError ErrorType = "RuntimeError"
|
||||||
|
)
|
||||||
|
|
||||||
|
type reporter struct {
|
||||||
|
sync.Mutex
|
||||||
|
em map[ErrorType]uint64
|
||||||
|
cb func(em map[ErrorType]uint64)
|
||||||
|
}
|
||||||
|
|
||||||
|
var rp reporter
|
||||||
|
|
||||||
|
func initReporter(cb func(em map[ErrorType]uint64)) {
|
||||||
|
rp = reporter{cb: cb, em: make(map[ErrorType]uint64)}
|
||||||
|
rp.Start()
|
||||||
|
}
|
||||||
|
|
||||||
|
func Report(errorType ErrorType) {
|
||||||
|
rp.report(errorType)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *reporter) reset() map[ErrorType]uint64 {
|
||||||
|
r.Lock()
|
||||||
|
defer r.Unlock()
|
||||||
|
if len(r.em) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
oem := r.em
|
||||||
|
r.em = make(map[ErrorType]uint64)
|
||||||
|
return oem
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *reporter) report(errorType ErrorType) {
|
||||||
|
r.Lock()
|
||||||
|
defer r.Unlock()
|
||||||
|
if count, has := r.em[errorType]; has {
|
||||||
|
r.em[errorType] = count + 1
|
||||||
|
} else {
|
||||||
|
r.em[errorType] = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *reporter) Start() {
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-time.After(time.Minute):
|
||||||
|
cur := r.reset()
|
||||||
|
if cur != nil {
|
||||||
|
r.cb(cur)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -116,7 +116,8 @@ func (r RuleEval) Work() {
|
||||||
value, warnings, err = reader.Client.Query(context.Background(), promql, time.Now())
|
value, warnings, err = reader.Client.Query(context.Background(), promql, time.Now())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err)
|
logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err)
|
||||||
notifyToMaintainer(err, "failed to query prometheus")
|
//notifyToMaintainer(err, "failed to query prometheus")
|
||||||
|
Report(QueryPrometheusError)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ import (
|
||||||
promstat "github.com/didi/nightingale/v5/src/server/stat"
|
promstat "github.com/didi/nightingale/v5/src/server/stat"
|
||||||
)
|
)
|
||||||
|
|
||||||
func New(version string) *gin.Engine {
|
func New(version string, reloadFunc func()) *gin.Engine {
|
||||||
gin.SetMode(config.C.RunMode)
|
gin.SetMode(config.C.RunMode)
|
||||||
|
|
||||||
loggerMid := aop.Logger()
|
loggerMid := aop.Logger()
|
||||||
|
@ -37,12 +37,12 @@ func New(version string) *gin.Engine {
|
||||||
r.Use(loggerMid)
|
r.Use(loggerMid)
|
||||||
}
|
}
|
||||||
|
|
||||||
configRoute(r, version)
|
configRoute(r, version, reloadFunc)
|
||||||
|
|
||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
|
||||||
func configRoute(r *gin.Engine, version string) {
|
func configRoute(r *gin.Engine, version string, reloadFunc func()) {
|
||||||
if config.C.HTTP.PProf {
|
if config.C.HTTP.PProf {
|
||||||
pprof.Register(r, "/api/debug/pprof")
|
pprof.Register(r, "/api/debug/pprof")
|
||||||
}
|
}
|
||||||
|
@ -63,6 +63,11 @@ func configRoute(r *gin.Engine, version string) {
|
||||||
c.String(200, version)
|
c.String(200, version)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
r.POST("/-/reload", func(c *gin.Context) {
|
||||||
|
reloadFunc()
|
||||||
|
c.String(200, "reload success")
|
||||||
|
})
|
||||||
|
|
||||||
r.GET("/servers/active", func(c *gin.Context) {
|
r.GET("/servers/active", func(c *gin.Context) {
|
||||||
lst, err := naming.ActiveServers(c.Request.Context(), config.C.ClusterName)
|
lst, err := naming.ActiveServers(c.Request.Context(), config.C.ClusterName)
|
||||||
ginx.NewRender(c).Data(lst, err)
|
ginx.NewRender(c).Data(lst, err)
|
||||||
|
|
|
@ -76,9 +76,7 @@ EXIT:
|
||||||
break EXIT
|
break EXIT
|
||||||
case syscall.SIGHUP:
|
case syscall.SIGHUP:
|
||||||
// reload configuration?
|
// reload configuration?
|
||||||
logger.Info("start reload configs")
|
reload()
|
||||||
engine.Reload()
|
|
||||||
logger.Info("reload configs finished")
|
|
||||||
default:
|
default:
|
||||||
break EXIT
|
break EXIT
|
||||||
}
|
}
|
||||||
|
@ -147,7 +145,7 @@ func (s Server) initialize() (func(), error) {
|
||||||
stat.Init()
|
stat.Init()
|
||||||
|
|
||||||
// init http server
|
// init http server
|
||||||
r := router.New(s.Version)
|
r := router.New(s.Version, reload)
|
||||||
httpClean := httpx.Init(config.C.HTTP, r)
|
httpClean := httpx.Init(config.C.HTTP, r)
|
||||||
fns.Add(httpClean)
|
fns.Add(httpClean)
|
||||||
|
|
||||||
|
@ -177,3 +175,9 @@ func (fs *Functions) Ret() func() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func reload() {
|
||||||
|
logger.Info("start reload configs")
|
||||||
|
engine.Reload()
|
||||||
|
logger.Info("reload configs finished")
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue