optimize error report (#1109)

* optimize error report

* code refactor

* add /-/reload as reload route like prometheus

Co-authored-by: ziv <xiaozheng@tuya.com>
This commit is contained in:
xiaoziv 2022-08-12 14:10:03 +08:00 committed by GitHub
parent 635369e3fd
commit 3b5c8d8357
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 118 additions and 30 deletions

View File

@ -2,6 +2,7 @@ package engine
import (
"context"
"fmt"
"time"
"github.com/toolkits/pkg/logger"
@ -27,6 +28,18 @@ func Start(ctx context.Context) error {
go sender.StartEmailSender()
go initReporter(func(em map[ErrorType]uint64) {
if len(em) == 0 {
return
}
title := fmt.Sprintf("server %s has some errors, please check server logs for detail", config.C.Heartbeat.IP)
msg := ""
for k, v := range em {
msg += fmt.Sprintf("error: %s, count: %d\n", k, v)
}
notifyToMaintainer(title, msg)
})
return nil
}

View File

@ -19,7 +19,22 @@ type MaintainMessage struct {
Content string `json:"content"`
}
func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*models.User) {
// notify to maintainer to handle the error
func notifyToMaintainer(title, msg string) {
logger.Errorf("notifyToMaintainer, msg: %s", msg)
users := memsto.UserCache.GetMaintainerUsers()
if len(users) == 0 {
return
}
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
notifyMaintainerWithPlugin(title, msg, triggerTime, users)
notifyMaintainerWithBuiltin(title, msg, triggerTime, users)
}
func notifyMaintainerWithPlugin(title, msg, triggerTime string, users []*models.User) {
if !config.C.Alerting.CallPlugin.Enable {
return
}
@ -27,7 +42,7 @@ func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*mod
stdinBytes, err := json.Marshal(MaintainMessage{
Tos: users,
Title: title,
Content: "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime,
Content: "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime,
})
if err != nil {
@ -39,22 +54,7 @@ func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*mod
logger.Debugf("notify maintainer with plugin done")
}
// notify to maintainer to handle the error
func notifyToMaintainer(e error, title string) {
logger.Errorf("notifyToMaintainer, title:%s, error:%v", title, e)
users := memsto.UserCache.GetMaintainerUsers()
if len(users) == 0 {
return
}
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
notifyMaintainerWithPlugin(e, title, triggerTime, users)
notifyMaintainerWithBuiltin(e, title, triggerTime, users)
}
func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*models.User) {
func notifyMaintainerWithBuiltin(title, msg, triggerTime string, users []*models.User) {
if len(config.C.Alerting.NotifyBuiltinChannels) == 0 {
return
}
@ -104,13 +104,13 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo
if len(emailset) == 0 {
continue
}
content := "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime
content := "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime
sender.WriteEmail(title, content, StringSetKeys(emailset))
case "dingtalk":
if len(dingtalkset) == 0 {
continue
}
content := "**Title: **" + title + "\n**Content: **" + e.Error() + "\n**Time: **" + triggerTime
content := "**Title: **" + title + "\n**Content: **" + msg + "\n**Time: **" + triggerTime
sender.SendDingtalk(sender.DingtalkMessage{
Title: title,
Text: content,
@ -121,7 +121,7 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo
if len(wecomset) == 0 {
continue
}
content := "**Title: **" + title + "\n**Content: **" + e.Error() + "\n**Time: **" + triggerTime
content := "**Title: **" + title + "\n**Content: **" + msg + "\n**Time: **" + triggerTime
sender.SendWecom(sender.WecomMessage{
Text: content,
Tokens: StringSetKeys(wecomset),
@ -131,7 +131,7 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo
continue
}
content := "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime
content := "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime
sender.SendFeishu(sender.FeishuMessage{
Text: content,
AtMobiles: phones,

View File

@ -0,0 +1,65 @@
package engine
import (
"sync"
"time"
)
type ErrorType string
// register new error here
const (
QueryPrometheusError ErrorType = "QueryPrometheusError"
RuntimeError ErrorType = "RuntimeError"
)
type reporter struct {
sync.Mutex
em map[ErrorType]uint64
cb func(em map[ErrorType]uint64)
}
var rp reporter
func initReporter(cb func(em map[ErrorType]uint64)) {
rp = reporter{cb: cb, em: make(map[ErrorType]uint64)}
rp.Start()
}
func Report(errorType ErrorType) {
rp.report(errorType)
}
func (r *reporter) reset() map[ErrorType]uint64 {
r.Lock()
defer r.Unlock()
if len(r.em) == 0 {
return nil
}
oem := r.em
r.em = make(map[ErrorType]uint64)
return oem
}
func (r *reporter) report(errorType ErrorType) {
r.Lock()
defer r.Unlock()
if count, has := r.em[errorType]; has {
r.em[errorType] = count + 1
} else {
r.em[errorType] = 1
}
}
func (r *reporter) Start() {
for {
select {
case <-time.After(time.Minute):
cur := r.reset()
if cur != nil {
r.cb(cur)
}
}
}
}

View File

@ -116,7 +116,8 @@ func (r RuleEval) Work() {
value, warnings, err = reader.Client.Query(context.Background(), promql, time.Now())
if err != nil {
logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err)
notifyToMaintainer(err, "failed to query prometheus")
//notifyToMaintainer(err, "failed to query prometheus")
Report(QueryPrometheusError)
return
}

View File

@ -18,7 +18,7 @@ import (
promstat "github.com/didi/nightingale/v5/src/server/stat"
)
func New(version string) *gin.Engine {
func New(version string, reloadFunc func()) *gin.Engine {
gin.SetMode(config.C.RunMode)
loggerMid := aop.Logger()
@ -37,12 +37,12 @@ func New(version string) *gin.Engine {
r.Use(loggerMid)
}
configRoute(r, version)
configRoute(r, version, reloadFunc)
return r
}
func configRoute(r *gin.Engine, version string) {
func configRoute(r *gin.Engine, version string, reloadFunc func()) {
if config.C.HTTP.PProf {
pprof.Register(r, "/api/debug/pprof")
}
@ -63,6 +63,11 @@ func configRoute(r *gin.Engine, version string) {
c.String(200, version)
})
r.POST("/-/reload", func(c *gin.Context) {
reloadFunc()
c.String(200, "reload success")
})
r.GET("/servers/active", func(c *gin.Context) {
lst, err := naming.ActiveServers(c.Request.Context(), config.C.ClusterName)
ginx.NewRender(c).Data(lst, err)

View File

@ -76,9 +76,7 @@ EXIT:
break EXIT
case syscall.SIGHUP:
// reload configuration?
logger.Info("start reload configs")
engine.Reload()
logger.Info("reload configs finished")
reload()
default:
break EXIT
}
@ -147,7 +145,7 @@ func (s Server) initialize() (func(), error) {
stat.Init()
// init http server
r := router.New(s.Version)
r := router.New(s.Version, reload)
httpClean := httpx.Init(config.C.HTTP, r)
fns.Add(httpClean)
@ -177,3 +175,9 @@ func (fs *Functions) Ret() func() {
}
}
}
func reload() {
logger.Info("start reload configs")
engine.Reload()
logger.Info("reload configs finished")
}