optimize error report (#1109)
* optimize error report * code refactor * add /-/reload as reload route like prometheus Co-authored-by: ziv <xiaozheng@tuya.com>
This commit is contained in:
parent
635369e3fd
commit
3b5c8d8357
|
@ -2,6 +2,7 @@ package engine
|
|||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/toolkits/pkg/logger"
|
||||
|
@ -27,6 +28,18 @@ func Start(ctx context.Context) error {
|
|||
|
||||
go sender.StartEmailSender()
|
||||
|
||||
go initReporter(func(em map[ErrorType]uint64) {
|
||||
if len(em) == 0 {
|
||||
return
|
||||
}
|
||||
title := fmt.Sprintf("server %s has some errors, please check server logs for detail", config.C.Heartbeat.IP)
|
||||
msg := ""
|
||||
for k, v := range em {
|
||||
msg += fmt.Sprintf("error: %s, count: %d\n", k, v)
|
||||
}
|
||||
notifyToMaintainer(title, msg)
|
||||
})
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,22 @@ type MaintainMessage struct {
|
|||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*models.User) {
|
||||
// notify to maintainer to handle the error
|
||||
func notifyToMaintainer(title, msg string) {
|
||||
logger.Errorf("notifyToMaintainer, msg: %s", msg)
|
||||
|
||||
users := memsto.UserCache.GetMaintainerUsers()
|
||||
if len(users) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
|
||||
|
||||
notifyMaintainerWithPlugin(title, msg, triggerTime, users)
|
||||
notifyMaintainerWithBuiltin(title, msg, triggerTime, users)
|
||||
}
|
||||
|
||||
func notifyMaintainerWithPlugin(title, msg, triggerTime string, users []*models.User) {
|
||||
if !config.C.Alerting.CallPlugin.Enable {
|
||||
return
|
||||
}
|
||||
|
@ -27,7 +42,7 @@ func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*mod
|
|||
stdinBytes, err := json.Marshal(MaintainMessage{
|
||||
Tos: users,
|
||||
Title: title,
|
||||
Content: "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime,
|
||||
Content: "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime,
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
|
@ -39,22 +54,7 @@ func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*mod
|
|||
logger.Debugf("notify maintainer with plugin done")
|
||||
}
|
||||
|
||||
// notify to maintainer to handle the error
|
||||
func notifyToMaintainer(e error, title string) {
|
||||
logger.Errorf("notifyToMaintainer, title:%s, error:%v", title, e)
|
||||
|
||||
users := memsto.UserCache.GetMaintainerUsers()
|
||||
if len(users) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
triggerTime := time.Now().Format("2006/01/02 - 15:04:05")
|
||||
|
||||
notifyMaintainerWithPlugin(e, title, triggerTime, users)
|
||||
notifyMaintainerWithBuiltin(e, title, triggerTime, users)
|
||||
}
|
||||
|
||||
func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*models.User) {
|
||||
func notifyMaintainerWithBuiltin(title, msg, triggerTime string, users []*models.User) {
|
||||
if len(config.C.Alerting.NotifyBuiltinChannels) == 0 {
|
||||
return
|
||||
}
|
||||
|
@ -104,13 +104,13 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo
|
|||
if len(emailset) == 0 {
|
||||
continue
|
||||
}
|
||||
content := "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime
|
||||
content := "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime
|
||||
sender.WriteEmail(title, content, StringSetKeys(emailset))
|
||||
case "dingtalk":
|
||||
if len(dingtalkset) == 0 {
|
||||
continue
|
||||
}
|
||||
content := "**Title: **" + title + "\n**Content: **" + e.Error() + "\n**Time: **" + triggerTime
|
||||
content := "**Title: **" + title + "\n**Content: **" + msg + "\n**Time: **" + triggerTime
|
||||
sender.SendDingtalk(sender.DingtalkMessage{
|
||||
Title: title,
|
||||
Text: content,
|
||||
|
@ -121,7 +121,7 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo
|
|||
if len(wecomset) == 0 {
|
||||
continue
|
||||
}
|
||||
content := "**Title: **" + title + "\n**Content: **" + e.Error() + "\n**Time: **" + triggerTime
|
||||
content := "**Title: **" + title + "\n**Content: **" + msg + "\n**Time: **" + triggerTime
|
||||
sender.SendWecom(sender.WecomMessage{
|
||||
Text: content,
|
||||
Tokens: StringSetKeys(wecomset),
|
||||
|
@ -131,7 +131,7 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo
|
|||
continue
|
||||
}
|
||||
|
||||
content := "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime
|
||||
content := "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime
|
||||
sender.SendFeishu(sender.FeishuMessage{
|
||||
Text: content,
|
||||
AtMobiles: phones,
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
package engine
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
type ErrorType string
|
||||
|
||||
// register new error here
|
||||
const (
|
||||
QueryPrometheusError ErrorType = "QueryPrometheusError"
|
||||
RuntimeError ErrorType = "RuntimeError"
|
||||
)
|
||||
|
||||
type reporter struct {
|
||||
sync.Mutex
|
||||
em map[ErrorType]uint64
|
||||
cb func(em map[ErrorType]uint64)
|
||||
}
|
||||
|
||||
var rp reporter
|
||||
|
||||
func initReporter(cb func(em map[ErrorType]uint64)) {
|
||||
rp = reporter{cb: cb, em: make(map[ErrorType]uint64)}
|
||||
rp.Start()
|
||||
}
|
||||
|
||||
func Report(errorType ErrorType) {
|
||||
rp.report(errorType)
|
||||
}
|
||||
|
||||
func (r *reporter) reset() map[ErrorType]uint64 {
|
||||
r.Lock()
|
||||
defer r.Unlock()
|
||||
if len(r.em) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
oem := r.em
|
||||
r.em = make(map[ErrorType]uint64)
|
||||
return oem
|
||||
}
|
||||
|
||||
func (r *reporter) report(errorType ErrorType) {
|
||||
r.Lock()
|
||||
defer r.Unlock()
|
||||
if count, has := r.em[errorType]; has {
|
||||
r.em[errorType] = count + 1
|
||||
} else {
|
||||
r.em[errorType] = 1
|
||||
}
|
||||
}
|
||||
|
||||
func (r *reporter) Start() {
|
||||
for {
|
||||
select {
|
||||
case <-time.After(time.Minute):
|
||||
cur := r.reset()
|
||||
if cur != nil {
|
||||
r.cb(cur)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -116,7 +116,8 @@ func (r RuleEval) Work() {
|
|||
value, warnings, err = reader.Client.Query(context.Background(), promql, time.Now())
|
||||
if err != nil {
|
||||
logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err)
|
||||
notifyToMaintainer(err, "failed to query prometheus")
|
||||
//notifyToMaintainer(err, "failed to query prometheus")
|
||||
Report(QueryPrometheusError)
|
||||
return
|
||||
}
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ import (
|
|||
promstat "github.com/didi/nightingale/v5/src/server/stat"
|
||||
)
|
||||
|
||||
func New(version string) *gin.Engine {
|
||||
func New(version string, reloadFunc func()) *gin.Engine {
|
||||
gin.SetMode(config.C.RunMode)
|
||||
|
||||
loggerMid := aop.Logger()
|
||||
|
@ -37,12 +37,12 @@ func New(version string) *gin.Engine {
|
|||
r.Use(loggerMid)
|
||||
}
|
||||
|
||||
configRoute(r, version)
|
||||
configRoute(r, version, reloadFunc)
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func configRoute(r *gin.Engine, version string) {
|
||||
func configRoute(r *gin.Engine, version string, reloadFunc func()) {
|
||||
if config.C.HTTP.PProf {
|
||||
pprof.Register(r, "/api/debug/pprof")
|
||||
}
|
||||
|
@ -63,6 +63,11 @@ func configRoute(r *gin.Engine, version string) {
|
|||
c.String(200, version)
|
||||
})
|
||||
|
||||
r.POST("/-/reload", func(c *gin.Context) {
|
||||
reloadFunc()
|
||||
c.String(200, "reload success")
|
||||
})
|
||||
|
||||
r.GET("/servers/active", func(c *gin.Context) {
|
||||
lst, err := naming.ActiveServers(c.Request.Context(), config.C.ClusterName)
|
||||
ginx.NewRender(c).Data(lst, err)
|
||||
|
|
|
@ -76,9 +76,7 @@ EXIT:
|
|||
break EXIT
|
||||
case syscall.SIGHUP:
|
||||
// reload configuration?
|
||||
logger.Info("start reload configs")
|
||||
engine.Reload()
|
||||
logger.Info("reload configs finished")
|
||||
reload()
|
||||
default:
|
||||
break EXIT
|
||||
}
|
||||
|
@ -147,7 +145,7 @@ func (s Server) initialize() (func(), error) {
|
|||
stat.Init()
|
||||
|
||||
// init http server
|
||||
r := router.New(s.Version)
|
||||
r := router.New(s.Version, reload)
|
||||
httpClean := httpx.Init(config.C.HTTP, r)
|
||||
fns.Add(httpClean)
|
||||
|
||||
|
@ -177,3 +175,9 @@ func (fs *Functions) Ret() func() {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func reload() {
|
||||
logger.Info("start reload configs")
|
||||
engine.Reload()
|
||||
logger.Info("reload configs finished")
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue