diff --git a/src/server/engine/engine.go b/src/server/engine/engine.go index d32e683c..7648e974 100644 --- a/src/server/engine/engine.go +++ b/src/server/engine/engine.go @@ -2,6 +2,7 @@ package engine import ( "context" + "fmt" "time" "github.com/toolkits/pkg/logger" @@ -27,6 +28,18 @@ func Start(ctx context.Context) error { go sender.StartEmailSender() + go initReporter(func(em map[ErrorType]uint64) { + if len(em) == 0 { + return + } + title := fmt.Sprintf("server %s has some errors, please check server logs for detail", config.C.Heartbeat.IP) + msg := "" + for k, v := range em { + msg += fmt.Sprintf("error: %s, count: %d\n", k, v) + } + notifyToMaintainer(title, msg) + }) + return nil } diff --git a/src/server/engine/notify_maintainer.go b/src/server/engine/notify_maintainer.go index 9f61708c..c3f8f678 100644 --- a/src/server/engine/notify_maintainer.go +++ b/src/server/engine/notify_maintainer.go @@ -19,7 +19,22 @@ type MaintainMessage struct { Content string `json:"content"` } -func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*models.User) { +// notify to maintainer to handle the error +func notifyToMaintainer(title, msg string) { + logger.Errorf("notifyToMaintainer, msg: %s", msg) + + users := memsto.UserCache.GetMaintainerUsers() + if len(users) == 0 { + return + } + + triggerTime := time.Now().Format("2006/01/02 - 15:04:05") + + notifyMaintainerWithPlugin(title, msg, triggerTime, users) + notifyMaintainerWithBuiltin(title, msg, triggerTime, users) +} + +func notifyMaintainerWithPlugin(title, msg, triggerTime string, users []*models.User) { if !config.C.Alerting.CallPlugin.Enable { return } @@ -27,7 +42,7 @@ func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*mod stdinBytes, err := json.Marshal(MaintainMessage{ Tos: users, Title: title, - Content: "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime, + Content: "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime, }) if err != nil { @@ -39,22 +54,7 @@ func notifyMaintainerWithPlugin(e error, title, triggerTime string, users []*mod logger.Debugf("notify maintainer with plugin done") } -// notify to maintainer to handle the error -func notifyToMaintainer(e error, title string) { - logger.Errorf("notifyToMaintainer, title:%s, error:%v", title, e) - - users := memsto.UserCache.GetMaintainerUsers() - if len(users) == 0 { - return - } - - triggerTime := time.Now().Format("2006/01/02 - 15:04:05") - - notifyMaintainerWithPlugin(e, title, triggerTime, users) - notifyMaintainerWithBuiltin(e, title, triggerTime, users) -} - -func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*models.User) { +func notifyMaintainerWithBuiltin(title, msg, triggerTime string, users []*models.User) { if len(config.C.Alerting.NotifyBuiltinChannels) == 0 { return } @@ -104,13 +104,13 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo if len(emailset) == 0 { continue } - content := "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime + content := "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime sender.WriteEmail(title, content, StringSetKeys(emailset)) case "dingtalk": if len(dingtalkset) == 0 { continue } - content := "**Title: **" + title + "\n**Content: **" + e.Error() + "\n**Time: **" + triggerTime + content := "**Title: **" + title + "\n**Content: **" + msg + "\n**Time: **" + triggerTime sender.SendDingtalk(sender.DingtalkMessage{ Title: title, Text: content, @@ -121,7 +121,7 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo if len(wecomset) == 0 { continue } - content := "**Title: **" + title + "\n**Content: **" + e.Error() + "\n**Time: **" + triggerTime + content := "**Title: **" + title + "\n**Content: **" + msg + "\n**Time: **" + triggerTime sender.SendWecom(sender.WecomMessage{ Text: content, Tokens: StringSetKeys(wecomset), @@ -131,7 +131,7 @@ func notifyMaintainerWithBuiltin(e error, title, triggerTime string, users []*mo continue } - content := "Title: " + title + "\nContent: " + e.Error() + "\nTime: " + triggerTime + content := "Title: " + title + "\nContent: " + msg + "\nTime: " + triggerTime sender.SendFeishu(sender.FeishuMessage{ Text: content, AtMobiles: phones, diff --git a/src/server/engine/reporter.go b/src/server/engine/reporter.go new file mode 100644 index 00000000..3b8fcf33 --- /dev/null +++ b/src/server/engine/reporter.go @@ -0,0 +1,65 @@ +package engine + +import ( + "sync" + "time" +) + +type ErrorType string + +// register new error here +const ( + QueryPrometheusError ErrorType = "QueryPrometheusError" + RuntimeError ErrorType = "RuntimeError" +) + +type reporter struct { + sync.Mutex + em map[ErrorType]uint64 + cb func(em map[ErrorType]uint64) +} + +var rp reporter + +func initReporter(cb func(em map[ErrorType]uint64)) { + rp = reporter{cb: cb, em: make(map[ErrorType]uint64)} + rp.Start() +} + +func Report(errorType ErrorType) { + rp.report(errorType) +} + +func (r *reporter) reset() map[ErrorType]uint64 { + r.Lock() + defer r.Unlock() + if len(r.em) == 0 { + return nil + } + + oem := r.em + r.em = make(map[ErrorType]uint64) + return oem +} + +func (r *reporter) report(errorType ErrorType) { + r.Lock() + defer r.Unlock() + if count, has := r.em[errorType]; has { + r.em[errorType] = count + 1 + } else { + r.em[errorType] = 1 + } +} + +func (r *reporter) Start() { + for { + select { + case <-time.After(time.Minute): + cur := r.reset() + if cur != nil { + r.cb(cur) + } + } + } +} diff --git a/src/server/engine/worker.go b/src/server/engine/worker.go index 3e2fefaa..0d2954b7 100644 --- a/src/server/engine/worker.go +++ b/src/server/engine/worker.go @@ -116,7 +116,8 @@ func (r RuleEval) Work() { value, warnings, err = reader.Client.Query(context.Background(), promql, time.Now()) if err != nil { logger.Errorf("rule_eval:%d promql:%s, error:%v", r.RuleID(), promql, err) - notifyToMaintainer(err, "failed to query prometheus") + //notifyToMaintainer(err, "failed to query prometheus") + Report(QueryPrometheusError) return } diff --git a/src/server/router/router.go b/src/server/router/router.go index 299c1184..415a06db 100644 --- a/src/server/router/router.go +++ b/src/server/router/router.go @@ -18,7 +18,7 @@ import ( promstat "github.com/didi/nightingale/v5/src/server/stat" ) -func New(version string) *gin.Engine { +func New(version string, reloadFunc func()) *gin.Engine { gin.SetMode(config.C.RunMode) loggerMid := aop.Logger() @@ -37,12 +37,12 @@ func New(version string) *gin.Engine { r.Use(loggerMid) } - configRoute(r, version) + configRoute(r, version, reloadFunc) return r } -func configRoute(r *gin.Engine, version string) { +func configRoute(r *gin.Engine, version string, reloadFunc func()) { if config.C.HTTP.PProf { pprof.Register(r, "/api/debug/pprof") } @@ -63,6 +63,11 @@ func configRoute(r *gin.Engine, version string) { c.String(200, version) }) + r.POST("/-/reload", func(c *gin.Context) { + reloadFunc() + c.String(200, "reload success") + }) + r.GET("/servers/active", func(c *gin.Context) { lst, err := naming.ActiveServers(c.Request.Context(), config.C.ClusterName) ginx.NewRender(c).Data(lst, err) diff --git a/src/server/server.go b/src/server/server.go index d694a3c4..d36d208e 100644 --- a/src/server/server.go +++ b/src/server/server.go @@ -76,9 +76,7 @@ EXIT: break EXIT case syscall.SIGHUP: // reload configuration? - logger.Info("start reload configs") - engine.Reload() - logger.Info("reload configs finished") + reload() default: break EXIT } @@ -147,7 +145,7 @@ func (s Server) initialize() (func(), error) { stat.Init() // init http server - r := router.New(s.Version) + r := router.New(s.Version, reload) httpClean := httpx.Init(config.C.HTTP, r) fns.Add(httpClean) @@ -177,3 +175,9 @@ func (fs *Functions) Ret() func() { } } } + +func reload() { + logger.Info("start reload configs") + engine.Reload() + logger.Info("reload configs finished") +}