Merge branch 'master' of github.com:didi/nightingale

This commit is contained in:
Ulric Qin 2020-03-21 15:01:23 +08:00
commit 178acfb4c2
18 changed files with 50 additions and 27 deletions

View File

@ -74,7 +74,7 @@ func getCollects() (CollectResp, error) {
url := fmt.Sprintf("http://%s%s%s", addr, StraConfig.Api, identity.Identity)
err = httplib.Get(url).SetTimeout(time.Duration(StraConfig.Timeout) * time.Millisecond).ToJSON(&res)
if err != nil {
err = fmt.Errorf("get collects from remote failed, error:%v", err)
err = fmt.Errorf("get collects from remote:%s failed, error:%v", url, err)
}
return res, err

View File

@ -48,7 +48,9 @@ func Push(items []*dataobj.MetricValue) {
logger.Error(err)
continue
} else {
logger.Info("push succ, reply: ", reply)
if reply.Msg != "ok" {
logger.Error("some item push err", reply)
}
return
}
}

View File

@ -6,6 +6,7 @@ import (
"time"
"github.com/didi/nightingale/src/toolkits/address"
"github.com/didi/nightingale/src/toolkits/stats"
"github.com/toolkits/pkg/concurrent/semaphore"
"github.com/toolkits/pkg/logger"
@ -49,11 +50,12 @@ func reportEndpoint(endpoints []interface{}) {
err := httplib.Post(url).JSONBodyQuiet(m).SetTimeout(3*time.Second).Header("x-srv-token", "monapi-builtin-token").ToJSON(&body)
if err != nil {
logger.Warningf("curl %s fail: %v. retry", url, err)
stats.Counter.Set("report.endpoint.err", 1)
continue
}
if body.Err != "" {
if body.Err != "" { //数据库连接出错会出现此情况
logger.Warningf("curl %s fail: %s. retry", url, body.Err)
stats.Counter.Set("report.endpoint.err", 1)
continue
}

View File

@ -19,6 +19,7 @@ import (
"github.com/didi/nightingale/src/toolkits/compress"
"github.com/didi/nightingale/src/toolkits/identity"
"github.com/didi/nightingale/src/toolkits/report"
"github.com/didi/nightingale/src/toolkits/stats"
)
type CacheSection struct {
@ -72,8 +73,8 @@ func StartPersist(interval int) {
err := Persist("normal")
if err != nil {
logger.Error("Persist err:", err)
stats.Counter.Set("persist.err", 1)
}
//logger.Infof("clean %+v, took %.2f ms\n", cleanRet, float64(time.Since(start).Nanoseconds())*1e-6)
}
}

View File

@ -42,6 +42,5 @@ func Push(event *dataobj.Event) error {
return nil
}
stats.Counter.Set("redis.failed", 1)
return fmt.Errorf("redis publish failed finally:%v", err)
}

View File

@ -4,6 +4,7 @@ import (
"log"
"time"
"github.com/didi/nightingale/src/toolkits/stats"
"github.com/garyburd/redigo/redis"
"github.com/toolkits/pkg/logger"
)
@ -44,6 +45,7 @@ func Init(cfg RedisSection) {
c, err := redis.Dial("tcp", addr, redis.DialConnectTimeout(connTimeout), redis.DialReadTimeout(readTimeout), redis.DialWriteTimeout(writeTimeout))
if err != nil {
logger.Errorf("conn redis err:%v", err)
stats.Counter.Set("redis.conn.failed", 1)
return nil, err
}
@ -51,6 +53,8 @@ func Init(cfg RedisSection) {
if _, err := c.Do("AUTH", pass); err != nil {
c.Close()
logger.Errorf("ERR: redis auth fail:%v", err)
stats.Counter.Set("redis.conn.failed", 1)
return nil, err
}
}

View File

@ -65,7 +65,7 @@ func Judge(stra *model.Stra, exps []model.Exp, historyData []*dataobj.RRDData, f
stats.Counter.Set("running", 1)
if len(exps) < 1 {
stats.Counter.Set("stra.err", 1)
stats.Counter.Set("stra.illegal", 1)
logger.Warningf("stra:%v exp is null", stra)
return
}
@ -110,16 +110,16 @@ func Judge(stra *model.Stra, exps []model.Exp, historyData []*dataobj.RRDData, f
}()
leftValue, isTriggered = judgeItemWithStrategy(stra, historyData, exps[0], firstItem, now)
if !isTriggered {
return
}
if value == "" {
value = fmt.Sprintf("%s: %v", exp.Metric, leftValue)
} else {
value += fmt.Sprintf("; %s: %v", exp.Metric, leftValue)
}
if !isTriggered {
return
}
//与条件情况下执行
if len(exps) > 1 {
if exps[1].Func == "nodata" { //nodata重新查询索引来进行告警判断
@ -421,6 +421,7 @@ func sendEvent(event *dataobj.Event) {
err := redi.Push(event)
if err != nil {
stats.Counter.Set("redis.push.failed", 1)
logger.Errorf("push event:%v err:%v", event, err)
}
}

View File

@ -56,11 +56,13 @@ func getStrategy(opts StrategySection) {
if err != nil {
logger.Warningf("get strategy from remote failed, error:%v", err)
stats.Counter.Set("stra.get.err", 1)
continue
}
if resp.Err != "" {
logger.Warningf("get strategy from remote failed, error:%v", resp.Err)
stats.Counter.Set("stra.get.err", 1)
continue
}

View File

@ -11,6 +11,7 @@ import (
"github.com/didi/nightingale/src/model"
"github.com/didi/nightingale/src/modules/monapi/config"
"github.com/didi/nightingale/src/modules/monapi/scache"
"github.com/didi/nightingale/src/toolkits/stats"
)
func CheckJudgeLoop() {
@ -19,6 +20,7 @@ func CheckJudgeLoop() {
time.Sleep(duration)
err := CheckJudge()
if err != nil {
stats.Counter.Set("get.judge.err", 1)
logger.Error("check judge fail: ", err)
}
}

View File

@ -7,6 +7,7 @@ import (
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/src/modules/monapi/config"
"github.com/didi/nightingale/src/toolkits/stats"
)
var RedisConnPool *redis.Pool
@ -29,6 +30,8 @@ func InitRedis() {
Dial: func() (redis.Conn, error) {
c, err := redis.Dial("tcp", addr, redis.DialConnectTimeout(connTimeout), redis.DialReadTimeout(readTimeout), redis.DialWriteTimeout(writeTimeout))
if err != nil {
logger.Errorf("conn redis err:%v", err)
stats.Counter.Set("redis.conn.failed", 1)
return nil, err
}
@ -36,6 +39,7 @@ func InitRedis() {
if _, err := c.Do("AUTH", pass); err != nil {
c.Close()
logger.Error("redis auth fail, pass: ", pass)
stats.Counter.Set("redis.conn.failed", 1)
return nil, err
}
}

View File

@ -12,6 +12,7 @@ import (
"github.com/didi/nightingale/src/dataobj"
"github.com/didi/nightingale/src/modules/transfer/calc"
"github.com/didi/nightingale/src/toolkits/address"
"github.com/didi/nightingale/src/toolkits/stats"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/net/httplib"
@ -161,10 +162,12 @@ func fetchDataSync(start, end int64, consolFun, endpoint, counter string, step i
defer func() {
<-worker
}()
stats.Counter.Set("query.tsdb", 1)
data, err := fetchData(start, end, consolFun, endpoint, counter, step)
if err != nil {
logger.Warning(err)
stats.Counter.Set("query.data.err", 1)
}
dataChan <- data
return

View File

@ -107,6 +107,7 @@ func Send2TsdbTask(Q *list.SafeListLimited, node string, addr string, concurrent
// 将数据 打入 某个Tsdb的发送缓存队列, 具体是哪一个Tsdb 由一致性哈希 决定
func Push2TsdbSendQueue(items []*dataobj.MetricValue) {
errCnt := 0
for _, item := range items {
tsdbItem := convert2TsdbItem(item)
stats.Counter.Set("tsdb.queue.push", 1)
@ -118,19 +119,18 @@ func Push2TsdbSendQueue(items []*dataobj.MetricValue) {
}
cnode := Config.ClusterList[node]
errCnt := 0
for _, addr := range cnode.Addrs {
Q := TsdbQueues[node+addr]
if !Q.PushFront(tsdbItem) {
errCnt += 1
}
}
}
// statistics
if errCnt > 0 {
stats.Counter.Set("tsdb.queue.err", errCnt)
logger.Error("Push2TsdbSendQueue err num: ", errCnt)
}
// statistics
if errCnt > 0 {
stats.Counter.Set("tsdb.queue.err", errCnt)
logger.Error("Push2TsdbSendQueue err num: ", errCnt)
}
}
@ -172,7 +172,7 @@ func Send2JudgeTask(Q *list.SafeListLimited, addr string, concurrent int) {
if !sendOk {
stats.Counter.Set("points.out.judge.err", 1)
logger.Errorf("send judge %s fail: %v", addr, err)
logger.Errorf("send %v to judge %s fail: %v", judgeItems, addr, err)
}
}(addr, judgeItems, count)
@ -180,6 +180,7 @@ func Send2JudgeTask(Q *list.SafeListLimited, addr string, concurrent int) {
}
func Push2JudgeSendQueue(items []*dataobj.MetricValue) {
errCnt := 0
for _, item := range items {
key := str.PK(item.Metric, item.Endpoint)
stras := cache.StraMap.GetByKey(key)
@ -203,11 +204,13 @@ func Push2JudgeSendQueue(items []*dataobj.MetricValue) {
q, exists := JudgeQueues.Get(stra.JudgeInstance)
if exists {
q.PushFront(judgeItem)
if !q.PushFront(judgeItem) {
errCnt += 1
}
}
}
}
stats.Counter.Set("judge.queue.err", errCnt)
}
// 打到Tsdb的数据,要根据rrdtool的特定 来限制 step、counterType、timestamp

View File

@ -27,9 +27,10 @@ func (t *Transfer) Push(args []*dataobj.MetricValue, reply *dataobj.TransferResp
err := v.CheckValidity()
if err != nil {
stats.Counter.Set("points.in.err", 1)
logger.Warningf("item is illegal item:%s err:%v", v, err)
msg := fmt.Sprintf("item is illegal item:%s err:%v", v, err)
logger.Warningf(msg)
reply.Invalid += 1
reply.Msg += fmt.Sprintf("%v\n", err)
reply.Msg += msg
continue
}

View File

@ -47,7 +47,7 @@ func GetIndexLoop() {
func GetIndex() {
instances, err := report.GetAlive("index", Config.HbsMod)
if err != nil {
stats.Counter.Set("index.get.err", 1)
stats.Counter.Set("get.index.err", 1)
logger.Warningf("get index list err:%v", err)
return
}

View File

@ -58,7 +58,6 @@ func handleItems(items []*dataobj.TsdbItem) {
//todo hash冲突问题需要解决
if err := cache.Caches.Push(item.Key, item.Timestamp, item.Value); err != nil {
stats.Counter.Set("points.in.err", 1)
logger.Warningf("push obj error, obj: %v, error: %v\n", items[i], err)
fail++
}

View File

@ -231,6 +231,7 @@ func FlushRRD(flushChunks map[interface{}][]*cache.Chunk) {
err := FlushFile(seriesID, items)
if err != nil {
stats.Counter.Set("flush.rrd.err", 1)
logger.Errorf("flush %v data to rrd err:%v", seriesID, err)
continue
}

View File

@ -18,11 +18,12 @@ type IdentitySection struct {
func Init(identity IdentitySection) {
if identity.Specify != "" {
Identity = identity.Specify
return
}
var err error
Identity, err = sys.CmdOutTrim("bash", "-c", identity.Shell)
if err != nil {
log.Fatalln("[F] cannot get hostname")
log.Fatalln("[F] cannot get identity")
}
}

View File

@ -7,7 +7,6 @@ import (
"time"
"github.com/didi/nightingale/src/dataobj"
"github.com/didi/nightingale/src/toolkits/identity"
"github.com/toolkits/pkg/logger"
)
@ -42,7 +41,6 @@ func Push() {
func NewMetricValue(metric string, value int64) *dataobj.MetricValue {
item := &dataobj.MetricValue{
Metric: metric,
Endpoint: identity.Identity,
Timestamp: time.Now().Unix(),
ValueUntyped: value,
CounterType: "GAUGE",