version 5.1

This commit is contained in:
UlricQin 2021-11-28 18:57:49 +08:00
parent 7a2b07eebd
commit 6e3ad3dd6b
259 changed files with 24719 additions and 18122 deletions

38
Makefile Normal file
View File

@ -0,0 +1,38 @@
.PHONY: start build
NOW = $(shell date -u '+%Y%m%d%I%M%S')
RELEASE_VERSION = 5.1.0
APP = n9e
SERVER_BIN = ${APP}
# RELEASE_ROOT = release
# RELEASE_SERVER = release/${APP}
# GIT_COUNT = $(shell git rev-list --all --count)
# GIT_HASH = $(shell git rev-parse --short HEAD)
# RELEASE_TAG = $(RELEASE_VERSION).$(GIT_COUNT).$(GIT_HASH)
all: build
build:
@go build -ldflags "-w -s -X main.VERSION=$(RELEASE_VERSION)" -o $(SERVER_BIN) ./src
# start:
# @go run -ldflags "-X main.VERSION=$(RELEASE_TAG)" ./cmd/${APP}/main.go web -c ./configs/config.toml -m ./configs/model.conf --menu ./configs/menu.yaml
# swagger:
# @swag init --parseDependency --generalInfo ./cmd/${APP}/main.go --output ./internal/app/swagger
# wire:
# @wire gen ./internal/app
# test:
# cd ./internal/app/test && go test -v
# clean:
# rm -rf data release $(SERVER_BIN) internal/app/test/data cmd/${APP}/data
# pack: build
# rm -rf $(RELEASE_ROOT) && mkdir -p $(RELEASE_SERVER)
# cp -r $(SERVER_BIN) configs $(RELEASE_SERVER)
# cd $(RELEASE_ROOT) && tar -cvf $(APP).tar ${APP} && rm -rf ${APP}

View File

@ -1,10 +1,18 @@
## 基本信息
- 官网:[n9e.didiyun.com](https://n9e.didiyun.com/) 右上角切换版本
- 招聘前后端都要base北京薪资open可将简历发至邮箱 `echo cWlueWVuaW5nQGRpZGlnbG9iYWwuY29t | base64 -d` 一起来做开源
## 大本营 ## 大本营
微信公号:`__n9e__`(夜莺监控) 微信公号:`__n9e__`(夜莺监控)
知识星球:夜莺开源社区 知识星球:夜莺开源社区
钉钉交流群:
# todo
- [x] deploy nightingale in docker
- [x] export /metrics endpoint
- [ ] notify.py support feishu
- [ ] notify.py support sms
- [ ] notify.py support voice

View File

@ -1,9 +0,0 @@
package alert
import (
"context"
)
func Start(ctx context.Context) {
go popEvent()
}

View File

@ -1,325 +0,0 @@
package alert
import (
"bytes"
"encoding/json"
"fmt"
"os/exec"
"sort"
"strconv"
"strings"
"time"
"github.com/didi/nightingale/v5/cache"
"github.com/didi/nightingale/v5/config"
"github.com/didi/nightingale/v5/judge"
"github.com/didi/nightingale/v5/models"
"github.com/toolkits/pkg/concurrent/semaphore"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/net/httplib"
"github.com/toolkits/pkg/sys"
)
func popEvent() {
sema := semaphore.NewSemaphore(config.Config.Alert.NotifyScriptConcurrency)
duration := time.Duration(100) * time.Millisecond
for {
events := judge.EventQueue.PopBackBy(200)
if len(events) < 1 {
time.Sleep(duration)
continue
}
consume(events, sema)
}
}
func consume(events []interface{}, sema *semaphore.Semaphore) {
for i := range events {
if events[i] == nil {
continue
}
event := events[i].(*models.AlertEvent)
alertRule, exists := cache.AlertRules.Get(event.RuleId)
if !exists {
logger.Errorf("event_consume: alert rule not found, event:%+v", event)
continue
}
logger.Debugf("[event_consume_success][type:%v][event:%+v]", event.IsPromePull, event)
if isNoneffective(event, alertRule) {
// 告警规则非生效时段
continue
}
event.RuleName = alertRule.Name
event.RuleNote = alertRule.Note
event.NotifyChannels = alertRule.NotifyChannels
classpaths := cache.ResClasspath.GetValues(event.ResIdent)
sort.Strings(classpaths)
event.ResClasspaths = strings.Join(classpaths, " ")
enrichTag(event, alertRule)
if isEventMute(event) && event.IsAlert() {
// 被屏蔽的事件
event.MarkMuted()
if config.Config.Alert.MutedAlertPersist {
persist(event)
}
continue
}
// 操作数据库
persist(event)
// 不管是告警还是恢复,都触发回调,接收端自己处理
if alertRule.Callbacks != "" {
go callback(event, alertRule)
}
uids := genNotifyUserIDs(alertRule)
if len(uids) == 0 {
logger.Warningf("event_consume: notify users not found, event_hash_id: %s, rule_id: %d, rule_name: %s, res_ident: %s", event.HashId, event.RuleId, event.RuleName, event.ResIdent)
continue
}
users := cache.UserCache.GetByIds(uids)
if len(users) == 0 {
logger.Warningf("event_consume: notify users not found, event_hash_id: %s, rule_id: %d, rule_name: %s, res_ident: %s", event.HashId, event.RuleId, event.RuleName, event.ResIdent)
continue
}
alertMsg := AlertMsg{
Event: event,
Rule: alertRule,
Users: users,
}
logger.Infof("event_consume: notify alert:%+v", alertMsg)
sema.Acquire()
go func(alertMsg AlertMsg) {
defer sema.Release()
notify(alertMsg)
}(alertMsg)
}
}
func genNotifyUserIDs(alertRule *models.AlertRule) []int64 {
uidMap := make(map[int64]struct{})
groupIds := strings.Fields(alertRule.NotifyGroups)
for _, groupId := range groupIds {
gid, err := strconv.ParseInt(groupId, 10, 64)
if err != nil {
logger.Warningf("event_consume: strconv groupid(%s) fail: %v", groupId, err)
continue
}
um, exists := cache.UserGroupMember.Get(gid)
if !exists {
continue
}
for uid := range um {
uidMap[uid] = struct{}{}
}
}
userIds := strings.Fields(alertRule.NotifyUsers)
for _, userId := range userIds {
uid, err := strconv.ParseInt(userId, 10, 64)
if err != nil {
logger.Warningf("event_consume: strconv userid(%s) fail: %v", userId, err)
continue
}
uidMap[uid] = struct{}{}
}
uids := make([]int64, 0, len(uidMap))
for uid := range uidMap {
uids = append(uids, uid)
}
return uids
}
// 如果是告警,就存库,如果是恢复,就从未恢复的告警表里删除
func persist(event *models.AlertEvent) {
if event.IsRecov() {
logger.Debugf("[event.Recovery.db.DelByHashId]: delete recovery event:%+v", event)
err := event.DelByHashId()
if err != nil {
logger.Warningf("event_consume: delete recovery event err:%v, event:%+v", err, event)
}
} else {
err := event.Add()
if err != nil {
logger.Warningf("event_consume: insert alert event err:%v, event:%+v", err, event)
}
}
obj := ToHistoryAlertEvent(event)
err := obj.Add()
if err != nil {
logger.Warningf("event_consume: insert history alert event err:%v, event:%+v", err, event)
}
}
type AlertMsg struct {
Event *models.AlertEvent `json:"event"`
Rule *models.AlertRule `json:"rule"`
Users []*models.User `json:"users"`
}
func notify(alertMsg AlertMsg) {
//增加并发控制
bs, err := json.Marshal(alertMsg)
if err != nil {
logger.Errorf("notify: marshal alert %+v err:%v", alertMsg, err)
}
fpath := config.Config.Alert.NotifyScriptPath
cmd := exec.Command(fpath)
cmd.Stdin = bytes.NewReader(bs)
// combine stdout and stderr
var buf bytes.Buffer
cmd.Stdout = &buf
cmd.Stderr = &buf
err = cmd.Start()
if err != nil {
logger.Errorf("notify: run cmd err:%v", err)
return
}
err, isTimeout := sys.WrapTimeout(cmd, time.Duration(10)*time.Second)
if isTimeout {
if err == nil {
logger.Errorf("notify: timeout and killed process %s", fpath)
}
if err != nil {
logger.Errorf("notify: kill process %s occur error %v", fpath, err)
}
return
}
if err != nil {
logger.Errorf("notify: exec script %s occur error: %v, output: %s", fpath, err, buf.String())
return
}
logger.Infof("notify: exec %s output: %s", fpath, buf.String())
}
func callback(event *models.AlertEvent, alertRule *models.AlertRule) {
urls := strings.Fields(alertRule.Callbacks)
for _, url := range urls {
if url == "" {
continue
}
if !(strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")) {
url = "http://" + url
}
resp, code, err := httplib.PostJSON(url, 5*time.Second, event, map[string]string{})
if err != nil {
logger.Errorf("callback[%s] fail, callback content: %+v, resp: %s, err: %v, code:%d", url, event, string(resp), err, code)
} else {
logger.Infof("callback[%s] succ, callback content: %+v, resp: %s, code:%d", url, event, string(resp), code)
}
}
}
func isNoneffective(event *models.AlertEvent, alertRule *models.AlertRule) bool {
// 生效时间过滤
if alertRule.Status == models.ALERT_RULE_DISABLED {
logger.Debugf("event:%+v alert rule:%+v disable", event, alertRule)
return true
}
tm := time.Unix(event.TriggerTime, 0)
triggerTime := tm.Format("15:04")
triggerWeek := strconv.Itoa(int(tm.Weekday()))
if alertRule.EnableStime <= alertRule.EnableEtime {
if triggerTime < alertRule.EnableStime || triggerTime > alertRule.EnableEtime {
logger.Debugf("event:%+v alert rule:%+v triggerTime Noneffective", event, alertRule)
return true
}
} else {
if triggerTime < alertRule.EnableStime && triggerTime > alertRule.EnableEtime {
logger.Debugf("event:%+v alert rule:%+v triggerTime Noneffective", event, alertRule)
return true
}
}
alertRule.EnableDaysOfWeek = strings.Replace(alertRule.EnableDaysOfWeek, "7", "0", 1)
if !strings.Contains(alertRule.EnableDaysOfWeek, triggerWeek) {
logger.Debugf("event:%+v alert rule:%+v triggerWeek Noneffective", event, alertRule)
return true
}
return false
}
// 事件的tags有多种tags组成ident作为一个tag数据本身的tags(前期已经把res的tags也附到数据tags里了)、规则的tags
func enrichTag(event *models.AlertEvent, alertRule *models.AlertRule) {
if event.ResIdent != "" {
event.TagMap["ident"] = event.ResIdent
}
if alertRule.AppendTags != "" {
appendTags := strings.Fields(alertRule.AppendTags)
for _, tag := range appendTags {
arr := strings.Split(tag, "=")
if len(arr) != 2 {
logger.Warningf("alertRule AppendTags:%+v illagel", alertRule.AppendTags)
continue
}
event.TagMap[arr[0]] = arr[1]
}
}
var tagList []string
for key, value := range event.TagMap {
tagList = append(tagList, fmt.Sprintf("%s=%s", key, value))
}
sort.Strings(tagList)
event.Tags = strings.Join(tagList, " ")
}
func ToHistoryAlertEvent(ae *models.AlertEvent) *models.HistoryAlertEvent {
var obj models.HistoryAlertEvent
obj.RuleId = ae.RuleId
obj.RuleName = ae.RuleName
obj.RuleNote = ae.RuleNote
obj.HashId = ae.HashId
obj.IsPromePull = ae.IsPromePull
obj.ResClasspaths = ae.ResClasspaths
obj.ResIdent = ae.ResIdent
obj.Priority = ae.Priority
obj.Status = ae.Status
obj.IsRecovery = ae.IsRecovery
obj.HistoryPoints = ae.HistoryPoints
obj.TriggerTime = ae.TriggerTime
obj.Values = ae.Values
obj.NotifyChannels = ae.NotifyChannels
obj.NotifyGroups = ae.NotifyGroups
obj.NotifyUsers = ae.NotifyUsers
obj.RunbookUrl = ae.RunbookUrl
obj.ReadableExpression = ae.ReadableExpression
obj.Tags = ae.Tags
obj.NotifyGroupObjs = ae.NotifyGroupObjs
obj.NotifyUserObjs = ae.NotifyUserObjs
return &obj
}

View File

@ -1,89 +0,0 @@
package alert
import (
"strings"
"github.com/didi/nightingale/v5/cache"
"github.com/didi/nightingale/v5/models"
"github.com/toolkits/pkg/logger"
)
func isEventMute(event *models.AlertEvent) bool {
historyPoints, err := event.GetHistoryPoints()
if err != nil {
logger.Errorf("get event HistoryPoints:%+v failed, err: %v", event.HistoryPoints, err)
return false
}
// 先去匹配一下metric为空的mute
if matchMute("", event.ResIdent, event.TagMap, event.ResClasspaths) {
return true
}
// 如果是与条件就会有多个metric任一个匹配了屏蔽规则都算被屏蔽
for i := 0; i < len(historyPoints); i++ {
if matchMute(historyPoints[i].Metric, event.ResIdent, event.TagMap, event.ResClasspaths) {
return true
}
}
resAndTags, exists := cache.ResTags.Get(event.ResIdent)
if exists {
if event.TriggerTime > resAndTags.Resource.MuteBtime && event.TriggerTime < resAndTags.Resource.MuteEtime {
return true
}
}
return false
}
func matchMute(metric, ident string, tags map[string]string, classpaths string) bool {
filters, exists := cache.AlertMute.GetByKey(metric)
if !exists {
// 没有屏蔽规则跟这个事件相关
return false
}
// 只要有一个屏蔽规则命中,那这个事件就是被屏蔽了
for _, filter := range filters {
if matchMuteOnce(filter, ident, tags, classpaths) {
return true
}
}
return false
}
func matchMuteOnce(filter cache.Filter, ident string, tags map[string]string, classpaths string) bool {
if len(filter.ClasspathPrefix) > 0 && !strings.HasPrefix(classpaths, filter.ClasspathPrefix) && !strings.Contains(classpaths, " "+filter.ClasspathPrefix) {
// 没配置分组屏蔽就不做后续比较
// 比如事件的资源calsspath为“n9e.mon n9e.rdb ccp.web”配置屏蔽为n9e.rdb
// 只要字符串前缀为n9e.rdb或者字符串包含“ n9e.rdb”即可判断所有alsspath中是否有前缀为n9e.rdb的
// 只要有任一点不满足,那这个屏蔽规则也没有继续验证下去的必要
return false
}
if filter.ResReg != nil && !filter.ResReg.MatchString(ident) {
// 比如屏蔽规则配置的是c3-ceph.*
// 当前事件的资源标识是c4-ceph01.bj
// 只要有任一点不满足,那这个屏蔽规则也没有继续验证下去的必要
return false
}
// 每个mute中的tags都得出现在event.tags否则就是不匹配
return mapContains(tags, filter.TagsMap)
}
func mapContains(big, small map[string]string) bool {
for tagk, tagv := range small {
val, exists := big[tagk]
if !exists {
return false
}
if val != tagv {
return false
}
}
return true
}

View File

@ -1,89 +0,0 @@
package backend
import (
"fmt"
"github.com/prometheus/prometheus/promql"
"github.com/didi/nightingale/v5/vos"
"github.com/toolkits/pkg/container/list"
pp "github.com/didi/nightingale/v5/backend/prome"
)
type BackendSection struct {
DataSource string `yaml:"datasource"`
Prometheus pp.PromeSection `yaml:"prometheus"`
}
type DataSource interface {
PushEndpoint
QueryData(inputs vos.DataQueryParam) []*vos.DataQueryResp // 查询一段时间
QueryDataInstant(ql string) []*vos.DataQueryInstanceResp // 查询一个时间点数据 等同于prometheus instant_query
QueryTagKeys(recv vos.CommonTagQueryParam) *vos.TagKeyQueryResp // 获取标签的names
QueryTagValues(recv vos.CommonTagQueryParam) *vos.TagValueQueryResp // 根据一个label_name获取 values
QueryTagPairs(recv vos.CommonTagQueryParam) *vos.TagPairQueryResp // 根据匹配拿到所有 series 上面三个使用统一的结构体
QueryMetrics(recv vos.MetricQueryParam) *vos.MetricQueryResp // 根据标签查 metric_names
QueryVector(ql string) promql.Vector // prometheus pull alert 所用,其他数据源留空即可
CleanUp() // 数据源退出时需要做的清理工作
}
type PushEndpoint interface {
Push2Queue(items []*vos.MetricPoint)
}
var (
defaultDataSource string
registryDataSources = make(map[string]DataSource)
registryPushEndpoints = make(map[string]PushEndpoint)
)
func Init(cfg BackendSection) {
defaultDataSource = cfg.DataSource
// init prometheus
if cfg.Prometheus.Enable {
promeDs := &pp.PromeDataSource{
Section: cfg.Prometheus,
PushQueue: list.NewSafeListLimited(10240000),
}
promeDs.Init()
RegisterDataSource(cfg.Prometheus.Name, promeDs)
}
}
// get backend datasource
// (pluginId == "" for default datasource)
func GetDataSourceFor(pluginId string) (DataSource, error) {
if pluginId == "" {
pluginId = defaultDataSource
}
if source, exists := registryDataSources[pluginId]; exists {
return source, nil
}
return nil, fmt.Errorf("could not find datasource for plugin: %s", pluginId)
}
func DatasourceCleanUp() {
for _, ds := range registryDataSources {
ds.CleanUp()
}
}
// get all push endpoints
func GetPushEndpoints() ([]PushEndpoint, error) {
if len(registryPushEndpoints) > 0 {
items := make([]PushEndpoint, 0, len(registryPushEndpoints))
for _, value := range registryPushEndpoints {
items = append(items, value)
}
return items, nil
}
return nil, fmt.Errorf("could not find any pushendpoint")
}
func RegisterDataSource(pluginId string, datasource DataSource) {
registryDataSources[pluginId] = datasource
registryPushEndpoints[pluginId] = datasource
}

View File

@ -1,183 +0,0 @@
package backend
import (
"bufio"
"bytes"
"context"
"io"
"io/ioutil"
"net/http"
"regexp"
"time"
"github.com/gogo/protobuf/proto"
"github.com/golang/snappy"
"github.com/opentracing-contrib/go-stdlib/nethttp"
"github.com/opentracing/opentracing-go"
"github.com/pkg/errors"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/prompb"
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/v5/vos"
)
var MetricNameRE = regexp.MustCompile(`^[a-zA-Z_:][a-zA-Z0-9_:]*$`)
type sample struct {
labels labels.Labels
t int64
v float64
}
func labelsToLabelsProto(labels labels.Labels, buf []prompb.Label) []prompb.Label {
result := buf[:0]
if cap(buf) < len(labels) {
result = make([]prompb.Label, 0, len(labels))
}
for _, l := range labels {
result = append(result, prompb.Label{
Name: l.Name,
Value: l.Value,
})
}
return result
}
func (pd *PromeDataSource) convertOne(item *vos.MetricPoint) (prompb.TimeSeries, error) {
pt := prompb.TimeSeries{}
pt.Samples = []prompb.Sample{{}}
s := sample{}
s.t = item.Time
s.v = item.Value
// name
if !MetricNameRE.MatchString(item.Metric) {
return pt, errors.New("invalid metrics name")
}
nameLs := labels.Label{
Name: LABEL_NAME,
Value: item.Metric,
}
s.labels = append(s.labels, nameLs)
if item.Ident != "" {
identLs := labels.Label{
Name: LABEL_IDENT,
Value: item.Ident,
}
s.labels = append(s.labels, identLs)
}
for k, v := range item.TagsMap {
if model.LabelNameRE.MatchString(k) {
ls := labels.Label{
Name: k,
Value: v,
}
s.labels = append(s.labels, ls)
}
}
pt.Labels = labelsToLabelsProto(s.labels, pt.Labels)
// 时间赋值问题,使用毫秒时间戳
tsMs := time.Unix(s.t, 0).UnixNano() / 1e6
pt.Samples[0].Timestamp = tsMs
pt.Samples[0].Value = s.v
return pt, nil
}
type RecoverableError struct {
error
}
func remoteWritePost(c *HttpClient, req []byte) error {
httpReq, err := http.NewRequest("POST", c.url.String(), bytes.NewReader(req))
if err != nil {
// Errors from NewRequest are from unparsable URLs, so are not
// recoverable.
return err
}
httpReq.Header.Add("Content-Encoding", "snappy")
httpReq.Header.Set("Content-Type", "application/x-protobuf")
httpReq.Header.Set("User-Agent", "n9e-v5")
httpReq.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
defer cancel()
httpReq = httpReq.WithContext(ctx)
if parentSpan := opentracing.SpanFromContext(ctx); parentSpan != nil {
var ht *nethttp.Tracer
httpReq, ht = nethttp.TraceRequest(
parentSpan.Tracer(),
httpReq,
nethttp.OperationName("Remote Store"),
nethttp.ClientTrace(false),
)
defer ht.Finish()
}
httpResp, err := c.Client.Do(httpReq)
if err != nil {
// Errors from Client.Do are from (for example) network errors, so are
// recoverable.
return RecoverableError{err}
}
defer func() {
io.Copy(ioutil.Discard, httpResp.Body)
httpResp.Body.Close()
}()
if httpResp.StatusCode/100 != 2 {
scanner := bufio.NewScanner(io.LimitReader(httpResp.Body, 512))
line := ""
if scanner.Scan() {
line = scanner.Text()
}
if httpResp.StatusCode == 400 {
//400的错误是客户端的问题不返回给上层输出到debug日志中
logger.Debugf("server returned HTTP status %s: %s req:%v", httpResp.Status, line, getSamples(req))
} else {
err = errors.Errorf("server returned HTTP status %s: %s", httpResp.Status, line)
}
}
if httpResp.StatusCode/100 == 5 {
return RecoverableError{err}
}
return err
}
func (pd *PromeDataSource) buildWriteRequest(samples []prompb.TimeSeries) ([]byte, error) {
req := &prompb.WriteRequest{
Timeseries: samples,
Metadata: nil,
}
data, err := proto.Marshal(req)
if err != nil {
return nil, err
}
compressed := snappy.Encode(nil, data)
return compressed, nil
}
func getSamples(compressed []byte) []prompb.TimeSeries {
var samples []prompb.TimeSeries
req := &prompb.WriteRequest{
Timeseries: samples,
Metadata: nil,
}
d, _ := snappy.Decode(nil, compressed)
proto.Unmarshal(d, req)
return req.Timeseries
}

View File

@ -1,257 +0,0 @@
package backend
import (
"io/ioutil"
"net/http"
"net/url"
"os"
"time"
"github.com/go-kit/kit/log"
"github.com/prometheus/client_golang/prometheus"
config_util "github.com/prometheus/common/config"
"github.com/prometheus/common/model"
"github.com/prometheus/common/promlog"
pc "github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/prompb"
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/storage/remote"
"github.com/toolkits/pkg/container/list"
"github.com/toolkits/pkg/logger"
"go.uber.org/atomic"
"github.com/didi/nightingale/v5/vos"
)
const (
DefaultPopNum = 1000
)
type PromeSection struct {
Enable bool `yaml:"enable"`
Name string `yaml:"name"`
Batch int `yaml:"batch"`
MaxRetry int `yaml:"maxRetry"`
LookbackDeltaMinute int `yaml:"lookbackDeltaMinute"`
MaxConcurrentQuery int `yaml:"maxConcurrentQuery"`
MaxSamples int `yaml:"maxSamples"`
MaxFetchAllSeriesLimitMinute int64 `yaml:"maxFetchAllSeriesLimitMinute"`
SlowLogRecordSecond float64 `yaml:"slowLogRecordSecond"`
DefaultFetchSeriesQl string `yaml:"defaultFetchSeriesQl"`
RemoteWrite []RemoteConfig `yaml:"remoteWrite"`
RemoteRead []RemoteConfig `yaml:"remoteRead"`
}
type RemoteConfig struct {
Name string `yaml:"name"`
Url string `yaml:"url"`
RemoteTimeoutSecond int `yaml:"remoteTimeoutSecond"`
}
type PromeDataSource struct {
Section PromeSection
LocalTmpDir string
// 除了promql的查询需要后端存储
Queryable storage.SampleAndChunkQueryable
// promql相关查询
QueryEngine *promql.Engine
PushQueue *list.SafeListLimited
WriteTargets []*HttpClient
}
type safePromQLNoStepSubqueryInterval struct {
value atomic.Int64
}
type HttpClient struct {
remoteName string // Used to differentiate clients in metrics.
url *url.URL
Client *http.Client
timeout time.Duration
}
func durationToInt64Millis(d time.Duration) int64 {
return int64(d / time.Millisecond)
}
func (i *safePromQLNoStepSubqueryInterval) Set(ev model.Duration) {
i.value.Store(durationToInt64Millis(time.Duration(ev)))
}
func (i *safePromQLNoStepSubqueryInterval) Get(int64) int64 {
return i.value.Load()
}
func (pd *PromeDataSource) CleanUp() {
err := os.RemoveAll(pd.LocalTmpDir)
logger.Infof("[remove_prome_tmp_dir_err][dir:%+v][err: %v]", pd.LocalTmpDir, err)
}
func (pd *PromeDataSource) Init() {
// 模拟创建本地存储目录
dbDir, err := ioutil.TempDir("", "tsdb-api-ready")
if err != nil {
logger.Errorf("[error_create_local_tsdb_dir][err: %v]", err)
return
}
pd.LocalTmpDir = dbDir
promlogConfig := promlog.Config{}
// 使用本地目录创建remote-storage
remoteS := remote.NewStorage(promlog.New(&promlogConfig), prometheus.DefaultRegisterer, func() (int64, error) {
return 0, nil
}, dbDir, 1*time.Minute, nil)
// ApplyConfig 加载queryables
remoteReadC := make([]*pc.RemoteReadConfig, 0)
for _, u := range pd.Section.RemoteRead {
ur, err := url.Parse(u.Url)
if err != nil {
logger.Errorf("[prome_ds_init_error][parse_url_error][url:%+v][err:%+v]", u.Url, err)
continue
}
remoteReadC = append(remoteReadC,
&pc.RemoteReadConfig{
URL: &config_util.URL{URL: ur},
RemoteTimeout: model.Duration(time.Duration(u.RemoteTimeoutSecond) * time.Second),
ReadRecent: true,
},
)
}
if len(remoteReadC) == 0 {
logger.Errorf("[prome_ds_error_got_zero_remote_read_storage]")
return
}
err = remoteS.ApplyConfig(&pc.Config{RemoteReadConfigs: remoteReadC})
if err != nil {
logger.Errorf("[error_load_remote_read_config][err: %v]", err)
return
}
pLogger := log.NewNopLogger()
noStepSubqueryInterval := &safePromQLNoStepSubqueryInterval{}
queryQueueDir, err := ioutil.TempDir(dbDir, "prom_query_concurrency")
opts := promql.EngineOpts{
Logger: log.With(pLogger, "component", "query engine"),
Reg: prometheus.DefaultRegisterer,
MaxSamples: pd.Section.MaxSamples,
Timeout: 30 * time.Second,
ActiveQueryTracker: promql.NewActiveQueryTracker(queryQueueDir, pd.Section.MaxConcurrentQuery, log.With(pLogger, "component", "activeQueryTracker")),
LookbackDelta: time.Duration(pd.Section.LookbackDeltaMinute) * time.Minute,
NoStepSubqueryIntervalFn: noStepSubqueryInterval.Get,
EnableAtModifier: true,
}
queryEngine := promql.NewEngine(opts)
pd.QueryEngine = queryEngine
pd.Queryable = remoteS
// 初始化writeClients
if len(pd.Section.RemoteWrite) == 0 {
logger.Warningf("[prome_ds_init_with_zero_RemoteWrite_target]")
logger.Infof("[successfully_init_prometheus_datasource][remote_read_num:%+v][remote_write_num:%+v]",
len(pd.Section.RemoteRead),
len(pd.Section.RemoteWrite),
)
return
}
writeTs := make([]*HttpClient, 0)
for _, u := range pd.Section.RemoteWrite {
ur, err := url.Parse(u.Url)
if err != nil {
logger.Errorf("[prome_ds_init_error][parse_url_error][url:%+v][err:%+v]", u.Url, err)
continue
}
writeTs = append(writeTs,
&HttpClient{
remoteName: u.Name,
url: ur,
Client: &http.Client{},
timeout: time.Duration(u.RemoteTimeoutSecond) * time.Second,
})
}
pd.WriteTargets = writeTs
// 开启prometheus 队列消费协程
go pd.remoteWrite()
logger.Infof("[successfully_init_prometheus_datasource][remote_read_num:%+v][remote_write_num:%+v]",
len(remoteReadC),
len(writeTs),
)
}
func (pd *PromeDataSource) Push2Queue(points []*vos.MetricPoint) {
for _, point := range points {
pt, err := pd.convertOne(point)
if err != nil {
logger.Errorf("[prome_convertOne_error][point: %+v][err:%s]", point, err)
continue
}
ok := pd.PushQueue.PushFront(pt)
if !ok {
logger.Errorf("[prome_push_queue_error][point: %+v] ", point)
}
}
}
func (pd *PromeDataSource) remoteWrite() {
batch := pd.Section.Batch // 一次发送,最多batch条数据
if batch <= 0 {
batch = DefaultPopNum
}
for {
items := pd.PushQueue.PopBackBy(batch)
count := len(items)
if count == 0 {
time.Sleep(time.Millisecond * 100)
continue
}
pbItems := make([]prompb.TimeSeries, count)
for i := 0; i < count; i++ {
pbItems[i] = items[i].(prompb.TimeSeries)
}
payload, err := pd.buildWriteRequest(pbItems)
if err != nil {
logger.Errorf("[prome_remote_write_error][pb_marshal_error][items: %+v][pb.err: %v]: ", items, err)
continue
}
pd.processWrite(payload)
}
}
func (pd *PromeDataSource) processWrite(payload []byte) {
retry := pd.Section.MaxRetry
for _, c := range pd.WriteTargets {
newC := c
go func(cc *HttpClient, payload []byte) {
sendOk := false
var rec bool
var finalErr error
for i := 0; i < retry; i++ {
err := remoteWritePost(cc, payload)
if err == nil {
sendOk = true
break
}
_, rec = err.(RecoverableError)
if !rec {
finalErr = err
break
}
logger.Warningf("[send prome fail recoverableError][retry: %d/%d][err:%v]", i+1, retry, err)
time.Sleep(time.Millisecond * 100)
}
if !sendOk {
logger.Errorf("send prome finally fail: %v", finalErr)
} else {
logger.Debugf("send to prome %s ok", cc.url.String())
}
}(newC, payload)
}
}

View File

@ -1,754 +0,0 @@
package backend
import (
"context"
"errors"
"fmt"
"math"
"sort"
"strings"
"time"
"github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/promql/parser"
"github.com/prometheus/prometheus/storage"
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/v5/cache"
"github.com/didi/nightingale/v5/models"
"github.com/didi/nightingale/v5/vos"
)
const (
LABEL_IDENT = "ident"
LABEL_NAME = "__name__"
DEFAULT_STEP = 15
)
type commonQueryObj struct {
Idents []string
TagPairs []*vos.TagPair
Metric string
Start int64
End int64
MetricNameExact bool // metric_name精确匹配在查询看图的时候为true
From string // 调用的来源
}
// 为查询索引或标签相关的转换,大部分都是正则匹配
func convertToPromql(recv *commonQueryObj) string {
qlStr := ""
qlStrFinal := ""
metricName := ""
labelIdent := ""
labelStrSlice := make([]string, 0)
// 匹配metric_name __name__=~"xx.*"
if recv.Metric != "" {
if recv.MetricNameExact {
metricName = fmt.Sprintf(`__name__="%s"`, recv.Metric)
} else {
metricName = fmt.Sprintf(`__name__=~".*%s.*"`, recv.Metric)
}
labelStrSlice = append(labelStrSlice, metricName)
}
// 匹配ident=~"k1|k2"
labelIdent = strings.Join(recv.Idents, "|")
if labelIdent != "" {
labelStrSlice = append(labelStrSlice, fmt.Sprintf(`ident=~"%s"`, labelIdent))
}
// 匹配标签
labelM := make(map[string]string)
for _, i := range recv.TagPairs {
if i.Key == "" {
continue
}
lastStr, _ := labelM[i.Key]
lastStr += fmt.Sprintf(`.*%s.*|`, i.Value)
labelM[i.Key] = lastStr
}
for k, v := range labelM {
thisLabel := strings.TrimRight(v, "|")
labelStrSlice = append(labelStrSlice, fmt.Sprintf(`%s=~"%s"`, k, thisLabel))
}
qlStr = strings.Join(labelStrSlice, ",")
qlStrFinal = fmt.Sprintf(`{%s}`, qlStr)
logger.Debugf("[convertToPromql][type=queryLabel][recv:%+v][qlStrFinal:%s]", recv, qlStrFinal)
return qlStrFinal
}
// 查询数据的转换metrics_name和标签都是精确匹配
func convertToPromqlForQueryData(recv *commonQueryObj) string {
qlStr := ""
qlStrFinal := ""
metricName := ""
labelIdent := ""
labelStrSlice := make([]string, 0)
// 匹配metric_name __name__=~"xx.*"
if recv.Metric != "" {
metricName = fmt.Sprintf(`__name__="%s"`, recv.Metric)
labelStrSlice = append(labelStrSlice, metricName)
}
// 匹配ident=~"k1|k2"
labelIdent = strings.Join(recv.Idents, "|")
if labelIdent != "" {
labelStrSlice = append(labelStrSlice, fmt.Sprintf(`ident=~"%s"`, labelIdent))
}
// 匹配标签
labelM := make(map[string]string)
for _, i := range recv.TagPairs {
if i.Key == "" {
continue
}
lastStr, _ := labelM[i.Key]
lastStr += fmt.Sprintf(`%s|`, i.Value)
labelM[i.Key] = lastStr
}
for k, v := range labelM {
thisLabel := strings.TrimRight(v, "|")
labelStrSlice = append(labelStrSlice, fmt.Sprintf(`%s=~"%s"`, k, thisLabel))
}
qlStr = strings.Join(labelStrSlice, ",")
qlStrFinal = fmt.Sprintf(`{%s}`, qlStr)
logger.Debugf("[convertToPromql][type=queryData][recv:%+v][qlStrFinal:%s]", recv, qlStrFinal)
return qlStrFinal
}
func parseMatchersParam(matchers []string) ([][]*labels.Matcher, error) {
var matcherSets [][]*labels.Matcher
for _, s := range matchers {
matchers, err := parser.ParseMetricSelector(s)
if err != nil {
return nil, err
}
matcherSets = append(matcherSets, matchers)
}
OUTER:
for _, ms := range matcherSets {
for _, lm := range ms {
if lm != nil && !lm.Matches("") {
continue OUTER
}
}
return nil, errors.New("match[] must contain at least one non-empty matcher")
}
return matcherSets, nil
}
func (pd *PromeDataSource) QueryData(inputs vos.DataQueryParam) []*vos.DataQueryResp {
respD := make([]*vos.DataQueryResp, 0)
for _, input := range inputs.Params {
var qlStrFinal string
if input.PromeQl != "" {
qlStrFinal = input.PromeQl
} else {
if len(input.Idents) == 0 {
for i := range input.TagPairs {
if input.TagPairs[i].Key == "ident" {
input.Idents = append(input.Idents, input.TagPairs[i].Value)
}
}
}
if len(input.Idents) == 0 && input.ClasspathId != 0 {
if input.ClasspathPrefix == 0 {
classpathAndRes, exists := cache.ClasspathRes.Get(input.ClasspathId)
if exists {
input.Idents = classpathAndRes.Res
}
} else {
classpath, err := models.ClasspathGet("id=?", input.ClasspathId)
if err != nil {
continue
}
cps, _ := models.ClasspathGetsByPrefix(classpath.Path)
for _, classpath := range cps {
classpathAndRes, exists := cache.ClasspathRes.Get(classpath.Id)
if exists {
idents := classpathAndRes.Res
input.Idents = append(input.Idents, idents...)
}
}
}
}
cj := &commonQueryObj{
Idents: input.Idents,
TagPairs: input.TagPairs,
Metric: input.Metric,
Start: inputs.Start,
End: inputs.End,
MetricNameExact: true,
}
qlStrFinal = convertToPromqlForQueryData(cj)
}
logger.Debugf("[input:%+v][qlStrFinal:%s]\n", input, qlStrFinal)
// 转化为utc时间
startT := tsToUtcTs(inputs.Start)
endT := tsToUtcTs(inputs.End)
resolution := time.Second * time.Duration(inputs.Step)
if inputs.Step == 0 {
// step==0 说明要自己算 grafana和prometheus ui都是前端传入
delta := (inputs.End - inputs.Start) / 3600
if delta <= 0 {
delta = 1
}
resolution = time.Second * time.Duration(delta*DEFAULT_STEP)
}
q, err := pd.QueryEngine.NewRangeQuery(pd.Queryable, qlStrFinal, startT, endT, resolution)
if err != nil {
logger.Errorf("[prome_query_error][QueryData_error_may_be_parse_ql_error][args:%+v][err:%+v]", input, err)
continue
}
ctx, _ := context.WithTimeout(context.Background(), time.Second*30)
res := q.Exec(ctx)
if res.Err != nil {
logger.Errorf("[prome_query_error][rangeQuery_exec_error][args:%+v][err:%+v]", input, res.Err)
q.Close()
continue
}
mat, ok := res.Value.(promql.Matrix)
if !ok {
logger.Errorf("[promql.Engine.exec: invalid expression type %q]", res.Value.Type())
q.Close()
continue
}
if res.Err != nil {
logger.Errorf("[prome_query_error][res.Matrix_error][args:%+v][err:%+v]", input, res.Err)
q.Close()
continue
}
for index, m := range mat {
if inputs.Limit > 0 && index+1 > inputs.Limit {
continue
}
tagStr := ""
oneResp := &vos.DataQueryResp{}
ident := m.Metric.Get(LABEL_IDENT)
name := m.Metric.Get(LABEL_NAME)
oneResp.Metric = name
oneResp.Ident = ident
pNum := len(m.Points)
interval := int64(resolution / time.Second)
pNumExpect := int((inputs.End - inputs.Start) / interval)
remotePIndex := 0
for i := 0; i <= pNumExpect; i++ {
// 先准备好null的point
tsLocal := inputs.Start + interval*int64(i)
tmpP := &vos.Point{
Timestamp: tsLocal,
Value: vos.JsonFloat(math.NaN()),
}
//说明points数组还没越界
//去m.Points获取一个
if remotePIndex < pNum {
pointOne := m.Points[remotePIndex]
tsRemote := pointOne.T / 1e3
// 判断时间戳 ,前后相差1秒认为时间戳对齐了
if math.Abs(float64(tsRemote-tsLocal)) <= 1 {
tmpP.Timestamp = tsRemote
tmpP.Value = vos.JsonFloat(pointOne.V)
// 说明远端的这个索引的值已经被pop了移动索引
remotePIndex++
}
}
oneResp.Values = append(oneResp.Values, tmpP)
}
for _, x := range m.Metric {
if x.Name == LABEL_NAME {
continue
}
tagStr += fmt.Sprintf("%s=%s,", x.Name, x.Value)
}
tagStr = strings.TrimRight(tagStr, ",")
oneResp.Tags = tagStr
oneResp.Resolution = interval
oneResp.PNum = pNum
respD = append(respD, oneResp)
}
q.Close()
}
return respD
}
func tsToUtcTs(s int64) time.Time {
return time.Unix(s, 0).UTC()
}
func timeParse(ts int64) time.Time {
t := float64(ts)
s, ns := math.Modf(t)
ns = math.Round(ns*1000) / 1000
return time.Unix(int64(s), int64(ns*float64(time.Second))).UTC()
}
func millisecondTs(t time.Time) int64 {
return t.Unix()*1000 + int64(t.Nanosecond())/int64(time.Millisecond)
}
func tsToStr(timestamp int64) string {
timeNow := time.Unix(timestamp, 0)
return timeNow.Format("2006-01-02 15:04:05")
}
func (pd *PromeDataSource) CommonQuerySeries(cj *commonQueryObj) storage.SeriesSet {
qlStrFinal := convertToPromql(cj)
if qlStrFinal == "{}" {
qlStrFinal = pd.Section.DefaultFetchSeriesQl
reqMinute := (cj.End - cj.Start) / 60
// 如果前端啥都没传要限制下查询series的时间范围防止高基础查询
if reqMinute > pd.Section.MaxFetchAllSeriesLimitMinute {
// 时间超长,用配置文件中的限制一下
now := time.Now().Unix()
cj.End = now
cj.Start = now - pd.Section.MaxFetchAllSeriesLimitMinute*60
logger.Debugf("[CommonQuerySeries.FetchAllSeries.LimitQueryTimeRange][start:%v][end:%v]", cj.Start, cj.End)
}
}
matcherSets, err := parseMatchersParam([]string{qlStrFinal})
if err != nil {
logger.Errorf("[prome_query_error][parse_label_match_error][err:%+v]", err)
return nil
}
now := time.Now().Unix()
if cj.Start == 0 {
cj.Start = now - 60*pd.Section.MaxFetchAllSeriesLimitMinute
}
if cj.End == 0 {
cj.End = now
}
startT := millisecondTs(timeParse(cj.Start))
endT := millisecondTs(timeParse(cj.End))
ctx, _ := context.WithTimeout(context.Background(), time.Second*30)
q, err := pd.Queryable.Querier(ctx, startT, endT)
if err != nil {
logger.Errorf("[prome_query_error][get_querier_errro]")
return nil
}
logger.Debugf("[CommonQuerySeries.Result][from:%s][cj.start_ts:%+v cj.start_str:%+v SelectHints.startT:%+v][cj.end_ts:%+v cj.end_str:%+v SelectHints.endT:%+v][qlStrFinal:%s][cj:%+v]",
cj.From,
cj.Start,
tsToStr(cj.Start),
startT,
cj.End,
tsToStr(cj.End),
endT,
qlStrFinal,
cj,
)
defer q.Close()
hints := &storage.SelectHints{
Start: startT,
End: endT,
Func: "series", // There is no series function, this token is used for lookups that don't need samples.
}
// Get all series which match matchers.
startTs := time.Now()
s := q.Select(true, hints, matcherSets[0]...)
timeTookSecond := time.Since(startTs).Seconds()
if timeTookSecond > pd.Section.SlowLogRecordSecond {
logger.Warningf("[prome_remote_read_show_slow_log_CommonQuerySeries_select][threshold:%v][timeTookSecond:%v][from:%v][args:%+v][promql:%v]",
pd.Section.SlowLogRecordSecond,
timeTookSecond,
cj.From,
cj,
qlStrFinal,
)
}
return s
}
// 全部转化为 {__name__="a",label_a!="b",label_b=~"d|c",label_c!~"d"}
// 对应prometheus 中的 /api/v1/labels
// TODO 等待prometheus官方对 remote_read label_values 的支持
// Implement: https://github.com/prometheus/prometheus/issues/3351
func (pd *PromeDataSource) QueryTagKeys(recv vos.CommonTagQueryParam) *vos.TagKeyQueryResp {
startTs := time.Now()
respD := &vos.TagKeyQueryResp{
Keys: make([]string, 0),
}
labelNamesSet := make(map[string]struct{})
if len(recv.Params) == 0 {
recv.Params = append(recv.Params, vos.TagPairQueryParamOne{
Idents: []string{},
Metric: "",
})
}
resultSeries := ""
for _, x := range recv.Params {
cj := &commonQueryObj{
Idents: x.Idents,
TagPairs: recv.TagPairs,
Metric: x.Metric,
Start: recv.Start,
End: recv.End,
From: "QueryTagKeys",
}
s := pd.CommonQuerySeries(cj)
if s.Warnings() != nil {
logger.Warningf("[prome_query_error][series_set_iter_error][warning:%+v]", s.Warnings())
}
if err := s.Err(); err != nil {
logger.Errorf("[prome_query_error][series_set_iter_error][err:%+v]", err)
continue
}
thisSeriesNum := 0
for s.Next() {
series := s.At()
thisSeriesNum++
for _, lb := range series.Labels() {
if lb.Name == LABEL_NAME {
continue
}
if recv.TagKey != "" {
if !strings.Contains(lb.Name, recv.TagKey) {
continue
}
}
labelNamesSet[lb.Name] = struct{}{}
}
}
resultSeries += fmt.Sprintf(" %d ", thisSeriesNum)
}
names := make([]string, len(labelNamesSet))
i := 0
for key := range labelNamesSet {
names[i] = key
i++
}
sort.Strings(names)
// 因为map中的key是无序的必须这样才能稳定输出
if recv.Limit > 0 && len(names) > recv.Limit {
names = names[:recv.Limit]
}
respD.Keys = names
timeTookSecond := time.Since(startTs).Seconds()
if timeTookSecond > pd.Section.SlowLogRecordSecond {
logger.Warningf("[prome_remote_read_show_slow_log][threshold:%v][timeTookSecond:%v][func:QueryTagKeys][args:%+v][resultSeries:%v]", pd.Section.SlowLogRecordSecond, timeTookSecond, recv, resultSeries)
}
return respD
}
// 对应prometheus 中的 /api/v1/label/<label_name>/values
func (pd *PromeDataSource) QueryTagValues(recv vos.CommonTagQueryParam) *vos.TagValueQueryResp {
startTs := time.Now()
labelValuesSet := make(map[string]struct{})
if len(recv.Params) == 0 {
recv.Params = append(recv.Params, vos.TagPairQueryParamOne{
Idents: []string{},
Metric: "",
})
}
resultSeries := ""
for _, x := range recv.Params {
cj := &commonQueryObj{
Idents: x.Idents,
Metric: x.Metric,
TagPairs: recv.TagPairs,
Start: recv.Start,
End: recv.End,
From: "QueryTagValues",
}
s := pd.CommonQuerySeries(cj)
if s.Warnings() != nil {
logger.Warningf("[prome_query_error][series_set_iter_error][warning:%+v]", s.Warnings())
}
if err := s.Err(); err != nil {
logger.Errorf("[prome_query_error][series_set_iter_error][err:%+v]", err)
continue
}
thisSeriesNum := 0
for s.Next() {
series := s.At()
thisSeriesNum++
for _, lb := range series.Labels() {
if lb.Name == recv.TagKey {
if recv.TagValue != "" {
if !strings.Contains(lb.Value, recv.TagValue) {
continue
}
}
labelValuesSet[lb.Value] = struct{}{}
}
}
}
resultSeries += fmt.Sprintf(" %d ", thisSeriesNum)
}
vals := make([]string, len(labelValuesSet))
i := 0
for val := range labelValuesSet {
vals[i] = val
i++
}
sort.Strings(vals)
if recv.Limit > 0 && len(vals) > recv.Limit {
vals = vals[:recv.Limit]
}
respD := &vos.TagValueQueryResp{}
respD.Values = vals
timeTookSecond := time.Since(startTs).Seconds()
if timeTookSecond > pd.Section.SlowLogRecordSecond {
logger.Warningf("[prome_remote_read_show_slow_log][threshold:%v][timeTookSecond:%v][func:QueryTagValues][args:%+v][resultSeries:%v]", pd.Section.SlowLogRecordSecond, timeTookSecond, recv, resultSeries)
}
return respD
}
// 对应prometheus 中的 /api/v1/label/<label_name>/values label_name == __name__
func (pd *PromeDataSource) QueryMetrics(recv vos.MetricQueryParam) *vos.MetricQueryResp {
startTs := time.Now()
cj := &commonQueryObj{
Idents: recv.Idents,
Metric: recv.Metric,
TagPairs: recv.TagPairs,
Start: recv.Start,
End: recv.End,
From: "QueryMetrics",
}
respD := &vos.MetricQueryResp{}
respD.Metrics = make([]string, 0)
s := pd.CommonQuerySeries(cj)
if s.Warnings() != nil {
logger.Warningf("[prome_query_error][series_set_iter_error][warning:%+v]", s.Warnings())
}
if err := s.Err(); err != nil {
logger.Errorf("[prome_query_error][series_set_iter_error][err:%+v]", err)
return respD
}
var sets []storage.SeriesSet
sets = append(sets, s)
set := storage.NewMergeSeriesSet(sets, storage.ChainedSeriesMerge)
labelValuesSet := make(map[string]struct{})
resultSeries := ""
thisSeriesNum := 0
for set.Next() {
series := set.At()
thisSeriesNum++
for _, lb := range series.Labels() {
if lb.Name == LABEL_NAME {
labelValuesSet[lb.Value] = struct{}{}
}
}
}
resultSeries += fmt.Sprintf(" %d ", thisSeriesNum)
vals := make([]string, len(labelValuesSet))
i := 0
for val := range labelValuesSet {
vals[i] = val
i++
}
sort.Strings(vals)
if recv.Limit > 0 && len(vals) > recv.Limit {
vals = vals[:recv.Limit]
}
respD.Metrics = vals
timeTookSecond := time.Since(startTs).Seconds()
if timeTookSecond > pd.Section.SlowLogRecordSecond {
logger.Warningf("[prome_remote_read_show_slow_log][threshold:%v][timeTookSecond:%v][func:QueryMetrics][args:%+v][resultSeries:%v]", pd.Section.SlowLogRecordSecond, timeTookSecond, recv, resultSeries)
}
return respD
}
// 对应prometheus 中的 /api/v1/series
func (pd *PromeDataSource) QueryTagPairs(recv vos.CommonTagQueryParam) *vos.TagPairQueryResp {
startTs := time.Now()
respD := &vos.TagPairQueryResp{
TagPairs: make([]string, 0),
Idents: make([]string, 0),
}
tps := make(map[string]struct{})
if len(recv.Params) == 0 {
recv.Params = append(recv.Params, vos.TagPairQueryParamOne{
Idents: []string{},
Metric: "",
})
}
resultSeries := ""
for _, x := range recv.Params {
cj := &commonQueryObj{
Idents: x.Idents,
TagPairs: recv.TagPairs,
Metric: x.Metric,
Start: recv.Start,
End: recv.End,
From: "QueryTagPairs",
}
s := pd.CommonQuerySeries(cj)
if s.Warnings() != nil {
logger.Warningf("[prome_query_error][series_set_iter_error][warning:%+v]", s.Warnings())
}
if err := s.Err(); err != nil {
logger.Errorf("[prome_query_error][series_set_iter_error][err:%+v]", err)
continue
}
var sets []storage.SeriesSet
sets = append(sets, s)
set := storage.NewMergeSeriesSet(sets, storage.ChainedSeriesMerge)
labelIdents := make([]string, 0)
thisSeriesNum := 0
for set.Next() {
series := s.At()
thisSeriesNum++
labelsS := series.Labels()
for _, i := range labelsS {
if i.Name == LABEL_NAME {
continue
}
if i.Name == LABEL_IDENT {
labelIdents = append(labelIdents, i.Value)
}
if recv.Search != "" {
// 如果配置了搜索字符串则key value中任意匹配到即可
if strings.Contains(i.Name, recv.Search) || strings.Contains(i.Value, recv.Search) {
tps[fmt.Sprintf("%s=%s", i.Name, i.Value)] = struct{}{}
}
} else {
tps[fmt.Sprintf("%s=%s", i.Name, i.Value)] = struct{}{}
}
}
}
resultSeries += fmt.Sprintf(" %d ", thisSeriesNum)
}
newTags := make([]string, len(tps))
i := 0
for k := range tps {
newTags[i] = k
i++
}
sort.Strings(newTags)
if recv.Limit > 0 && len(newTags) > recv.Limit {
newTags = newTags[:recv.Limit]
}
respD.TagPairs = newTags
timeTookSecond := time.Since(startTs).Seconds()
if timeTookSecond > pd.Section.SlowLogRecordSecond {
logger.Warningf("[prome_remote_read_show_slow_log][threshold:%v][timeTookSecond:%v][func:QueryTagPairs][args:%+v][resultSeries:%v]", pd.Section.SlowLogRecordSecond, timeTookSecond, recv, resultSeries)
}
return respD
}
func (pd *PromeDataSource) QueryDataInstant(ql string) []*vos.DataQueryInstanceResp {
respD := make([]*vos.DataQueryInstanceResp, 0)
pv := pd.QueryVector(ql)
if pv == nil {
return respD
}
for _, s := range pv {
metricOne := make(map[string]interface{})
valueOne := make([]float64, 0)
for _, l := range s.Metric {
if l.Name == LABEL_NAME {
continue
}
metricOne[l.Name] = l.Value
}
// 毫秒时间时间戳转 秒时间戳
valueOne = append(valueOne, float64(s.Point.T)/1e3)
valueOne = append(valueOne, s.Point.V)
respD = append(respD, &vos.DataQueryInstanceResp{
Metric: metricOne,
Value: valueOne,
})
}
return respD
}
func (pd *PromeDataSource) QueryVector(ql string) promql.Vector {
t := time.Now()
q, err := pd.QueryEngine.NewInstantQuery(pd.Queryable, ql, t)
if err != nil {
logger.Errorf("[prome_query_error][new_insQuery_error][err:%+v][ql:%+v]", err, ql)
return nil
}
ctx := context.Background()
res := q.Exec(ctx)
if res.Err != nil {
logger.Errorf("[prome_query_error][insQuery_exec_error][err:%+v][ql:%+v]", err, ql)
return nil
}
defer q.Close()
switch v := res.Value.(type) {
case promql.Vector:
return v
case promql.Scalar:
return promql.Vector{promql.Sample{
Point: promql.Point(v),
Metric: labels.Labels{},
}}
default:
logger.Errorf("[prome_query_error][insQuery_res_error rule result is not a vector or scalar][err:%+v][ql:%+v]", err, ql)
return nil
}
}

View File

@ -1,9 +0,0 @@
#!/bin/bash
# release version
version=5.0.0-rc7-1
#export GO111MODULE=on
#export GOPROXY=https://goproxy.cn
go build -ldflags "-X github.com/didi/nightingale/v5/config.Version=${version}" -o n9e-server main.go

33
cache/alert_mute.go vendored
View File

@ -1,33 +0,0 @@
package cache
import (
"regexp"
"sync"
)
type AlertMuteMap struct {
sync.RWMutex
Data map[string][]Filter
}
type Filter struct {
ClasspathPrefix string
ResReg *regexp.Regexp
TagsMap map[string]string
}
var AlertMute = &AlertMuteMap{Data: make(map[string][]Filter)}
func (a *AlertMuteMap) SetAll(m map[string][]Filter) {
a.Lock()
defer a.Unlock()
a.Data = m
}
func (a *AlertMuteMap) GetByKey(key string) ([]Filter, bool) {
a.RLock()
defer a.RUnlock()
value, exists := a.Data[key]
return value, exists
}

75
cache/alert_rule.go vendored
View File

@ -1,75 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
type AlertRulesByMetricCache struct {
sync.RWMutex
Data map[string][]*models.AlertRule // key是metric便于后续检索
MaxUpdateTs int64 // 从数据库拿到的最大update_at
RuleNum int64 // 从数据库中统计到的行数
LastSync int64 // 保存上次全量同步时间
}
var (
AlertRulesByMetric = &AlertRulesByMetricCache{Data: make(map[string][]*models.AlertRule)}
)
func (a *AlertRulesByMetricCache) GetBy(instance string) []*models.AlertRule {
a.RLock()
defer a.RUnlock()
return a.Data[instance]
}
func (a *AlertRulesByMetricCache) SetAll(alertRulesMap map[string][]*models.AlertRule, lastUpdateTs, ruleNum, lastSync int64) {
a.Lock()
defer a.Unlock()
a.Data = alertRulesMap
a.MaxUpdateTs = lastUpdateTs
a.RuleNum = ruleNum
a.LastSync = lastSync
}
type AlertRulesTotalCache struct {
sync.RWMutex
Data map[int64]*models.AlertRule
}
var AlertRules = &AlertRulesTotalCache{Data: make(map[int64]*models.AlertRule)}
func (a *AlertRulesTotalCache) Get(id int64) (*models.AlertRule, bool) {
a.RLock()
defer a.RUnlock()
alertRule, exists := a.Data[id]
return alertRule, exists
}
func (a *AlertRulesTotalCache) SetAll(alertRulesMap map[int64]*models.AlertRule) {
a.Lock()
defer a.Unlock()
a.Data = alertRulesMap
}
// 获取所有PULL型规则的列表
func (a *AlertRulesTotalCache) Pulls() []*models.AlertRule {
a.RLock()
defer a.RUnlock()
cnt := len(a.Data)
ret := make([]*models.AlertRule, 0, cnt)
for _, rule := range a.Data {
if rule.Type == models.PULL {
ret = append(ret, rule)
}
}
return ret
}

7
cache/cache.go vendored
View File

@ -1,7 +0,0 @@
package cache
import (
cmap "github.com/orcaman/concurrent-map"
)
var MetricDescMapper = cmap.New()

View File

@ -1,27 +0,0 @@
package cache
import (
"sync"
)
type ClasspathPrefixMap struct {
sync.RWMutex
Data map[int64][]int64
}
var ClasspathPrefix = &ClasspathPrefixMap{Data: make(map[int64][]int64)}
func (c *ClasspathPrefixMap) Get(id int64) ([]int64, bool) {
c.RLock()
defer c.RUnlock()
ids, exists := c.Data[id]
return ids, exists
}
func (c *ClasspathPrefixMap) SetAll(data map[int64][]int64) {
c.Lock()
defer c.Unlock()
c.Data = data
return
}

View File

@ -1,33 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
type ClasspathResMap struct {
sync.RWMutex
Data map[int64]*ClasspathAndRes
}
type ClasspathAndRes struct {
Res []string
Classpath *models.Classpath
}
// classpath_id -> classpath & res_idents
var ClasspathRes = &ClasspathResMap{Data: make(map[int64]*ClasspathAndRes)}
func (c *ClasspathResMap) Get(id int64) (*ClasspathAndRes, bool) {
c.RLock()
defer c.RUnlock()
resources, exists := c.Data[id]
return resources, exists
}
func (c *ClasspathResMap) SetAll(collectRulesMap map[int64]*ClasspathAndRes) {
c.Lock()
defer c.Unlock()
c.Data = collectRulesMap
}

32
cache/collect_rule.go vendored
View File

@ -1,32 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
type CollectRuleOfIdentMap struct {
sync.RWMutex
Data map[string][]*models.CollectRule
}
var CollectRulesOfIdent = &CollectRuleOfIdentMap{Data: make(map[string][]*models.CollectRule)}
func (c *CollectRuleOfIdentMap) GetBy(ident string) []*models.CollectRule {
c.RLock()
defer c.RUnlock()
return c.Data[ident]
}
func (c *CollectRuleOfIdentMap) Set(node string, collectRules []*models.CollectRule) {
c.Lock()
defer c.Unlock()
c.Data[node] = collectRules
}
func (c *CollectRuleOfIdentMap) SetAll(collectRulesMap map[string][]*models.CollectRule) {
c.Lock()
defer c.Unlock()
c.Data = collectRulesMap
}

View File

@ -1,76 +0,0 @@
package cache
import (
"sync"
)
type SafeDoubleMap struct {
sync.RWMutex
M map[string]map[string]struct{}
}
// res_ident -> classpath_path -> struct{}{}
var ResClasspath = &SafeDoubleMap{M: make(map[string]map[string]struct{})}
func (s *SafeDoubleMap) GetKeys() []string {
s.RLock()
defer s.RUnlock()
keys := make([]string, 0, len(s.M))
for key := range s.M {
keys = append(keys, key)
}
return keys
}
func (s *SafeDoubleMap) GetValues(key string) []string {
s.RLock()
defer s.RUnlock()
valueMap, exists := s.M[key]
if !exists {
return []string{}
}
values := make([]string, 0, len(valueMap))
for value := range valueMap {
values = append(values, value)
}
return values
}
func (s *SafeDoubleMap) Exists(key string, value string) bool {
s.RLock()
defer s.RUnlock()
if _, exists := s.M[key]; !exists {
return false
}
if _, exists := s.M[key][value]; !exists {
return false
}
return true
}
func (s *SafeDoubleMap) Set(key string, value string) {
s.Lock()
defer s.Unlock()
if _, exists := s.M[key]; !exists {
s.M[key] = make(map[string]struct{})
}
s.M[key][value] = struct{}{}
}
func (s *SafeDoubleMap) SetAll(data map[string]map[string]struct{}) {
s.Lock()
defer s.Unlock()
s.M = data
}

36
cache/res_tags.go vendored
View File

@ -1,36 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
// resource_ident -> tags_map
// 监控数据上报的时候要把资源的tags附到指标数据上
type ResTagsMap struct {
sync.RWMutex
Data map[string]ResourceAndTags
}
type ResourceAndTags struct {
Tags map[string]string
Resource models.Resource
}
var ResTags = &ResTagsMap{Data: make(map[string]ResourceAndTags)}
func (r *ResTagsMap) SetAll(m map[string]ResourceAndTags) {
r.Lock()
defer r.Unlock()
r.Data = m
}
func (r *ResTagsMap) Get(key string) (ResourceAndTags, bool) {
r.RLock()
defer r.RUnlock()
value, exists := r.Data[key]
return value, exists
}

48
cache/user.go vendored
View File

@ -1,48 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
type UserMap struct {
sync.RWMutex
Data map[int64]*models.User
}
var UserCache = &UserMap{Data: make(map[int64]*models.User)}
func (s *UserMap) GetBy(id int64) *models.User {
s.RLock()
defer s.RUnlock()
return s.Data[id]
}
func (s *UserMap) GetByIds(ids []int64) []*models.User {
s.RLock()
defer s.RUnlock()
var users []*models.User
for _, id := range ids {
if s.Data[id] == nil {
continue
}
users = append(users, s.Data[id])
}
return users
}
func (s *UserMap) GetById(id int64) *models.User {
s.RLock()
defer s.RUnlock()
return s.Data[id]
}
func (s *UserMap) SetAll(users map[int64]*models.User) {
s.Lock()
defer s.Unlock()
s.Data = users
}

41
cache/user_group.go vendored
View File

@ -1,41 +0,0 @@
package cache
import (
"sync"
"github.com/didi/nightingale/v5/models"
)
type UserGroupMap struct {
sync.RWMutex
Data map[int64]*models.UserGroup
}
var UserGroupCache = &UserGroupMap{Data: make(map[int64]*models.UserGroup)}
func (s *UserGroupMap) GetBy(id int64) *models.UserGroup {
s.RLock()
defer s.RUnlock()
return s.Data[id]
}
func (s *UserGroupMap) GetByIds(ids []int64) []*models.UserGroup {
s.RLock()
defer s.RUnlock()
var userGroups []*models.UserGroup
for _, id := range ids {
if s.Data[id] == nil {
continue
}
userGroups = append(userGroups, s.Data[id])
}
return userGroups
}
func (s *UserGroupMap) SetAll(userGroups map[int64]*models.UserGroup) {
s.Lock()
defer s.Unlock()
s.Data = userGroups
}

View File

@ -1,38 +0,0 @@
package cache
import (
"sync"
)
type UserGroupMemberMap struct {
sync.RWMutex
Data map[int64]map[int64]struct{}
}
// groupid -> userid
var UserGroupMember = &UserGroupMemberMap{Data: make(map[int64]map[int64]struct{})}
func (m *UserGroupMemberMap) Get(id int64) (map[int64]struct{}, bool) {
m.RLock()
defer m.RUnlock()
ids, exists := m.Data[id]
return ids, exists
}
func (m *UserGroupMemberMap) Exists(gid, uid int64) bool {
m.RLock()
defer m.RUnlock()
uidMap, exists := m.Data[gid]
if !exists {
return false
}
_, exists = uidMap[uid]
return exists
}
func (m *UserGroupMemberMap) SetAll(data map[int64]map[int64]struct{}) {
m.Lock()
defer m.Unlock()
m.Data = data
}

186
changelog
View File

@ -1,186 +0,0 @@
3.1.1
影响模块n9e-job
更新内容job模块之前给监控用的callback地址method误设置为了get是不对的改成了post
3.1.2
影响模块n9e-rdb
更新内容:子节点修改的时候,不允许修改为租户节点
3.1.3
影响模块n9e-monapi
更新内容对于P2、P3的告警会发送重复的两条
3.1.4
影响模块n9e-index n9e-judge n9e-monapi n9e-rdb n9e-transfer n9e-tsdb
更新内容把hbs的逻辑从monapi挪到rdb拆分监控的权限点
3.1.5
影响模块n9e-monapi
更新内容清理策略的时候会空指针node删除了策略还在此时会复现
3.1.6
影响模块n9e-ams etc/gop.yml
更新内容主机设备增加了扩展字段的管理用于维护一些位置信息、过保信息增加了新的sqlsql/n9e_ams_3.1.6.sql
3.2.0
影响模块n9e-agent etc/agent.yml
更新内容agent支持metrics指标采集能力这个版本是为商业版本服务的开源用户无需更新
3.3.0
影响模块n9e-rdb n9e-transfer n9e-judge n9e-ams n9e-monapi sql/n9e_rdb_3.3.0.sql etc/*.tpl
更新内容增强安全性密码复杂度、cookie处理优化等支持M3DB作为存储后端如果要尝试M3需要修改transfer、monapi配置文件修复告警引擎与条件串数的问题为主机设备增加自定义字段的能力
3.3.1
影响模块n9e-job n9e-rdb n9e-agent n9e-ams n9e-judge
更新内容修复job模块的一个调度bugrdb支持根据org搜索useragent在fields变化时及时感知fields和host扩展字段联动解决上个版本引入的judge处理nodata的问题
3.4.0
升级内容:
- 增强了安全性引入了session机制写入cookie的内容从user.uuid变更为随机session.id
- 修复部分sql注入漏洞
- 告警引擎函数优化all、c_avg_rate_abs等
- 告警消息内容优化可以展示设备名称和设备备注感谢冯骐的PR
- 增加了大盘导入导出功能
升级方法:
- 除了agent、tsdb、index的二进制不用升级其他所有模块的二进制都要升级
- job ams monapi rdb 四个模块的配置文件中的cookieName全部换成ecmc-sid
- rdb的配置文件发生了较大变化需要对照升级
- sql目录下有几个3.4.0的sql需要导入
3.4.1
升级内容:
- 修复日志监控采集策略配置了tag但是无法编辑的问题
升级方法:
- 更新monapi的二进制即可
3.5.0
升级内容:
- 引入了组件监控模块prober内置了mysql、redis、mongo监控采集能力
- 引入了内置监控大盘和内置告警策略,可以在任意节点一键导入内置大盘和策略
升级方法:
- n9e-monapi n9e-rdb n9e-transfer n9e-ams n9e-job 的二进制要升级
- n9e-agent也可以升级解决了进程监控的性能问题如果不在意可以不升级
- n9e-prober 模块需要新部署
- sql目录下有个3.5.0的sql patch文件需要导入
- etc目录下新增了screen、alert两个目录需要拷贝到生产环境
- etc目录下新增了plugins目录需要随着prober模块走
- etc/address.yml里增加prober的配置
3.5.1
升级内容:
- monapi里的alarmEnabled默认值设置为true
- agent进程采集忽略EOF日志
- agent增加一个接口获取endpoint
- agent日志监控支持一种新的日志时间格式
- 修复组件监控调整采集频率不生效的问题
升级方法:
- 替换n9e-monapi n9e-prober n9e-agent二进制升级pub下的前端资源文件
3.5.2
升级内容:
- prober模板支持匿名结构体结构体嵌套
- prober插件添加了对TLS的支持
- 修复prober上报没有port的问题
升级方法:
- 替换n9e-prober n9e-monapi二进制升级pub下的前端资源文件
3.6.0
升级内容:
- prober模块支持nginx、elasticsearch、prometheus的监控采集prometheus转换时姑且干掉了 Histogram 和 Summary
- 告警消息中节点挂载关系做了去重处理
升级方法:
- 替换n9e-prober n9e-monapi二进制
3.7.0
升级内容:
- 调整session清理频率
- 新增zookeeper、tengine、rabbitmq、haproxy、ping、telnet相关采集工具
- bugfix集群部署的时候多个redis实例judge只能识别最后一个实例的问题
升级方法:
- sql/n9e_rdb-v3.7.0.sql 有个新的表结构,需要导入一下
- 替换n9e-rdb n9e-prober n9e-judge n9e-monapi二进制前端没有升级
- 将etc/plugins里zookeeper.yml,tengine.yml等新增的yml文件复制到配置文件里
3.7.1
升级内容:
- prober采集增加dryrun测试方法可以测试是否真的能采集到数据
- 增加dns_query插件对dns做监控
- 内置大盘增加n9e内置模块大盘
- 如果存储使用m3支持在transfer配置一次查询每条线最多返回的原始点数
- 日志监控可以把最后一条日志放到extra字段报警的时候可以展示需要升级n9e-agent n9e-monapi
- 修复agent对进程监控采集的bug进程cpu使用采集的不准确
- 修改告警策略配置多个团队的时候不生效的问题
- monapi支持一个新的timestamp格式
升级方法:
- sql/n9e_mon-v3.7.1.sql变更了表结构需要执行一下
- 将etc/plugins里的dns_query.yml放到生产环境的etc/plugins目录下
- 将etc/screen/n9e_modules放到生产环境的etc/screen目录下
- 替换n9e-rdb n9e-prober n9e-monapi n9e-transfer n9e-agent二进制
3.8.0
升级内容:
- monapi优化告警策略中用户信息补全逻辑
- rdb新增接口,查询项目下用户拥有的资源权限点
- transfer查询索引接口支持指定时间范围
- prober去掉组件采集默认的白名单设置
升级方法:
- 替换n9e-rdb n9e-prober n9e-monapi n9e-transfer二进制
- 将etc/password-changed-email.tpl放到生产环境的etc目录下
4.0.0
升级内容:
- 服务端模块合并为一个模块
- agentd和server的调用全部走rpc
重新安装:见 https://n9e.didiyun.com/v4/docs/install/
升级方法:
- 使用新的etc替换掉原来的etc
- 使用etc/nginx.conf替换原来的nginx.conf
- n9e-prober替换旧的n9e-prober
- n9e-agentd替换n9e-agent
- n9e-server替换n9e-rdb、n9e-ams、n9e-job、n9e-monapi、n9e-transfer、n9e-judge
4.0.1
升级内容:
- 修复消息通知的问题
重新安装:见 https://n9e.didiyun.com/v4/docs/install/
升级方法:
- 将 *.tpl 文件放到 etc/tpl 下
- 替换etc/server.yml
- 替换n9e-server
4.0.2
升级内容:
- 优化告警接收人补全逻辑
- 增加pospostgresql监控插件
重新安装:见 https://n9e.didiyun.com/v4/docs/install/
升级方法:
- 替换n9e-server n9e-prober
4.0.3
升级内容:
- 修复nodata恢复告警重复问题
升级方法:
- 替换n9e-server
5.0.0-rc1
升级内容:
- 发布v5预览版
部署方式:
- 见文档 https://n9e.didiyun.com/docs/install/
5.0.0-rc2
升级内容:
- 修复若干问题
- 新增告警策略,监控大盘导入、导出和内置模板功能
- 新增概览页面
部署方式:
- 见文档 https://n9e.didiyun.com/docs/install/

View File

@ -1,176 +0,0 @@
package config
import (
"bytes"
"fmt"
"net"
"os"
"strings"
"github.com/spf13/viper"
"github.com/toolkits/pkg/file"
"github.com/didi/nightingale/v5/backend"
"github.com/didi/nightingale/v5/models"
"github.com/didi/nightingale/v5/pkg/i18n"
"github.com/didi/nightingale/v5/pkg/iconf"
"github.com/didi/nightingale/v5/pkg/ilog"
)
type ConfigStruct struct {
Logger ilog.Config `yaml:"logger"`
HTTP httpSection `yaml:"http"`
RPC rpcSection `yaml:"rpc"`
LDAP models.LdapSection `yaml:"ldap"`
MySQL models.MysqlSection `yaml:"mysql"`
Heartbeat heartbeatSection `yaml:"heartbeat"`
I18N i18n.Config `yaml:"i18n"`
Judge judgeSection `yaml:"judge"`
Alert alertSection `yaml:"alert"`
Trans transSection `yaml:"trans"`
ContactKeys []contactKey `yaml:"contactKeys"`
NotifyChannels []string `yaml:"notifyChannels"`
Tpl tplSection `yaml:"tpl"`
}
type tplSection struct {
AlertRulePath string `yaml:"alertRulePath"`
DashboardPath string `yaml:"dashboardPath"`
}
type alertSection struct {
NotifyScriptPath string `yaml:"notifyScriptPath"`
NotifyScriptConcurrency int `yaml:"notifyScriptConcurrency"`
MutedAlertPersist bool `yaml:"mutedAlertPersist"`
}
type transSection struct {
Enable bool `yaml:"enable"`
Backend backend.BackendSection `yaml:"backend"`
}
type judgeSection struct {
ReadBatch int `yaml:"readBatch"`
ConnTimeout int `yaml:"connTimeout"`
CallTimeout int `yaml:"callTimeout"`
WriterNum int `yaml:"writerNum"`
ConnMax int `yaml:"connMax"`
ConnIdle int `yaml:"connIdle"`
}
type heartbeatSection struct {
IP string `yaml:"ip"`
LocalAddr string `yaml:"-"`
Interval int64 `yaml:"interval"`
}
type httpSection struct {
Mode string `yaml:"mode"`
Access bool `yaml:"access"`
Listen string `yaml:"listen"`
Pprof bool `yaml:"pprof"`
CookieName string `yaml:"cookieName"`
CookieDomain string `yaml:"cookieDomain"`
CookieSecure bool `yaml:"cookieSecure"`
CookieHttpOnly bool `yaml:"cookieHttpOnly"`
CookieMaxAge int `yaml:"cookieMaxAge"`
CookieSecret string `yaml:"cookieSecret"`
CsrfSecret string `yaml:"csrfSecret"`
}
type rpcSection struct {
Listen string `yaml:"listen"`
}
type contactKey struct {
Label string `yaml:"label" json:"label"`
Key string `yaml:"key" json:"key"`
}
var Config *ConfigStruct
func Parse() error {
ymlFile := iconf.GetYmlFile("server")
if ymlFile == "" {
return fmt.Errorf("configuration file of server not found")
}
bs, err := file.ReadBytes(ymlFile)
if err != nil {
return fmt.Errorf("cannot read yml[%s]: %v", ymlFile, err)
}
viper.SetConfigType("yaml")
err = viper.ReadConfig(bytes.NewBuffer(bs))
if err != nil {
return fmt.Errorf("cannot read yml[%s]: %v", ymlFile, err)
}
// default value settings
viper.SetDefault("i18n.lang", "zh")
viper.SetDefault("heartbeat.interval", 1000)
viper.SetDefault("judge.readBatch", 2000)
viper.SetDefault("judge.connTimeout", 2000)
viper.SetDefault("judge.callTimeout", 5000)
viper.SetDefault("judge.writerNum", 256)
viper.SetDefault("judge.connMax", 2560)
viper.SetDefault("judge.connIdle", 256)
viper.SetDefault("alert.notifyScriptPath", "./etc/script/notify.py")
viper.SetDefault("alert.notifyScriptConcurrency", 200)
viper.SetDefault("alert.mutedAlertPersist", true)
viper.SetDefault("trans.backend.prometheus.lookbackDeltaMinute", 2)
viper.SetDefault("trans.backend.prometheus.maxConcurrentQuery", 30)
viper.SetDefault("trans.backend.prometheus.maxSamples", 50000000)
viper.SetDefault("trans.backend.prometheus.maxFetchAllSeriesLimitMinute", 5)
viper.SetDefault("trans.backend.prometheus.slowLogRecordSecond", 3)
viper.SetDefault("trans.backend.prometheus.defaultFetchSeriesQl", `{__name__=~"system.*"}`)
viper.SetDefault("tpl.alertRulePath", "./etc/alert_rule")
viper.SetDefault("tpl.dashboardPath", "./etc/dashboard")
err = viper.Unmarshal(&Config)
if err != nil {
return fmt.Errorf("cannot read yml[%s]: %v", ymlFile, err)
}
fmt.Println("config.file:", ymlFile)
if Config.Heartbeat.IP == "" {
// auto detect
Config.Heartbeat.IP = fmt.Sprint(GetOutboundIP())
if Config.Heartbeat.IP == "" {
fmt.Println("heartbeat ip auto got is blank")
os.Exit(1)
}
}
// 用户在配置文件中指定了heartbeat.ip 用于本机没有网络下面的报错那么需要将Config.Heartbeat.LocalAddr设置一下
// auto get outbound ip fail: dial udp 8.8.8.8:80: connect: network is unreachable
port := strings.Split(Config.RPC.Listen, ":")[1]
Config.Heartbeat.LocalAddr = Config.Heartbeat.IP + ":" + port
// 正常情况肯定不是127.0.0.1,但是,如果就是单机部署,并且这个机器没有网络,比如本地调试并且本机没网的时候
// if Config.Heartbeat.IP == "127.0.0.1" {
// fmt.Println("heartbeat ip is 127.0.0.1 and it is useless, so, exit")
// os.Exit(1)
// }
fmt.Println("heartbeat.ip:", Config.Heartbeat.IP)
fmt.Printf("heartbeat.interval: %dms\n", Config.Heartbeat.Interval)
return nil
}
// Get preferred outbound ip of this machine
func GetOutboundIP() net.IP {
conn, err := net.Dial("udp", "8.8.8.8:80")
if err != nil {
fmt.Println("auto get outbound ip fail:", err)
os.Exit(1)
}
defer conn.Close()
localAddr := conn.LocalAddr().(*net.UDPAddr)
return localAddr.IP
}

View File

@ -1,6 +0,0 @@
package config
// Server周期性去数据库心跳给自己起的名字
const EndpointName = "server_rpc"
var Version = "not specified"

View File

@ -1,71 +0,0 @@
package config
import "github.com/didi/nightingale/v5/pkg/i18n"
var (
dict = map[string]string{
"Login fail, check your username and password": "登录失败,请检查您的用户名和密码",
"Internal server error, try again later please": "系统内部错误,请稍后再试",
"Each user has at most two tokens": "每个用户至多创建两个密钥",
"No such token": "密钥不存在",
"Username is blank": "用户名不能为空",
"Username has invalid characters": "用户名含有非法字符",
"Nickname has invalid characters": "用户昵称含有非法字符",
"Phone invalid": "手机号格式有误",
"Email invalid": "邮箱格式有误",
"Incorrect old password": "旧密码错误",
"Username %s already exists": "用户名(%s)已存在",
"No such user": "用户不存在",
"UserGroup %s already exists": "用户组(%s)已存在",
"Group name has invalid characters": "分组名称含有非法字符",
"Group note has invalid characters": "分组备注含有非法字符",
"No such user group": "用户组不存在",
"Classpath path has invalid characters": "机器分组路径含有非法字符",
"Classpath note has invalid characters": "机器分组路径备注含有非法字符",
"There are still resources under the classpath": "机器分组路径下仍然挂有资源",
"There are still collect rules under the classpath": "机器分组路径下仍然存在采集策略",
"No such classpath": "机器分组路径不存在",
"Classpath %s already exists": "机器分组路径(%s)已存在",
"Preset classpath %s cannot delete": "内置机器分组(%s)不允许删除",
"No such mute config": "此屏蔽配置不存在",
"DashboardGroup name has invalid characters": "大盘分组名称含有非法字符",
"DashboardGroup name is blank": "大盘分组名称为空",
"DashboardGroup %s already exists": "大盘分组(%s)已存在",
"No such dashboard group": "大盘分组不存在",
"Dashboard name has invalid characters": "大盘名称含有非法字符",
"Dashboard %s already exists": "监控大盘(%s)已存在",
"ChartGroup name has invalid characters": "图表分组名称含有非法字符",
"No such dashboard": "监控大盘不存在",
"No such chart group": "图表分组不存在",
"No such chart": "图表不存在",
"There are still dashboards under the group": "分组下面仍然存在监控大盘,请先从组内移出",
"AlertRuleGroup name has invalid characters": "告警规则分组含有非法字符",
"AlertRuleGroup %s already exists": "告警规则分组(%s)已存在",
"There are still alert rules under the group": "分组下面仍然存在告警规则",
"AlertRule name has invalid characters": "告警规则含有非法字符",
"No such alert rule": "告警规则不存在",
"No such alert rule group": "告警规则分组不存在",
"No such alert event": "告警事件不存在",
"Alert rule %s already exists": "告警规则(%s)已存在",
"No such collect rule": "采集规则不存在",
"Decoded metric description empty": "导入的指标释义列表为空",
"User disabled": "用户已被禁用",
"Tags(%s) invalid": "标签(%s)格式不合法",
"Resource filter(Func:%s)'s param invalid": "资源过滤条件(函数:%s)参数不合法(为空或包含空格都不合法)",
"Tags filter(Func:%s)'s param invalid": "标签过滤条件(函数:%s)参数不合法(为空或包含空格都不合法)",
"Regexp: %s cannot be compiled": "正则表达式(%s)不合法,无法编译",
"AppendTags(%s) invalid": "附件标签(%s)格式不合法",
"Regexp %s matching failed": "正则表达式 %s 匹配失败",
"Regexp %s matched, but cannot get substring()": "主正则 %s 匹配成功,但无法匹配到子串",
"TagKey or TagValue contains illegal characters[:,/=\r\n\t]": "标签KEY或者标签值包含非法字符串[:,/=\r\n\t]",
"Resource cannot delete in preset classpath": "预置分组不能删除资源",
"No such resource %s": "不存在该资源(%s)",
}
langDict = map[string]map[string]string{
"zh": dict,
}
)
func init() {
i18n.DictRegister(langDict)
}

7
docker/.dockerignore Normal file
View File

@ -0,0 +1,7 @@
ibexetc
initsql
mysqletc
n9eetc
prometc
build.sh
docker-compose.yaml

10
docker/Dockerfile Normal file
View File

@ -0,0 +1,10 @@
FROM ubuntu:21.04
WORKDIR /app
ADD n9e /app
RUN chmod +x n9e
EXPOSE 19000
EXPOSE 18000
CMD ["/app/n9e", "-h"]

15
docker/build.sh Executable file
View File

@ -0,0 +1,15 @@
#!/bin/sh
if [ $# -ne 1 ]; then
echo "$0 <tag>"
exit 0
fi
tag=$1
echo "tag: ${tag}"
rm -rf n9e && cp ../n9e . && docker build -t nightingale:${tag} .
docker tag nightingale:${tag} ulric2019/nightingale:${tag}
docker push ulric2019/nightingale:${tag}

175
docker/docker-compose.yaml Normal file
View File

@ -0,0 +1,175 @@
version: "3.7"
networks:
nightingale:
driver: bridge
services:
mysql:
image: "mysql:5.7"
container_name: mysql
hostname: mysql
restart: always
ports:
- "3306:3306"
environment:
TZ: Asia/Shanghai
MYSQL_ROOT_PASSWORD: 1234
volumes:
- ./mysqldata:/var/lib/mysql/
- ./initsql:/docker-entrypoint-initdb.d/
- ./mysqletc/my.cnf:/etc/my.cnf
networks:
- nightingale
redis:
image: "redis:6.2"
container_name: redis
hostname: redis
restart: always
ports:
- "6379:6379"
environment:
TZ: Asia/Shanghai
networks:
- nightingale
prometheus:
image: prom/prometheus
container_name: prometheus
hostname: prometheus
restart: always
environment:
TZ: Asia/Shanghai
volumes:
- ./prometc:/etc/prometheus
ports:
- "9090:9090"
networks:
- nightingale
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
- "--enable-feature=remote-write-receiver"
- "--query.lookback-delta=2m"
ibex:
image: ulric2019/ibex:0.2
container_name: ibex
hostname: ibex
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
ports:
- "10090:10090"
- "20090:20090"
volumes:
- ./ibexetc:/app/etc
networks:
- nightingale
depends_on:
- mysql
links:
- mysql:mysql
command:
- "/app/ibex"
- "server"
nwebapi:
image: ulric2019/nightingale:0.4
container_name: nwebapi
hostname: nwebapi
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
volumes:
- ./n9eetc:/app/etc
ports:
- "18000:18000"
networks:
- nightingale
depends_on:
- mysql
- redis
- prometheus
- ibex
links:
- mysql:mysql
- redis:redis
- prometheus:prometheus
- ibex:ibex
command:
- "/app/n9e"
- "webapi"
nserver:
image: ulric2019/nightingale:0.4
container_name: nserver
hostname: nserver
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
volumes:
- ./n9eetc:/app/etc
ports:
- "19000:19000"
networks:
- nightingale
depends_on:
- mysql
- redis
- prometheus
- ibex
links:
- mysql:mysql
- redis:redis
- prometheus:prometheus
- ibex:ibex
command:
- "/app/n9e"
- "server"
telegraf:
image: "telegraf:1.20.3"
container_name: "telegraf"
hostname: "telegraf01"
restart: always
environment:
TZ: Asia/Shanghai
volumes:
- ./telegrafetc/telegraf.conf:/etc/telegraf/telegraf.conf
ports:
- "8125:8125/udp"
- "8092:8092/udp"
- "8094:8094/tcp"
networks:
- nightingale
depends_on:
- nserver
links:
- nserver:nserver
agentd:
image: ulric2019/ibex:0.2
container_name: agentd
hostname: agentd
restart: always
environment:
GIN_MODE: release
TZ: Asia/Shanghai
volumes:
- ./ibexetc:/app/etc
networks:
- nightingale
depends_on:
- ibex
links:
- ibex:ibex
command:
- "/app/ibex"
- "agentd"

View File

@ -0,0 +1,38 @@
# debug, release
RunMode = "release"
# task meta storage dir
MetaDir = "./meta"
[HTTP]
Enable = true
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 2090
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = true
# whether enable pprof
PProf = false
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[Heartbeat]
# unit: ms
Interval = 1000
# rpc servers
Servers = ["ibex:20090"]
# $ip or $hostname or specified string
Host = "telegraf01"

View File

@ -0,0 +1,97 @@
# debug, release
RunMode = "release"
[Log]
# log write dir
Dir = "logs-server"
# log level: DEBUG INFO WARNING ERROR
Level = "DEBUG"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours: 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
Enable = true
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 10090
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = true
# whether enable pprof
PProf = false
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[BasicAuth]
# using when call apis
ibex = "ibex"
[RPC]
Listen = "0.0.0.0:20090"
[Heartbeat]
# auto detect if blank
IP = ""
# unit: ms
Interval = 1000
[Output]
# database | remote
ComeFrom = "database"
AgtdPort = 2090
[Gorm]
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
# table prefix
TablePrefix = ""
[MySQL]
# mysql address host:port
Address = "mysql:3306"
# mysql username
User = "root"
# mysql password
Password = "1234"
# database name
DBName = "ibex"
# connection params
Parameters = "charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
[Postgres]
# pg address host:port
Address = "postgres:5432"
# pg user
User = "root"
# pg password
Password = "1234"
# database name
DBName = "ibex"
# ssl mode
SSLMode = "disable"

1362
docker/initsql/ibex.sql Normal file

File diff suppressed because it is too large Load Diff

3
docker/initsql/init.sql Normal file
View File

@ -0,0 +1,3 @@
GRANT ALL ON *.* TO 'root'@'127.0.0.1' IDENTIFIED BY '1234';
GRANT ALL ON *.* TO 'root'@'localhost' IDENTIFIED BY '1234';
GRANT ALL ON *.* TO 'root'@'%' IDENTIFIED BY '1234';

372
docker/initsql/n9e.sql Normal file
View File

@ -0,0 +1,372 @@
set names utf8mb4;
drop database if exists n9e_v5;
create database n9e_v5;
use n9e_v5;
CREATE TABLE `user` (
`id` bigint unsigned not null auto_increment,
`username` varchar(64) not null comment 'login name, cannot rename',
`nickname` varchar(64) not null comment 'display name, chinese name',
`password` varchar(128) not null default '',
`phone` varchar(16) not null default '',
`email` varchar(64) not null default '',
`portrait` varchar(255) not null default '' comment 'portrait image url',
`roles` varchar(255) not null comment 'Admin | Standard | Guest, split by space',
`contacts` varchar(1024) comment 'json e.g. {wecom:xx, dingtalk_robot_token:yy}',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`username`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into `user`(id, username, nickname, password, roles, create_at, create_by, update_at, update_by) values(1, 'root', '超管', 'root.2020', 'Admin', unix_timestamp(now()), 'system', unix_timestamp(now()), 'system');
CREATE TABLE `user_group` (
`id` bigint unsigned not null auto_increment,
`name` varchar(128) not null default '',
`note` varchar(255) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`create_by`),
KEY (`update_at`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into user_group(id, name, create_at, create_by, update_at, update_by) values(1, 'demo-root-group', unix_timestamp(now()), 'root', unix_timestamp(now()), 'root');
CREATE TABLE `user_group_member` (
`group_id` bigint unsigned not null,
`user_id` bigint unsigned not null,
KEY (`group_id`),
KEY (`user_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into user_group_member(group_id, user_id) values(1, 1);
CREATE TABLE `configs` (
`id` bigint unsigned not null auto_increment,
`ckey` varchar(191) not null,
`cval` varchar(1024) not null default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`ckey`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `role` (
`id` bigint unsigned not null auto_increment,
`name` varchar(191) not null default '',
`note` varchar(255) not null default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into `role`(name, note) values('Admin', 'Administrator role');
insert into `role`(name, note) values('Standard', 'Ordinary user role');
insert into `role`(name, note) values('Guest', 'Readonly user role');
CREATE TABLE `role_operation`(
`role_name` varchar(128) not null,
`operation` varchar(191) not null,
KEY (`role_name`),
KEY (`operation`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
-- Admin is special, who has no concrete operation but can do anything.
insert into `role_operation`(role_name, operation) values('Guest', 'menu_prom_dash');
insert into `role_operation`(role_name, operation) values('Guest', 'menu_target_dash');
insert into `role_operation`(role_name, operation) values('Guest', 'menu_dashboard');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_prom_dash');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_target_dash');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_dashboard');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_user');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_user_group');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_busi_group');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_target');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_alert_rule');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_alert_mute');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_alert_subscribe');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_alert_cur_event');
insert into `role_operation`(role_name, operation) values('Standard', 'menu_alert_his_event');
-- for alert_rule | collect_rule | mute | dashboard grouping
CREATE TABLE `busi_group` (
`id` bigint unsigned not null auto_increment,
`name` varchar(191) not null,
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into busi_group(id, name, create_at, create_by, update_at, update_by) values(1, 'Default Business Group', unix_timestamp(now()), 'root', unix_timestamp(now()), 'root');
CREATE TABLE `busi_group_member` (
`id` bigint unsigned not null auto_increment,
`busi_group_id` bigint not null comment 'busi group id',
`user_group_id` bigint not null comment 'user group id',
`perm_flag` char(2) not null comment 'ro | rw',
PRIMARY KEY (`id`),
KEY (`busi_group_id`),
KEY (`user_group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
insert into busi_group_member(busi_group_id, user_group_id, perm_flag) values(1, 1, "rw");
CREATE TABLE `dashboard` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`name` varchar(191) not null,
`tags` varchar(255) not null comment 'split by space',
`configs` varchar(4096) comment 'dashboard variables',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
UNIQUE KEY (`group_id`, `name`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
-- auto create the first subclass 'Default chart group' of dashboard
CREATE TABLE `chart_group` (
`id` bigint unsigned not null auto_increment,
`dashboard_id` bigint unsigned not null,
`name` varchar(255) not null,
`weight` int not null default 0,
PRIMARY KEY (`id`),
KEY (`dashboard_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `chart` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint unsigned not null comment 'chart group id',
`configs` varchar(8192),
`weight` int not null default 0,
PRIMARY KEY (`id`),
KEY (`group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `chart_share` (
`id` bigint unsigned not null auto_increment,
`cluster` varchar(128) not null,
`configs` varchar(8192),
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
primary key (`id`),
key (`create_at`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alert_rule` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`cluster` varchar(128) not null,
`name` varchar(255) not null,
`note` varchar(255) not null,
`severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`disabled` tinyint(1) not null comment '0:enabled 1:disabled',
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`prom_ql` varchar(4096) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`enable_stime` char(5) not null default '00:00',
`enable_etime` char(5) not null default '23:59',
`enable_days_of_week` varchar(32) not null default '' comment 'split by space: 0 1 2 3 4 5 6',
`notify_recovered` tinyint(1) not null comment 'whether notify when recovery',
`notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
`notify_groups` varchar(255) not null default '' comment 'split by space: 233 43',
`notify_repeat_step` int not null default 0 comment 'unit: min',
`callbacks` varchar(255) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`runbook_url` varchar(255),
`append_tags` varchar(255) not null default '' comment 'split by space: service=n9e mod=api',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`group_id`),
KEY (`update_at`)
) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alert_mute` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`cluster` varchar(128) not null,
`tags` varchar(2048) not null default '' comment 'json,map,tagkey->regexp|value',
`cause` varchar(255) not null default '',
`btime` bigint not null default 0 comment 'begin time',
`etime` bigint not null default 0 comment 'end time',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`create_at`),
KEY (`group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alert_subscribe` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`cluster` varchar(128) not null,
`rule_id` bigint not null default 0,
`tags` varchar(2048) not null default '' comment 'json,map,tagkey->regexp|value',
`redefine_severity` tinyint(1) default 0 comment 'is redefine severity?',
`new_severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`redefine_channels` tinyint(1) default 0 comment 'is redefine channels?',
`new_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
`user_group_ids` varchar(250) not null comment 'split by space 1 34 5, notify cc to user_group_ids',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`update_at`),
KEY (`group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `target` (
`id` bigint unsigned not null auto_increment,
`group_id` bigint not null default 0 comment 'busi group id',
`cluster` varchar(128) not null comment 'append to alert event as field',
`ident` varchar(191) not null comment 'target id',
`note` varchar(255) not null default '' comment 'append to alert event as field',
`tags` varchar(512) not null default '' comment 'append to series data as tags, split by space, append external space at suffix',
`update_at` bigint not null default 0,
PRIMARY KEY (`id`),
UNIQUE KEY (`ident`),
KEY (`group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
-- case1: target_idents; case2: target_tags
-- CREATE TABLE `collect_rule` (
-- `id` bigint unsigned not null auto_increment,
-- `group_id` bigint not null default 0 comment 'busi group id',
-- `cluster` varchar(128) not null,
-- `target_idents` varchar(512) not null default '' comment 'ident list, split by space',
-- `target_tags` varchar(512) not null default '' comment 'filter targets by tags, split by space',
-- `name` varchar(191) not null default '',
-- `note` varchar(255) not null default '',
-- `step` int not null,
-- `type` varchar(64) not null comment 'e.g. port proc log plugin',
-- `data` text not null,
-- `append_tags` varchar(255) not null default '' comment 'split by space: e.g. mod=n9e dept=cloud',
-- `create_at` bigint not null default 0,
-- `create_by` varchar(64) not null default '',
-- `update_at` bigint not null default 0,
-- `update_by` varchar(64) not null default '',
-- PRIMARY KEY (`id`),
-- KEY (`group_id`, `type`, `name`)
-- ) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alert_cur_event` (
`id` bigint unsigned not null comment 'use alert_his_event.id',
`cluster` varchar(128) not null,
`group_id` bigint unsigned not null comment 'busi group id of rule',
`hash` varchar(64) not null comment 'rule_id + vector_pk',
`rule_id` bigint unsigned not null,
`rule_name` varchar(255) not null,
`rule_note` varchar(512) not null default 'alert rule note',
`severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`prom_ql` varchar(4096) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`callbacks` varchar(255) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`runbook_url` varchar(255),
`notify_recovered` tinyint(1) not null comment 'whether notify when recovery',
`notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
`notify_groups` varchar(255) not null default '' comment 'split by space: 233 43',
`notify_repeat_next` bigint not null default 0 comment 'next timestamp to notify, get repeat settings from rule',
`target_ident` varchar(191) not null default '' comment 'target ident, also in tags',
`target_note` varchar(191) not null default '' comment 'target note',
`trigger_time` bigint not null,
`trigger_value` varchar(255) not null,
`tags` varchar(1024) not null default '' comment 'merge data_tags rule_tags, split by ,,',
PRIMARY KEY (`id`),
KEY (`hash`),
KEY (`rule_id`),
KEY (`trigger_time`, `group_id`),
KEY (`notify_repeat_next`)
) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `alert_his_event` (
`id` bigint unsigned not null AUTO_INCREMENT,
`is_recovered` tinyint(1) not null,
`cluster` varchar(128) not null,
`group_id` bigint unsigned not null comment 'busi group id of rule',
`hash` varchar(64) not null comment 'rule_id + vector_pk',
`rule_id` bigint unsigned not null,
`rule_name` varchar(255) not null,
`rule_note` varchar(512) not null default 'alert rule note',
`severity` tinyint(1) not null comment '0:Emergency 1:Warning 2:Notice',
`prom_for_duration` int not null comment 'prometheus for, unit:s',
`prom_ql` varchar(4096) not null comment 'promql',
`prom_eval_interval` int not null comment 'evaluate interval',
`callbacks` varchar(255) not null default '' comment 'split by space: http://a.com/api/x http://a.com/api/y',
`runbook_url` varchar(255),
`notify_recovered` tinyint(1) not null comment 'whether notify when recovery',
`notify_channels` varchar(255) not null default '' comment 'split by space: sms voice email dingtalk wecom',
`notify_groups` varchar(255) not null default '' comment 'split by space: 233 43',
`target_ident` varchar(191) not null default '' comment 'target ident, also in tags',
`target_note` varchar(191) not null default '' comment 'target note',
`trigger_time` bigint not null,
`trigger_value` varchar(255) not null,
`tags` varchar(1024) not null default '' comment 'merge data_tags rule_tags, split by ,,',
PRIMARY KEY (`id`),
KEY (`hash`),
KEY (`rule_id`),
KEY (`trigger_time`, `group_id`)
) ENGINE=InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `task_tpl`
(
`id` int unsigned NOT NULL AUTO_INCREMENT,
`group_id` int unsigned not null comment 'busi group id',
`title` varchar(255) not null default '',
`account` varchar(64) not null,
`batch` int unsigned not null default 0,
`tolerance` int unsigned not null default 0,
`timeout` int unsigned not null default 0,
`pause` varchar(255) not null default '',
`script` text not null,
`args` varchar(512) not null default '',
`tags` varchar(255) not null default '' comment 'split by space',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
`update_at` bigint not null default 0,
`update_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`group_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `task_tpl_host`
(
`ii` int unsigned NOT NULL AUTO_INCREMENT,
`id` int unsigned not null comment 'task tpl id',
`host` varchar(128) not null comment 'ip or hostname',
PRIMARY KEY (`ii`),
KEY (`id`, `host`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;
CREATE TABLE `task_record`
(
`id` bigint unsigned not null comment 'ibex task id',
`group_id` bigint not null comment 'busi group id',
`ibex_address` varchar(128) not null,
`ibex_auth_user` varchar(128) not null default '',
`ibex_auth_pass` varchar(128) not null default '',
`title` varchar(255) not null default '',
`account` varchar(64) not null,
`batch` int unsigned not null default 0,
`tolerance` int unsigned not null default 0,
`timeout` int unsigned not null default 0,
`pause` varchar(255) not null default '',
`script` text not null,
`args` varchar(512) not null default '',
`create_at` bigint not null default 0,
`create_by` varchar(64) not null default '',
PRIMARY KEY (`id`),
KEY (`create_at`, `group_id`),
KEY (`create_by`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8mb4;

5
docker/mysqletc/my.cnf Normal file
View File

@ -0,0 +1,5 @@
[mysqld]
pid-file = /var/run/mysqld/mysqld.pid
socket = /var/run/mysqld/mysqld.sock
datadir = /var/lib/mysql
bind-address = 0.0.0.0

BIN
docker/n9e Executable file

Binary file not shown.

View File

@ -0,0 +1 @@
cpu_usage_idle: CPU空闲率单位%

162
docker/n9eetc/script/notify.py Executable file
View File

@ -0,0 +1,162 @@
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import sys
import json
import urllib2
import smtplib
from email.mime.text import MIMEText
notify_channel_funcs = {
"email":"email",
"sms":"sms",
"voice":"voice",
"dingtalk":"dingtalk",
"wecom":"wecom"
}
mail_host = "smtp.163.com"
mail_port = 994
mail_user = "ulricqin"
mail_pass = "password"
mail_from = "ulricqin@163.com"
class Sender(object):
@classmethod
def send_email(cls, payload):
users = payload.get('event').get("notify_users_obj")
emails = {}
for u in users:
if u.get("email"):
emails[u.get("email")] = 1
if not emails:
return
recipients = emails.keys()
mail_body = payload.get('tpls').get("mailbody.tpl", "mailbody.tpl not found")
message = MIMEText(mail_body, 'html', 'utf-8')
message['From'] = mail_from
message['To'] = ", ".join(recipients)
message["Subject"] = payload.get('tpls').get("subject.tpl", "subject.tpl not found")
try:
smtp = smtplib.SMTP_SSL(mail_host, mail_port)
smtp.login(mail_user, mail_pass)
smtp.sendmail(mail_from, recipients, message.as_string())
smtp.close()
except smtplib.SMTPException, error:
print(error)
@classmethod
def send_wecom(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
for u in users:
contacts = u.get("contacts")
if contacts.get("wecom_robot_token", ""):
tokens[contacts.get("wecom_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}".format(t)
body = {
"msgtype": "markdown",
"markdown": {
"content": payload.get('tpls').get("wecom.tpl", "wecom.tpl not found")
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_dingtalk(cls, payload):
users = payload.get('event').get("notify_users_obj")
tokens = {}
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts")
if contacts.get("dingtalk_robot_token", ""):
tokens[contacts.get("dingtalk_robot_token", "")] = 1
opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
for t in tokens:
url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(t)
body = {
"msgtype": "text",
"text": {
"content": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found")
},
"at": {
"atMobiles": phones.keys(),
"isAtAll": False
}
}
request = urllib2.Request(url, data=json.dumps(body))
request.add_header("Content-Type",'application/json;charset=utf-8')
request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod
def send_sms(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_sms not implemented, phones: {}".format(phones.keys()))
@classmethod
def send_voice(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_voice not implemented, phones: {}".format(phones.keys()))
def main():
payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(notify_channel_funcs.get(ch.strip()))
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
def hello():
print("hello nightingale")
if __name__ == "__main__":
if len(sys.argv) == 1:
main()
elif sys.argv[1] == "hello":
hello()
else:
print("I am confused")

188
docker/n9eetc/server.conf Normal file
View File

@ -0,0 +1,188 @@
# debug, release
RunMode = "release"
# my cluster name
ClusterName = "Default"
[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "INFO"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours: 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 19000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = false
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
# [BasicAuth]
# user002 = "ccc26da7b9aba533cbb263a36c07dcc9"
[Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
[Alerting]
NotifyScriptPath = "./etc/script/notify.py"
NotifyConcurrency = 100
[Alerting.RedisPub]
Enable = false
# complete redis key: ${ChannelPrefix} + ${Cluster}
ChannelPrefix = "/alerts/"
[NoData]
Metric = "target_up"
# unit: second
Interval = 15
[Ibex]
# callback: ${ibex}/${tplid}/${host}
Address = "ibex:10090"
# basic auth
BasicAuthUser = "ibex"
BasicAuthPass = "ibex"
# unit: ms
Timeout = 3000
[Redis]
# address, ip:port
Address = "redis:6379"
# requirepass
Password = ""
# # db
# DB = 0
[Gorm]
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
# table prefix
TablePrefix = ""
# enable auto migrate or not
EnableAutoMigrate = false
[MySQL]
# mysql address host:port
Address = "mysql:3306"
# mysql username
User = "root"
# mysql password
Password = "1234"
# database name
DBName = "n9e_v5"
# connection params
Parameters = "charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
[Postgres]
# pg address host:port
Address = "postgres:5432"
# pg user
User = "root"
# pg password
Password = "1234"
# database name
DBName = "n9e_v5"
# ssl mode
SSLMode = "disable"
[Reader]
# prometheus base url
Url = "http://prometheus:9090"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Timeout = 30000
DialTimeout = 10000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 10
[WriterOpt]
# queue max size
QueueMaxSize = 10000000
# once pop samples number from queue
QueuePopSize = 2000
# unit: ms
SleepInterval = 50
[[Writers]]
Name = "prom"
Url = "http://prometheus:9090/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Timeout = 30000
DialTimeout = 10000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
# [[Writers]]
# Name = "m3db"
# Url = "http://m3db:7201/api/v1/prom/remote/write"
# # Basic auth username
# BasicAuthUser = ""
# # Basic auth password
# BasicAuthPass = ""
# timeout settings, unit: ms
# Timeout = 30000
# DialTimeout = 10000
# TLSHandshakeTimeout = 30000
# ExpectContinueTimeout = 1000
# IdleConnTimeout = 90000
# # time duration, unit: ms
# KeepAlive = 30000
# MaxConnsPerHost = 0
# MaxIdleConns = 100
# MaxIdleConnsPerHost = 100

View File

@ -0,0 +1,6 @@
级别状态: S{{.Severity}} {{if .IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{.RuleName}}{{if .RuleNote}}
规则备注: {{.RuleNote}}{{end}}
监控指标: {{.TagsJSON}}
触发时间: {{timeformat .TriggerTime}}
触发时值: {{.TriggerValue}}

View File

@ -129,7 +129,7 @@
<div class="wrapper"> <div class="wrapper">
<div class="main"> <div class="main">
<header> <header>
<h3 class="title">{{Sname}}</h3> <h3 class="title">{{.RuleName}}</h3>
<p class="sub-desc"></p> <p class="sub-desc"></p>
</header> </header>
@ -138,61 +138,45 @@
<div class="body"> <div class="body">
<table cellspacing="0" cellpadding="0" border="0"> <table cellspacing="0" cellpadding="0" border="0">
<tbody> <tbody>
% if IsAlert: {{if .IsRecovered}}
<tr class="fail">
<th>级别状态:</th>
<td>{{Status}}</td>
</tr>
% else:
<tr class="succ"> <tr class="succ">
<th>级别状态:</th> <th>级别状态:</th>
<td>{{Status}}</td> <td>S{{.Severity}} Recovered</td>
</tr> </tr>
% end {{else}}
<tr class="fail">
<th>级别状态:</th>
<td>S{{.Severity}} Triggered</td>
</tr>
{{end}}
% if IsMachineDep:
<tr> <tr>
<th>告警设备:</th> <th>策略备注</th>
<td>{{Ident}}</td> <td>{{.RuleNote}}</td>
</tr> </tr>
<tr> <tr>
<th>所属分组:</th> <th>设备备注:</th>
<td> <td>{{.TargetNote}}</td>
{{Classpath}}<br />
</td>
</tr> </tr>
% end
<tr> <tr>
<th>监控指标:</th> <th>监控指标:</th>
<td>{{Metric}}</td> <td>{{.TagsJSON}}</td>
</tr> </tr>
<tr> <tr>
<th>tags</th> <th>触发时值:</th>
<td>{{Tags}}</td> <td>{{.TriggerValue}}</td>
</tr>
<tr>
<th>当前值:</th>
<td>{{Value}}</td>
</tr>
<tr>
<th>报警说明:</th>
<td>
{{ReadableExpression}}
</td>
</tr> </tr>
<tr> <tr>
<th>触发时间:</th> <th>触发时间:</th>
<td> <td>
{{TriggerTime}} {{timeformat .TriggerTime}}
</td> </td>
</tr> </tr>
<tr> <tr>
<th>报警详情:</th> <th>PromQL</th>
<td>{{Elink}}</td> <td>
</tr> {{.PromQl}}
<tr> </td>
<th>报警策略:</th>
<td>{{Slink}}</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
@ -200,11 +184,6 @@
<hr> <hr>
<footer> <footer>
<div class="footer-logo">
<a href="https://n9e.didiyun.com">
<img src="https://s3-gz01.didistatic.com/n9e-pub/image/n9e-logo-bg-white.png" class="footer-logo-image" alt="">
</a>
</div>
<div class="copyright" style="font-style: italic"> <div class="copyright" style="font-style: italic">
我们希望与您一起,将监控这个事情,做到极致! 我们希望与您一起,将监控这个事情,做到极致!
</div> </div>

View File

@ -0,0 +1 @@
{{if .IsRecovered}}Recovered{{else}}Triggered{{end}}: {{.RuleName}} {{.TagsJSON}}

View File

@ -0,0 +1,6 @@
**级别状态**: {{if .IsRecovered}}<font color="info">S{{.Severity}} Recovered</font>{{else}}<font color="warning">S{{.Severity}} Triggered</font>{{end}}
**规则标题**: {{.RuleName}}{{if .RuleNote}}
**规则备注**: {{.RuleNote}}{{end}}
**监控指标**: {{.TagsJSON}}
**触发时间**: {{timeformat .TriggerTime}}
**触发时值**: {{.TriggerValue}}

166
docker/n9eetc/webapi.conf Normal file
View File

@ -0,0 +1,166 @@
# debug, release
RunMode = "release"
# # custom i18n dict config
# I18N = "./etc/i18n.json"
# do not change
AdminRole = "Admin"
# Linkage with notify.py script
NotifyChannels = [ "email", "dingtalk", "wecom" ]
[[ContactKeys]]
Label = "Wecom Robot Token"
Key = "wecom_robot_token"
[[ContactKeys]]
Label = "Dingtalk Robot Token"
Key = "dingtalk_robot_token"
[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "DEBUG"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours: 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 18000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = true
# whether enable pprof
PProf = false
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[JWTAuth]
# signing key
SigningKey = "5b94a0fd640fe2765af826acfe42d151"
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"
[BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[LDAP]
Enable = false
Host = "ldap.example.org"
Port = 389
BaseDn = "dc=example,dc=org"
# AD: manange@example.org
BindUser = "cn=manager,dc=example,dc=org"
BindPass = "*******"
# openldap format e.g. (&(uid=%s))
# AD format e.g. (&(sAMAccountName=%s))
AuthFilter = "(&(uid=%s))"
CoverAttributes = true
TLS = false
StartTLS = true
[LDAP.Attributes]
Nickname = "cn"
Phone = "mobile"
Email = "mail"
[Redis]
# address, ip:port
Address = "redis:6379"
# requirepass
Password = ""
# # db
# DB = 0
[Gorm]
# enable debug mode or not
Debug = true
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
# table prefix
TablePrefix = ""
# enable auto migrate or not
EnableAutoMigrate = false
[MySQL]
# mysql address host:port
Address = "mysql:3306"
# mysql username
User = "root"
# mysql password
Password = "1234"
# database name
DBName = "n9e_v5"
# connection params
Parameters = "charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
[Postgres]
# pg address host:port
Address = "postgres:5432"
# pg user
User = "root"
# pg password
Password = "1234"
# database name
DBName = "n9e_v5"
# ssl mode
SSLMode = "disable"
[[Clusters]]
# Prometheus cluster name
Name = "Default"
# Prometheus APIs base url
Prom = "http://prometheus:9090"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Timeout = 30000
DialTimeout = 10000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
[Ibex]
Address = "http://ibex:10090"
# basic auth
BasicAuthUser = "ibex"
BasicAuthPass = "ibex"
# unit: ms
Timeout = 3000

View File

@ -0,0 +1,29 @@
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<title>Nightingale</title>
</head>
<body>
<h1>Hello, Nightingale</h1>
</body>
</html>

View File

@ -1,33 +0,0 @@
#!/bin/sh
now=$(date +%s)
echo '[
{
"metric": "plugin_example_gauge",
"tags": {
"type": "testcase",
"author": "ulric"
},
"value": '${now}',
"type": "gauge"
},
{
"metric": "plugin_example_rate",
"tags": {
"type": "testcase",
"author": "ulric"
},
"value": '${now}',
"type": "rate"
},
{
"metric": "plugin_example_increase",
"tags": {
"type": "testcase",
"author": "ulric"
},
"value": '${now}',
"type": "increase"
}
]'

View File

@ -1,191 +0,0 @@
[
{
"name": "dns解析时间超过2秒",
"type": 1,
"expression": {
"evaluation_interval": 10,
"promql": "probe_dns_lookup_time_seconds>2"
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 3,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626935980,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "https证书过期时间小于7天",
"type": 1,
"expression": {
"evaluation_interval": 10,
"promql": "(probe_ssl_earliest_cert_expiry - time()) / 3600 / 24 <7"
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 1,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626935909,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "http响应数据传输占比超过70%",
"type": 1,
"expression": {
"evaluation_interval": 10,
"promql": "100 * avg(probe_http_duration_seconds{phase=\"transfer\"})by(instance) / sum(probe_http_duration_seconds) by(instance) >70"
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 2,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626936324,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "http接口探测失败",
"type": 1,
"expression": {
"evaluation_interval": 10,
"promql": "probe_success{job=~\".*http.*\"}==0"
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 1,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626935627,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "http接口探测耗时超过3秒",
"type": 1,
"expression": {
"evaluation_interval": 10,
"promql": "sum(probe_http_duration_seconds) by (instance) >3\n"
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 3,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626936059,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "http接口返回状态码4xx/5xx错误",
"type": 1,
"expression": {
"evaluation_interval": 10,
"promql": "probe_http_status_code >=400"
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 1,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626936145,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "icmp探测失败",
"type": 1,
"expression": {
"evaluation_interval": 10,
"promql": "probe_success{job=~\".*icmp.*\"}==0"
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 1,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626935855,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "tcp端口探测失败",
"type": 1,
"expression": {
"evaluation_interval": 10,
"promql": "probe_success{job=~\".*tcp.*\"}==0"
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 1,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626935874,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "机器ssh探测失败",
"type": 1,
"expression": {
"evaluation_interval": 10,
"promql": "probe_success{job=~\".*ssh.*\"}==0\n"
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 1,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626935827,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
}
]

View File

@ -1,271 +0,0 @@
[
{
"name": "cpu使用率大于85%",
"type": 0,
"expression": {
"together_or_any": 0,
"trigger_conditions": [
{
"optr": ">",
"func": "all",
"metric": "system_cpu_util",
"params": [],
"threshold": 85
}
],
"tags_filters": []
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 2,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626517658,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "内存利用率大于75%",
"type": 0,
"expression": {
"together_or_any": 0,
"trigger_conditions": [
{
"func": "all",
"metric": "system_mem_used_percent",
"optr": ">",
"params": [],
"threshold": 75
}
],
"tags_filters": []
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 2,
"notify_channels": "sms email",
"runbook_url": "",
"note": "",
"create_at": 1626517103,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "机器loadavg大于16",
"type": 0,
"expression": {
"tags_filters": [],
"trigger_conditions": [
{
"func": "all",
"metric": "system_cpu_load1",
"optr": ">",
"params": [],
"threshold": 16
}
]
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 1,
"notify_channels": "sms email",
"runbook_url": "",
"note": "",
"create_at": 1626517103,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "磁盘利用率达到85%",
"type": 0,
"expression": {
"tags_filters": [],
"trigger_conditions": [
{
"func": "all",
"metric": "system_disk_used_percent",
"optr": ">",
"params": [],
"threshold": 85
}
]
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 3,
"notify_channels": "email",
"runbook_url": "",
"note": "",
"create_at": 1626517103,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "磁盘利用率达到88%",
"type": 0,
"expression": {
"tags_filters": [],
"trigger_conditions": [
{
"func": "all",
"metric": "system_disk_used_percent",
"optr": ">",
"params": [],
"threshold": 88
}
]
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 2,
"notify_channels": "email sms",
"runbook_url": "",
"note": "",
"create_at": 1626517103,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "磁盘利用率达到92%",
"type": 0,
"expression": {
"tags_filters": [],
"trigger_conditions": [
{
"func": "all",
"metric": "system_disk_used_percent",
"optr": ">",
"params": [],
"threshold": 88
}
]
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 1,
"notify_channels": "email sms voice",
"runbook_url": "",
"note": "",
"create_at": 1626517103,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "端口挂了",
"type": 0,
"expression": {
"tags_filters": [],
"trigger_conditions": [
{
"func": "all",
"metric": "proc_port_listen",
"optr": "<",
"params": [],
"threshold": 1
}
]
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 2,
"notify_channels": "sms email",
"runbook_url": "",
"note": "",
"create_at": 1626517103,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "网卡入方向错包",
"type": 0,
"expression": {
"together_or_any": 0,
"trigger_conditions": [
{
"optr": ">",
"func": "all",
"metric": "system_net_packets_in_error",
"params": [
1
],
"threshold": 3
}
],
"tags_filters": []
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 2,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626517809,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
},
{
"name": "网卡出方向错包",
"type": 0,
"expression": {
"together_or_any": 0,
"trigger_conditions": [
{
"optr": ">",
"func": "all",
"metric": "system_net_packets_out_error",
"params": [
1
],
"threshold": 3
}
],
"tags_filters": []
},
"status": 0,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": "1 2 3 4 5 6 7",
"recovery_notify": 0,
"priority": 2,
"notify_channels": "",
"runbook_url": "",
"note": "",
"create_at": 1626517838,
"alert_duration": 60,
"notify_users_detail": null,
"notify_groups_detail": null
}
]

View File

@ -1,226 +0,0 @@
[
{
"id": 0,
"name": "blackbox_exporter",
"tags": "",
"configs": "{\"tags\":[{\"tagName\":\"http_probe_job\",\"key\":\"job\",\"value\":\"blackbox-http\",\"prefix\":false},{\"tagName\":\"http_probe_instance\",\"key\":\"instance\",\"value\":\"*\",\"prefix\":false}]}",
"chart_groups": [
{
"id": 0,
"dashboard_id": 0,
"name": "http接口探测",
"weight": 0,
"charts": [
{
"id": 440,
"group_id": 109,
"configs": "{\"name\":\"https的探测\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_http_ssl==1\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 441,
"group_id": 109,
"configs": "{\"name\":\"http的探测\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_http_ssl==0\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 442,
"group_id": 109,
"configs": "{\"name\":\"https探测目标个数\",\"mode\":\"promethues\",\"prome_ql\":[\"count(probe_http_ssl==1)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":0,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 443,
"group_id": 109,
"configs": "{\"name\":\"http探测目标个数\",\"mode\":\"promethues\",\"prome_ql\":[\"count(probe_http_ssl==0)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":0,\"i\":\"3\"}}",
"weight": 0
},
{
"id": 446,
"group_id": 109,
"configs": "{\"name\":\"http探测成功个数\",\"mode\":\"promethues\",\"prome_ql\":[\"count(probe_success{job=~\\\".*http.*\\\"}==1)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":2,\"i\":\"4\"}}",
"weight": 0
},
{
"id": 447,
"group_id": 109,
"configs": "{\"name\":\"http探测失败列表\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_success{job=~\\\".*http.*\\\"}==0\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":2,\"i\":\"5\"}}",
"weight": 0
},
{
"id": 448,
"group_id": 109,
"configs": "{\"name\":\"http探测失败个数\",\"mode\":\"promethues\",\"prome_ql\":[\"count(probe_success{job=~\\\".*http.*\\\"}==0)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":2,\"i\":\"6\"}}",
"weight": 0
},
{
"id": 449,
"group_id": 109,
"configs": "{\"name\":\"http探测总耗时 单位秒\",\"mode\":\"promethues\",\"prome_ql\":[\"sum(probe_http_duration_seconds) by (instance)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":2,\"i\":\"7\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "https接口探测汇总",
"weight": 1,
"charts": [
{
"id": 444,
"group_id": 110,
"configs": "{\"name\":\"tls版本信息\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_tls_version_info\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 445,
"group_id": 110,
"configs": "{\"name\":\"tls证书过期时间 单位:天\",\"mode\":\"promethues\",\"prome_ql\":[\"(probe_ssl_earliest_cert_expiry - time()) / 3600 / 24\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":0,\"i\":\"1\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "http接口各阶段耗时详情",
"weight": 2,
"charts": [
{
"id": 450,
"group_id": 111,
"configs": "{\"name\":\"单个目标的各阶段耗时\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_http_duration_seconds{instance=~\\\"$instance\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 451,
"group_id": 111,
"configs": "{\"name\":\"[阶段1] dns解析时间\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_http_duration_seconds{instance=~\\\"$instance\\\",phase=\\\"resolve\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 452,
"group_id": 111,
"configs": "{\"name\":\"[可无]tls握手时间\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_http_duration_seconds{instance=~\\\"$instance\\\",phase=\\\"tls\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":0,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 453,
"group_id": 111,
"configs": "{\"name\":\"[阶段2] tcp连接耗时\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_http_duration_seconds{instance=~\\\"$instance\\\",phase=\\\"connect\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":0,\"i\":\"3\"}}",
"weight": 0
},
{
"id": 454,
"group_id": 111,
"configs": "{\"name\":\"[阶段3] 服务端处理耗时\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_http_duration_seconds{instance=~\\\"$instance\\\",phase=\\\"processing\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":2,\"i\":\"4\"}}",
"weight": 0
},
{
"id": 455,
"group_id": 111,
"configs": "{\"name\":\"[阶段4] 传输响应耗时\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_http_duration_seconds{instance=~\\\"$instance\\\",phase=\\\"transfer\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":2,\"i\":\"5\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "ssh存活探测(配置了ssh探测job才有)",
"weight": 3,
"charts": [
{
"id": 456,
"group_id": 112,
"configs": "{\"name\":\"ssh探测成功个数\",\"mode\":\"promethues\",\"prome_ql\":[\"count(probe_success{job=~\\\".*ssh.*\\\"}==1)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 457,
"group_id": 112,
"configs": "{\"name\":\"ssh探测失败个数\",\"mode\":\"promethues\",\"prome_ql\":[\"count(probe_success{job=~\\\".*ssh.*\\\"}==0)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 458,
"group_id": 112,
"configs": "{\"name\":\"ssh探测失败详情\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_success{job=~\\\".*ssh.*\\\"}==0\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":0,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 459,
"group_id": 112,
"configs": "{\"name\":\"ssh探测耗时\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_duration_seconds{job=~\\\".*ssh.*\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":0,\"i\":\"3\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "icmp探测(配置了icmp探测job才有)",
"weight": 4,
"charts": [
{
"id": 460,
"group_id": 113,
"configs": "{\"name\":\"icmp探测成功个数\",\"mode\":\"promethues\",\"prome_ql\":[\"count(probe_success{job=~\\\".*icmp.*\\\"}==1)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 461,
"group_id": 113,
"configs": "{\"name\":\"icmp探测失败个数\",\"mode\":\"promethues\",\"prome_ql\":[\"count(probe_success{job=~\\\".*icmp.*\\\"}==0)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 462,
"group_id": 113,
"configs": "{\"name\":\"icmp探测失败详情\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_success{job=~\\\".*icmp.*\\\"}==0\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":0,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 463,
"group_id": 113,
"configs": "{\"name\":\"icmp探测总耗时\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_duration_seconds{job=~\\\".*icmp.*\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":0,\"i\":\"3\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "tcp端口探测(配置了tcp探测job才有)",
"weight": 5,
"charts": [
{
"id": 464,
"group_id": 114,
"configs": "{\"name\":\"tcp端口探测成功个数\",\"mode\":\"promethues\",\"prome_ql\":[\"count(probe_success{job=~\\\".*tcp.*\\\"}==1)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 465,
"group_id": 114,
"configs": "{\"name\":\"tcp端口探测失败个数\",\"mode\":\"promethues\",\"prome_ql\":[\"count(probe_success{job=~\\\".*tcp.*\\\"}==0)\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 466,
"group_id": 114,
"configs": "{\"name\":\"tcp端口探测失败列表\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_success{job=~\\\".*tcp.*\\\"}==0\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":0,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 467,
"group_id": 114,
"configs": "{\"name\":\"tcp端口探测耗时\",\"mode\":\"promethues\",\"prome_ql\":[\"probe_duration_seconds{job=~\\\".*tcp.*\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":0,\"i\":\"3\"}}",
"weight": 0
}
]
}
]
}
]

View File

@ -1,306 +0,0 @@
[
{
"id": 0,
"name": "jmx_exporter",
"tags": "",
"configs": "{\"tags\":[{\"tagName\":\"java_app\",\"key\":\"java_app\",\"value\":\"*\",\"prefix\":false}]}",
"chart_groups": [
{
"id": 0,
"dashboard_id": 0,
"name": "jvm统计",
"weight": 1,
"charts": [
{
"id": 278,
"group_id": 75,
"configs": "{\"name\":\"jvm版本信息\",\"mode\":\"promethues\",\"prome_ql\":[\"avg(jvm_info{java_app=~\\\"$java_app\\\"}) without (runtime,vendor)\"],\"layout\":{\"h\":2,\"w\":12,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 309,
"group_id": 75,
"configs": "{\"name\":\"java进程启动时间 单位:小时\",\"mode\":\"promethues\",\"prome_ql\":[\"(time() - process_start_time_seconds{java_app=~\\\"$java_app\\\"})/3600\"],\"layout\":{\"h\":2,\"w\":12,\"x\":12,\"y\":0,\"i\":\"1\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "jvm内存使用",
"weight": 2,
"charts": [
{
"id": 279,
"group_id": 76,
"configs": "{\"name\":\"jvm内存使用 - nonheap 非堆区\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_memory_bytes_used{java_app=~\\\"$java_app\\\",area=\\\"nonheap\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 280,
"group_id": 76,
"configs": "{\"name\":\"jvm内存使用 - heap堆区\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_memory_bytes_used{java_app=~\\\"$java_app\\\",area=\\\"heap\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 281,
"group_id": 76,
"configs": "{\"name\":\"提交给 Java虚拟机使用的内存量 heap 堆区\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_memory_bytes_committed{java_app=~\\\"$java_app\\\",area=\\\"heap\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":0,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 282,
"group_id": 76,
"configs": "{\"name\":\"提交给 Java虚拟机使用的内存量 nonheap 非堆区\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_memory_bytes_committed{java_app=~\\\"$java_app\\\",area=\\\"nonheap\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":0,\"i\":\"3\"}}",
"weight": 0
},
{
"id": 283,
"group_id": 76,
"configs": "{\"name\":\"jvm最大内存 \",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_memory_bytes_max{java_app=~\\\"$java_app\\\",area=\\\"heap\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":2,\"i\":\"4\"}}",
"weight": 0
},
{
"id": 285,
"group_id": 76,
"configs": "{\"name\":\"jvm 初始化内存\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_memory_bytes_init{java_app=~\\\"$java_app\\\",area=\\\"heap\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":2,\"i\":\"5\"}}",
"weight": 0
},
{
"id": 286,
"group_id": 76,
"configs": "{\"name\":\"jvm内存使用百分比% heap堆区 \",\"mode\":\"promethues\",\"prome_ql\":[\"100 * jvm_memory_bytes_used{java_app=~\\\"$java_app\\\",area=\\\"heap\\\"}/jvm_memory_bytes_max{java_app=~\\\"$java_app\\\",area=\\\"heap\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":2,\"i\":\"6\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "jvm内存池",
"weight": 3,
"charts": [
{
"id": 287,
"group_id": 77,
"configs": "{\"name\":\"jvm内存池分pool展示\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_memory_pool_bytes_max{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":24,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 316,
"group_id": 77,
"configs": "{\"name\":\" JVM 缓冲池使用缓存大小\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_buffer_pool_used_bytes{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":2,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 317,
"group_id": 77,
"configs": "{\"name\":\"JVM 缓冲池的字节容量\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_buffer_pool_capacity_bytes{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":2,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 318,
"group_id": 77,
"configs": "{\"name\":\"JVM 缓冲池使用的字节大小\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_buffer_pool_used_bytes{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":2,\"i\":\"3\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "jvm gc情况",
"weight": 4,
"charts": [
{
"id": 288,
"group_id": 78,
"configs": "{\"name\":\"新生代gc耗时 1分钟\",\"mode\":\"promethues\",\"prome_ql\":[\"increase(jvm_gc_collection_seconds_sum{java_app=~\\\"$java_app\\\",gc=\\\"G1 Young Generation\\\" }[1m])\"],\"layout\":{\"h\":2,\"w\":8,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 289,
"group_id": 78,
"configs": "{\"name\":\"老生代gc耗时 1分钟\",\"mode\":\"promethues\",\"prome_ql\":[\"increase(jvm_gc_collection_seconds_sum{java_app=~\\\"$java_app\\\",gc=\\\"G1 Old Generation\\\" }[1m])\"],\"layout\":{\"h\":2,\"w\":8,\"x\":8,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 290,
"group_id": 78,
"configs": "{\"name\":\"新生代gc次数 1分钟\",\"mode\":\"promethues\",\"prome_ql\":[\"increase(jvm_gc_collection_seconds_count{java_app=~\\\"$java_app\\\",gc=\\\"G1 Young Generation\\\" }[1m])\"],\"layout\":{\"h\":2,\"w\":8,\"x\":16,\"y\":0,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 291,
"group_id": 78,
"configs": "{\"name\":\"老生代gc次数 1分钟\",\"mode\":\"promethues\",\"prome_ql\":[\"increase(jvm_gc_collection_seconds_count{java_app=~\\\"$java_app\\\",gc=\\\"G1 Old Generation\\\" }[1m])\"],\"layout\":{\"h\":2,\"w\":8,\"x\":0,\"y\":2,\"i\":\"3\"}}",
"weight": 0
},
{
"id": 292,
"group_id": 78,
"configs": "{\"name\":\"新生代平均gc耗时 秒\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_gc_collection_seconds_sum{java_app=~\\\"$java_app\\\",gc=\\\"G1 Young Generation\\\" }/jvm_gc_collection_seconds_count{java_app=~\\\"$java_app\\\",gc=\\\"G1 Young Generation\\\" }\"],\"layout\":{\"h\":2,\"w\":8,\"x\":8,\"y\":2,\"i\":\"4\"}}",
"weight": 0
},
{
"id": 293,
"group_id": 78,
"configs": "{\"name\":\"老生代平均gc耗时\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_gc_collection_seconds_sum{java_app=~\\\"$java_app\\\",gc=\\\"G1 Old Generation\\\"}/jvm_gc_collection_seconds_count{java_app=~\\\"$java_app\\\",gc=\\\"G1 Old Generation\\\" }\"],\"layout\":{\"h\":2,\"w\":8,\"x\":16,\"y\":2,\"i\":\"5\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "jvm线程情况",
"weight": 5,
"charts": [
{
"id": 294,
"group_id": 79,
"configs": "{\"name\":\"当前线程数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_current{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 295,
"group_id": 79,
"configs": "{\"name\":\"守护线程数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_daemon{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 296,
"group_id": 79,
"configs": "{\"name\":\"死锁线程数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_deadlocked{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":0,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 297,
"group_id": 79,
"configs": "{\"name\":\"活动线程峰值\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_peak{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":0,\"i\":\"3\"}}",
"weight": 0
},
{
"id": 298,
"group_id": 79,
"configs": "{\"name\":\"自JVM启动后启动的线程总量包括daemon,non-daemon和终止了的\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_started_total{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":2,\"i\":\"4\"}}",
"weight": 0
},
{
"id": 299,
"group_id": 79,
"configs": "{\"name\":\"当前TERMINATED线程个数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_state{java_app=~\\\"$java_app\\\",state=\\\"TERMINATED\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":4,\"i\":\"5\"}}",
"weight": 0
},
{
"id": 300,
"group_id": 79,
"configs": "{\"name\":\"当前RUNNABLE线程个数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_state{java_app=~\\\"$java_app\\\",state=\\\"RUNNABLE\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":2,\"i\":\"6\"}}",
"weight": 0
},
{
"id": 301,
"group_id": 79,
"configs": "{\"name\":\"当前NEW线程个数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_state{java_app=~\\\"$java_app\\\",state=\\\"NEW\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":2,\"i\":\"7\"}}",
"weight": 0
},
{
"id": 302,
"group_id": 79,
"configs": "{\"name\":\"当前TIMED_WAITING线程个数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_state{java_app=~\\\"$java_app\\\",state=\\\"TIMED_WAITING\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":4,\"i\":\"8\"}}",
"weight": 0
},
{
"id": 303,
"group_id": 79,
"configs": "{\"name\":\"当前BLOCKED线程个数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_state{java_app=~\\\"$java_app\\\",state=\\\"BLOCKED\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":4,\"i\":\"9\"}}",
"weight": 0
},
{
"id": 304,
"group_id": 79,
"configs": "{\"name\":\"当前WAITING线程个数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_state{java_app=~\\\"$java_app\\\",state=\\\"WAITING\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":4,\"i\":\"10\"}}",
"weight": 0
},
{
"id": 305,
"group_id": 79,
"configs": "{\"name\":\"当前线程状态汇总\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_threads_state{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":2,\"i\":\"11\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "加载类情况",
"weight": 6,
"charts": [
{
"id": 306,
"group_id": 80,
"configs": "{\"name\":\"jvm 当前加载的类个数 \",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_classes_loaded{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":8,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 307,
"group_id": 80,
"configs": "{\"name\":\"jvm启动以来加载的类总个数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_classes_loaded_total{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":8,\"x\":8,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 308,
"group_id": 80,
"configs": "{\"name\":\"jvm启动以来卸载的类总个数\",\"mode\":\"promethues\",\"prome_ql\":[\"jvm_classes_unloaded_total{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":8,\"x\":16,\"y\":0,\"i\":\"2\"}}",
"weight": 0
}
]
},
{
"id": 0,
"dashboard_id": 0,
"name": "机器指标(配置了java.lang才有)",
"weight": 7,
"charts": [
{
"id": 311,
"group_id": 81,
"configs": "{\"name\":\"java进程打开fd数\",\"mode\":\"promethues\",\"prome_ql\":[\"os_open_file_descriptor_count{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":8,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 312,
"group_id": 81,
"configs": "{\"name\":\"机器总内存\",\"mode\":\"promethues\",\"prome_ql\":[\"os_total_memory_size{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":8,\"x\":8,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 313,
"group_id": 81,
"configs": "{\"name\":\"机器可用内存数\",\"mode\":\"promethues\",\"prome_ql\":[\"os_free_memory_size{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":8,\"x\":16,\"y\":0,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 314,
"group_id": 81,
"configs": "{\"name\":\"机器近期cpu使用率\",\"mode\":\"promethues\",\"link\":\"https://docs.oracle.com/javase/7/docs/jre/api/management/extension/com/sun/management/OperatingSystemMXBean.html#getSystemCpuLoad()\",\"prome_ql\":[\"100 * os_system_cpu_load{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":8,\"x\":0,\"y\":2,\"i\":\"3\"}}",
"weight": 0
},
{
"id": 315,
"group_id": 81,
"configs": "{\"name\":\"java进程cpu使用\",\"mode\":\"promethues\",\"link\":\"https://docs.oracle.com/javase/7/docs/jre/api/management/extension/com/sun/management/OperatingSystemMXBean.html#getProcessCpuLoad()\",\"prome_ql\":[\"os_process_cpu_load{java_app=~\\\"$java_app\\\"}\"],\"layout\":{\"h\":2,\"w\":8,\"x\":8,\"y\":2,\"i\":\"4\"}}",
"weight": 0
},
{
"id": 319,
"group_id": 81,
"configs": "{\"name\":\"jvm cpu百分比\",\"mode\":\"promethues\",\"prome_ql\":[\"100 *(os_process_cpu_load{java_app=~\\\"$java_app\\\"}/os_system_cpu_load{java_app=~\\\"$java_app\\\"})\"],\"layout\":{\"h\":2,\"w\":8,\"x\":16,\"y\":2,\"i\":\"5\"}}",
"weight": 0
}
]
}
]
}
]

View File

@ -1,42 +0,0 @@
[
{
"id": 0,
"name": "linux_host",
"tags": "",
"configs": "{\"tags\":[{\"tagName\":\"ident\",\"key\":\"ident\",\"value\":\"*\",\"prefix\":false}]}",
"chart_groups": [
{
"id": 0,
"dashboard_id": 0,
"name": "Default chart group",
"weight": 0,
"charts": [
{
"id": 1,
"group_id": 1,
"configs": "{\"name\":\"CPU使用率\",\"mode\":\"nightingale\",\"metric\":[\"system_cpu_util\"],\"tags\":{},\"layout\":{\"h\":2,\"w\":6,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"id": 2,
"group_id": 1,
"configs": "{\"name\":\"硬盘使用率\",\"mode\":\"nightingale\",\"metric\":[\"system_disk_used_percent\"],\"tags\":{},\"layout\":{\"h\":2,\"w\":6,\"x\":6,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"id": 3,
"group_id": 1,
"configs": "{\"name\":\"内存使用率\",\"mode\":\"nightingale\",\"metric\":[\"system_mem_used_percent\"],\"tags\":{},\"layout\":{\"h\":2,\"w\":6,\"x\":12,\"y\":0,\"i\":\"2\"}}",
"weight": 0
},
{
"id": 4,
"group_id": 1,
"configs": "{\"name\":\"IO使用率\",\"mode\":\"nightingale\",\"metric\":[\"system_io_util\"],\"tags\":{},\"layout\":{\"h\":2,\"w\":6,\"x\":18,\"y\":0,\"i\":\"3\"}}",
"weight": 0
}
]
}
]
}
]

1
etc/metrics.yaml Normal file
View File

@ -0,0 +1 @@
cpu_usage_idle: CPU空闲率单位%

View File

@ -1,36 +1,12 @@
#!/usr/bin/python #!/usr/bin/env python
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
#
# n9e-server把告警事件通过stdin的方式传入notify.pynotify.py从事件中解析出接收人信息、拼出通知内容发送通知
# 脚本的灵活性高要接入短信、电话、jira、飞书等都非常容易只要有接口notify.py去调用即可
#
import sys import sys
import json import json
import os import urllib2
import smtplib import smtplib
import time
import requests
from email.mime.text import MIMEText from email.mime.text import MIMEText
from email.header import Header
from bottle import template
reload(sys) # reload 才能调用 setdefaultencoding 方法 notify_channel_funcs = {
sys.setdefaultencoding('utf-8') # 设置 'utf-8'
################################
## 邮件告警,修改下面的配置 ##
################################
mail_host = "smtp.163.com"
mail_port = 994
mail_user = "ulricqin"
mail_pass = "password"
mail_from = "ulricqin@163.com"
# 本地告警event json存储目录
LOCAL_EVENT_FILE_DIR = ".alerts"
NOTIFY_CHANNELS_SPLIT_STR = " "
NOTIFY_CHANNEL_DICT = {
"email":"email", "email":"email",
"sms":"sms", "sms":"sms",
"voice":"voice", "voice":"voice",
@ -38,290 +14,149 @@ NOTIFY_CHANNEL_DICT = {
"wecom":"wecom" "wecom":"wecom"
} }
# stdin 告警json实例 mail_host = "smtp.163.com"
TEST_ALERT_JSON = { mail_port = 994
"event": { mail_user = "ulricqin"
"alert_duration": 10, mail_pass = "password"
"notify_channels": "dingtalk", mail_from = "ulricqin@163.com"
"res_classpaths": "all",
"id": 4,
"notify_group_objs": None,
"rule_note": "",
"history_points": [
{
"metric": "go_goroutines",
"points": [
{
"t": 1625213114,
"v": 33.0
}
],
"tags": {
"instance": "localhost:9090",
"job": "prometheus"
}
}
],
"priority": 1,
"last_sent": True,
"tag_map": {
"instance": "localhost:9090",
"job": "prometheus"
},
"hash_id": "ecb258d2ca03454ee390a352913c461b",
"status": 0,
"tags": "instance=localhost:9090 job=prometheus",
"trigger_time": 1625213114,
"res_ident": "ident1",
"rule_name": "alert_test",
"is_prome_pull": 1,
"notify_users": "1",
"notify_groups": "",
"runbook_url": "",
"values": "[vector={__name__=\"go_goroutines\", instance=\"localhost:9090\", job=\"prometheus\"}]: [value=33.000000]",
"readable_expression": "go_goroutines>0",
"notify_user_objs": None,
"is_recovery": 1,
"rule_id": 1
},
"rule": {
"alert_duration": 10,
"notify_channels": "dingtalk",
"enable_stime": "00:00",
"id": 1,
"note": "",
"create_by": "root",
"append_tags": "",
"priority": 1,
"update_by": "root",
"type": 1,
"status": 0,
"recovery_notify": 0,
"enable_days_of_week": "1 2 3 4 5 6 7",
"callbacks": "localhost:10000",
"notify_users": "1",
"notify_groups": "",
"runbook_url": "",
"name": "a",
"update_at": 1625211576,
"create_at": 1625211576,
"enable_etime": "23:59",
"group_id": 1,
"expression": {
"evaluation_interval": 4,
"promql": "go_goroutines>0"
}
},
"users": [
{
"username": "root",
"status": 0,
"contacts": {
"dingtalk_robot_token": "xxxxxx"
},
"create_by": "system",
"update_at": 1625211432,
"create_at": 1624871926,
"email": "",
"phone": "",
"role": "Admin",
"update_by": "root",
"portrait": "",
"nickname": "\u8d85\u7ba1",
"id": 1
}
]
}
class Sender(object):
def main():
payload = json.load(sys.stdin)
trigger_time = payload['event']['trigger_time']
event_id = payload['event']['id']
rule_id = payload['rule']['id']
notify_channels = payload['event'].get('notify_channels').strip().split(NOTIFY_CHANNELS_SPLIT_STR)
if len(notify_channels) == 0:
msg = "notify_channels_empty"
print(msg)
return
# 持久化到本地json文件
persist(payload, rule_id, event_id, trigger_time)
# 生成告警内容
alert_content = sms_content_gen(values_gen(payload))
for ch in notify_channels:
send_func_name = "send_{}".format(NOTIFY_CHANNEL_DICT.get(ch.strip()))
has_func = hasattr(Send, send_func_name)
if not has_func:
msg = "[send_func_name_err][func_not_found_in_Send_class:{}]".format(send_func_name)
print(msg)
continue
send_func = getattr(Send, send_func_name)
send_func(alert_content, payload)
def values_gen(payload):
event_obj = payload.get("event")
values = {
"IsAlert": event_obj.get("is_recovery") == 0,
"IsMachineDep": event_obj.get("res_classpaths") != "",
"Status": status_gen(event_obj.get("priority"),event_obj.get("is_recovery")),
"Sname": event_obj.get("rule_name"),
"Ident": event_obj.get("res_ident"),
"Classpath": event_obj.get("res_classpaths"),
"Metric": metric_gen(event_obj.get("history_points")),
"Tags": event_obj.get("tags"),
"Value": event_obj.get("values"),
"ReadableExpression": event_obj.get("readable_expression"),
"TriggerTime": time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(event_obj.get("trigger_time"))),
"Elink": "http://n9e.didiyun.com/strategy/edit/{}".format(event_obj.get("rule_id")),
"Slink": "http://n9e.didiyun.com/event/{}".format(event_obj.get("id"))
}
return values
def email_content_gen(values):
return template('etc/script/tpl/mail.tpl', values)
def sms_content_gen(values):
return template('etc/script/tpl/sms.tpl', values)
def status_gen(priority,is_recovery):
is_recovery_str_m = {1: "恢复", 0: "告警"}
status = "P{} {}".format(priority, is_recovery_str_m.get(is_recovery))
return status
def subject_gen(priority,is_recovery,rule_name):
is_recovery_str_m = {1: "恢复", 0: "告警"}
subject = "P{} {} {}".format(priority, is_recovery_str_m.get(is_recovery), rule_name)
return subject
def metric_gen(history_points):
metrics = []
for item in history_points:
metrics.append(item.get("metric"))
return ",".join(metrics)
def persist(payload, rule_id, event_id, trigger_time):
if not os.path.exists(LOCAL_EVENT_FILE_DIR):
os.makedirs(LOCAL_EVENT_FILE_DIR)
filename = '%d_%d_%d' % (rule_id, event_id, trigger_time)
filepath = os.path.join(LOCAL_EVENT_FILE_DIR, filename)
with open(filepath, 'w') as f:
f.write(json.dumps(payload, indent=4))
class Send(object):
@classmethod @classmethod
def send_email(cls, alert_content, payload): def send_email(cls, payload):
users = payload.get("users") users = payload.get('event').get("notify_users_obj")
emails = [x.get("email") for x in users]
emails = {}
for u in users:
if u.get("email"):
emails[u.get("email")] = 1
if not emails: if not emails:
return return
recipients = emails recipients = emails.keys()
mail_body = email_content_gen(values_gen(payload)) mail_body = payload.get('tpls').get("mailbody.tpl", "mailbody.tpl not found")
message = MIMEText(mail_body, 'html', 'utf-8') message = MIMEText(mail_body, 'html', 'utf-8')
message['From'] = mail_from message['From'] = mail_from
message['To'] = ", ".join(recipients) message['To'] = ", ".join(recipients)
message["Subject"] = subject_gen(payload.get("event").get("priority"),payload.get("event").get("is_recovery"),payload.get("event").get("rule_name")) message["Subject"] = payload.get('tpls').get("subject.tpl", "subject.tpl not found")
try:
smtp = smtplib.SMTP_SSL(mail_host, mail_port) smtp = smtplib.SMTP_SSL(mail_host, mail_port)
smtp.login(mail_user, mail_pass) smtp.login(mail_user, mail_pass)
smtp.sendmail(mail_from, recipients, message.as_string()) smtp.sendmail(mail_from, recipients, message.as_string())
smtp.close() smtp.close()
except smtplib.SMTPException, error:
print("send_mail_success") print(error)
@classmethod @classmethod
def send_wecom(cls, alert_content, payload): def send_wecom(cls, payload):
users = payload.get("users") users = payload.get('event').get("notify_users_obj")
tokens = {}
for u in users: for u in users:
contacts = u.get("contacts") contacts = u.get("contacts")
wecom_robot_token = contacts.get("wecom_robot_token", "") if contacts.get("wecom_robot_token", ""):
tokens[contacts.get("wecom_robot_token", "")] = 1
if wecom_robot_token == "": opener = urllib2.build_opener(urllib2.HTTPHandler())
continue method = "POST"
wecom_api_url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}".format(wecom_robot_token) for t in tokens:
atMobiles = [u.get("phone")] url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key={}".format(t)
headers = {'Content-Type': 'application/json;charset=utf-8'} body = {
payload = { "msgtype": "markdown",
"msgtype": "text", "markdown": {
"text": { "content": payload.get('tpls').get("wecom.tpl", "wecom.tpl not found")
"content": alert_content
},
"at": {
"atMobiles": atMobiles,
"isAtAll": False
} }
} }
res = requests.post(wecom_api_url, json.dumps(payload), headers=headers) request = urllib2.Request(url, data=json.dumps(body))
print(res.status_code) request.add_header("Content-Type",'application/json;charset=utf-8')
print(res.text) request.get_method = lambda: method
print("send_wecom") try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
@classmethod @classmethod
def send_dingtalk(cls, alert_content, payload): def send_dingtalk(cls, payload):
# 钉钉发群信息需要群的webhook机器人 token这个信息可以在user的contacts map中 users = payload.get('event').get("notify_users_obj")
users = payload.get("users") tokens = {}
phones = {}
for u in users: for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
contacts = u.get("contacts") contacts = u.get("contacts")
if contacts.get("dingtalk_robot_token", ""):
tokens[contacts.get("dingtalk_robot_token", "")] = 1
dingtalk_robot_token = contacts.get("dingtalk_robot_token", "") opener = urllib2.build_opener(urllib2.HTTPHandler())
method = "POST"
if dingtalk_robot_token == "": for t in tokens:
print("dingtalk_robot_token_not_found") url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(t)
continue body = {
dingtalk_api_url = "https://oapi.dingtalk.com/robot/send?access_token={}".format(dingtalk_robot_token)
atMobiles = [u.get("phone")]
headers = {'Content-Type': 'application/json;charset=utf-8'}
payload = {
"msgtype": "text", "msgtype": "text",
"text": { "text": {
"content": alert_content "content": payload.get('tpls').get("dingtalk.tpl", "dingtalk.tpl not found")
}, },
"at": { "at": {
"atMobiles": atMobiles, "atMobiles": phones.keys(),
"isAtAll": False "isAtAll": False
} }
} }
res = requests.post(dingtalk_api_url, json.dumps(payload), headers=headers) request = urllib2.Request(url, data=json.dumps(body))
print(res.status_code) request.add_header("Content-Type",'application/json;charset=utf-8')
print(res.text) request.get_method = lambda: method
try:
connection = opener.open(request)
print(connection.read())
except urllib2.HTTPError, error:
print(error)
print("send_dingtalk") @classmethod
def send_sms(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_sms not implemented, phones: {}".format(phones.keys()))
@classmethod
def send_voice(cls, payload):
users = payload.get('event').get("notify_users_obj")
phones = {}
for u in users:
if u.get("phone"):
phones[u.get("phone")] = 1
if phones:
print("send_voice not implemented, phones: {}".format(phones.keys()))
def mail_test(): def main():
print("mail_test_todo") payload = json.load(sys.stdin)
with open(".payload", 'w') as f:
f.write(json.dumps(payload, indent=4))
for ch in payload.get('event').get('notify_channels'):
send_func_name = "send_{}".format(notify_channel_funcs.get(ch.strip()))
if not hasattr(Sender, send_func_name):
print("function: {} not found", send_func_name)
continue
send_func = getattr(Sender, send_func_name)
send_func(payload)
recipients = ["ulricqin@qq.com", "ulric@163.com"] def hello():
print("hello nightingale")
payload = json.loads(json.dumps(TEST_ALERT_JSON))
mail_body = email_content_gen(values_gen(payload))
message = MIMEText(mail_body, 'html', 'utf-8')
message['From'] = mail_from
message['To'] = ", ".join(recipients)
message["Subject"] = subject_gen(payload.get("event").get("priority"),payload.get("event").get("is_recovery"),payload.get("event").get("rule_name"))
smtp = smtplib.SMTP_SSL(mail_host, mail_port)
smtp.login(mail_user, mail_pass)
smtp.sendmail(mail_from, recipients, message.as_string())
smtp.close()
print("mail_test_done")
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) == 1: if len(sys.argv) == 1:
main() main()
elif sys.argv[1] == "mail": elif sys.argv[1] == "hello":
mail_test() hello()
else: else:
print("I am confused") print("I am confused")

View File

@ -1,14 +0,0 @@
级别状态:{{Status}}
策略名称:{{Sname}}
% if IsMachineDep:
告警设备:{{Ident}}
挂载节点:{{Classpath}}
% end
监控指标:{{Metric}}
指标标签:{{Tags}}
当前值:{{!Value}}
报警说明:{{!ReadableExpression}}
触发时间:{{TriggerTime}}
报警详情:{{Elink}}
报警策略:{{Slink}}

188
etc/server.conf Normal file
View File

@ -0,0 +1,188 @@
# debug, release
RunMode = "release"
# my cluster name
ClusterName = "Default"
[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "INFO"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours: 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 19000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = false
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
# [BasicAuth]
# user002 = "ccc26da7b9aba533cbb263a36c07dcc9"
[Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
[Alerting]
NotifyScriptPath = "./etc/script/notify.py"
NotifyConcurrency = 100
[Alerting.RedisPub]
Enable = false
# complete redis key: ${ChannelPrefix} + ${Cluster}
ChannelPrefix = "/alerts/"
[NoData]
Metric = "target_up"
# unit: second
Interval = 15
[Ibex]
# callback: ${ibex}/${tplid}/${host}
Address = "127.0.0.1:10090"
# basic auth
BasicAuthUser = "ibex"
BasicAuthPass = "ibex"
# unit: ms
Timeout = 3000
[Redis]
# address, ip:port
Address = "127.0.0.1:6379"
# requirepass
Password = ""
# # db
# DB = 0
[Gorm]
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
# table prefix
TablePrefix = ""
# enable auto migrate or not
EnableAutoMigrate = false
[MySQL]
# mysql address host:port
Address = "127.0.0.1:3306"
# mysql username
User = "root"
# mysql password
Password = "1234"
# database name
DBName = "n9e_v5"
# connection params
Parameters = "charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
[Postgres]
# pg address host:port
Address = "127.0.0.1:5432"
# pg user
User = "root"
# pg password
Password = "1234"
# database name
DBName = "n9e_v5"
# ssl mode
SSLMode = "disable"
[Reader]
# prometheus base url
Url = "http://127.0.0.1:9090"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Timeout = 30000
DialTimeout = 10000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 10
[WriterOpt]
# queue max size
QueueMaxSize = 10000000
# once pop samples number from queue
QueuePopSize = 2000
# unit: ms
SleepInterval = 50
[[Writers]]
Name = "prom"
Url = "http://127.0.0.1:9090/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Timeout = 30000
DialTimeout = 10000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
# [[Writers]]
# Name = "m3db"
# Url = "http://127.0.0.1:7201/api/v1/prom/remote/write"
# # Basic auth username
# BasicAuthUser = ""
# # Basic auth password
# BasicAuthPass = ""
# timeout settings, unit: ms
# Timeout = 30000
# DialTimeout = 10000
# TLSHandshakeTimeout = 30000
# ExpectContinueTimeout = 1000
# IdleConnTimeout = 90000
# # time duration, unit: ms
# KeepAlive = 30000
# MaxConnsPerHost = 0
# MaxIdleConns = 100
# MaxIdleConnsPerHost = 100

View File

@ -1,126 +0,0 @@
logger:
dir: logs
level: DEBUG
# # rotate by time
# keepHours: 4
# rotate by size
rotatenum: 3
rotatesize: 256 # unit: MB
http:
mode: release
# whether print access log to DEBUG.log
access: false
listen: 0.0.0.0:8000
pprof: false
cookieName: n9e
cookieDomain: ""
cookieMaxAge: 86400
cookieSecure: false
cookieHttpOnly: true
cookieSecret: 4696709ab8cc3ff2fea17b930158516b
csrfSecret: 15b8ea164b5d3d9254677053c72a19f1
rpc:
listen: 0.0.0.0:9000
mysql:
addr: "root:1234@tcp(127.0.0.1:3306)/n9e?charset=utf8&parseTime=True&loc=Asia%2FShanghai"
max: 128
idle: 16
debug: false
# i18n:
# # zh | en
# lang: zh
# dictPath: etc/i18n.json
# heartbeat:
# # auto detect if blank
# ip: ""
# # unit: ms
# interval: 1000
# ldap:
# enable: false
# host: ldap.example.org
# port: 389
# baseDn: "dc=example,dc=org"
# # AD: manange@example.org
# bindUser: "cn=manager,dc=example,dc=org"
# bindPass: "*******"
# # openldap: (&(uid=%s))
# # AD: (&(sAMAccountName=%s))
# authFilter: "(&(uid=%s))"
# attributes:
# nickname: "cn"
# email: "mail"
# phone: "mobile"
# coverAttributes: false
# autoRegist: true
# tls: false
# startTLS: false
# judge:
# readBatch: 2000
# connTimeout: 2000
# callTimeout: 5000
# writerNum: 256
# connMax: 2560
# connIdle: 256
# alert:
# notifyScriptPath: ./etc/script/notify.py
# notifyConcurrency: 200
# mutedAlertPersist: true
trans:
enable: true
backend:
datasource: "prometheus"
prometheus:
enable: true
name: prometheus
batch: 100000
maxRetry: 5
# prometheus 查询返回最大点数query.max-samples
maxSamples: 50000000
# prometheus并发的查询 query.max-concurrency
maxConcurrentQuery: 20
# prometheus 回查窗口 query.lookback-delta
lookbackDeltaMinute: 2
# 查询全量索引时时间窗口限制,降低高基数
maxFetchAllSeriesLimitMinute: 5
# 查询接口耗时超过多少秒就打印warning日志记录
slowLogRecordSecond: 3
# remote_read时如果没有查询条件则用这条默认的ql查询
# 注意! ql匹配series越多造成的oom或者慢查询可能越大
defaultFetchSeriesQl: '{__name__=~"system.*"}'
remoteWrite:
# m3db的配置
#- name: m3db01
# url: http://localhost:7201/api/v1/prom/remote/write
# remoteTimeoutSecond: 5
# prometheus的配置
- name: prome01
url: http://localhost:9090/api/v1/write
remoteTimeoutSecond: 5
remoteRead:
- name: prome01
url: http://localhost:9090/api/v1/read
remoteTimeoutSecond: 5
contactKeys:
- label: "Wecom Robot Token"
key: wecom_robot_token
- label: "Dingtalk Robot Token"
key: dingtalk_robot_token
notifyChannels:
- email
- sms
- voice
- dingtalk
- wecom

View File

@ -1,13 +1,14 @@
[Unit] [Unit]
Description="n9e-server" Description="n9e-server"
After=network.target
[Service] [Service]
Type=simple Type=simple
ExecStart=/opt/n9e/server/n9e-server
WorkingDirectory=/opt/n9e/server
Restart=always ExecStart=/root/gopath/src/n9e/n9e server
RestartSecs=1s WorkingDirectory=/root/gopath/src/n9e
Restart=on-failure
SuccessExitStatus=0 SuccessExitStatus=0
LimitNOFILE=65536 LimitNOFILE=65536
StandardOutput=syslog StandardOutput=syslog

View File

@ -0,0 +1,20 @@
[Unit]
Description="n9e-webapi"
After=network.target
[Service]
Type=simple
ExecStart=/root/gopath/src/n9e/n9e webapi
WorkingDirectory=/root/gopath/src/n9e
Restart=on-failure
SuccessExitStatus=0
LimitNOFILE=65536
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=n9e-webapi
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,6 @@
级别状态: S{{.Severity}} {{if .IsRecovered}}Recovered{{else}}Triggered{{end}}
规则名称: {{.RuleName}}{{if .RuleNote}}
规则备注: {{.RuleNote}}{{end}}
监控指标: {{.TagsJSON}}
触发时间: {{timeformat .TriggerTime}}
触发时值: {{.TriggerValue}}

195
etc/template/mailbody.tpl Normal file
View File

@ -0,0 +1,195 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>夜莺告警通知</title>
<style type="text/css">
.wrapper {
background-color: #f8f8f8;
padding: 15px;
height: 100%;
}
.main {
width: 600px;
padding: 30px;
margin: 0 auto;
background-color: #fff;
font-size: 12px;
font-family: verdana,'Microsoft YaHei',Consolas,'Deja Vu Sans Mono','Bitstream Vera Sans Mono';
}
header {
border-radius: 2px 2px 0 0;
}
header .title {
font-size: 16px;
color: #333333;
margin: 0;
}
header .sub-desc {
color: #333;
font-size: 14px;
margin-top: 6px;
margin-bottom: 0;
}
hr {
margin: 20px 0;
height: 0;
border: none;
border-top: 1px solid #e5e5e5;
}
em {
font-weight: 600;
}
table {
margin: 20px 0;
width: 100%;
}
table tbody tr{
font-weight: 200;
font-size: 12px;
color: #666;
height: 32px;
}
.succ {
background-color: green;
color: white;
}
.fail {
background-color: red;
color: white;
}
table tbody tr th {
width: 80px;
text-align: right;
}
.text-right {
text-align: right;
}
.body {
margin-top: 24px;
}
.body-text {
color: #666666;
-webkit-font-smoothing: antialiased;
}
.body-extra {
-webkit-font-smoothing: antialiased;
}
.body-extra.text-right a {
text-decoration: none;
color: #333;
}
.body-extra.text-right a:hover {
color: #666;
}
.button {
width: 200px;
height: 50px;
margin-top: 20px;
text-align: center;
border-radius: 2px;
background: #2D77EE;
line-height: 50px;
font-size: 20px;
color: #FFFFFF;
cursor: pointer;
}
.button:hover {
background: rgb(25, 115, 255);
border-color: rgb(25, 115, 255);
color: #fff;
}
footer {
margin-top: 10px;
text-align: right;
}
.footer-logo {
text-align: right;
}
.footer-logo-image {
width: 108px;
height: 27px;
margin-right: 10px;
}
.copyright {
margin-top: 10px;
font-size: 12px;
text-align: right;
color: #999;
-webkit-font-smoothing: antialiased;
}
</style>
</head>
<body>
<div class="wrapper">
<div class="main">
<header>
<h3 class="title">{{.RuleName}}</h3>
<p class="sub-desc"></p>
</header>
<hr>
<div class="body">
<table cellspacing="0" cellpadding="0" border="0">
<tbody>
{{if .IsRecovered}}
<tr class="succ">
<th>级别状态:</th>
<td>S{{.Severity}} Recovered</td>
</tr>
{{else}}
<tr class="fail">
<th>级别状态:</th>
<td>S{{.Severity}} Triggered</td>
</tr>
{{end}}
<tr>
<th>策略备注:</th>
<td>{{.RuleNote}}</td>
</tr>
<tr>
<th>设备备注:</th>
<td>{{.TargetNote}}</td>
</tr>
<tr>
<th>监控指标:</th>
<td>{{.TagsJSON}}</td>
</tr>
<tr>
<th>触发时值:</th>
<td>{{.TriggerValue}}</td>
</tr>
<tr>
<th>触发时间:</th>
<td>
{{timeformat .TriggerTime}}
</td>
</tr>
<tr>
<th>PromQL</th>
<td>
{{.PromQl}}
</td>
</tr>
</tbody>
</table>
<hr>
<footer>
<div class="copyright" style="font-style: italic">
我们希望与您一起,将监控这个事情,做到极致!
</div>
</footer>
</div>
</div>
</div>
</body>
</html>

1
etc/template/subject.tpl Normal file
View File

@ -0,0 +1 @@
{{if .IsRecovered}}Recovered{{else}}Triggered{{end}}: {{.RuleName}} {{.TagsJSON}}

6
etc/template/wecom.tpl Normal file
View File

@ -0,0 +1,6 @@
**级别状态**: {{if .IsRecovered}}<font color="info">S{{.Severity}} Recovered</font>{{else}}<font color="warning">S{{.Severity}} Triggered</font>{{end}}
**规则标题**: {{.RuleName}}{{if .RuleNote}}
**规则备注**: {{.RuleNote}}{{end}}
**监控指标**: {{.TagsJSON}}
**触发时间**: {{timeformat .TriggerTime}}
**触发时值**: {{.TriggerValue}}

166
etc/webapi.conf Normal file
View File

@ -0,0 +1,166 @@
# debug, release
RunMode = "release"
# # custom i18n dict config
# I18N = "./etc/i18n.json"
# do not change
AdminRole = "Admin"
# Linkage with notify.py script
NotifyChannels = [ "email", "dingtalk", "wecom" ]
[[ContactKeys]]
Label = "Wecom Robot Token"
Key = "wecom_robot_token"
[[ContactKeys]]
Label = "Dingtalk Robot Token"
Key = "dingtalk_robot_token"
[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "DEBUG"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours: 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256
[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 18000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = true
# whether enable pprof
PProf = false
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120
[JWTAuth]
# signing key
SigningKey = "5b94a0fd640fe2765af826acfe42d151"
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"
[BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"
[LDAP]
Enable = false
Host = "ldap.example.org"
Port = 389
BaseDn = "dc=example,dc=org"
# AD: manange@example.org
BindUser = "cn=manager,dc=example,dc=org"
BindPass = "*******"
# openldap format e.g. (&(uid=%s))
# AD format e.g. (&(sAMAccountName=%s))
AuthFilter = "(&(uid=%s))"
CoverAttributes = true
TLS = false
StartTLS = true
[LDAP.Attributes]
Nickname = "cn"
Phone = "mobile"
Email = "mail"
[Redis]
# address, ip:port
Address = "127.0.0.1:6379"
# requirepass
Password = ""
# # db
# DB = 0
[Gorm]
# enable debug mode or not
Debug = true
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
# table prefix
TablePrefix = ""
# enable auto migrate or not
EnableAutoMigrate = false
[MySQL]
# mysql address host:port
Address = "127.0.0.1:3306"
# mysql username
User = "root"
# mysql password
Password = "1234"
# database name
DBName = "n9e_v5"
# connection params
Parameters = "charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
[Postgres]
# pg address host:port
Address = "127.0.0.1:5432"
# pg user
User = "root"
# pg password
Password = "1234"
# database name
DBName = "n9e_v5"
# ssl mode
SSLMode = "disable"
[[Clusters]]
# Prometheus cluster name
Name = "Default"
# Prometheus APIs base url
Prom = "http://127.0.0.1:9090"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Timeout = 30000
DialTimeout = 10000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
[Ibex]
Address = "http://127.0.0.1:10090"
# basic auth
BasicAuthUser = "ibex"
BasicAuthPass = "ibex"
# unit: ms
Timeout = 3000

70
go.mod
View File

@ -3,53 +3,31 @@ module github.com/didi/nightingale/v5
go 1.14 go 1.14
require ( require (
github.com/armon/go-metrics v0.3.4 // indirect github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/gin-contrib/gzip v0.0.3 github.com/fatih/camelcase v1.0.0 // indirect
github.com/fatih/structs v1.1.0 // indirect
github.com/gin-contrib/pprof v1.3.0 github.com/gin-contrib/pprof v1.3.0
github.com/gin-contrib/sessions v0.0.3 github.com/gin-gonic/gin v1.7.4
github.com/gin-gonic/gin v1.7.0 github.com/go-ldap/ldap/v3 v3.4.1
github.com/go-kit/kit v0.10.0 github.com/go-redis/redis/v8 v8.11.3
github.com/go-ldap/ldap/v3 v3.2.4 github.com/golang/protobuf v1.5.2
github.com/go-sql-driver/mysql v1.5.0 github.com/golang/snappy v0.0.4
github.com/gogo/protobuf v1.3.2 github.com/google/uuid v1.3.0
github.com/golang/snappy v0.0.3 github.com/json-iterator/go v1.1.12
github.com/gopherjs/gopherjs v0.0.0-20190910122728-9d188e94fb99 // indirect github.com/koding/multiconfig v0.0.0-20171124222453-69c27309b2d7
github.com/gorilla/sessions v1.2.0 // indirect
github.com/hashicorp/go-immutable-radix v1.2.0 // indirect
github.com/hashicorp/go-msgpack v0.5.5 // indirect
github.com/hashicorp/go-uuid v1.0.2 // indirect
github.com/hashicorp/golang-lru v0.5.4 // indirect
github.com/hashicorp/hcl v1.0.1-0.20190611123218-cf7d376da96d // indirect
github.com/magiconair/properties v1.8.2 // indirect
github.com/mattn/go-isatty v0.0.12 github.com/mattn/go-isatty v0.0.12
github.com/n9e/agent-payload v0.0.0-20210619031503-b72325474651 github.com/orcaman/concurrent-map v0.0.0-20210501183033-44dafcb38ecc
github.com/opentracing-contrib/go-stdlib v1.0.0
github.com/opentracing/opentracing-go v1.2.0
github.com/orcaman/concurrent-map v0.0.0-20210106121528-16402b402231
github.com/pkg/errors v0.9.1 github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.9.0 github.com/prometheus/client_golang v1.11.0
github.com/prometheus/common v0.17.0 github.com/prometheus/common v0.26.0
github.com/prometheus/prometheus v1.8.2-0.20210220213500-8c8de46003d1 github.com/prometheus/prometheus v2.5.0+incompatible
github.com/smartystreets/assertions v1.0.0 // indirect github.com/toolkits/pkg v1.2.9
github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/urfave/cli/v2 v2.3.0
github.com/spf13/cast v1.3.1-0.20190531151931-f31dc0aaab5a // indirect golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d // indirect
github.com/spf13/jwalterweatherman v1.1.0 // indirect golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e // indirect
github.com/spf13/viper v1.7.1 google.golang.org/genproto v0.0.0-20211007155348-82e027067bd4 // indirect
github.com/subosito/gotenv v1.2.1-0.20190917103637-de67a6614a4d // indirect google.golang.org/grpc v1.41.0 // indirect
github.com/toolkits/pkg v1.1.3 gorm.io/driver/mysql v1.1.2
github.com/ugorji/go/codec v1.1.7 gorm.io/driver/postgres v1.1.1
go.uber.org/atomic v1.7.0 gorm.io/gorm v1.21.15
go.uber.org/automaxprocs v1.4.0 // indirect
golang.org/x/text v0.3.5
gopkg.in/ini.v1 v1.51.1 // indirect
xorm.io/builder v0.3.7
xorm.io/xorm v1.0.7
) )
// branch 0.9.3-pool-read-binary-3
replace github.com/apache/thrift => github.com/m3db/thrift v0.0.0-20190820191926-05b5a2227fe4
// Fix legacy import path - https://github.com/uber-go/atomic/pull/60
replace github.com/uber-go/atomic => github.com/uber-go/atomic v1.4.0
replace google.golang.org/grpc => google.golang.org/grpc v1.26.0

1328
go.sum

File diff suppressed because it is too large Load Diff

View File

@ -1,425 +0,0 @@
package http
import (
"fmt"
"net/http"
"strconv"
"strings"
"github.com/gin-contrib/sessions"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
"github.com/didi/nightingale/v5/pkg/i18n"
"github.com/didi/nightingale/v5/pkg/ierr"
)
const defaultLimit = 20
func _e(format string, a ...interface{}) error {
return fmt.Errorf(_s(format, a...))
}
func _s(format string, a ...interface{}) string {
return i18n.Sprintf(format, a...)
}
func dangerous(v interface{}, code ...int) {
ierr.Dangerous(v, code...)
}
func bomb(code int, format string, a ...interface{}) {
ierr.Bomb(code, _s(format, a...))
}
func bind(c *gin.Context, ptr interface{}) {
dangerous(c.ShouldBindJSON(ptr), http.StatusBadRequest)
}
func urlParamStr(c *gin.Context, field string) string {
val := c.Param(field)
if val == "" {
bomb(http.StatusBadRequest, "url param[%s] is blank", field)
}
return val
}
func urlParamInt64(c *gin.Context, field string) int64 {
strval := urlParamStr(c, field)
intval, err := strconv.ParseInt(strval, 10, 64)
if err != nil {
bomb(http.StatusBadRequest, "cannot convert %s to int64", strval)
}
return intval
}
func urlParamInt(c *gin.Context, field string) int {
return int(urlParamInt64(c, field))
}
func queryStr(c *gin.Context, key string, defaultVal ...string) string {
val := c.Query(key)
if val != "" {
return val
}
if len(defaultVal) == 0 {
bomb(http.StatusBadRequest, "query param[%s] is necessary", key)
}
return defaultVal[0]
}
func queryInt(c *gin.Context, key string, defaultVal ...int) int {
strv := c.Query(key)
if strv != "" {
intv, err := strconv.Atoi(strv)
if err != nil {
bomb(http.StatusBadRequest, "cannot convert [%s] to int", strv)
}
return intv
}
if len(defaultVal) == 0 {
bomb(http.StatusBadRequest, "query param[%s] is necessary", key)
}
return defaultVal[0]
}
func queryInt64(c *gin.Context, key string, defaultVal ...int64) int64 {
strv := c.Query(key)
if strv != "" {
intv, err := strconv.ParseInt(strv, 10, 64)
if err != nil {
bomb(http.StatusBadRequest, "cannot convert [%s] to int64", strv)
}
return intv
}
if len(defaultVal) == 0 {
bomb(http.StatusBadRequest, "query param[%s] is necessary", key)
}
return defaultVal[0]
}
func queryBool(c *gin.Context, key string, defaultVal ...bool) bool {
strv := c.Query(key)
if strv != "" {
if strv == "true" || strv == "1" || strv == "on" || strv == "checked" || strv == "yes" || strv == "Y" {
return true
} else if strv == "false" || strv == "0" || strv == "off" || strv == "no" || strv == "N" {
return false
} else {
bomb(http.StatusBadRequest, "unknown arg[%s] value: %s", key, strv)
}
}
if len(defaultVal) == 0 {
bomb(http.StatusBadRequest, "arg[%s] is necessary", key)
}
return defaultVal[0]
}
func offset(c *gin.Context, limit int) int {
if limit <= 0 {
limit = 10
}
page := queryInt(c, "p", 1)
return (page - 1) * limit
}
func renderMessage(c *gin.Context, v interface{}, statusCode ...int) {
code := 200
if len(statusCode) > 0 {
code = statusCode[0]
}
if v == nil {
c.JSON(code, gin.H{"err": ""})
return
}
switch t := v.(type) {
case string:
c.JSON(code, gin.H{"err": _s(t)})
case error:
c.JSON(code, gin.H{"err": t.Error()})
}
}
func renderData(c *gin.Context, data interface{}, err error, statusCode ...int) {
code := 200
if len(statusCode) > 0 {
code = statusCode[0]
}
if err == nil {
c.JSON(code, gin.H{"dat": data, "err": ""})
return
}
renderMessage(c, err.Error(), code)
}
func renderZeroPage(c *gin.Context) {
renderData(c, gin.H{
"list": []int{},
"total": 0,
}, nil)
}
type idsForm struct {
Ids []int64 `json:"ids"`
}
func (f idsForm) Validate() {
if len(f.Ids) == 0 {
bomb(http.StatusBadRequest, "ids empty")
}
}
func cookieUsername(c *gin.Context) string {
session := sessions.Default(c)
value := session.Get("username")
if value == nil {
return ""
}
return value.(string)
}
func headerUsername(c *gin.Context) string {
token := c.GetHeader("Authorization")
if token == "" {
return ""
}
ut, err := models.UserTokenGet("token=?", strings.TrimPrefix(token, "Bearer "))
if err != nil {
return ""
}
if ut == nil {
return ""
}
return ut.Username
}
// must get username
func loginUsername(c *gin.Context) string {
usernameInterface, has := c.Get("username")
if has {
return usernameInterface.(string)
}
username := cookieUsername(c)
if username == "" {
username = headerUsername(c)
}
if username == "" {
remoteAddr := c.Request.RemoteAddr
idx := strings.LastIndex(remoteAddr, ":")
ip := ""
if idx > 0 {
ip = remoteAddr[0:idx]
}
if (ip == "127.0.0.1" || ip == "[::1]") && c.GetHeader("X-Local") == "1" {
//本地调用都当成是root用户在调用
username = "root"
}
}
if username == "" {
ierr.Bomb(http.StatusUnauthorized, "unauthorized")
}
c.Set("username", username)
return username
}
func loginUser(c *gin.Context) *models.User {
username := loginUsername(c)
user, err := models.UserGetByUsername(username)
dangerous(err)
if user == nil {
ierr.Bomb(http.StatusUnauthorized, "unauthorized")
}
if user.Status == 1 {
ierr.Bomb(http.StatusUnauthorized, "unauthorized")
}
return user
}
func User(id int64) *models.User {
obj, err := models.UserGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such user")
}
return obj
}
func UserGroup(id int64) *models.UserGroup {
obj, err := models.UserGroupGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such user group")
}
return obj
}
func Classpath(id int64) *models.Classpath {
obj, err := models.ClasspathGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such classpath")
}
return obj
}
func Mute(id int64) *models.Mute {
obj, err := models.MuteGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such mute config")
}
return obj
}
func Dashboard(id int64) *models.Dashboard {
obj, err := models.DashboardGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such dashboard")
}
return obj
}
func ChartGroup(id int64) *models.ChartGroup {
obj, err := models.ChartGroupGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such chart group")
}
return obj
}
func Chart(id int64) *models.Chart {
obj, err := models.ChartGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such chart")
}
return obj
}
func AlertRule(id int64) *models.AlertRule {
obj, err := models.AlertRuleGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such alert rule")
}
return obj
}
func AlertRuleGroup(id int64) *models.AlertRuleGroup {
obj, err := models.AlertRuleGroupGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such alert rule group")
}
return obj
}
func AlertEvent(id int64) *models.AlertEvent {
obj, err := models.AlertEventGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such alert event")
}
return obj
}
func HistoryAlertEvent(id int64) *models.HistoryAlertEvent {
obj, err := models.HistoryAlertEventGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such alert all event")
}
return obj
}
func CollectRule(id int64) *models.CollectRule {
obj, err := models.CollectRuleGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such collect rule")
}
return obj
}
func MetricDescription(id int64) *models.MetricDescription {
obj, err := models.MetricDescriptionGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such metric description")
}
return obj
}
func Resource(id int64) *models.Resource {
obj, err := models.ResourceGet("id=?", id)
dangerous(err)
if obj == nil {
bomb(http.StatusNotFound, "No such resource")
}
classpathResources, err := models.ClasspathResourceGets("res_ident=?", obj.Ident)
dangerous(err)
for _, cr := range classpathResources {
obj.ClasspathIds = append(obj.ClasspathIds, cr.ClasspathId)
}
return obj
}

View File

@ -1,43 +0,0 @@
package http
import (
"net/http"
"strings"
"github.com/didi/nightingale/v5/pkg/ierr"
"github.com/gin-gonic/gin"
)
func login() gin.HandlerFunc {
return func(c *gin.Context) {
username := loginUsername(c)
c.Set("username", username)
// 这里调用loginUser主要是为了判断当前用户是否被disable了
loginUser(c)
c.Next()
}
}
func admin() gin.HandlerFunc {
return func(c *gin.Context) {
username := loginUsername(c)
c.Set("username", username)
user := loginUser(c)
roles := strings.Fields(user.RolesForDB)
found := false
for i := 0; i < len(roles); i++ {
if roles[i] == "Admin" {
found = true
break
}
}
if !found {
ierr.Bomb(http.StatusForbidden, "forbidden")
}
c.Next()
}
}

View File

@ -1,108 +0,0 @@
package http
import (
"context"
"fmt"
"net/http"
"os"
"path"
"strings"
"time"
"github.com/gin-contrib/sessions"
"github.com/gin-contrib/sessions/cookie"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/config"
"github.com/didi/nightingale/v5/pkg/iaop"
)
var srv = &http.Server{
ReadTimeout: 30 * time.Second,
WriteTimeout: 30 * time.Second,
MaxHeaderBytes: 1 << 30,
}
var skipPaths = []string{
"/api/n9e/auth/login",
"/api/n9e/self/password",
"/api/n9e/push",
"/v1/n9e/series",
}
func Start() {
c := config.Config
loggerMid := iaop.LoggerWithConfig(iaop.LoggerConfig{SkipPaths: skipPaths})
recoveryMid := iaop.Recovery()
if strings.ToLower(c.HTTP.Mode) == "release" {
gin.SetMode(gin.ReleaseMode)
iaop.DisableConsoleColor()
}
r := gin.New()
r.Use(recoveryMid)
// whether print access log
if c.HTTP.Access {
r.Use(loggerMid)
}
// use cookie to save session
store := cookie.NewStore([]byte(config.Config.HTTP.CookieSecret))
store.Options(sessions.Options{
Domain: config.Config.HTTP.CookieDomain,
MaxAge: config.Config.HTTP.CookieMaxAge,
Secure: config.Config.HTTP.CookieSecure,
HttpOnly: config.Config.HTTP.CookieHttpOnly,
Path: "/",
})
session := sessions.Sessions(config.Config.HTTP.CookieName, store)
r.Use(session)
configRoutes(r)
configNoRoute(r)
srv.Addr = c.HTTP.Listen
srv.Handler = r
go func() {
fmt.Println("http.listening:", srv.Addr)
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
fmt.Printf("listening %s occur error: %s\n", srv.Addr, err)
os.Exit(3)
}
}()
}
// Shutdown http server
func Shutdown() {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := srv.Shutdown(ctx); err != nil {
fmt.Println("cannot shutdown http server:", err)
os.Exit(2)
}
// catching ctx.Done(). timeout of 5 seconds.
select {
case <-ctx.Done():
fmt.Println("shutdown http server timeout of 5 seconds.")
default:
fmt.Println("http server stopped")
}
}
func configNoRoute(r *gin.Engine) {
r.NoRoute(func(c *gin.Context) {
arr := strings.Split(c.Request.URL.Path, ".")
suffix := arr[len(arr)-1]
switch suffix {
case "png", "jpeg", "jpg", "svg", "ico", "gif", "css", "js", "html", "htm", "gz", "map":
c.File(path.Join(strings.Split("pub/"+c.Request.URL.Path, "/")...))
default:
c.File(path.Join("pub", "index.html"))
}
})
}

View File

@ -1,220 +0,0 @@
package http
import (
"fmt"
"os"
"github.com/gin-contrib/gzip"
"github.com/gin-contrib/pprof"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/config"
)
func configRoutes(r *gin.Engine) {
/*
csrfMid := csrf.Middleware(csrf.Options{
Secret: config.Config.HTTP.CsrfSecret,
ErrorFunc: func(c *gin.Context) {
c.JSON(452, gin.H{"err": "csrf token mismatch"})
c.Abort()
},
})
*/
if config.Config.HTTP.Pprof {
pprof.Register(r, "/api/debug/pprof")
}
guest := r.Group("/api/n9e")
{
guest.GET("/ping", func(c *gin.Context) {
c.String(200, "pong")
})
guest.GET("/pid", func(c *gin.Context) {
c.String(200, fmt.Sprintf("%d", os.Getpid()))
})
guest.GET("/addr", func(c *gin.Context) {
c.String(200, c.Request.RemoteAddr)
})
guest.GET("/version", func(c *gin.Context) {
c.String(200, config.Version)
})
guest.POST("/auth/login", loginPost)
guest.GET("/auth/logout", logoutGet)
// 开源版本,为了支持图表分享功能,允许匿名查询数据
guest.POST("/query", GetData)
guest.POST("/instant-query", GetDataInstant)
guest.POST("/tag-pairs", GetTagPairs)
guest.POST("/tag-keys", GetTagKeys)
guest.POST("/tag-values", GetTagValues)
guest.POST("/tag-metrics", GetMetrics)
guest.GET("/check-promql", checkPromeQl)
}
// for brower, expose location in nginx.conf
pages := r.Group("/api/n9e")
{
pages.GET("/csrf", func(c *gin.Context) {
// renderData(c, csrf.GetToken(c), nil)
renderData(c, "not supported", nil)
})
pages.GET("/roles", rolesGet)
pages.GET("/self/profile", selfProfileGet)
pages.PUT("/self/profile", selfProfilePut)
pages.PUT("/self/password", selfPasswordPut)
pages.GET("/self/token", selfTokenGets)
pages.POST("/self/token", selfTokenPost)
pages.PUT("/self/token", selfTokenPut)
pages.GET("/users", login(), userGets)
pages.POST("/users", admin(), userAddPost)
pages.GET("/user/:id/profile", login(), userProfileGet)
pages.PUT("/user/:id/profile", admin(), userProfilePut)
pages.PUT("/user/:id/status", admin(), userStatusPut)
pages.PUT("/user/:id/password", admin(), userPasswordPut)
pages.DELETE("/user/:id", admin(), userDel)
pages.GET("/user-groups", login(), userGroupListGet)
pages.GET("/user-groups/mine", login(), userGroupMineGet)
pages.POST("/user-groups", login(), userGroupAdd)
pages.PUT("/user-group/:id", login(), userGroupPut)
pages.GET("/user-group/:id", login(), userGroupGet)
pages.POST("/user-group/:id/members", login(), userGroupMemberAdd)
pages.DELETE("/user-group/:id/members", login(), userGroupMemberDel)
pages.DELETE("/user-group/:id", login(), userGroupDel)
pages.GET("/classpaths", login(), classpathListGets)
pages.GET("/classpaths/tree-node/:id", login(), classpathListNodeGetsById)
pages.POST("/classpaths", login(), classpathAdd)
pages.PUT("/classpath/:id", login(), classpathPut)
pages.DELETE("/classpath/:id", login(), classpathDel)
pages.POST("/classpath/:id/resources", login(), classpathAddResources)
pages.DELETE("/classpath/:id/resources", login(), classpathDelResources)
pages.GET("/classpath/:id/resources", login(), classpathGetsResources)
pages.GET("/classpaths/favorites", login(), classpathFavoriteGet)
pages.POST("/classpath/:id/favorites", login(), classpathFavoriteAdd)
pages.DELETE("/classpath/:id/favorites", login(), classpathFavoriteDel)
pages.GET("/resources", login(), resourcesQuery)
pages.PUT("/resources/note", resourceNotePut)
pages.PUT("/resources/tags", resourceTagsPut)
pages.PUT("/resources/classpaths", resourceClasspathsPut)
pages.PUT("/resources/mute", resourceMutePut)
pages.GET("/resource/:id", login(), resourceGet)
pages.DELETE("/resource/:id", login(), resourceDel)
pages.GET("/mutes", login(), muteGets)
pages.POST("/mutes", login(), muteAdd)
pages.GET("/mute/:id", login(), muteGet)
pages.DELETE("/mute/:id", login(), muteDel)
pages.GET("/dashboards", login(), dashboardGets)
pages.POST("/dashboards", login(), dashboardAdd)
pages.POST("/dashboards-clone", login(), dashboardClone)
pages.POST("/dashboards/import", login(), dashboardImport)
pages.POST("/dashboards/export", login(), dashboardExport)
pages.GET("/dashboard/:id", login(), dashboardGet)
pages.PUT("/dashboard/:id", login(), dashboardPut)
pages.DELETE("/dashboard/:id", login(), dashboardDel)
pages.POST("/dashboard/:id/favorites", login(), dashboardFavoriteAdd)
pages.DELETE("/dashboard/:id/favorites", login(), dashboardFavoriteDel)
pages.GET("/dashboard/:id/chart-groups", login(), chartGroupGets)
pages.POST("/dashboard/:id/chart-groups", login(), chartGroupAdd)
pages.PUT("/chart-groups", login(), chartGroupsPut)
pages.DELETE("/chart-group/:id", login(), chartGroupDel)
pages.GET("/chart-group/:id/charts", login(), chartGets)
pages.POST("/chart-group/:id/charts", login(), chartAdd)
pages.PUT("/chart/:id", login(), chartPut)
pages.DELETE("/chart/:id", login(), chartDel)
pages.PUT("/charts/configs", login(), chartConfigsPut)
pages.GET("/charts/tmps", chartTmpGets)
pages.POST("/charts/tmps", login(), chartTmpAdd)
pages.GET("/alert-rule-groups", login(), alertRuleGroupGets)
pages.GET("/alert-rule-groups/favorites", login(), alertRuleGroupFavoriteGet)
pages.POST("/alert-rule-groups", login(), alertRuleGroupAdd)
pages.GET("/alert-rule-group/:id", login(), alertRuleGroupGet)
pages.GET("/alert-rule-group/:id/alert-rules", login(), alertRuleOfGroupGet)
pages.DELETE("/alert-rule-group/:id/alert-rules", login(), alertRuleOfGroupDel)
pages.PUT("/alert-rule-group/:id", login(), alertRuleGroupPut)
pages.DELETE("/alert-rule-group/:id", login(), alertRuleGroupDel)
pages.POST("/alert-rule-group/:id/favorites", login(), alertRuleGroupFavoriteAdd)
pages.DELETE("/alert-rule-group/:id/favorites", login(), alertRuleGroupFavoriteDel)
pages.POST("/alert-rules", login(), alertRuleAdd)
pages.PUT("/alert-rules/status", login(), alertRuleStatusPut)
pages.PUT("/alert-rules/notify-groups", login(), alertRuleNotifyGroupsPut)
pages.PUT("/alert-rules/notify-channels", login(), alertRuleNotifyChannelsPut)
pages.PUT("/alert-rules/append-tags", login(), alertRuleAppendTagsPut)
pages.GET("/alert-rule/:id", login(), alertRuleGet)
pages.PUT("/alert-rule/:id", login(), alertRulePut)
pages.DELETE("/alert-rule/:id", login(), alertRuleDel)
pages.GET("/alert-events", login(), alertEventGets)
pages.DELETE("/alert-events", login(), alertEventsDel)
pages.GET("/alert-event/:id", login(), alertEventGet)
pages.DELETE("/alert-event/:id", login(), alertEventDel)
// pages.PUT("/alert-event/:id", login(), alertEventNotePut)
pages.GET("/history-alert-events", login(), historyAlertEventGets)
pages.GET("/history-alert-event/:id", login(), historyAlertEventGet)
pages.GET("/classpath/:id/collect-rules", login(), collectRuleGets)
pages.POST("/collect-rules", login(), collectRuleAdd)
pages.DELETE("/collect-rules", login(), collectRuleDel)
pages.PUT("/collect-rule/:id", login(), collectRulePut)
pages.POST("/log/check", regExpCheck)
pages.GET("/metric-descriptions", metricDescriptionGets)
pages.POST("/metric-descriptions", login(), metricDescriptionAdd)
pages.DELETE("/metric-descriptions", login(), metricDescriptionDel)
pages.PUT("/metric-description/:id", login(), metricDescriptionPut)
pages.GET("/contact-channels", contactChannelsGet)
pages.GET("/notify-channels", notifyChannelsGet)
pages.GET("/tpl/list", tplNameGets)
pages.GET("/tpl/content", tplGet)
pages.GET("/status", Status)
}
// for brower, expose location in nginx.conf
pagesV2 := r.Group("/api/n9e/v2")
{
pagesV2.POST("/collect-rules", login(), collectRulesAdd)
}
// for thirdparty, do not expose location in nginx.conf
v1 := r.Group("/v1/n9e")
{
v1.POST("/query", GetData)
v1.POST("/instant-query", GetDataInstant)
v1.POST("/tag-keys", GetTagKeys)
v1.POST("/tag-values", GetTagValues)
v1.POST("/tag-pairs", GetTagPairs)
v1.POST("/tag-metrics", GetMetrics)
v1.POST("/push", PushData)
v1.GET("/collect-rules-belong-to-ident", collectRuleGetsByIdent)
v1.GET("/collect-rules-summary", collectRuleSummaryGetByIdent)
v1.GET("/can-do-op-by-name", login(), canDoOpByName)
v1.GET("/can-do-op-by-token", login(), canDoOpByToken)
v1.GET("/get-user-by-name", login(), getUserByName)
v1.GET("/get-user-by-token", login(), getUserByToken)
}
push := r.Group("/v1/n9e/series").Use(gzip.Gzip(gzip.DefaultCompression))
{
push.POST("", PushSeries)
}
}

View File

@ -1,82 +0,0 @@
package http
import (
"time"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func alertEventGets(c *gin.Context) {
stime := queryInt64(c, "stime", 0)
etime := queryInt64(c, "etime", 0)
hours := queryInt64(c, "hours", 0)
now := time.Now().Unix()
if hours != 0 {
stime = now - 3600*hours
etime = now + 3600*24
}
if stime != 0 && etime == 0 {
etime = now + 3600*24
}
query := queryStr(c, "query", "")
priority := queryInt(c, "priority", -1)
status := queryInt(c, "status", -1)
limit := queryInt(c, "limit", defaultLimit)
total, err := models.AlertEventTotal(stime, etime, query, status, priority)
dangerous(err)
list, err := models.AlertEventGets(stime, etime, query, status, priority, limit, offset(c, limit))
dangerous(err)
for i := 0; i < len(list); i++ {
dangerous(list[i].FillObjs())
}
if len(list) == 0 {
renderZeroPage(c)
return
}
renderData(c, map[string]interface{}{
"total": total,
"list": list,
}, nil)
}
func alertEventGet(c *gin.Context) {
ae := AlertEvent(urlParamInt64(c, "id"))
dangerous(ae.FillObjs())
renderData(c, ae, nil)
}
type alertEventNoteForm struct {
EventNote string `json:"event_note"`
}
// func alertEventNotePut(c *gin.Context) {
// var f alertEventNoteForm
// bind(c, &f)
// me := loginUser(c).MustPerm("alert_event_modify")
// ae := AlertEvent(urlParamInt64(c, "id"))
// renderMessage(c, models.AlertEventUpdateEventNote(ae.Id, ae.HashId, f.EventNote, me.Id))
// }
func alertEventDel(c *gin.Context) {
loginUser(c).MustPerm("alert_event_delete")
renderMessage(c, AlertEvent(urlParamInt64(c, "id")).Del())
}
func alertEventsDel(c *gin.Context) {
var f idsForm
bind(c, &f)
f.Validate()
loginUser(c).MustPerm("alert_event_delete")
renderMessage(c, models.AlertEventsDel(f.Ids))
}

View File

@ -1,351 +0,0 @@
package http
import (
"encoding/json"
"fmt"
"net/http"
"strconv"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/v5/cache"
"github.com/didi/nightingale/v5/config"
"github.com/didi/nightingale/v5/models"
)
func alertRuleGet(c *gin.Context) {
alertRule := AlertRule(urlParamInt64(c, "id"))
alertRuleFillUserAndGroups(alertRule)
renderData(c, alertRule, nil)
}
type alertRuleForm struct {
GroupId int64 `json:"group_id"`
Name string `json:"name"`
Note string `json:"note"`
Type int `json:"type"`
Status int `json:"status"`
Expression json.RawMessage `json:"expression"`
AppendTags string `json:"append_tags"`
EnableStime string `json:"enable_stime"`
EnableEtime string `json:"enable_etime"`
EnableDaysOfWeek string `json:"enable_days_of_week"`
AlertDuration int `json:"alert_duration"`
RecoveryNotify int `json:"recovery_notify"`
Priority int `json:"priority"`
NotifyChannels string `json:"notify_channels"`
NotifyGroups string `json:"notify_groups"`
NotifyUsers string `json:"notify_users"`
Callbacks string `json:"callbacks"`
RunbookUrl string `json:"runbook_url"`
}
func alertRuleAdd(c *gin.Context) {
var f []alertRuleForm
bind(c, &f)
me := loginUser(c).MustPerm("alert_rule_create")
var ids []int64
for _, alertRule := range f {
arg := AlertRuleGroup(alertRule.GroupId)
alertRuleWritePermCheck(arg, me)
ar := models.AlertRule{
GroupId: alertRule.GroupId,
Name: alertRule.Name,
Type: alertRule.Type,
Note: alertRule.Note,
Status: alertRule.Status,
Expression: alertRule.Expression,
AlertDuration: alertRule.AlertDuration,
AppendTags: alertRule.AppendTags,
EnableStime: alertRule.EnableStime,
EnableEtime: alertRule.EnableEtime,
EnableDaysOfWeek: alertRule.EnableDaysOfWeek,
RecoveryNotify: alertRule.RecoveryNotify,
Priority: alertRule.Priority,
NotifyChannels: alertRule.NotifyChannels,
NotifyGroups: alertRule.NotifyGroups,
NotifyUsers: alertRule.NotifyUsers,
Callbacks: alertRule.Callbacks,
RunbookUrl: alertRule.RunbookUrl,
CreateBy: me.Username,
UpdateBy: me.Username,
}
dangerous(ar.Add())
ids = append(ids, ar.Id)
}
renderData(c, ids, nil)
}
func alertRulePut(c *gin.Context) {
var f alertRuleForm
bind(c, &f)
me := loginUser(c).MustPerm("alert_rule_modify")
ar := AlertRule(urlParamInt64(c, "id"))
arg := AlertRuleGroup(ar.GroupId)
alertRuleWritePermCheck(arg, me)
if ar.Name != f.Name {
num, err := models.AlertRuleCount("group_id=? and name=? and id<>?", ar.GroupId, f.Name, ar.Id)
dangerous(err)
if num > 0 {
bomb(200, "Alert rule %s already exists", f.Name)
}
}
ar.Name = f.Name
ar.Note = f.Note
ar.Type = f.Type
ar.Status = f.Status
ar.AlertDuration = f.AlertDuration
ar.Expression = f.Expression
ar.AppendTags = f.AppendTags
ar.EnableStime = f.EnableStime
ar.EnableEtime = f.EnableEtime
ar.EnableDaysOfWeek = f.EnableDaysOfWeek
ar.RecoveryNotify = f.RecoveryNotify
ar.Priority = f.Priority
ar.NotifyChannels = f.NotifyChannels
ar.NotifyGroups = f.NotifyGroups
ar.NotifyUsers = f.NotifyUsers
ar.Callbacks = f.Callbacks
ar.RunbookUrl = f.RunbookUrl
ar.CreateBy = me.Username
ar.UpdateAt = time.Now().Unix()
ar.UpdateBy = me.Username
renderMessage(c, ar.Update(
"name",
"note",
"type",
"status",
"alert_duration",
"expression",
"res_filters",
"tags_filters",
"append_tags",
"enable_stime",
"enable_etime",
"enable_days_of_week",
"recovery_notify",
"priority",
"notify_channels",
"notify_groups",
"notify_users",
"callbacks",
"runbook_url",
"update_at",
"update_by",
))
}
type alertRuleStatusForm struct {
Ids []int64 `json:"ids"`
Status int `json:"status"`
}
func alertRuleStatusPut(c *gin.Context) {
var f alertRuleStatusForm
bind(c, &f)
me := loginUser(c).MustPerm("alert_rule_modify")
if len(f.Ids) == 0 {
bomb(http.StatusBadRequest, "ids is empty")
}
for _, id := range f.Ids {
alertRule := AlertRule(id)
arg := AlertRuleGroup(alertRule.GroupId)
alertRuleWritePermCheck(arg, me)
}
renderMessage(c, models.AlertRuleUpdateStatus(f.Ids, f.Status, me.Username))
}
type alertRuleNotifyGroupsForm struct {
Ids []int64 `json:"ids"`
NotifyGroups string `json:"notify_groups"`
NotifyUsers string `json:"notify_users"`
}
func alertRuleNotifyGroupsPut(c *gin.Context) {
var f alertRuleNotifyGroupsForm
bind(c, &f)
//用户有修改告警策略的权限
me := loginUser(c).MustPerm("alert_rule_modify")
//id不存在
if len(f.Ids) == 0 {
bomb(http.StatusBadRequest, "ids is empty")
}
for _, id := range f.Ids {
alertRule := AlertRule(id)
arg := AlertRuleGroup(alertRule.GroupId)
alertRuleWritePermCheck(arg, me)
}
renderMessage(c, models.AlertRuleUpdateNotifyGroups(f.Ids, f.NotifyGroups, f.NotifyUsers, me.Username))
}
type alertRuleNotifyChannelsForm struct {
Ids []int64 `json:"ids"`
NotifyChannels string `json:"notify_channels"`
}
func alertRuleNotifyChannelsPut(c *gin.Context) {
var f alertRuleNotifyChannelsForm
bind(c, &f)
me := loginUser(c).MustPerm("alert_rule_modify")
if len(f.Ids) == 0 {
bomb(http.StatusBadRequest, "ids is empty")
}
for _, id := range f.Ids {
alertRule := AlertRule(id)
arg := AlertRuleGroup(alertRule.GroupId)
alertRuleWritePermCheck(arg, me)
}
renderMessage(c, models.AlertRuleUpdateNotifyChannels(f.Ids, f.NotifyChannels, me.Username))
}
type alertRuleAppendTagsForm struct {
Ids []int64 `json:"ids"`
AppendTags string `json:"append_tags"`
}
func alertRuleAppendTagsPut(c *gin.Context) {
var f alertRuleAppendTagsForm
bind(c, &f)
me := loginUser(c).MustPerm("alert_rule_modify")
if len(f.Ids) == 0 {
bomb(http.StatusBadRequest, "ids is empty")
}
for _, id := range f.Ids {
alertRule := AlertRule(id)
arg := AlertRuleGroup(alertRule.GroupId)
alertRuleWritePermCheck(arg, me)
}
renderMessage(c, models.AlertRuleUpdateAppendTags(f.Ids, f.AppendTags, me.Username))
}
func alertRuleDel(c *gin.Context) {
me := loginUser(c).MustPerm("alert_rule_delete")
alertRule := AlertRule(urlParamInt64(c, "id"))
arg := AlertRuleGroup(alertRule.GroupId)
alertRuleWritePermCheck(arg, me)
renderMessage(c, alertRule.Del())
}
func notifyChannelsGet(c *gin.Context) {
renderData(c, config.Config.NotifyChannels, nil)
}
func alertRuleFillUserAndGroups(alertRule *models.AlertRule) {
uidStrs := strings.Fields(alertRule.NotifyUsers)
userlen := len(uidStrs)
users := make([]*models.User, 0, userlen)
if userlen > 0 {
// 是否有用户已经被删除的情况出现
userMiss := false
for _, uidStr := range uidStrs {
uid, err := strconv.ParseInt(uidStr, 10, 64)
if err != nil {
userMiss = true
continue
}
user := cache.UserCache.GetById(uid)
if user != nil {
users = append(users, user)
continue
}
// uid在cache里找不到可能是还没来得及缓存也可能是被删除了
// 去查一下数据库,如果确实找不到了,就更新一下
user, err = models.UserGetById(uid)
if err != nil {
logger.Error("UserGetById fail:", err)
continue
}
if user != nil {
users = append(users, user)
} else {
userMiss = true
}
}
if userMiss {
userIdsNew := make([]string, len(users))
for i := 0; i < len(users); i++ {
userIdsNew[i] = fmt.Sprint(users[i].Id)
}
alertRule.NotifyUsers = strings.Join(userIdsNew, " ")
alertRule.UpdateAt = time.Now().Unix()
alertRule.Update("notify_users", "update_at")
}
}
// 最终存活的user列表赋值给alertRule
alertRule.NotifyUsersDetail = users
gidStrs := strings.Fields(alertRule.NotifyGroups)
grplen := len(gidStrs)
grps := make([]*models.UserGroup, 0, grplen)
if grplen > 0 {
grpMiss := false
for _, gidStr := range gidStrs {
gid, err := strconv.ParseInt(gidStr, 10, 64)
if err != nil {
grpMiss = true
continue
}
grp := cache.UserGroupCache.GetBy(gid)
if grp != nil {
grps = append(grps, grp)
continue
}
grp, err = models.UserGroupGet("id=?", gid)
if err != nil {
logger.Error("UserGroupGet fail:", err)
continue
}
if grp != nil {
grps = append(grps, grp)
} else {
grpMiss = true
}
}
if grpMiss {
grpIdsNew := make([]string, len(grps))
for i := 0; i < len(grps); i++ {
grpIdsNew[i] = fmt.Sprint(grps[i].Id)
}
alertRule.NotifyGroups = strings.Join(grpIdsNew, " ")
alertRule.UpdateAt = time.Now().Unix()
alertRule.Update("notify_groups", "update_at")
}
}
alertRule.NotifyGroupsDetail = grps
}

View File

@ -1,191 +0,0 @@
package http
import (
"fmt"
"net/http"
"strconv"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/cache"
"github.com/didi/nightingale/v5/models"
)
func alertRuleGroupGets(c *gin.Context) {
limit := queryInt(c, "limit", defaultLimit)
query := queryStr(c, "query", "")
total, err := models.AlertRuleGroupTotal(query)
dangerous(err)
list, err := models.AlertRuleGroupGets(query, limit, offset(c, limit))
dangerous(err)
renderData(c, gin.H{
"list": list,
"total": total,
}, nil)
}
func alertRuleGroupFavoriteGet(c *gin.Context) {
lst, err := loginUser(c).FavoriteAlertRuleGroups()
renderData(c, lst, err)
}
type alertRuleGroupForm struct {
Name string `json:"name"`
UserGroupIds string `json:"user_group_ids"`
}
func alertRuleGroupAdd(c *gin.Context) {
var f alertRuleGroupForm
bind(c, &f)
me := loginUser(c).MustPerm("alert_rule_group_create")
arg := models.AlertRuleGroup{
Name: f.Name,
UserGroupIds: f.UserGroupIds,
CreateBy: me.Username,
UpdateBy: me.Username,
}
err := arg.Add()
if err == nil {
// 我创建的,顺便设置为我关注的
models.AlertRuleGroupFavoriteAdd(arg.Id, me.Id)
}
renderMessage(c, err)
}
func alertRuleGroupGet(c *gin.Context) {
alertRuleGroup := AlertRuleGroup(urlParamInt64(c, "id"))
alertRuleGroup.FillUserGroups()
renderData(c, alertRuleGroup, nil)
}
func alertRuleOfGroupGet(c *gin.Context) {
ars, err := models.AlertRulesOfGroup(urlParamInt64(c, "id"))
for i := range ars {
alertRuleFillUserAndGroups(&ars[i])
}
renderData(c, ars, err)
}
func alertRuleOfGroupDel(c *gin.Context) {
var f idsForm
bind(c, &f)
f.Validate()
me := loginUser(c).MustPerm("alert_rule_delete")
// 可能大部分alert_rule都来自同一个alert_rule_group所以权限判断可以无需重复判断
cachePerm := make(map[string]struct{})
for i := 0; i < len(f.Ids); i++ {
ar := AlertRule(f.Ids[i])
cacheKey := fmt.Sprintf("%d,%d", f.Ids[i], ar.GroupId)
if _, has := cachePerm[cacheKey]; has {
continue
}
arg := AlertRuleGroup(ar.GroupId)
alertRuleWritePermCheck(arg, me)
cachePerm[cacheKey] = struct{}{}
}
renderMessage(c, models.AlertRulesDel(f.Ids))
}
func alertRuleGroupPut(c *gin.Context) {
var f alertRuleGroupForm
bind(c, &f)
me := loginUser(c).MustPerm("alert_rule_group_modify")
arg := AlertRuleGroup(urlParamInt64(c, "id"))
alertRuleWritePermCheck(arg, me)
if arg.Name != f.Name {
num, err := models.AlertRuleGroupCount("name=? and id<>?", f.Name, arg.Id)
dangerous(err)
if num > 0 {
bomb(200, "AlertRuleGroup %s already exists", f.Name)
}
}
arg.Name = f.Name
arg.UserGroupIds = f.UserGroupIds
arg.UpdateBy = me.Username
arg.UpdateAt = time.Now().Unix()
renderMessage(c, arg.Update("name", "update_by", "update_at", "user_group_ids"))
}
func alertRuleGroupDel(c *gin.Context) {
me := loginUser(c).MustPerm("alert_rule_group_delete")
arg := AlertRuleGroup(urlParamInt64(c, "id"))
alertRuleWritePermCheck(arg, me)
renderMessage(c, arg.Del())
}
func alertRuleGroupFavoriteAdd(c *gin.Context) {
me := loginUser(c)
arg := AlertRuleGroup(urlParamInt64(c, "id"))
renderMessage(c, models.AlertRuleGroupFavoriteAdd(arg.Id, me.Id))
}
func alertRuleGroupFavoriteDel(c *gin.Context) {
me := loginUser(c)
arg := AlertRuleGroup(urlParamInt64(c, "id"))
renderMessage(c, models.AlertRuleGroupFavoriteDel(arg.Id, me.Id))
}
func alertRuleWritePermCheck(alertRuleGroup *models.AlertRuleGroup, user *models.User) {
roles := strings.Fields(user.RolesForDB)
for i := 0; i < len(roles); i++ {
if roles[i] == "Admin" {
return
}
}
gids := IdsInt64(alertRuleGroup.UserGroupIds)
if len(gids) == 0 {
// 压根没有配置管理团队表示对所有Standard角色放开那就不校验了
return
}
for _, gid := range gids {
if cache.UserGroupMember.Exists(gid, user.Id) {
return
}
}
bomb(http.StatusForbidden, "no permission")
}
func IdsInt64(ids string) []int64 {
if ids == "" {
return []int64{}
}
arr := strings.Fields(ids)
count := len(arr)
ret := make([]int64, 0, count)
for i := 0; i < count; i++ {
if arr[i] != "" {
id, err := strconv.ParseInt(arr[i], 10, 64)
if err == nil {
ret = append(ret, id)
}
}
}
return ret
}

View File

@ -1,92 +0,0 @@
package http
import (
"github.com/gin-contrib/sessions"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/config"
"github.com/didi/nightingale/v5/models"
)
type loginForm struct {
Username string `json:"username"`
Password string `json:"password"`
}
func loginPost(c *gin.Context) {
var f loginForm
bind(c, &f)
user, err1 := models.PassLogin(f.Username, f.Password)
if err1 == nil {
if user.Status == 1 {
renderMessage(c, "User disabled")
return
}
session := sessions.Default(c)
session.Set("username", f.Username)
session.Save()
renderData(c, user, nil)
return
}
// password login fail, try ldap
if config.Config.LDAP.Enable {
user, err2 := models.LdapLogin(f.Username, f.Password)
if err2 == nil {
if user.Status == 1 {
renderMessage(c, "User disabled")
return
}
session := sessions.Default(c)
session.Set("username", f.Username)
session.Save()
renderData(c, user, nil)
return
}
}
// password and ldap both fail
renderMessage(c, err1)
}
func logoutGet(c *gin.Context) {
session := sessions.Default(c)
session.Set("username", "")
session.Save()
renderMessage(c, nil)
}
func canDoOpByName(c *gin.Context) {
user, err := models.UserGetByUsername(queryStr(c, "name"))
dangerous(err)
if user == nil {
renderData(c, false, err)
return
}
can, err := user.CanDo(queryStr(c, "op"))
renderData(c, can, err)
}
func canDoOpByToken(c *gin.Context) {
userToken, err := models.UserTokenGet("token=?", queryStr(c, "token"))
dangerous(err)
if userToken == nil {
renderData(c, false, err)
return
}
user, err := models.UserGetByUsername(userToken.Username)
dangerous(err)
if user == nil {
renderData(c, false, err)
return
}
can, err := user.CanDo(queryStr(c, "op"))
renderData(c, can, err)
}

View File

@ -1,82 +0,0 @@
package http
import (
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func chartGets(c *gin.Context) {
objs, err := models.ChartGets(urlParamInt64(c, "id"))
renderData(c, objs, err)
}
type chartForm struct {
Configs string `json:"configs"`
Weight int `json:"weight"`
}
func chartAdd(c *gin.Context) {
var f chartForm
bind(c, &f)
loginUser(c).MustPerm("dashboard_modify")
cg := ChartGroup(urlParamInt64(c, "id"))
ct := models.Chart{
GroupId: cg.Id,
Configs: f.Configs,
Weight: f.Weight,
}
dangerous(ct.Add())
renderData(c, ct, nil)
}
type chartPutForm struct {
Configs string `json:"configs"`
}
func chartPut(c *gin.Context) {
var f chartPutForm
bind(c, &f)
loginUser(c).MustPerm("dashboard_modify")
ct := Chart(urlParamInt64(c, "id"))
ct.Configs = f.Configs
dangerous(ct.Update("configs"))
renderData(c, ct, nil)
}
func chartDel(c *gin.Context) {
loginUser(c).MustPerm("dashboard_modify")
renderMessage(c, Chart(urlParamInt64(c, "id")).Del())
}
type chartConfig struct {
Id int64 `json:"id"`
GroupId int64 `json:"group_id"`
Configs string `json:"configs"`
}
func chartConfigsPut(c *gin.Context) {
var arr []chartConfig
bind(c, &arr)
loginUser(c).MustPerm("dashboard_modify")
for i := 0; i < len(arr); i++ {
ct := Chart(arr[i].Id)
ct.Configs = arr[i].Configs
if arr[i].GroupId > 0 {
ct.GroupId = arr[i].GroupId
}
dangerous(ct.Update("configs", "group_id"))
}
renderMessage(c, nil)
}

View File

@ -1,55 +0,0 @@
package http
import (
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func chartGroupGets(c *gin.Context) {
objs, err := models.ChartGroupGets(urlParamInt64(c, "id"))
renderData(c, objs, err)
}
type chartGroupForm struct {
Name string `json:"name"`
Weight int `json:"weight"`
}
func chartGroupAdd(c *gin.Context) {
var f chartGroupForm
bind(c, &f)
loginUser(c).MustPerm("dashboard_modify")
d := Dashboard(urlParamInt64(c, "id"))
cg := models.ChartGroup{
DashboardId: d.Id,
Name: f.Name,
Weight: f.Weight,
}
dangerous(cg.Add())
renderData(c, cg, nil)
}
func chartGroupsPut(c *gin.Context) {
var arr []models.ChartGroup
bind(c, &arr)
loginUser(c).MustPerm("dashboard_modify")
for i := 0; i < len(arr); i++ {
dangerous(arr[i].Update("name", "weight"))
}
renderMessage(c, nil)
}
func chartGroupDel(c *gin.Context) {
loginUser(c).MustPerm("dashboard_modify")
cg := models.ChartGroup{Id: urlParamInt64(c, "id")}
renderMessage(c, cg.Del())
}

View File

@ -1,50 +0,0 @@
package http
import (
"strconv"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
type chartTmpForm struct {
Configs string `json:"configs"`
}
func chartTmpAdd(c *gin.Context) {
ids := []int64{}
var forms []chartTmpForm
bind(c, &forms)
for _, f := range forms {
chart := models.ChartTmp{
Configs: f.Configs,
CreateBy: loginUsername(c),
CreateAt: time.Now().Unix(),
}
dangerous(chart.Add())
ids = append(ids, chart.Id)
}
renderData(c, ids, nil)
}
func chartTmpGets(c *gin.Context) {
objs := []*models.ChartTmp{}
idStr := queryStr(c, "ids")
ids := strings.Split(idStr, ",")
for _, id := range ids {
i, err := strconv.ParseInt(id, 10, 64)
dangerous(err)
obj, err := models.ChartTmpGet("id=?", i)
dangerous(err)
objs = append(objs, obj)
}
renderData(c, objs, nil)
}

View File

@ -1,152 +0,0 @@
package http
import (
"time"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func classpathListGets(c *gin.Context) {
limit := queryInt(c, "limit", defaultLimit)
query := queryStr(c, "query", "")
total, err := models.ClasspathTotal(query)
dangerous(err)
list, err := models.ClasspathGets(query, limit, offset(c, limit))
dangerous(err)
renderData(c, gin.H{
"list": list,
"total": total,
}, nil)
}
//此api暂时不对外开放
func classpathListNodeGets(c *gin.Context) {
query := queryStr(c, "query", "")
list, err := models.ClasspathNodeGets(query)
dangerous(err)
renderData(c, list, nil)
}
func classpathListNodeGetsById(c *gin.Context) {
cp := Classpath(urlParamInt64(c, "id"))
children, err := cp.DirectChildren()
renderData(c, children, err)
}
func classpathFavoriteGet(c *gin.Context) {
lst, err := loginUser(c).FavoriteClasspaths()
renderData(c, lst, err)
}
type classpathForm struct {
Path string `json:"path"`
Note string `json:"note"`
}
func classpathAdd(c *gin.Context) {
var f classpathForm
bind(c, &f)
me := loginUser(c).MustPerm("classpath_create")
cp := models.Classpath{
Path: f.Path,
Note: f.Note,
Preset: 0,
CreateBy: me.Username,
UpdateBy: me.Username,
}
renderMessage(c, cp.Add())
}
func classpathPut(c *gin.Context) {
var f classpathForm
bind(c, &f)
me := loginUser(c).MustPerm("classpath_modify")
cp := Classpath(urlParamInt64(c, "id"))
if cp.Path != f.Path {
num, err := models.ClasspathCount("path=? and id<>?", f.Path, cp.Id)
dangerous(err)
if num > 0 {
bomb(200, "Classpath %s already exists", f.Path)
}
}
cp.Path = f.Path
cp.Note = f.Note
cp.UpdateBy = me.Username
cp.UpdateAt = time.Now().Unix()
renderMessage(c, cp.Update("path", "note", "update_by", "update_at"))
}
func classpathDel(c *gin.Context) {
loginUser(c).MustPerm("classpath_delete")
cp := Classpath(urlParamInt64(c, "id"))
if cp.Preset == 1 {
bomb(200, "Preset classpath %s cannot delete", cp.Path)
}
renderMessage(c, cp.Del())
}
func classpathAddResources(c *gin.Context) {
var arr []string
bind(c, &arr)
me := loginUser(c).MustPerm("classpath_add_resource")
cp := Classpath(urlParamInt64(c, "id"))
dangerous(cp.AddResources(arr))
cp.UpdateAt = time.Now().Unix()
cp.UpdateBy = me.Username
cp.Update("update_at", "update_by")
renderMessage(c, nil)
}
func classpathDelResources(c *gin.Context) {
var arr []string
bind(c, &arr)
classpathId := urlParamInt64(c, "id")
me := loginUser(c).MustPerm("classpath_del_resource")
if classpathId == 1 {
bomb(200, _s("Resource cannot delete in preset classpath"))
}
cp := Classpath(classpathId)
dangerous(cp.DelResources(arr))
cp.UpdateAt = time.Now().Unix()
cp.UpdateBy = me.Username
cp.Update("update_at", "update_by")
renderMessage(c, nil)
}
func classpathFavoriteAdd(c *gin.Context) {
me := loginUser(c)
cp := Classpath(urlParamInt64(c, "id"))
renderMessage(c, models.ClasspathFavoriteAdd(cp.Id, me.Id))
}
func classpathFavoriteDel(c *gin.Context) {
me := loginUser(c)
cp := Classpath(urlParamInt64(c, "id"))
renderMessage(c, models.ClasspathFavoriteDel(cp.Id, me.Id))
}

View File

@ -1,283 +0,0 @@
package http
import (
"regexp"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/cache"
"github.com/didi/nightingale/v5/models"
)
type collectRuleForm struct {
ClasspathId int64 `json:"classpath_id"`
PrefixMatch int `json:"prefix_match"`
Name string `json:"name"`
Note string `json:"note"`
Step int `json:"step"`
Type string `json:"type"`
Data string `json:"data"`
AppendTags string `json:"append_tags"`
}
func collectRuleAdd(c *gin.Context) {
var f collectRuleForm
bind(c, &f)
me := loginUser(c).MustPerm("collect_rule_create")
cr := models.CollectRule{
ClasspathId: f.ClasspathId,
PrefixMatch: f.PrefixMatch,
Name: f.Name,
Note: f.Note,
Step: f.Step,
Type: f.Type,
Data: f.Data,
AppendTags: f.AppendTags,
CreateBy: me.Username,
UpdateBy: me.Username,
}
renderMessage(c, cr.Add())
}
func collectRulesAdd(c *gin.Context) {
var forms []collectRuleForm
bind(c, &forms)
me := loginUser(c).MustPerm("collect_rule_create")
for _, f := range forms {
cr := models.CollectRule{
ClasspathId: f.ClasspathId,
PrefixMatch: f.PrefixMatch,
Name: f.Name,
Note: f.Note,
Step: f.Step,
Type: f.Type,
Data: f.Data,
AppendTags: f.AppendTags,
CreateBy: me.Username,
UpdateBy: me.Username,
}
dangerous(cr.Add())
}
renderMessage(c, nil)
}
func collectRulePut(c *gin.Context) {
var f collectRuleForm
bind(c, &f)
me := loginUser(c).MustPerm("collect_rule_modify")
cr := CollectRule(urlParamInt64(c, "id"))
cr.PrefixMatch = f.PrefixMatch
cr.Name = f.Name
cr.Note = f.Note
cr.Step = f.Step
cr.Type = f.Type
cr.Data = f.Data
cr.AppendTags = f.AppendTags
cr.UpdateAt = time.Now().Unix()
cr.UpdateBy = me.Username
renderMessage(c, cr.Update(
"prefix_match",
"name",
"note",
"step",
"type",
"data",
"update_at",
"update_by",
"append_tags",
))
}
func collectRuleDel(c *gin.Context) {
var f idsForm
bind(c, &f)
f.Validate()
loginUser(c).MustPerm("collect_rule_delete")
renderMessage(c, models.CollectRulesDel(f.Ids))
}
func collectRuleGets(c *gin.Context) {
classpathId := urlParamInt64(c, "id")
where := "classpath_id = ?"
param := []interface{}{classpathId}
typ := queryStr(c, "type", "")
if typ != "" {
where += " and type = ?"
param = append(param, typ)
}
objs, err := models.CollectRuleGets(where, param...)
renderData(c, objs, err)
}
func collectRuleGetsByIdent(c *gin.Context) {
ident := queryStr(c, "ident")
objs := cache.CollectRulesOfIdent.GetBy(ident)
renderData(c, objs, nil)
}
type Summary struct {
LatestUpdatedAt int64 `json:"latest_updated_at"`
Total int `json:"total"`
}
func collectRuleSummaryGetByIdent(c *gin.Context) {
ident := queryStr(c, "ident")
var summary Summary
objs := cache.CollectRulesOfIdent.GetBy(ident)
total := len(objs)
if total > 0 {
summary.Total = total
var latestUpdatedAt int64
for _, obj := range objs {
if latestUpdatedAt < obj.UpdateAt {
latestUpdatedAt = obj.UpdateAt
}
}
summary.LatestUpdatedAt = latestUpdatedAt
}
renderData(c, summary, nil)
}
type RegExpCheck struct {
Success bool `json:"success"`
Data []map[string]string `json:"tags"`
}
func regExpCheck(c *gin.Context) {
param := make(map[string]string)
dangerous(c.ShouldBind(&param))
ret := &RegExpCheck{
Success: true,
Data: make([]map[string]string, 0),
}
calcMethod := param["func"]
if calcMethod == "" {
tmp := map[string]string{"func": "is empty"}
ret.Data = append(ret.Data, tmp)
renderData(c, ret, nil)
return
}
// 处理主正则
if re, ok := param["re"]; !ok || re == "" {
tmp := map[string]string{"re": "regex does not exist or is empty"}
ret.Data = append(ret.Data, tmp)
renderData(c, ret, nil)
return
}
// 匹配主正则
suc, reRes, isSub := checkRegex(param["re"], param["log"])
if !suc {
ret.Success = false
reRes = genErrMsg(param["re"])
ret.Data = append(ret.Data, map[string]string{"re": reRes})
renderData(c, ret, nil)
return
}
if calcMethod == "histogram" && !isSub {
ret.Success = false
reRes = genSubErrMsg(param["re"])
ret.Data = append(ret.Data, map[string]string{"re": reRes})
renderData(c, ret, nil)
return
}
ret.Data = append(ret.Data, map[string]string{"re": reRes})
// 处理tags
var nonTagKey = map[string]bool{
"re": true,
"log": true,
"func": true,
}
for tagk, pat := range param {
// 如果不是tag就继续循环
if _, ok := nonTagKey[tagk]; ok {
continue
}
suc, tagRes, isSub := checkRegex(pat, param["log"])
if !suc {
// 正则错误
ret.Success = false
tagRes = genErrMsg(pat)
} else if !isSub {
// 未匹配出子串
ret.Success = false
tagRes = genSubErrMsg(pat)
} else if includeIllegalChar(tagRes) || includeIllegalChar(tagk) {
// 保留字报错
ret.Success = false
tagRes = genIllegalCharErrMsg()
}
tmp := map[string]string{tagk: tagRes}
ret.Data = append(ret.Data, tmp)
}
renderData(c, ret, nil)
}
// 出错信息直接放在body里
func checkRegex(pat string, log string) (succ bool, result string, isSub bool) {
if pat == "" {
return false, "", false
}
reg, err := regexp.Compile(pat)
if err != nil {
return false, "", false
}
res := reg.FindStringSubmatch(log)
switch len(res) {
// 没查到
case 0:
return false, "", false
// 没查到括号内的串,返回整个匹配串
case 1:
return true, res[0], false
// 查到了,默认取第一个串
default:
return true, res[1], true
}
}
func includeIllegalChar(s string) bool {
illegalChars := ":,=\r\n\t"
return strings.ContainsAny(s, illegalChars)
}
// 生成返回错误信息
func genErrMsg(pattern string) string {
return _s("Regexp %s matching failed", pattern)
}
// 生成子串匹配错误信息
func genSubErrMsg(pattern string) string {
return _s("Regexp %s matched, but cannot get substring()", pattern)
}
// 生成子串匹配错误信息
func genIllegalCharErrMsg() string {
return _s(`TagKey or TagValue contains illegal characters[:,/=\r\n\t]`)
}

View File

@ -1,244 +0,0 @@
package http
import (
"time"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func dashboardGets(c *gin.Context) {
limit := queryInt(c, "limit", defaultLimit)
query := queryStr(c, "query", "")
onlyfavorite := queryBool(c, "onlyfavorite", false)
me := loginUser(c)
ids, err := me.FavoriteDashboardIds()
dangerous(err)
// 我的收藏是空的,所以直接返回空列表
if onlyfavorite && len(ids) == 0 {
renderZeroPage(c)
return
}
total, err := models.DashboardTotal(onlyfavorite, ids, query)
dangerous(err)
list, err := models.DashboardGets(onlyfavorite, ids, query, limit, offset(c, limit))
dangerous(err)
if onlyfavorite {
for i := 0; i < len(list); i++ {
list[i].Favorite = 1
}
} else {
for i := 0; i < len(list); i++ {
list[i].FillFavorite(ids)
}
}
renderData(c, gin.H{
"list": list,
"total": total,
}, nil)
}
func dashboardGet(c *gin.Context) {
renderData(c, Dashboard(urlParamInt64(c, "id")), nil)
}
type dashboardForm struct {
Id int64 `json:"id"`
Name string `json:"name"`
Tags string `json:"tags"`
Configs string `json:"configs"`
}
func dashboardAdd(c *gin.Context) {
var f dashboardForm
bind(c, &f)
me := loginUser(c).MustPerm("dashboard_create")
d := &models.Dashboard{
Name: f.Name,
Tags: f.Tags,
Configs: f.Configs,
CreateBy: me.Username,
UpdateBy: me.Username,
}
dangerous(d.Add())
renderData(c, d, nil)
}
func dashboardPut(c *gin.Context) {
var f dashboardForm
bind(c, &f)
me := loginUser(c).MustPerm("dashboard_modify")
d := Dashboard(urlParamInt64(c, "id"))
if d.Name != f.Name {
num, err := models.DashboardCount("name=? and id<>?", f.Name, d.Id)
dangerous(err)
if num > 0 {
bomb(200, "Dashboard %s already exists", f.Name)
}
}
d.Name = f.Name
d.Tags = f.Tags
d.Configs = f.Configs
d.UpdateAt = time.Now().Unix()
d.UpdateBy = me.Username
dangerous(d.Update("name", "tags", "configs", "update_at", "update_by"))
renderData(c, d, nil)
}
func dashboardClone(c *gin.Context) {
var f dashboardForm
bind(c, &f)
me := loginUser(c).MustPerm("dashboard_create")
d := &models.Dashboard{
Name: f.Name,
Tags: f.Tags,
Configs: f.Configs,
CreateBy: me.Username,
UpdateBy: me.Username,
}
dangerous(d.AddOnly())
chartGroups, err := models.ChartGroupGets(f.Id)
dangerous(err)
for _, chartGroup := range chartGroups {
charts, err := models.ChartGets(chartGroup.Id)
dangerous(err)
chartGroup.DashboardId = d.Id
chartGroup.Id = 0
dangerous(chartGroup.Add())
for _, chart := range charts {
chart.Id = 0
chart.GroupId = chartGroup.Id
dangerous(chart.Add())
}
}
renderData(c, d, nil)
}
func dashboardDel(c *gin.Context) {
loginUser(c).MustPerm("dashboard_delete")
renderMessage(c, Dashboard(urlParamInt64(c, "id")).Del())
}
func dashboardFavoriteAdd(c *gin.Context) {
me := loginUser(c)
d := Dashboard(urlParamInt64(c, "id"))
renderMessage(c, models.DashboardFavoriteAdd(d.Id, me.Id))
}
func dashboardFavoriteDel(c *gin.Context) {
me := loginUser(c)
d := Dashboard(urlParamInt64(c, "id"))
renderMessage(c, models.DashboardFavoriteDel(d.Id, me.Id))
}
type ChartGroupDetail struct {
Id int64 `json:"id"`
DashboardId int64 `json:"dashboard_id"`
Name string `json:"name"`
Weight int `json:"weight"`
Charts []models.Chart `json:"charts"`
}
type DashboardDetail struct {
Id int64 `json:"id"`
Name string `json:"name"`
Tags string `json:"tags"`
Configs string `json:"configs"`
ChartGroups []ChartGroupDetail `json:"chart_groups"`
}
func dashboardExport(c *gin.Context) {
var f idsForm
bind(c, &f)
dashboards, err := models.DashboardGetsByIds(f.Ids)
dangerous(err)
var details []DashboardDetail
for _, databoard := range dashboards {
detail := DashboardDetail{
Name: databoard.Name,
Tags: databoard.Tags,
Configs: databoard.Configs,
}
chartGroups, err := models.ChartGroupGets(databoard.Id)
dangerous(err)
var chartGroupsDetail []ChartGroupDetail
for _, chartGroup := range chartGroups {
chartGroupDetail := ChartGroupDetail{
Name: chartGroup.Name,
Weight: chartGroup.Weight,
}
charts, err := models.ChartGets(chartGroup.Id)
dangerous(err)
chartGroupDetail.Charts = charts
chartGroupsDetail = append(chartGroupsDetail, chartGroupDetail)
}
detail.ChartGroups = chartGroupsDetail
details = append(details, detail)
}
renderData(c, details, nil)
}
func dashboardImport(c *gin.Context) {
var details []DashboardDetail
bind(c, &details)
me := loginUser(c).MustPerm("dashboard_create")
for _, detail := range details {
d := &models.Dashboard{
Name: detail.Name,
Tags: detail.Tags,
Configs: detail.Configs,
CreateBy: me.Username,
UpdateBy: me.Username,
}
dangerous(d.AddOnly())
for _, chartGroup := range detail.ChartGroups {
cg := models.ChartGroup{
DashboardId: d.Id,
Name: chartGroup.Name,
Weight: chartGroup.Weight,
}
dangerous(cg.Add())
for _, chart := range chartGroup.Charts {
c := models.Chart{
GroupId: cg.Id,
Configs: chart.Configs,
Weight: chart.Weight,
}
dangerous(c.Add())
}
}
}
renderMessage(c, nil)
}

View File

@ -1,56 +0,0 @@
package http
import (
"time"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func historyAlertEventGets(c *gin.Context) {
stime := queryInt64(c, "stime", 0)
etime := queryInt64(c, "etime", 0)
hours := queryInt64(c, "hours", 0)
now := time.Now().Unix()
if hours != 0 {
stime = now - 3600*hours
etime = now + 3600*24
}
if stime != 0 && etime == 0 {
etime = now + 3600*24
}
query := queryStr(c, "query", "")
priority := queryInt(c, "priority", -1)
status := queryInt(c, "status", -1)
isRecovery := queryInt(c, "is_recovery", -1)
limit := queryInt(c, "limit", defaultLimit)
total, err := models.HistoryAlertEventsTotal(stime, etime, query, status, isRecovery, priority)
dangerous(err)
list, err := models.HistoryAlertEventGets(stime, etime, query, status, isRecovery, priority, limit, offset(c, limit))
dangerous(err)
for i := 0; i < len(list); i++ {
dangerous(list[i].FillObjs())
}
if len(list) == 0 {
renderZeroPage(c)
return
}
renderData(c, map[string]interface{}{
"total": total,
"list": list,
}, nil)
}
func historyAlertEventGet(c *gin.Context) {
ae := HistoryAlertEvent(urlParamInt64(c, "id"))
dangerous(ae.FillObjs())
renderData(c, ae, nil)
}

View File

@ -1,82 +0,0 @@
package http
import (
"net/http"
"strings"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func metricDescriptionGets(c *gin.Context) {
limit := queryInt(c, "limit", defaultLimit)
query := queryStr(c, "query", "")
total, err := models.MetricDescriptionTotal(query)
dangerous(err)
list, err := models.MetricDescriptionGets(query, limit, offset(c, limit))
dangerous(err)
renderData(c, gin.H{
"list": list,
"total": total,
}, nil)
}
type metricDescriptionFrom struct {
Data string `json:"data"`
}
// 没有单个新增的功能,只有批量导入
func metricDescriptionAdd(c *gin.Context) {
var f metricDescriptionFrom
var metricDescriptions []models.MetricDescription
bind(c, &f)
lines := strings.Split(f.Data, "\n")
for _, md := range lines {
arr := strings.Split(md, ":")
if len(arr) != 2 {
bomb(200, "metric description %s is illegal", md)
}
m := models.MetricDescription{
Metric: arr[0],
Description: arr[1],
}
metricDescriptions = append(metricDescriptions, m)
}
if len(metricDescriptions) == 0 {
bomb(http.StatusBadRequest, "Decoded metric description empty")
}
loginUser(c).MustPerm("metric_description_create")
renderMessage(c, models.MetricDescriptionUpdate(metricDescriptions))
}
func metricDescriptionDel(c *gin.Context) {
var f idsForm
bind(c, &f)
loginUser(c).MustPerm("metric_description_delete")
renderMessage(c, models.MetricDescriptionDel(f.Ids))
}
type metricDescriptionForm struct {
Description string `json:"description"`
}
func metricDescriptionPut(c *gin.Context) {
var f metricDescriptionForm
bind(c, &f)
loginUser(c).MustPerm("metric_description_modify")
md := MetricDescription(urlParamInt64(c, "id"))
md.Description = f.Description
renderMessage(c, md.Update("description"))
}

View File

@ -1,62 +0,0 @@
package http
import (
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func muteGets(c *gin.Context) {
limit := queryInt(c, "limit", defaultLimit)
query := queryStr(c, "query", "")
total, err := models.MuteTotal(query)
dangerous(err)
list, err := models.MuteGets(query, limit, offset(c, limit))
dangerous(err)
renderData(c, gin.H{
"list": list,
"total": total,
}, nil)
}
type muteForm struct {
ClasspathPrefix string `json:"classpath_prefix "`
Metric string `json:"metric"`
ResFilters string `json:"res_filters"`
TagFilters string `json:"tags_filters"`
Cause string `json:"cause"`
Btime int64 `json:"btime"`
Etime int64 `json:"etime"`
}
func muteAdd(c *gin.Context) {
var f muteForm
bind(c, &f)
me := loginUser(c).MustPerm("mute_create")
mt := models.Mute{
ClasspathPrefix: f.ClasspathPrefix,
Metric: f.Metric,
ResFilters: f.ResFilters,
TagFilters: f.TagFilters,
Cause: f.Cause,
Btime: f.Btime,
Etime: f.Etime,
CreateBy: me.Username,
}
renderMessage(c, mt.Add())
}
func muteGet(c *gin.Context) {
renderData(c, Mute(urlParamInt64(c, "id")), nil)
}
func muteDel(c *gin.Context) {
loginUser(c).MustPerm("mute_delete")
renderMessage(c, Mute(urlParamInt64(c, "id")).Del())
}

View File

@ -1,24 +0,0 @@
package http
import (
"github.com/gin-gonic/gin"
"github.com/prometheus/prometheus/promql/parser"
"github.com/didi/nightingale/v5/vos"
)
func checkPromeQl(c *gin.Context) {
ql := c.Query("promql")
_, err := parser.ParseExpr(ql)
respD := &vos.PromQlCheckResp{}
isCorrect := true
if err != nil {
isCorrect = false
respD.ParseError = err.Error()
}
respD.QlCorrect = isCorrect
renderData(c, respD, nil)
}

View File

@ -1,190 +0,0 @@
package http
import (
"net/http"
"strings"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/str"
"github.com/didi/nightingale/v5/models"
)
func classpathGetsResources(c *gin.Context) {
limit := queryInt(c, "limit", defaultLimit)
prefix := queryInt(c, "prefix", 0)
query := queryStr(c, "query", "")
cp := Classpath(urlParamInt64(c, "id"))
var classpathIds []int64
if prefix == 1 {
cps, err := models.ClasspathGetsByPrefix(cp.Path)
dangerous(err)
for i := range cps {
classpathIds = append(classpathIds, cps[i].Id)
}
} else {
classpathIds = append(classpathIds, cp.Id)
}
total, err := models.ResourceTotalByClasspathId(classpathIds, query)
dangerous(err)
reses, err := models.ResourceGetsByClasspathId(classpathIds, query, limit, offset(c, limit))
dangerous(err)
renderData(c, gin.H{
"classpath": cp,
"list": reses,
"total": total,
}, nil)
}
func resourcesQuery(c *gin.Context) {
limit := queryInt(c, "limit", defaultLimit)
qres := queryStr(c, "qres", "")
// qpaths 可以选择多个英文逗号分隔的多个id
qpaths := str.IdsInt64(queryStr(c, "qpaths", ""))
total, err := models.ResourceTotalByClasspathQuery(qpaths, qres)
dangerous(err)
reses, err := models.ResourceGetsByClasspathQuery(qpaths, qres, limit, offset(c, limit))
dangerous(err)
if len(reses) == 0 {
renderZeroPage(c)
return
}
renderData(c, gin.H{
"list": reses,
"total": total,
}, nil)
}
func resourceGet(c *gin.Context) {
renderData(c, Resource(urlParamInt64(c, "id")), nil)
}
func resourceDel(c *gin.Context) {
loginUser(c).MustPerm("resource_modify")
renderData(c, Resource(urlParamInt64(c, "id")).Del(), nil)
}
type resourceNoteForm struct {
Ids []int64 `json:"ids"`
Note string `json:"note"`
}
// 修改主机设备的备注
func resourceNotePut(c *gin.Context) {
var f resourceNoteForm
bind(c, &f)
if len(f.Ids) == 0 {
bomb(http.StatusBadRequest, "ids is empty")
}
loginUser(c).MustPerm("resource_modify")
renderMessage(c, models.ResourceUpdateNote(f.Ids, f.Note))
}
type resourceTagsForm struct {
Ids []int64 `json:"ids"`
Tags string `json:"tags"`
}
func resourceTagsPut(c *gin.Context) {
var f resourceTagsForm
bind(c, &f)
if len(f.Ids) == 0 {
bomb(http.StatusBadRequest, "ids is empty")
}
loginUser(c).MustPerm("resource_modify")
renderMessage(c, models.ResourceUpdateTags(f.Ids, f.Tags))
}
type resourceMuteForm struct {
Ids []int64 `json:"ids"`
Btime int64 `json:"btime"`
Etime int64 `json:"etime"`
}
func resourceMutePut(c *gin.Context) {
var f resourceMuteForm
bind(c, &f)
if len(f.Ids) == 0 {
bomb(http.StatusBadRequest, "ids is empty")
}
loginUser(c).MustPerm("resource_modify")
renderMessage(c, models.ResourceUpdateMute(f.Ids, f.Btime, f.Etime))
}
type resourceClasspathsForm struct {
ResIdents []string `json:"res_idents"`
ClasspathIds []int64 `json:"classpath_ids"`
}
func resourceClasspathsPut(c *gin.Context) {
var f resourceClasspathsForm
m := make(map[string]map[int64]struct{}) //store database data to compare
toAdd := make(map[string][]int64)
bind(c, &f)
loginUser(c).MustPerm("resource_modify")
sql := "res_ident in (\"" + strings.Join(f.ResIdents, ",") + "\")"
oldClasspathResources, err := models.ClasspathResourceGets(sql)
dangerous(err)
for _, obj := range oldClasspathResources {
if _, exists := m[obj.ResIdent]; !exists {
m[obj.ResIdent] = make(map[int64]struct{})
}
m[obj.ResIdent][obj.ClasspathId] = struct{}{}
}
for _, ident := range f.ResIdents {
toAdd[ident] = []int64{}
if _, exists := m[ident]; exists {
for _, classpathId := range f.ClasspathIds {
if _, exists := m[ident][classpathId]; exists {
// classpathResource 在数据库中已存在,不做处理
delete(m[ident], classpathId)
} else {
toAdd[ident] = append(toAdd[ident], classpathId)
}
}
} else {
toAdd[ident] = f.ClasspathIds
}
}
//删除数据库中多余的classpathResources
for ident := range m {
for classpathId := range m[ident] {
if classpathId == 1 {
continue
}
dangerous(models.ClasspathResourceDel(classpathId, []string{ident}))
}
}
//添加数据库没有的classpathResources
for ident, cids := range toAdd {
for _, cid := range cids {
dangerous(models.ClasspathResourceAdd(cid, ident))
}
}
renderMessage(c, nil)
}

View File

@ -1,12 +0,0 @@
package http
import (
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func rolesGet(c *gin.Context) {
lst, err := models.RoleGetsAll()
renderData(c, lst, err)
}

View File

@ -1,58 +0,0 @@
package http
import (
"encoding/json"
"time"
"github.com/gin-gonic/gin"
)
func selfProfileGet(c *gin.Context) {
renderData(c, loginUser(c), nil)
}
type selfProfileForm struct {
Nickname string `json:"nickname"`
Phone string `json:"phone"`
Email string `json:"email"`
Portrait string `json:"portrait"`
Contacts json.RawMessage `json:"contacts"`
}
func selfProfilePut(c *gin.Context) {
var f selfProfileForm
bind(c, &f)
user := loginUser(c)
user.Nickname = f.Nickname
user.Phone = f.Phone
user.Email = f.Email
user.Portrait = f.Portrait
user.Contacts = f.Contacts
user.UpdateAt = time.Now().Unix()
user.UpdateBy = user.Username
renderMessage(
c,
user.Update(
"nickname",
"phone",
"email",
"portrait",
"contacts",
"update_at",
"update_by",
),
)
}
type selfPasswordForm struct {
OldPass string `json:"oldpass" binding:"required"`
NewPass string `json:"newpass" binding:"required"`
}
func selfPasswordPut(c *gin.Context) {
var f selfPasswordForm
bind(c, &f)
renderMessage(c, loginUser(c).ChangePassword(f.OldPass, f.NewPass))
}

View File

@ -1,42 +0,0 @@
package http
import (
"time"
"github.com/didi/nightingale/v5/models"
"github.com/gin-gonic/gin"
)
func Status(c *gin.Context) {
var err error
data := make(map[string]int64)
data["user_total"], err = models.UserTotal("")
dangerous(err)
data["user_group_total"], err = models.UserGroupTotal("")
dangerous(err)
data["resource_total"], err = models.ResourceTotal("")
dangerous(err)
data["alert_rule_total"], err = models.AlertRuleTotal("")
dangerous(err)
data["dashboard_total"], err = models.DashboardCount("")
dangerous(err)
now := time.Now().Unix()
stime := now - 24*3600
data["event_total_day"], err = models.AlertEventTotal(stime, now, "", -1, -1)
dangerous(err)
stime = now - 7*24*3600
data["event_total_week"], err = models.AlertEventTotal(stime, now, "", -1, -1)
dangerous(err)
stime = now - 30*24*3600
data["event_total_month"], err = models.AlertEventTotal(stime, now, "", -1, -1)
dangerous(err)
renderData(c, data, nil)
}

View File

@ -1,32 +0,0 @@
package http
import (
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func selfTokenGets(c *gin.Context) {
objs, err := models.UserTokenGets("user_id=?", loginUser(c).Id)
renderData(c, objs, err)
}
func selfTokenPost(c *gin.Context) {
user := loginUser(c)
obj, err := models.UserTokenNew(user.Id, user.Username)
renderData(c, obj, err)
}
type selfTokenForm struct {
Token string `json:"token"`
}
func selfTokenPut(c *gin.Context) {
user := loginUser(c)
var f selfTokenForm
bind(c, &f)
obj, err := models.UserTokenReset(user.Id, f.Token)
renderData(c, obj, err)
}

View File

@ -1,58 +0,0 @@
package http
import (
"encoding/json"
"io/ioutil"
"net/http"
"path"
"github.com/didi/nightingale/v5/config"
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/file"
)
func tplNameGets(c *gin.Context) {
tplType := queryStr(c, "tpl_type")
var files []string
var err error
switch tplType {
case "alert_rule":
files, err = file.FilesUnder(config.Config.Tpl.AlertRulePath)
dangerous(err)
case "dashboard":
files, err = file.FilesUnder(config.Config.Tpl.DashboardPath)
dangerous(err)
default:
bomb(http.StatusBadRequest, "tpl type not found")
}
renderData(c, files, err)
}
func tplGet(c *gin.Context) {
tplName := path.Base(queryStr(c, "tpl_name"))
tplType := queryStr(c, "tpl_type")
var filePath string
switch tplType {
case "alert_rule":
filePath = config.Config.Tpl.AlertRulePath + "/" + tplName
case "dashboard":
filePath = config.Config.Tpl.DashboardPath + "/" + tplName
default:
bomb(http.StatusBadRequest, "tpl type not found")
}
if !file.IsExist(filePath) {
bomb(http.StatusBadRequest, "tpl not found")
}
b, err := ioutil.ReadFile(filePath)
dangerous(err)
var content interface{}
err = json.Unmarshal(b, &content)
renderData(c, content, err)
}

View File

@ -1,221 +0,0 @@
package http
import (
"compress/gzip"
"compress/zlib"
"errors"
"fmt"
"io/ioutil"
"github.com/didi/nightingale/v5/backend"
"github.com/didi/nightingale/v5/cache"
"github.com/didi/nightingale/v5/trans"
"github.com/didi/nightingale/v5/vos"
"github.com/gin-gonic/gin"
agentpayload "github.com/n9e/agent-payload/gogen"
"github.com/toolkits/pkg/logger"
)
// 错误消息也是返回了200是和客户端的约定客户端如果发现code!=200就会重试
func PushSeries(c *gin.Context) {
req := agentpayload.N9EMetricsPayload{}
r := c.Request
reader := r.Body
var err error
if encoding := r.Header.Get("Content-Encoding"); encoding == "gzip" {
if reader, err = gzip.NewReader(r.Body); err != nil {
message := fmt.Sprintf("error: get gzip reader occur error: %v", err)
logger.Warning(message)
c.String(200, message)
return
}
defer reader.Close()
} else if encoding == "deflate" {
if reader, err = zlib.NewReader(r.Body); err != nil {
message := fmt.Sprintf("error: get zlib reader occur error: %v", err)
logger.Warning(message)
c.String(200, message)
return
}
defer reader.Close()
}
b, err := ioutil.ReadAll(reader)
if err != nil {
message := fmt.Sprintf("error: ioutil occur error: %v", err)
logger.Warning(message)
c.String(200, message)
return
}
if r.Header.Get("Content-Type") == "application/x-protobuf" {
if err := req.Unmarshal(b); err != nil {
message := fmt.Sprintf("error: decode protobuf body occur error: %v", err)
logger.Warning(message)
c.String(200, message)
return
}
count := len(req.Samples)
if count == 0 {
c.String(200, "error: samples is empty")
return
}
metricPoints := make([]*vos.MetricPoint, 0, count)
for i := 0; i < count; i++ {
logger.Debugf("recv %v", req.Samples[i])
metricPoints = append(metricPoints, convertAgentdPoint(req.Samples[i]))
}
if err = trans.Push(metricPoints); err != nil {
logger.Warningf("error: trans.push %+v err:%v", req.Samples, err)
c.String(200, "error: "+err.Error())
} else {
c.String(200, "success: received %d points", len(metricPoints))
}
} else {
logger.Warningf("error: trans.push %+v Content-Type(%s) not equals application/x-protobuf", req.Samples)
c.String(200, "error: Content-Type(%s) not equals application/x-protobuf")
}
}
func convertAgentdPoint(obj *agentpayload.N9EMetricsPayload_Sample) *vos.MetricPoint {
return &vos.MetricPoint{
Metric: obj.Metric,
Ident: obj.Ident,
Alias: obj.Alias,
TagsMap: obj.Tags,
Time: obj.Time,
ValueUntyped: obj.Value,
}
}
func PushData(c *gin.Context) {
var points []*vos.MetricPoint
err := c.ShouldBindJSON(&points)
if err != nil {
message := fmt.Sprintf("error: decode json body occur error: %v", err)
logger.Warning(message)
c.String(200, message)
return
}
if err = trans.Push(points); err != nil {
c.String(200, "error: "+err.Error())
} else {
c.String(200, "success")
}
}
func GetTagKeys(c *gin.Context) {
recv := vos.CommonTagQueryParam{}
dangerous(c.ShouldBindJSON(&recv))
dataSource, err := backend.GetDataSourceFor("")
if err != nil {
logger.Warningf("could not find datasource")
renderMessage(c, err)
return
}
resp := dataSource.QueryTagKeys(recv)
renderData(c, resp, nil)
}
func GetTagValues(c *gin.Context) {
recv := vos.CommonTagQueryParam{}
dangerous(c.ShouldBindJSON(&recv))
dataSource, err := backend.GetDataSourceFor("")
if err != nil {
logger.Warningf("could not find datasource")
renderMessage(c, err)
return
}
if recv.TagKey == "" {
renderMessage(c, errors.New("missing tag_key"))
return
}
resp := dataSource.QueryTagValues(recv)
renderData(c, resp, nil)
}
func GetMetrics(c *gin.Context) {
recv := vos.MetricQueryParam{}
dangerous(c.ShouldBindJSON(&recv))
dataSource, err := backend.GetDataSourceFor("")
if err != nil {
logger.Warningf("could not find datasource")
renderMessage(c, err)
return
}
resp := dataSource.QueryMetrics(recv)
logger.Debugf("[GetMetrics][recv:%+v][resp:%+v]", recv, resp)
res := &vos.MetricDesQueryResp{
Metrics: make([]vos.MetricsWithDescription, 0),
}
for _, metric := range resp.Metrics {
t := vos.MetricsWithDescription{
Name: metric,
}
description, exists := cache.MetricDescMapper.Get(metric)
if exists {
t.Description = description.(string)
}
res.Metrics = append(res.Metrics, t)
}
renderData(c, res, nil)
}
func GetTagPairs(c *gin.Context) {
recv := vos.CommonTagQueryParam{}
dangerous(c.ShouldBindJSON(&recv))
dataSource, err := backend.GetDataSourceFor("")
if err != nil {
logger.Warningf("could not find datasource")
renderMessage(c, err)
return
}
resp := dataSource.QueryTagPairs(recv)
renderData(c, resp, nil)
}
func GetData(c *gin.Context) {
dataSource, err := backend.GetDataSourceFor("")
if err != nil {
logger.Warningf("could not find datasource")
renderMessage(c, err)
return
}
var input vos.DataQueryParam
dangerous(c.ShouldBindJSON(&input))
resp := dataSource.QueryData(input)
renderData(c, resp, nil)
}
func GetDataInstant(c *gin.Context) {
dataSource, err := backend.GetDataSourceFor("")
if err != nil {
logger.Warningf("could not find datasource")
renderMessage(c, err)
return
}
var input vos.DataQueryInstantParam
dangerous(c.ShouldBindJSON(&input))
resp := dataSource.QueryDataInstant(input.PromeQl)
renderData(c, resp, nil)
}

View File

@ -1,197 +0,0 @@
package http
import (
"encoding/json"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/config"
"github.com/didi/nightingale/v5/models"
)
func userGets(c *gin.Context) {
limit := queryInt(c, "limit", defaultLimit)
query := queryStr(c, "query", "")
total, err := models.UserTotal(query)
dangerous(err)
list, err := models.UserGets(query, limit, offset(c, limit))
dangerous(err)
admin := false
roles := strings.Fields(loginUser(c).RolesForDB)
for i := 0; i < len(roles); i++ {
if roles[i] == "Admin" {
admin = true
break
}
}
renderData(c, gin.H{
"list": list,
"total": total,
"admin": admin,
}, nil)
}
type userAddForm struct {
Username string `json:"username" binding:"required"`
Password string `json:"password" binding:"required"`
Nickname string `json:"nickname"`
Phone string `json:"phone"`
Email string `json:"email"`
Portrait string `json:"portrait"`
Roles []string `json:"roles"`
Contacts json.RawMessage `json:"contacts"`
}
func userAddPost(c *gin.Context) {
var f userAddForm
bind(c, &f)
password, err := models.CryptoPass(f.Password)
dangerous(err)
now := time.Now().Unix()
username := loginUsername(c)
if len(f.Roles) == 0 {
bomb(200, "roles empty")
}
u := models.User{
Username: f.Username,
Password: password,
Nickname: f.Nickname,
Phone: f.Phone,
Email: f.Email,
Portrait: f.Portrait,
RolesForDB: strings.Join(f.Roles, " "),
Contacts: f.Contacts,
CreateAt: now,
UpdateAt: now,
CreateBy: username,
UpdateBy: username,
}
renderMessage(c, u.Add())
}
func userProfileGet(c *gin.Context) {
renderData(c, User(urlParamInt64(c, "id")), nil)
}
type userProfileForm struct {
Nickname string `json:"nickname"`
Phone string `json:"phone"`
Email string `json:"email"`
Portrait string `json:"portrait"`
Roles []string `json:"roles"`
Status int `json:"status"`
Contacts json.RawMessage `json:"contacts"`
}
func userProfilePut(c *gin.Context) {
var f userProfileForm
bind(c, &f)
if len(f.Roles) == 0 {
bomb(200, "roles empty")
}
target := User(urlParamInt64(c, "id"))
target.Nickname = f.Nickname
target.Phone = f.Phone
target.Email = f.Email
target.Portrait = f.Portrait
target.RolesForDB = strings.Join(f.Roles, " ")
target.Status = f.Status
target.Contacts = f.Contacts
target.UpdateAt = time.Now().Unix()
target.UpdateBy = loginUsername(c)
renderMessage(
c,
target.Update(
"nickname",
"phone",
"email",
"portrait",
"roles",
"status",
"contacts",
"update_at",
"update_by",
),
)
}
type userPasswordForm struct {
Password string `json:"password" binding:"required"`
}
func userPasswordPut(c *gin.Context) {
var f userPasswordForm
bind(c, &f)
target := User(urlParamInt64(c, "id"))
cryptoPass, err := models.CryptoPass(f.Password)
dangerous(err)
target.Password = cryptoPass
target.UpdateAt = time.Now().Unix()
target.UpdateBy = loginUsername(c)
renderMessage(c, target.Update("password", "update_at", "update_by"))
}
type userStatusForm struct {
Status int `json:"status"`
}
func userStatusPut(c *gin.Context) {
var f userStatusForm
bind(c, &f)
target := User(urlParamInt64(c, "id"))
target.Status = f.Status
target.UpdateAt = time.Now().Unix()
target.UpdateBy = loginUsername(c)
renderMessage(c, target.Update("status", "update_at", "update_by"))
}
func userDel(c *gin.Context) {
id := urlParamInt64(c, "id")
target, err := models.UserGet("id=?", id)
dangerous(err)
if target == nil {
renderMessage(c, nil)
return
}
renderMessage(c, target.Del())
}
func contactChannelsGet(c *gin.Context) {
renderData(c, config.Config.ContactKeys, nil)
}
func getUserByName(c *gin.Context) {
user, err := models.UserGetByUsername(queryStr(c, "name"))
renderData(c, user, err)
}
func getUserByToken(c *gin.Context) {
userToken, err := models.UserTokenGet("token=?", queryStr(c, "token"))
dangerous(err)
if userToken == nil {
renderMessage(c, nil)
return
}
user, err := models.UserGetByUsername(userToken.Username)
renderData(c, user, err)
}

View File

@ -1,173 +0,0 @@
package http
import (
"net/http"
"time"
"github.com/gin-gonic/gin"
"github.com/didi/nightingale/v5/models"
)
func userGroupListGet(c *gin.Context) {
limit := queryInt(c, "limit", defaultLimit)
query := queryStr(c, "query", "")
total, err := models.UserGroupTotal(query)
dangerous(err)
list, err := models.UserGroupGets(query, limit, offset(c, limit))
dangerous(err)
renderData(c, gin.H{
"list": list,
"total": total,
}, nil)
}
// 与我相关的用户组,我创建的,或者我是其中一员
// 这个量不大,搜索和分页都放在前端来做,后端搞起来比较麻烦
func userGroupMineGet(c *gin.Context) {
list, err := loginUser(c).MyUserGroups()
renderData(c, list, err)
}
type userGroupForm struct {
Name string `json:"name"`
Note string `json:"note"`
}
func userGroupAdd(c *gin.Context) {
var f userGroupForm
bind(c, &f)
me := loginUser(c)
ug := models.UserGroup{
Name: f.Name,
Note: f.Note,
CreateBy: me.Username,
UpdateBy: me.Username,
}
dangerous(ug.Add())
// 顺便把创建者也作为团队的一员,失败了也没关系,用户会重新添加成员
models.UserGroupMemberAdd(ug.Id, me.Id)
renderData(c, ug.Id, nil)
}
func userGroupPut(c *gin.Context) {
var f userGroupForm
bind(c, &f)
me := loginUser(c)
ug := UserGroup(urlParamInt64(c, "id"))
can, err := me.CanModifyUserGroup(ug)
dangerous(err)
if !can {
bomb(http.StatusForbidden, "forbidden")
}
if ug.Name != f.Name {
// 如果name发生变化需要检查这个新name是否与别的group重名
num, err := models.UserGroupCount("name=? and id<>?", f.Name, ug.Id)
dangerous(err)
if num > 0 {
bomb(200, "UserGroup %s already exists", f.Name)
}
}
ug.Name = f.Name
ug.Note = f.Note
ug.UpdateBy = me.Username
ug.UpdateAt = time.Now().Unix()
renderMessage(c, ug.Update("name", "note", "update_at", "update_by"))
}
// 不但返回UserGroup的信息也把成员信息返回成员不会特别多所以
// 成员全部返回,由前端分页、查询
func userGroupGet(c *gin.Context) {
ug := UserGroup(urlParamInt64(c, "id"))
ids, err := ug.MemberIds()
dangerous(err)
users, err := models.UserGetsByIds(ids)
renderData(c, gin.H{
"users": users,
"user_group": ug,
}, err)
}
func userGroupMemberAdd(c *gin.Context) {
var f idsForm
bind(c, &f)
f.Validate()
me := loginUser(c)
ug := UserGroup(urlParamInt64(c, "id"))
can, err := me.CanModifyUserGroup(ug)
dangerous(err)
if !can {
bomb(http.StatusForbidden, "forbidden")
}
dangerous(ug.AddMembers(f.Ids))
// 用户组的成员发生变化,相当于更新了用户组
// 如果更新失败了直接忽略,不是啥大事
ug.UpdateAt = time.Now().Unix()
ug.UpdateBy = me.Username
ug.Update("update_at", "update_by")
renderMessage(c, nil)
}
func userGroupMemberDel(c *gin.Context) {
var f idsForm
bind(c, &f)
f.Validate()
me := loginUser(c)
ug := UserGroup(urlParamInt64(c, "id"))
can, err := me.CanModifyUserGroup(ug)
dangerous(err)
if !can {
bomb(http.StatusForbidden, "forbidden")
}
dangerous(ug.DelMembers(f.Ids))
// 用户组的成员发生变化,相当于更新了用户组
// 如果更新失败了直接忽略,不是啥大事
ug.UpdateAt = time.Now().Unix()
ug.UpdateBy = me.Username
ug.Update("update_at", "update_by")
renderMessage(c, nil)
}
func userGroupDel(c *gin.Context) {
me := loginUser(c)
ug := UserGroup(urlParamInt64(c, "id"))
can, err := me.CanModifyUserGroup(ug)
dangerous(err)
if !can {
bomb(http.StatusForbidden, "forbidden")
}
renderMessage(c, ug.Del())
}

View File

@ -1,426 +0,0 @@
// Copyright 2017 Xiaomi, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package judge
import (
"fmt"
"math"
"github.com/didi/nightingale/v5/vos"
)
type Function interface {
Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool)
}
type MaxFunction struct {
Function
Limit int
Operator string
RightValue float64
}
func (f MaxFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
count := len(vs)
if count < 1 {
return
}
max := vs[0].Value
for i := 1; i < len(vs); i++ {
if max < vs[i].Value {
max = vs[i].Value
}
}
leftValue = max
isTriggered = checkIsTriggered(leftValue, f.Operator, f.RightValue)
return
}
type MinFunction struct {
Function
Limit int
Operator string
RightValue float64
}
func (f MinFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
count := len(vs)
if count < 1 {
return
}
min := vs[0].Value
for i := 1; i < len(vs); i++ {
if min > vs[i].Value {
min = vs[i].Value
}
}
leftValue = min
isTriggered = checkIsTriggered(leftValue, f.Operator, f.RightValue)
return
}
type AllFunction struct {
Function
Limit int
Operator string
RightValue float64
}
func (f AllFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
count := len(vs)
if count < 1 {
return
}
for i := 0; i < len(vs); i++ {
isTriggered = checkIsTriggered(vs[i].Value, f.Operator, f.RightValue)
if !isTriggered {
break
}
}
leftValue = vs[0].Value
return
}
type SumFunction struct {
Function
Limit int
Operator string
RightValue float64
}
func (f SumFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
count := len(vs)
if count < 1 {
return
}
sum := vos.JsonFloat(0.0)
for i := 0; i < count; i++ {
sum += vs[i].Value
}
leftValue = sum
isTriggered = checkIsTriggered(leftValue, f.Operator, f.RightValue)
return
}
type AvgFunction struct {
Function
Limit int
Operator string
RightValue float64
}
func (f AvgFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
vsLen := len(vs)
if vsLen < 1 {
return
}
sum := vos.JsonFloat(0.0)
for i := 0; i < vsLen; i++ {
sum += vs[i].Value
}
leftValue = sum / vos.JsonFloat(vsLen)
isTriggered = checkIsTriggered(leftValue, f.Operator, f.RightValue)
return
}
type StddevFunction struct {
Function
Num int
Limit int
}
func (f StddevFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
var sum float64
vsLen := len(vs)
if vsLen < 1 {
return
}
for i := 0; i < vsLen; i++ {
sum += float64(vs[i].Value)
}
mean := sum / float64(vsLen)
var num float64
for i := 0; i < vsLen; i++ {
num += math.Pow(float64(vs[i].Value)-mean, 2)
}
std := math.Sqrt(num / float64(vsLen))
upperBound := mean + std*float64(f.Num)
lowerBound := mean - std*float64(f.Num)
leftValue = vs[0].Value
isTriggered = checkIsTriggered(leftValue, "<", lowerBound) || checkIsTriggered(leftValue, ">", upperBound)
return
}
type DiffFunction struct {
Function
Limit int
Operator string
RightValue float64
}
// 只要有一个点的diff触发阈值就报警
func (f DiffFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
vsLen := len(vs)
if vsLen < 1 {
return
}
first := vs[0].Value
isTriggered = false
for i := 1; i < vsLen; i++ {
// diff是当前值减去历史值
leftValue = first - vs[i].Value
isTriggered = checkIsTriggered(leftValue, f.Operator, f.RightValue)
if isTriggered {
break
}
}
return
}
// pdiff(#3)
type PDiffFunction struct {
Function
Limit int
Operator string
RightValue float64
}
func (f PDiffFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
vsLen := len(vs)
if vsLen < 1 {
return
}
first := vs[0].Value
isTriggered = false
for i := 1; i < len(vs); i++ {
if vs[i].Value == 0 {
continue
}
leftValue = (first - vs[i].Value) / vs[i].Value * 100.0
isTriggered = checkIsTriggered(leftValue, f.Operator, f.RightValue)
if isTriggered {
break
}
}
return
}
type HappenFunction struct {
Function
Num int
Limit int
Operator string
RightValue float64
}
func (f HappenFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
for n, i := 0, 0; i < len(vs); i++ {
if checkIsTriggered(vs[i].Value, f.Operator, f.RightValue) {
n++
if n == f.Num {
isTriggered = true
leftValue = vs[i].Value
return
}
}
}
return
}
type CAvgAbsFunction struct {
Function
Limit int
Operator string
RightValue float64
CompareValue float64
}
func (f CAvgAbsFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
vsLen := len(vs)
if vsLen < 1 {
return
}
sum := vos.JsonFloat(0.0)
for i := 0; i < vsLen; i++ {
sum += vs[i].Value
}
value := sum / vos.JsonFloat(vsLen)
leftValue = vos.JsonFloat(math.Abs(float64(value) - float64(f.CompareValue)))
isTriggered = checkIsTriggered(leftValue, f.Operator, f.RightValue)
return
}
type CAvgFunction struct {
Function
Limit int
Operator string
RightValue float64
CompareValue float64
}
func (f CAvgFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
vsLen := len(vs)
if vsLen < 1 {
return
}
sum := vos.JsonFloat(0.0)
for i := 0; i < vsLen; i++ {
sum += vs[i].Value
}
leftValue = sum/vos.JsonFloat(vsLen) - vos.JsonFloat(f.CompareValue)
isTriggered = checkIsTriggered(leftValue, f.Operator, f.RightValue)
return
}
type CAvgRateAbsFunction struct {
Function
Limit int
Operator string
RightValue float64
CompareValue float64
}
func (f CAvgRateAbsFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
vsLen := len(vs)
if vsLen < 1 {
return
}
sum := vos.JsonFloat(0.0)
for i := 0; i < vsLen; i++ {
sum += vs[i].Value
}
value := sum / vos.JsonFloat(vsLen)
leftValue = vos.JsonFloat(math.Abs((float64(value)-float64(f.CompareValue))/f.CompareValue)) * 100.00
isTriggered = checkIsTriggered(leftValue, f.Operator, f.RightValue)
return
}
type CAvgRateFunction struct {
Function
Limit int
Operator string
RightValue float64
CompareValue float64
}
func (f CAvgRateFunction) Compute(vs []*vos.HPoint) (leftValue vos.JsonFloat, isTriggered bool) {
vsLen := len(vs)
if vsLen < 1 {
return
}
sum := vos.JsonFloat(0.0)
for i := 0; i < vsLen; i++ {
sum += vs[i].Value
}
value := sum / vos.JsonFloat(vsLen)
leftValue = (value - vos.JsonFloat(f.CompareValue)) / vos.JsonFloat(math.Abs(f.CompareValue)) * 100.00
isTriggered = checkIsTriggered(leftValue, f.Operator, f.RightValue)
return
}
func ParseFuncFromString(str string, span []interface{}, operator string, rightValue float64) (fn Function, err error) {
if str == "" {
return nil, fmt.Errorf("func can not be null")
}
limit := span[0].(int)
switch str {
case "max":
fn = &MaxFunction{Limit: limit, Operator: operator, RightValue: rightValue}
case "min":
fn = &MinFunction{Limit: limit, Operator: operator, RightValue: rightValue}
case "all":
fn = &AllFunction{Limit: limit, Operator: operator, RightValue: rightValue}
case "sum":
fn = &SumFunction{Limit: limit, Operator: operator, RightValue: rightValue}
case "avg":
fn = &AvgFunction{Limit: limit, Operator: operator, RightValue: rightValue}
case "stddev":
fn = &StddevFunction{Limit: limit, Num: span[1].(int)}
case "diff":
fn = &DiffFunction{Limit: limit, Operator: operator, RightValue: rightValue}
case "pdiff":
fn = &PDiffFunction{Limit: limit, Operator: operator, RightValue: rightValue}
case "happen":
fn = &HappenFunction{Limit: limit, Num: span[1].(int), Operator: operator, RightValue: rightValue}
case "c_avg":
fn = &CAvgFunction{Limit: limit, CompareValue: span[1].(float64), Operator: operator, RightValue: rightValue}
case "c_avg_abs":
fn = &CAvgAbsFunction{Limit: limit, CompareValue: span[1].(float64), Operator: operator, RightValue: rightValue}
case "c_avg_rate":
fn = &CAvgRateFunction{Limit: limit, CompareValue: span[1].(float64), Operator: operator, RightValue: rightValue}
case "c_avg_rate_abs":
fn = &CAvgRateAbsFunction{Limit: limit, CompareValue: span[1].(float64), Operator: operator, RightValue: rightValue}
default:
err = fmt.Errorf("not_supported_method")
}
return
}
func checkIsTriggered(leftValue vos.JsonFloat, operator string, rightValue float64) (isTriggered bool) {
switch operator {
case "=", "==":
isTriggered = math.Abs(float64(leftValue)-rightValue) < 0.0001
case "!=":
isTriggered = math.Abs(float64(leftValue)-rightValue) > 0.0001
case "<":
isTriggered = float64(leftValue) < rightValue
case "<=":
isTriggered = float64(leftValue) <= rightValue
case ">":
isTriggered = float64(leftValue) > rightValue
case ">=":
isTriggered = float64(leftValue) >= rightValue
}
return
}

View File

@ -1,545 +0,0 @@
// Copyright 2017 Xiaomi, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package judge
import (
"bytes"
"encoding/json"
"fmt"
"math"
"regexp"
"strconv"
"strings"
"sync"
"time"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/str"
"github.com/didi/nightingale/v5/cache"
"github.com/didi/nightingale/v5/models"
"github.com/didi/nightingale/v5/vos"
)
var (
bufferPool = sync.Pool{New: func() interface{} { return new(bytes.Buffer) }}
EVENT_ALERT = "alert"
EVENT_RECOVER = "recovery"
)
func Send(points []*vos.MetricPoint) {
for i := range points {
alertRules := getMatchAlertRules(points[i])
rulesCount := len(alertRules)
if rulesCount == 0 {
// 这个监控数据没有关联任何告警策略,省事了不用处理
continue
}
logger.Debugf("[point_match_alertRules][point:%+v][alertRuleNum:%+v]", points[i], rulesCount)
// 不同的告警规则alert_duration字段大小不同找到最大的按照最大的值来缓存历史数据
var maxAliveDuration = 0
for j := range alertRules {
if maxAliveDuration < alertRules[j].AlertDuration {
maxAliveDuration = alertRules[j].AlertDuration
}
}
ll := PointCaches[points[i].PK[0:2]].PutPoint(points[i], int64(maxAliveDuration))
for j := range alertRules {
go ToJudge(ll, alertRules[j], points[i])
}
}
}
func getMatchAlertRules(point *vos.MetricPoint) []*models.AlertRule {
alertRules := cache.AlertRulesByMetric.GetBy(point.Metric)
matchRules := make([]*models.AlertRule, 0, len(alertRules))
for i := range alertRules {
if alertRules[i].Type == models.PULL {
continue
}
if matchAlertRule(point, alertRules[i]) {
matchRules = append(matchRules, alertRules[i])
}
}
return matchRules
}
func matchAlertRule(item *vos.MetricPoint, alertRule *models.AlertRule) bool {
//TODO 过滤方式待优化
for _, filter := range alertRule.PushExpr.ResFilters {
if !valueMatch(item.Ident, filter.Func, filter.Params) {
return false
}
}
for _, filter := range alertRule.PushExpr.TagFilters {
value, exists := item.TagsMap[filter.Key]
if !exists {
return false
}
if !valueMatch(value, filter.Func, filter.Params) {
return false
}
}
return true
}
func valueMatch(value, f string, params []string) bool {
switch f {
case "InClasspath":
for i := range params {
if cache.ResClasspath.Exists(value, params[i]) {
return true
}
}
return false
case "NotInClasspath":
for i := range params {
if cache.ResClasspath.Exists(value, params[i]) {
return false
}
}
return true
case "InClasspathPrefix":
classpaths := cache.ResClasspath.GetValues(value)
for _, classpath := range classpaths {
for i := range params {
if strings.HasPrefix(classpath, params[i]) {
return true
}
}
}
return false
case "NotInClasspathPrefix":
classpaths := cache.ResClasspath.GetValues(value)
for _, classpath := range classpaths {
for i := range params {
if strings.HasPrefix(classpath, params[i]) {
return false
}
}
}
return true
case "InList":
for i := range params {
if value == params[i] {
return true
}
}
return false
case "NotInList":
for i := range params {
if value == params[i] {
return false
}
}
return true
case "InResourceList":
for i := range params {
if value == params[i] {
return true
}
}
return false
case "NotInResourceList":
for i := range params {
if value == params[i] {
return false
}
}
return true
case "HasPrefixString":
for i := range params {
if strings.HasPrefix(value, params[i]) {
return true
}
}
return false
case "NoPrefixString":
for i := range params {
if strings.HasPrefix(value, params[i]) {
return false
}
}
return true
case "HasSuffixString":
for i := range params {
if strings.HasSuffix(value, params[i]) {
return true
}
}
return false
case "NoSuffixString":
for i := range params {
if strings.HasSuffix(value, params[i]) {
return false
}
}
return true
case "ContainsString":
for i := range params {
if strings.Contains(value, params[i]) {
return true
}
}
return false
case "NotContainsString":
for i := range params {
if strings.Contains(value, params[i]) {
return false
}
}
return true
case "MatchRegexp":
for i := range params {
r, _ := regexp.Compile(params[i])
if r.MatchString(value) {
return true
}
}
return false
case "NotMatchRegexp":
for i := range params {
r, _ := regexp.Compile(params[i])
if r.MatchString(value) {
return false
}
}
return true
}
return false
}
func ToJudge(linkedList *SafeLinkedList, stra *models.AlertRule, val *vos.MetricPoint) {
logger.Debugf("[ToJudge.start][stra:%+v][val:%+v]", stra, val)
now := val.Time
hps := linkedList.HistoryPoints(now - int64(stra.AlertDuration))
if len(hps) == 0 {
return
}
historyArr := []vos.HistoryPoints{}
statusArr := []bool{}
eventInfo := ""
value := ""
if len(stra.PushExpr.Exps) == 1 {
for _, expr := range stra.PushExpr.Exps {
history, info, lastValue, status := Judge(stra, expr, hps, val, now)
statusArr = append(statusArr, status)
if value == "" {
value = fmt.Sprintf("%s: %s", expr.Metric, lastValue)
} else {
value += fmt.Sprintf("; %s: %s", expr.Metric, lastValue)
}
historyArr = append(historyArr, history)
eventInfo += info
}
} else { //多个条件
for _, expr := range stra.PushExpr.Exps {
respData, err := GetData(stra, expr, val, now)
if err != nil {
logger.Errorf("stra:%+v get query data err:%v", stra, err)
return
}
if len(respData) <= 0 {
logger.Errorf("stra:%+v get query data respData:%v err", stra, respData)
return
}
history, info, lastValue, status := Judge(stra, expr, respData, val, now)
statusArr = append(statusArr, status)
if value == "" {
value = fmt.Sprintf("%s: %s", expr.Metric, lastValue)
} else {
value += fmt.Sprintf("; %s: %s", expr.Metric, lastValue)
}
historyArr = append(historyArr, history)
if eventInfo == "" {
eventInfo = info
} else {
if stra.PushExpr.TogetherOrAny == 0 {
eventInfo += fmt.Sprintf(" & %s", info)
} else if stra.PushExpr.TogetherOrAny == 1 {
eventInfo += fmt.Sprintf(" || %s", info)
}
}
}
}
bs, err := json.Marshal(historyArr)
if err != nil {
logger.Errorf("Marshal history:%+v err:%v", historyArr, err)
}
event := &models.AlertEvent{
RuleId: stra.Id,
RuleName: stra.Name,
RuleNote: stra.Note,
HashId: str.MD5(fmt.Sprintf("%d_%s", stra.Id, val.PK)),
ResIdent: val.Ident,
Priority: stra.Priority,
HistoryPoints: bs,
TriggerTime: now,
Values: value,
NotifyChannels: stra.NotifyChannels,
NotifyGroups: stra.NotifyGroups,
NotifyUsers: stra.NotifyUsers,
RunbookUrl: stra.RunbookUrl,
ReadableExpression: eventInfo,
TagMap: val.TagsMap,
}
logger.Debugf("[ToJudge.event.create][statusArr:%v][type=push][stra:%+v][val:%+v][event:%+v]", statusArr, stra, val, event)
sendEventIfNeed(statusArr, event, stra)
}
func Judge(stra *models.AlertRule, exp models.Exp, historyData []*vos.HPoint, firstItem *vos.MetricPoint, now int64) (history vos.HistoryPoints, info string, lastValue string, status bool) {
var leftValue vos.JsonFloat
if exp.Func == "stddev" {
info = fmt.Sprintf(" %s (%s,%ds) %v", exp.Metric, exp.Func, stra.AlertDuration, exp.Params)
} else if exp.Func == "happen" {
info = fmt.Sprintf(" %s (%s,%ds) %v %s %v", exp.Metric, exp.Func, stra.AlertDuration, exp.Params, exp.Optr, exp.Threshold)
} else {
info = fmt.Sprintf(" %s(%s,%ds) %s %v", exp.Metric, exp.Func, stra.AlertDuration, exp.Optr, exp.Threshold)
}
leftValue, status = judgeItemWithStrategy(stra, historyData, exp, firstItem, now)
lastValue = "null"
if !math.IsNaN(float64(leftValue)) {
lastValue = strconv.FormatFloat(float64(leftValue), 'f', -1, 64)
}
history = vos.HistoryPoints{
Metric: exp.Metric,
Tags: firstItem.TagsMap,
Points: historyData,
}
return
}
func judgeItemWithStrategy(stra *models.AlertRule, historyData []*vos.HPoint, exp models.Exp, firstItem *vos.MetricPoint, now int64) (leftValue vos.JsonFloat, isTriggered bool) {
straFunc := exp.Func
var straParam []interface{}
straParam = append(straParam, stra.AlertDuration)
switch straFunc {
case "happen", "stddev":
if len(exp.Params) < 1 {
logger.Errorf("stra:%d exp:%+v stra param is null", stra.Id, exp)
return
}
straParam = append(straParam, exp.Params[0])
case "c_avg", "c_avg_abs", "c_avg_rate", "c_avg_rate_abs":
if len(exp.Params) < 1 {
logger.Errorf("stra:%d exp:%+v stra param is null", stra.Id, exp)
return
}
hisD, err := GetData(stra, exp, firstItem, now-int64(exp.Params[0]))
if err != nil {
logger.Errorf("stra:%v %+v get compare data err:%v", stra.Id, exp, err)
return
}
if len(hisD) != 1 {
logger.Errorf("stra:%d %+v get compare data err, respItems:%v", stra.Id, exp, hisD)
return
}
var sum float64
for _, i := range hisD {
sum += float64(i.Value)
}
//环比数据的平均值
straParam = append(straParam, sum/float64(len(hisD)))
}
fn, err := ParseFuncFromString(straFunc, straParam, exp.Optr, exp.Threshold)
if err != nil {
logger.Errorf("stra:%d %+v parse func fail: %v", stra.Id, exp, err)
return
}
return fn.Compute(historyData)
}
func GetData(stra *models.AlertRule, exp models.Exp, firstItem *vos.MetricPoint, now int64) ([]*vos.HPoint, error) {
var respData []*vos.HPoint
var err error
//多查一些数据,防止由于查询不到最新点,导致点数不够
start := now - int64(stra.AlertDuration) - 2
// 这里的参数肯定只有一个
queryParam, err := NewQueryRequest(firstItem.Ident, exp.Metric, firstItem.TagsMap, start, now)
if err != nil {
return respData, err
}
respData = Query(queryParam)
logger.Debugf("[exp:%+v][queryParam:%+v][respData:%+v]\n", exp, queryParam, respData)
return respData, err
}
// 虽然最近的数据确实产生了事件(产生事件很频繁),但是未必一定要发送,只有告警/恢复状态发生变化的时候才需发送
func sendEventIfNeed(status []bool, event *models.AlertEvent, stra *models.AlertRule) {
isTriggered := true
if stra.Type == 0 {
// 只判断push型的
switch stra.PushExpr.TogetherOrAny {
case 0:
// 全部触发
for _, s := range status {
isTriggered = isTriggered && s
}
case 1:
// 任意一个触发
isTriggered = false
for _, s := range status {
if s == true {
isTriggered = true
break
}
}
}
}
now := time.Now().Unix()
lastEvent, exists := LastEvents.Get(event.RuleId, event.HashId)
switch event.IsPromePull {
case 0:
// push型的 && 与条件型的
if exists && lastEvent.IsPromePull == 1 {
// 之前内存中的事件是pull型的先清空内存中的事件
LastEvents.Del(event.RuleId, event.HashId)
}
if isTriggered {
// 新告警或者上次是恢复事件,都需要立即发送
if !exists || lastEvent.IsRecov() {
event.MarkAlert()
SendEvent(event)
}
} else {
// 上次是告警事件,现在恢复了,自然需要通知
if exists && lastEvent.IsAlert() {
event.MarkRecov()
SendEvent(event)
}
}
case 1:
// pull型的产生的事件一定是触发了阈值的即这个case里不存在recovery的场景recovery的场景用resolve_timeout的cron来处理
if exists && lastEvent.IsPromePull == 0 {
// 之前内存中的事件是push型的先清空内存中的事件
LastEvents.Del(event.RuleId, event.HashId)
}
// 1. 第一次来并且AlertDuration=0直接发送
// 2. 触发累计到AlertDuration时长后触发一条
if !exists {
// 这是个新事件,之前未曾产生过的
if stra.AlertDuration == 0 {
// 代表prometheus rule for 配置为0直接发送
event.LastSend = true
event.MarkAlert()
SendEvent(event)
} else {
// 只有一条事件显然无法满足for AlertDuration的时间放到内存里等待
LastEvents.Set(event)
}
return
}
// 内存里有事件虽然AlertDuration是0但是上次没有发过(可能是中间调整过AlertDuration比如从某个大于0的值调整为0)
if stra.AlertDuration == 0 && !lastEvent.LastSend {
event.LastSend = true
event.MarkAlert()
SendEvent(event)
return
}
// 内存里有事件AlertDuration也是大于0的需要判断Prometheus里的for的逻辑
if now-lastEvent.TriggerTime < int64(stra.AlertDuration) {
// 距离上次告警的时间小于告警统计周期即不满足for的条件不产生告警通知
return
}
logger.Debugf("[lastEvent.LastSend:%+v][event.LastSend:%+v][now:%+v][lastEvent.TriggerTime:%+v][stra.AlertDuration:%+v][now-lastEvent.TriggerTime:%+v]\n",
lastEvent.LastSend,
event.LastSend,
now,
lastEvent.TriggerTime,
stra.AlertDuration,
now-lastEvent.TriggerTime,
)
// 满足for的条件了应产生事件但是未必一定要发送上次没发送或者上次是恢复这次才发送即保证只发一条
if !lastEvent.LastSend || lastEvent.IsRecov() {
event.LastSend = true
event.MarkAlert()
SendEvent(event)
}
}
}
func SendEvent(event *models.AlertEvent) {
// update last event
LastEvents.Set(event)
if event.IsAlert() {
// 只有是告警事件才需要判断是否重复发送的问题如果是恢复事件就直接交给后续alert处理
ae, err := models.AlertEventGet("hash_id = ?", event.HashId)
if err == nil && ae != nil {
logger.Debugf("[event exists do not send again][type:%+v][event:%+v]", event.IsPromePull, event)
return
}
}
ok := EventQueue.PushFront(event)
if !ok {
logger.Errorf("push event:%v err", event)
}
logger.Debugf("[SendEvent.event.success][type:%+v][event:%+v]", event.IsPromePull, event)
}

View File

@ -1,122 +0,0 @@
// Copyright 2017 Xiaomi, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package judge
import (
"container/list"
"sync"
"time"
"github.com/didi/nightingale/v5/vos"
)
type PointCache struct {
sync.RWMutex
M map[string]*SafeLinkedList
}
func NewPointCache() *PointCache {
return &PointCache{M: make(map[string]*SafeLinkedList)}
}
func (pc *PointCache) Get(key string) (*SafeLinkedList, bool) {
pc.RLock()
defer pc.RUnlock()
val, ok := pc.M[key]
return val, ok
}
func (pc *PointCache) Set(key string, val *SafeLinkedList) {
pc.Lock()
defer pc.Unlock()
pc.M[key] = val
}
func (pc *PointCache) Len() int {
pc.RLock()
defer pc.RUnlock()
return len(pc.M)
}
func (pc *PointCache) CleanStale(before int64) {
var keys []string
pc.RLock()
for key, L := range pc.M {
front := L.Front()
if front == nil {
continue
}
if front.Value.(*vos.MetricPoint).Time < before {
keys = append(keys, key)
}
}
pc.RUnlock()
pc.BatchDelete(keys)
}
func (pc *PointCache) BatchDelete(keys []string) {
count := len(keys)
if count == 0 {
return
}
pc.Lock()
defer pc.Unlock()
for i := 0; i < count; i++ {
delete(pc.M, keys[i])
}
}
func (pc *PointCache) PutPoint(p *vos.MetricPoint, maxAliveDuration int64) *SafeLinkedList {
linkedList, exists := pc.Get(p.PK)
if exists {
linkedList.PushFrontAndMaintain(p, maxAliveDuration)
} else {
NL := list.New()
NL.PushFront(p)
linkedList = &SafeLinkedList{L: NL}
pc.Set(p.PK, linkedList)
}
return linkedList
}
// 这是个线程不安全的大Map需要提前初始化好
var PointCaches = make(map[string]*PointCache)
var pointChars = []string{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"}
var pointHeadKeys = make([]string, 0, 256)
func initPointCaches() {
for i := 0; i < 16; i++ {
for j := 0; j < 16; j++ {
pointHeadKeys = append(pointHeadKeys, pointChars[i]+pointChars[j])
}
}
for i := 0; i < 256; i++ {
PointCaches[pointHeadKeys[i]] = NewPointCache()
}
}
func CleanStalePoints() {
// 监控数据2天都没关联到任何告警策略说明对应的告警策略已经删除了
before := time.Now().Unix() - 3600*24*2
for i := 0; i < 256; i++ {
PointCaches[pointHeadKeys[i]].CleanStale(before)
}
}

View File

@ -1,118 +0,0 @@
package judge
import (
"context"
"fmt"
"os"
"time"
"github.com/didi/nightingale/v5/cache"
"github.com/didi/nightingale/v5/config"
"github.com/didi/nightingale/v5/models"
"github.com/didi/nightingale/v5/naming"
"github.com/toolkits/pkg/container/list"
"github.com/toolkits/pkg/logger"
)
var (
// 这个内存Queue放到judge的包里或alert的包里感觉都可以
// 放到judge的包里即当前的做法相当于把alert看做judge的一个附属小功能
// 这个Queue的核心作用就是削峰填谷应对突然产生的大面积事件
EventQueue *list.SafeListLimited
// 上次同步全量告警规则的时间全量同步都没做过我这也不用处理PULL的规则了
lastSyncTime int64
)
func Start(ctx context.Context) {
// PUSH型的告警引擎依赖内存里缓存的数据来做告警判断两层map减小锁粒度
initPointCaches()
// 把数据库中的未恢复告警同步一份到内存中,便于后续判断告警是否应该发送
LastEvents.Init()
// 默认初始化的大小是1000万相当于内存里有1000万事件应该够用了
EventQueue = list.NewSafeListLimited(10000000)
// 开始心跳对于PUSH型的数据我有策略了自然就可以处理了
if err := heartbeat(config.Config.Heartbeat.LocalAddr); err != nil {
fmt.Println(err)
logger.Close()
os.Exit(1)
}
// 启动心跳goroutinue如果挂了trans可以及时感知
go loopHeartbeat()
// PULL型的策略不着急等一段时间(等哈希环是稳态的)再开始周期性干活
go syncPullRules(ctx)
// 告警策略删除之后,针对这些告警策略缓存的监控数据要被清理
go loopCleanStalePoints()
}
func syncPullRules(ctx context.Context) {
// 先等一会再干活等大部分judge都上报心跳过了哈希环不变了
time.Sleep(time.Second * 33)
for {
syncPullRulesOnce(ctx)
time.Sleep(time.Second * 9)
}
}
func syncPullRulesOnce(ctx context.Context) {
if cache.AlertRulesByMetric.LastSync == lastSyncTime {
return
}
// 根据我自己的标识去查找属于我的PULL型告警规则
ident := config.Config.Heartbeat.LocalAddr
rules := cache.AlertRules.Pulls()
count := len(rules)
mines := make([]models.AlertRule, 0, count)
logger.Debugf("[got_one_pull_rule_for_all][ruleNum:%v]", count)
for i := 0; i < count; i++ {
instance, err := naming.HashRing.GetNode(fmt.Sprint(rules[i].Id))
if err != nil {
logger.Warningf("hashring: sharding pull rule(%d) fail: %v", rules[i].Id, err)
continue
}
logger.Debugf("[got_one_pull_rule_hash_result][instance:%v][ident:%v][rule:%v]", instance, ident, rules[i])
if instance == ident {
// 属于我的
mines = append(mines, *rules[i])
logger.Debugf("[got_one_pull_rule_for_me][rule:%v]", rules[i])
}
}
pullRuleManager.SyncRules(ctx, mines)
lastSyncTime = cache.AlertRulesByMetric.LastSync
}
func loopHeartbeat() {
interval := time.Duration(config.Config.Heartbeat.Interval) * time.Millisecond
for {
time.Sleep(interval)
if err := heartbeat(config.Config.Heartbeat.LocalAddr); err != nil {
logger.Warning(err)
}
}
}
func heartbeat(endpoint string) error {
err := models.InstanceHeartbeat(config.EndpointName, endpoint)
if err != nil {
return fmt.Errorf("mysql.error: instance(service=%s, endpoint=%s) heartbeat fail: %v", config.EndpointName, endpoint, err)
}
return nil
}
func loopCleanStalePoints() {
for {
time.Sleep(time.Hour)
CleanStalePoints()
}
}

View File

@ -1,119 +0,0 @@
package judge
import (
"fmt"
"os"
"sync"
"time"
"github.com/didi/nightingale/v5/models"
"github.com/toolkits/pkg/logger"
)
// rule_id -> hash_id -> *models.AlertEvent
type SafeEventMap struct {
sync.RWMutex
M map[int64]map[string]*models.AlertEvent
}
var (
LastEvents = &SafeEventMap{M: make(map[int64]map[string]*models.AlertEvent)}
)
func (s *SafeEventMap) Get(ruleId int64, hashId string) (*models.AlertEvent, bool) {
s.RLock()
defer s.RUnlock()
m, has := s.M[ruleId]
if !has {
return nil, false
}
event, has := m[hashId]
return event, has
}
func (s *SafeEventMap) Set(event *models.AlertEvent) {
s.Lock()
defer s.Unlock()
_, has := s.M[event.RuleId]
if !has {
m := make(map[string]*models.AlertEvent)
m[event.HashId] = event
s.M[event.RuleId] = m
} else {
s.M[event.RuleId][event.HashId] = event
}
}
func (s *SafeEventMap) Init() {
aes, err := models.AlertEventGetAll()
if err != nil {
fmt.Println("load all alert_event fail:", err)
os.Exit(1)
}
if len(aes) == 0 {
return
}
data := make(map[int64]map[string]*models.AlertEvent)
for i := 0; i < len(aes); i++ {
event := aes[i]
_, has := data[event.RuleId]
if !has {
m := make(map[string]*models.AlertEvent)
m[event.HashId] = event
data[event.RuleId] = m
} else {
data[event.RuleId][event.HashId] = event
}
}
s.Lock()
s.M = data
s.Unlock()
}
func (s *SafeEventMap) Del(ruleId int64, hashId string) {
s.Lock()
defer s.Unlock()
_, has := s.M[ruleId]
if !has {
return
}
delete(s.M[ruleId], hashId)
}
func (s *SafeEventMap) DeleteOrSendRecovery(ruleId int64, toKeepKeys map[string]struct{}) {
s.Lock()
defer s.Unlock()
m, has := s.M[ruleId]
if !has {
return
}
for k, ev := range m {
if _, loaded := toKeepKeys[k]; loaded {
continue
}
// 如果因为promql修改导致本来是告警状态变成了恢复也接受
logger.Debugf("[to_del][ev.IsRecovery:%+v][ev.LastSend:%+v]", ev.IsRecovery, ev.LastSend)
// promql 没查询到结果,需要将告警标记为已恢复并发送
// 同时需要满足 已经发送过触发信息,并且时间差满足 大于AlertDuration
// 为了避免 发送告警后 一个点 断点了就立即发送恢复信息的case
now := time.Now().Unix()
if ev.IsAlert() && ev.LastSend && now-ev.TriggerTime > ev.AlertDuration {
logger.Debugf("[prom.alert.MarkRecov][ev.RuleName:%v]", ev.RuleName)
ev.MarkRecov()
EventQueue.PushFront(ev)
delete(s.M[ruleId], k)
}
}
}

View File

@ -1,164 +0,0 @@
// Copyright 2017 Xiaomi, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package judge
import (
"container/list"
"sync"
"github.com/didi/nightingale/v5/vos"
)
type SafeLinkedList struct {
sync.RWMutex
L *list.List
}
func (ll *SafeLinkedList) Front() *list.Element {
ll.RLock()
defer ll.RUnlock()
return ll.L.Front()
}
func (ll *SafeLinkedList) Len() int {
ll.RLock()
defer ll.RUnlock()
return ll.L.Len()
}
func (ll *SafeLinkedList) PushFrontAndMaintain(v *vos.MetricPoint, maintainDuration int64) {
ll.Lock()
defer ll.Unlock()
sz := ll.L.Len()
lastPointTs := ll.L.Front().Value.(*vos.MetricPoint).Time
earliestTs := v.Time - maintainDuration
if sz > 0 {
// 新push上来的数据有可能重复了或者timestamp不对这种数据要丢掉
if v.Time <= lastPointTs {
return
}
}
ll.L.PushFront(v)
sz++
for i := 0; i < sz; i++ {
if ll.L.Back().Value.(*vos.MetricPoint).Time >= earliestTs {
break
}
//最前面的点已经不在告警策略时间周期内,丢弃掉
ll.L.Remove(ll.L.Back())
}
}
func (ll *SafeLinkedList) HistoryPoints(smallestTime int64) []*vos.HPoint {
size := ll.Len()
if size == 0 {
return []*vos.HPoint{}
}
firstElement := ll.Front()
firstItem := firstElement.Value.(*vos.MetricPoint)
vs := make([]*vos.HPoint, 0)
if firstItem.Time < smallestTime {
return vs
}
v := &vos.HPoint{
Timestamp: firstItem.Time,
Value: vos.JsonFloat(firstItem.Value),
}
vs = append(vs, v)
currentElement := firstElement
for i := 1; i < size; i++ {
nextElement := currentElement.Next()
if nextElement == nil {
return vs
}
item := nextElement.Value.(*vos.MetricPoint)
if item.Time < smallestTime {
return vs
}
v := &vos.HPoint{
Timestamp: item.Time,
Value: vos.JsonFloat(item.Value),
}
vs = append(vs, v)
currentElement = nextElement
}
return vs
}
// func (ll *SafeLinkedList) QueryDataByTS(start, end int64) []*vos.HPoint {
// size := ll.Len()
// if size == 0 {
// return []*vos.HPoint{}
// }
// firstElement := ll.Front()
// firstItem := firstElement.Value.(*vos.MetricPoint)
// var vs []*vos.HPoint
// if firstItem.Time < start {
// //最新的点也比起始时间旧,直接返回
// return vs
// }
// v := &vos.HPoint{
// Timestamp: firstItem.Time,
// Value: vos.JsonFloat(firstItem.Value),
// }
// vs = append(vs, v)
// currentElement := firstElement
// for {
// nextElement := currentElement.Next()
// if nextElement == nil {
// return vs
// }
// if nextElement.Value.(*vos.MetricPoint).Time < start {
// return vs
// }
// if nextElement.Value.(*vos.MetricPoint).Time > end {
// currentElement = nextElement
// continue
// }
// v := &vos.HPoint{
// Timestamp: nextElement.Value.(*vos.MetricPoint).Time,
// Value: vos.JsonFloat(nextElement.Value.(*vos.MetricPoint).Value),
// }
// vs = append(vs, v)
// currentElement = nextElement
// }
// return vs
// }

Some files were not shown because too many files have changed in this diff Show More