Merge branch 'master' of github.com:didi/nightingale

This commit is contained in:
Ulric Qin 2020-11-07 08:18:52 +08:00
commit 6d02d8876a
68 changed files with 5931 additions and 166 deletions

View File

@ -20,4 +20,8 @@
3.1.6
影响模块n9e-ams etc/gop.yml
更新内容主机设备增加了扩展字段的管理用于维护一些位置信息、过保信息增加了新的sqlsql/n9e_ams_3.1.6.sql
更新内容主机设备增加了扩展字段的管理用于维护一些位置信息、过保信息增加了新的sqlsql/n9e_ams_3.1.6.sql
3.2.0
影响模块n9e-agent etc/agent.yml
更新内容agent支持metrics指标采集能力

View File

@ -1,7 +1,7 @@
#!/bin/bash
# release version
version=3.1.6
version=3.2.0
CWD=$(cd $(dirname $0)/; pwd)
cd $CWD

View File

@ -7,6 +7,18 @@ enable:
mon: true
job: true
report: true
metrics: true
udp:
enable: true
listen: :788
metrics:
maxProcs: 1
reportIntervalMs: 10
reportTimeoutMs: 2000
reportPacketSize: 100
sendToInfoFile: false
job:
metadir: ./meta

1
etc/login-code-email.tpl Normal file
View File

@ -0,0 +1 @@
您好,您的登录验证码为 {{.Code}}

1
etc/login-code-sms.tpl Normal file
View File

@ -0,0 +1 @@
您好,您的登录验证码为 {{.Code}}

View File

@ -1,12 +1,22 @@
---
tokens:
- monapi-internal-third-module-pass-fjsdi
logger:
dir: logs/monapi
level: INFO
keepHours: 24
region:
- default
# clean history event
cleaner:
# retention days
days: 100
# number of events deleted per time
batch: 100
# read alert from redis
redis:
addr: 127.0.0.1:6379

View File

@ -24,6 +24,8 @@ sso:
coverAttributes: false
stateExpiresIn: 300
captcha: true
tokens:
- rdb-builtin-token
@ -90,3 +92,5 @@ wechat:
corp_id: "xxxxxxxxxxxxx"
agent_id: 1000000
secret: "xxxxxxxxxxxxxxxxx"
captcha: false

6
go.mod
View File

@ -5,13 +5,12 @@ go 1.12
require (
github.com/Shopify/sarama v1.19.0
github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d // indirect
github.com/caio/go-tdigest v3.1.0+incompatible
github.com/cespare/xxhash v1.1.0
github.com/codegangsta/negroni v1.0.0
github.com/coreos/go-oidc v2.2.1+incompatible
github.com/dgryski/go-tsz v0.0.0-20180227144327-03b7d791f4fe
github.com/eapache/go-resiliency v1.2.0 // indirect
github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21 // indirect
github.com/eapache/queue v1.1.0 // indirect
github.com/garyburd/redigo v1.6.2
github.com/gin-contrib/pprof v1.3.0
github.com/gin-gonic/gin v1.6.3
@ -19,12 +18,13 @@ require (
github.com/go-sql-driver/mysql v1.5.0
github.com/google/go-cmp v0.5.1 // indirect
github.com/google/uuid v1.1.2
github.com/gorilla/context v1.1.1 // indirect
github.com/gorilla/mux v1.6.2
github.com/hashicorp/golang-lru v0.5.1
github.com/hpcloud/tail v1.0.0
github.com/influxdata/influxdb v1.8.0
github.com/mattn/go-isatty v0.0.12
github.com/mattn/go-sqlite3 v1.14.0 // indirect
github.com/mojocn/base64Captcha v1.3.1
github.com/onsi/ginkgo v1.7.0 // indirect
github.com/onsi/gomega v1.4.3 // indirect
github.com/open-falcon/rrdlite v0.0.0-20200214140804-bf5829f786ad

10
go.sum
View File

@ -51,6 +51,9 @@ github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJm
github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40/go.mod h1:8rLXio+WjiTceGBHIoTvn60HIbs7Hm7bcHjyrSqYB9c=
github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
github.com/c-bata/go-prompt v0.2.2/go.mod h1:VzqtzE2ksDBcdln8G7mk2RX9QyGjH+OVqOCSiVIqS34=
github.com/caio/go-tdigest v1.1.3 h1:dwSirEYz3a9cPJox2HCszM6TcE+7keac+spVV7LNWfw=
github.com/caio/go-tdigest v3.1.0+incompatible h1:uoVMJ3Q5lXmVLCCqaMGHLBWnbGoN6Lpu7OAUPR60cds=
github.com/caio/go-tdigest v3.1.0+incompatible/go.mod h1:sHQM/ubZStBUmF1WbB8FAm8q9GjDajLC5T7ydxE3JHI=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
@ -135,6 +138,7 @@ github.com/gogo/protobuf v0.0.0-20171007142547-342cbe0a0415/go.mod h1:r8qH/GZQm5
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/gogo/protobuf v1.2.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
github.com/golang/geo v0.0.0-20190916061304-5b978397cfec/go.mod h1:QZ0nwyI2jOfgRAoBvP+ab5aRr7c9x7lhGEJrKvBwjWI=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
@ -279,6 +283,8 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJ
github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI=
github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
github.com/mojocn/base64Captcha v1.3.1 h1:2Wbkt8Oc8qjmNJ5GyOfSo4tgVQPsbKMftqASnq8GlT0=
github.com/mojocn/base64Captcha v1.3.1/go.mod h1:wAQCKEc5bDujxKRmbT6/vTnTt5CjStQ8bRfPWUuz/iY=
github.com/mschoch/smat v0.0.0-20160514031455-90eadee771ae/go.mod h1:qAyveg+e4CE+eKJXWVjKXM4ck2QobLqTDytGJbLLhJg=
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
@ -377,8 +383,6 @@ github.com/subosito/gotenv v1.2.0 h1:Slr1R9HxAlEKefgq5jn9U+DnETlIUa6HfgEzj0g5d7s
github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw=
github.com/tinylib/msgp v1.0.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE=
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/toolkits/pkg v1.1.2 h1:BygBwfbL+kiYBH6Rlrx6hKC3WTvNQCsDDOy8keYFNCM=
github.com/toolkits/pkg v1.1.2/go.mod h1:ge83E8FQqUnFk+2wtVtZ8kvbmoSjE1l8FP3f+qmR0fY=
github.com/toolkits/pkg v1.1.3 h1:cjZMz9hmuTv4v7ivYERA9mWJCLKyr8JMd4S+CL/YzMM=
github.com/toolkits/pkg v1.1.3/go.mod h1:ge83E8FQqUnFk+2wtVtZ8kvbmoSjE1l8FP3f+qmR0fY=
github.com/ugorji/go v1.1.7 h1:/68gy2h+1mWMrwZFeD1kQialdSzAb432dtpeJ42ovdo=
@ -426,6 +430,8 @@ golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u0
golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190501045829-6d32002ffd75/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b h1:+qEpEAPhDZ1o0x3tHzZTQDArnOixOzGD9HUJfcg0mb4=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=

View File

@ -282,5 +282,33 @@ CREATE TABLE `operation_log`
PRIMARY KEY (`id`),
KEY (`clock`),
KEY (`res_cl`, `res_id`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8;
CREATE TABLE `login_code`
(
`username` varchar(64) not null comment 'login name, cannot rename',
`code` varchar(32) not null,
`login_type` varchar(32) not null,
`created_at` bigint not null comment 'created at',
KEY (`code`),
KEY (`created_at`),
UNIQUE KEY (`username`)
) ENGINE = InnoDB
DEFAULT CHARSET = utf8;
CREATE TABLE `auth_state` (
`state` varchar(128) DEFAULT '' NOT NULL,
`typ` varchar(32) DEFAULT '' NOT NULL COMMENT 'response_type',
`redirect` varchar(1024) DEFAULT '' NOT NULL,
`expires_at` bigint DEFAULT '0' NOT NULL,
PRIMARY KEY (`state`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8;
CREATE TABLE `captcha` (
`captcha_id` varchar(128) NOT NULL,
`answer` varchar(128) DEFAULT '' NOT NULL,
`created_at` bigint DEFAULT '0' NOT NULL,
KEY (`captcha_id`, `answer`),
KEY (`created_at`)
) ENGINE = InnoDB DEFAULT CHARSET = utf8;

42
src/models/auth_state.go Normal file
View File

@ -0,0 +1,42 @@
package models
import (
"errors"
"time"
)
type AuthState struct {
State string `json:"state"`
Typ string `json:"typ"`
Redirect string `json:"redirect"`
ExpiresAt int64 `json:"expiresAt"`
}
func AuthStateGet(where string, args ...interface{}) (*AuthState, error) {
var obj AuthState
has, err := DB["rdb"].Where(where, args...).Get(&obj)
if err != nil {
return nil, err
}
if !has {
return nil, errors.New("auth state not found")
}
return &obj, nil
}
func (p *AuthState) Save() error {
_, err := DB["rdb"].Insert(p)
return err
}
func (p *AuthState) Del() error {
_, err := DB["rdb"].Where("state=?", p.State).Delete(new(AuthState))
return err
}
func (p AuthState) CleanUp() error {
_, err := DB["rdb"].Exec("delete from auth_state where expires_at < ?", time.Now().Unix())
return err
}

44
src/models/captcha.go Normal file
View File

@ -0,0 +1,44 @@
package models
import (
"errors"
"time"
)
type Captcha struct {
CaptchaId string `json:"captchaId"`
Answer string `json:"-"`
Image string `xorm:"-" json:"image"`
CreatedAt int64 `json:"createdAt"`
}
func CaptchaGet(where string, args ...interface{}) (*Captcha, error) {
var obj Captcha
has, err := DB["rdb"].Where(where, args...).Get(&obj)
if err != nil {
return nil, err
}
if !has {
return nil, errors.New("captcha not found")
}
return &obj, nil
}
func (p *Captcha) Save() error {
_, err := DB["rdb"].Insert(p)
return err
}
func (p *Captcha) Del() error {
_, err := DB["rdb"].Where("captcha_id=?", p.CaptchaId).Delete(new(Captcha))
return err
}
const captchaExpiresIn = 600
func (p Captcha) CleanUp() error {
_, err := DB["rdb"].Exec("delete from captcha where created_at < ?", time.Now().Unix()-captchaExpiresIn)
return err
}

View File

@ -6,6 +6,8 @@ import (
"gopkg.in/ldap.v3"
"github.com/toolkits/pkg/logger"
"github.com/didi/nightingale/src/modules/rdb/config"
)
@ -73,7 +75,8 @@ func ldapReq(user, pass string) (*ldap.SearchResult, error) {
}
if len(sr.Entries) == 0 {
return nil, fmt.Errorf("cannot find such user: %v", user)
logger.Infof("ldap auth fail, no such user: %s", user)
return nil, fmt.Errorf("login fail, check your username and password")
}
if len(sr.Entries) > 1 {
@ -81,7 +84,8 @@ func ldapReq(user, pass string) (*ldap.SearchResult, error) {
}
if err := conn.Bind(sr.Entries[0].DN, pass); err != nil {
return nil, fmt.Errorf("password error")
logger.Info("ldap auth fail, password error, user: %s", user)
return nil, fmt.Errorf("login fail, check your username and password")
}
return sr, nil
}

39
src/models/login_code.go Normal file
View File

@ -0,0 +1,39 @@
package models
import "errors"
type LoginCode struct {
Username string `json:"username"`
Code string `json:"code"`
LoginType string `json:"login_type"`
CreatedAt int64 `json:"created_at"`
}
var (
errLoginCode = errors.New("invalid login code")
)
func LoginCodeGet(where string, args ...interface{}) (*LoginCode, error) {
var obj LoginCode
has, err := DB["rdb"].Where(where, args...).Get(&obj)
if err != nil {
return nil, err
}
if !has {
return nil, errLoginCode
}
return &obj, nil
}
func (p *LoginCode) Save() error {
p.Del()
_, err := DB["rdb"].Insert(p)
return err
}
func (p *LoginCode) Del() error {
_, err := DB["rdb"].Where("username=?", p.Username).Delete(new(LoginCode))
return err
}

View File

@ -18,7 +18,7 @@ type Stra struct {
ExclNidStr string `xorm:"excl_nid" json:"-"` //排除的叶子节点
AlertDur int `json:"alert_dur"` //单位秒持续异常10分钟则产生异常event
RecoveryDur int `json:"recovery_dur"` //单位秒持续正常2分钟则产生恢复event0表示立即产生恢复event
RecoveryNotify int `json:"recovery_notify"` //0 发送恢复通知 1不发送恢复通知
RecoveryNotify int `json:"recovery_notify"` //1 发送恢复通知 0不发送恢复通知
ExprsStr string `xorm:"exprs" json:"-"` //多个条件的监控实例需要相同并且同时满足才产生event
TagsStr string `xorm:"tags" json:"-"` //tag过滤条件
EnableStime string `json:"enable_stime"` //策略生效开始时间

View File

@ -183,14 +183,22 @@ func (n *Node) CreateChild(ident, name, note, cate, creator string, leaf, proxy
return nil, fmt.Errorf("tenant node should be root node only")
}
if cate == "project" && (n.Cate != "tenant" && n.Cate != "organization") {
return nil, fmt.Errorf("project node should be under tenant or organization")
}
if ident == "" {
return nil, fmt.Errorf("ident is blank")
}
if !str.IsMatch(ident, "^[a-zA-Z0-9\\-\\_]+$") {
if !str.IsMatch(ident, "^[a-z0-9\\-\\_]+$") {
return nil, fmt.Errorf("ident invalid")
}
if len(ident) >= 32 {
return nil, fmt.Errorf("ident length should be less than 32")
}
if creator != "system" {
// 人为创建的节点有些保留名字不能使用是为了给PaaS各个子系统注册资源所用
if (n.Path == "inner" || n.Cate == "project") && slice.ContainsString(protectedNodeIdents, ident) {

View File

@ -5,6 +5,12 @@ type RoleGlobalUser struct {
UserId int64 `json:"user_id" xorm:"'user_id'"`
}
func RoleGlobalUserAll() ([]RoleGlobalUser, error) {
var objs []RoleGlobalUser
err := DB["rdb"].Find(&objs)
return objs, err
}
// UserHasGlobalRole 查看某个用户是否有某个全局角色
func UserHasGlobalRole(userId int64, roleIds []int64) (bool, error) {
cnt, err := DB["rdb"].Where("user_id=?", userId).In("role_id", roleIds).Count(new(RoleGlobalUser))

View File

@ -10,6 +10,12 @@ type RoleOperation struct {
Operation string `json:"operation"`
}
func RoleOperationAll() ([]RoleOperation, error) {
var objs []RoleOperation
err := DB["rdb"].OrderBy("id").Find(&objs)
return objs, err
}
func OperationsOfRoles(rids []int64) ([]string, error) {
if len(rids) == 0 {
return []string{}, nil

View File

@ -18,6 +18,15 @@ import (
"github.com/didi/nightingale/src/modules/rdb/config"
)
const (
LOGIN_T_SMS = "sms-code"
LOGIN_T_EMAIL = "email-code"
LOGIN_T_RST = "rst-code"
LOGIN_T_PWD = "password"
LOGIN_T_LDAP = "ldap"
LOGIN_EXPIRES_IN = 300
)
type User struct {
Id int64 `json:"id"`
UUID string `json:"uuid" xorm:"'uuid'"`
@ -82,18 +91,16 @@ func InitRooter() {
log.Println("user root init done")
}
func LdapLogin(user, pass, clientIP string) error {
func LdapLogin(user, pass string) (*User, error) {
sr, err := ldapReq(user, pass)
if err != nil {
return err
return nil, err
}
go LoginLogNew(user, clientIP, "in")
var u User
has, err := DB["rdb"].Where("username=?", user).Get(&u)
if err != nil {
return err
return nil, err
}
u.CopyLdapAttr(sr)
@ -101,9 +108,9 @@ func LdapLogin(user, pass, clientIP string) error {
if has {
if config.Config.LDAP.CoverAttributes {
_, err := DB["rdb"].Where("id=?", u.Id).Update(u)
return err
return nil, err
} else {
return nil
return &u, err
}
}
@ -111,32 +118,76 @@ func LdapLogin(user, pass, clientIP string) error {
u.Password = "******"
u.UUID = GenUUIDForUser(user)
_, err = DB["rdb"].Insert(u)
return err
return &u, nil
}
func PassLogin(user, pass, clientIP string) error {
func PassLogin(user, pass string) (*User, error) {
var u User
has, err := DB["rdb"].Where("username=?", user).Cols("password").Get(&u)
has, err := DB["rdb"].Where("username=?", user).Get(&u)
if err != nil {
return err
return nil, err
}
if !has {
return fmt.Errorf("user[%s] not found", user)
logger.Infof("password auth fail, no such user: %s", user)
return nil, fmt.Errorf("login fail, check your username and password")
}
loginPass, err := CryptoPass(pass)
if err != nil {
return err
return nil, err
}
if loginPass != u.Password {
return fmt.Errorf("password error")
logger.Infof("password auth fail, password error, user: %s", user)
return nil, fmt.Errorf("login fail, check your username and password")
}
go LoginLogNew(user, clientIP, "in")
return &u, nil
}
return nil
func SmsCodeLogin(phone, code string) (*User, error) {
user, _ := UserGet("phone=?", phone)
if user == nil {
return nil, fmt.Errorf("phone %s dose not exist", phone)
}
lc, err := LoginCodeGet("username=? and code=? and login_type=?", user.Username, code, LOGIN_T_SMS)
if err != nil {
logger.Infof("sms-code auth fail, user: %s", user.Username)
return nil, fmt.Errorf("login fail, check your sms-code")
}
if time.Now().Unix()-lc.CreatedAt > LOGIN_EXPIRES_IN {
logger.Infof("sms-code auth expired, user: %s", user.Username)
return nil, fmt.Errorf("login fail, the code has expired")
}
lc.Del()
return user, nil
}
func EmailCodeLogin(email, code string) (*User, error) {
user, _ := UserGet("email=?", email)
if user == nil {
return nil, fmt.Errorf("email %s dose not exist", email)
}
lc, err := LoginCodeGet("username=? and code=? and login_type=?", user.Username, code, LOGIN_T_EMAIL)
if err != nil {
logger.Infof("email-code auth fail, user: %s", user.Username)
return nil, fmt.Errorf("login fail, check your email-code")
}
if time.Now().Unix()-lc.CreatedAt > LOGIN_EXPIRES_IN {
logger.Infof("email-code auth expired, user: %s", user.Username)
return nil, fmt.Errorf("login fail, the code has expired")
}
lc.Del()
return user, nil
}
func UserGet(where string, args ...interface{}) (*User, error) {

View File

@ -14,6 +14,7 @@ import (
"github.com/didi/nightingale/src/modules/agent/http"
"github.com/didi/nightingale/src/modules/agent/log/worker"
"github.com/didi/nightingale/src/modules/agent/report"
"github.com/didi/nightingale/src/modules/agent/statsd"
"github.com/didi/nightingale/src/modules/agent/stra"
"github.com/didi/nightingale/src/modules/agent/sys"
"github.com/didi/nightingale/src/modules/agent/sys/funcs"
@ -21,6 +22,8 @@ import (
"github.com/didi/nightingale/src/modules/agent/sys/ports"
"github.com/didi/nightingale/src/modules/agent/sys/procs"
"github.com/didi/nightingale/src/modules/agent/timer"
"github.com/didi/nightingale/src/modules/agent/udp"
"github.com/didi/nightingale/src/toolkits/stats"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/runner"
@ -59,6 +62,7 @@ func main() {
parseConf()
loggeri.Init(config.Config.Logger)
stats.Init("agent")
if config.Config.Enable.Mon {
monStart()
@ -72,6 +76,16 @@ func main() {
reportStart()
}
if config.Config.Enable.Metrics {
// 初始化 statsd服务
statsd.Start()
// 开启 udp监听 和 udp数据包处理进程
udp.Start()
}
core.InitRpcClients()
http.Start()
endingProc()
@ -94,7 +108,6 @@ func monStart() {
sys.Init(config.Config.Sys)
stra.Init()
core.InitRpcClients()
funcs.BuildMappers()
funcs.Collect()

View File

@ -4,6 +4,7 @@ import (
"bytes"
"fmt"
"strings"
"time"
"github.com/spf13/viper"
"github.com/toolkits/pkg/file"
@ -14,19 +15,35 @@ import (
)
type ConfigT struct {
Logger loggeri.Config `yaml:"logger"`
Stra straSection `yaml:"stra"`
Worker workerSection `yaml:"worker"`
Sys sys.SysSection `yaml:"sys"`
Enable enableSection `yaml:"enable"`
Job jobSection `yaml:"job"`
Report reportSection `yaml:"report"`
Logger loggeri.Config `yaml:"logger"`
Stra straSection `yaml:"stra"`
Worker workerSection `yaml:"worker"`
Sys sys.SysSection `yaml:"sys"`
Enable enableSection `yaml:"enable"`
Job jobSection `yaml:"job"`
Report reportSection `yaml:"report"`
Udp UdpSection `yaml:"udp"`
Metrics MetricsSection `yaml:"metrics"`
}
type UdpSection struct {
Enable bool `yaml:"enable"`
Listen string `yaml:"listen"`
}
type MetricsSection struct {
MaxProcs int `yaml:"maxProcs"`
ReportIntervalMs int `yaml:"reportIntervalMs"`
ReportTimeoutMs int `yaml:"reportTimeoutMs"`
ReportPacketSize int `yaml:"reportPacketSize"`
SendToInfoFile bool `yaml:"sendToInfoFile"`
Interval time.Duration
}
type enableSection struct {
Mon bool `yaml:"mon"`
Job bool `yaml:"job"`
Report bool `yaml:"report"`
Mon bool `yaml:"mon"`
Job bool `yaml:"job"`
Report bool `yaml:"report"`
Metrics bool `yaml:"metrics"`
}
type reportSection struct {

View File

@ -25,7 +25,7 @@ func Push(metricItems []*dataobj.MetricValue) error {
now := time.Now().Unix()
for _, item := range metricItems {
logger.Debug("->recv: ", item)
logger.Debugf("->recv:%+v", item)
if item.Endpoint == "" {
item.Endpoint = config.Endpoint
}
@ -48,7 +48,7 @@ func Push(metricItems []*dataobj.MetricValue) error {
continue
}
}
logger.Debug("push item: ", item)
logger.Debugf("push item: %+v", item)
items = append(items, item)
}

View File

@ -0,0 +1,178 @@
package statsd
/*
// raw configs
type MetricAgentConfig struct {
Updated int64 `json:"updated"` // 配置生成的时间戳
Version string `json:"version"` // 配置版本
Hostname string `json:"hostname"`
Ip string `json:"ip"`
Aggr map[string]*AggrConfigItem `json:"aggr"` // ns --> x
}
type AggrConfigItem struct {
Ns string `json:"ns"`
Type string `json:"type"`
MetricTagks map[string]*AggrMetricTagks `json:"metric_tagks"`
}
type AggrMetricTagks struct {
Metric string `json:"metric"`
Tagks [][]string `json:"tagks"`
}
func (this MetricAgentConfig) UpdateLoop() {
if sconfig.Config.Cfg.Disable {
logger.Debugf("config update loop disabled")
return
}
for {
nc, err := this.getMetricAgentConfigFromRemote()
if err != nil {
logger.Debugf("get metric agent config error, [error: %s]", err.Error())
} else if nc == nil {
// 机器没有配置metrics本机聚合
} else {
lac, err1 := nc.transToLocalAggrConfig()
if err1 != nil {
logger.Debugf("trans to local aggr config error, [error: %s]", err1.Error())
} else {
localAggrConfig.Update(lac, nc.Version, nc.Updated)
logger.Debugf("localAggrConfig updated at:%d", nc.Updated)
}
}
time.Sleep(time.Duration(sconfig.Config.Cfg.UdpateIntervalMs) * time.Millisecond)
}
}
func (this *MetricAgentConfig) transToLocalAggrConfig() (map[string]*NsAggrConfig, error) {
if len(this.Aggr) == 0 && this.Updated == 0 && this.Version == "" {
return nil, fmt.Errorf("bad aggr configs")
}
ret := make(map[string]*NsAggrConfig, 0)
for _, v := range this.Aggr {
if !(LocalAggrConfig{}.CheckType(v.Type)) {
logger.Debugf("bad aggr config type, [type: %s]", v.Type)
continue
}
// metric_tagks
mtks := make(map[string][][]string, 0)
for _, mtk := range v.MetricTagks {
if mtk == nil || len(mtk.Metric) == 0 || len(mtk.Tagks) == 0 {
continue
}
ttagks := make([][]string, 0)
for i := 0; i < len(mtk.Tagks); i++ {
mtksTagksMap := make(map[string]bool, 0)
for _, tk := range mtk.Tagks[i] {
mtksTagksMap[tk] = true
}
mktsTagsList := make([]string, 0)
for k, _ := range mtksTagksMap {
mktsTagsList = append(mktsTagsList, k)
}
sort.Strings(mktsTagsList)
ttagks = append(ttagks, mktsTagsList)
}
if (Func{}).HasSameSortedArray(ttagks) {
logger.Debugf("bad aggr config tagks, has same tagks: [ns: %s][metric: %s][tagks: %#v]",
v.Ns, mtk.Metric, mtk.Tagks)
logger.Debugf("drop aggr config of metric, [ns: %s][metric: %s]", v.Ns, mtk.Metric)
continue
}
mtks[mtk.Metric] = ttagks
}
if attks, ok := mtks[Const_AllMetrics]; ok && len(attks) > 0 {
for k, v := range mtks {
if k == Const_AllMetrics {
continue
}
mtks[k] = (Func{}).MergeSortedArrays(attks, v)
}
}
// metric_tagks
ret[v.Ns] = &NsAggrConfig{
Ns: v.Ns,
Type: v.Type,
MetricTagks: mtks,
}
}
return ret, nil
}
// local transfered configs
var (
localAggrConfig = &LocalAggrConfig{NsConfig: map[string]*NsAggrConfig{}, Updated: 0, Version: "init"}
)
func (this LocalAggrConfig) GetLocalAggrConfig() *LocalAggrConfig {
return localAggrConfig.Clone()
}
const (
// Type: 三段式 ${指标}:${聚合维度}:${聚合与否}
Const_AggrType_AllAnyNoaggr = "all:any:noaggr"
Const_AggrType_SomeSomeAggr = "some:some:aggr"
// 全部指标
Const_AllMetrics = ".*"
)
var (
// 禁止聚合-常亮
Const_NoAggrConfig = &NsAggrConfig{Ns: ".*", Type: Const_AggrType_AllAnyNoaggr}
)
type LocalAggrConfig struct {
sync.RWMutex
NsConfig map[string]*NsAggrConfig `json:"ns_config"`
Version string `json:"version"`
Updated int64 `json:"updated"`
}
type NsAggrConfig struct {
Ns string `json:"ns"`
Type string `json:"type"`
MetricTagks map[string][][]string `json:"metric_tagks"`
}
func (this *LocalAggrConfig) GetByNs(ns string) (nsAggrConfig *NsAggrConfig, found bool) {
// TODO: daijia产品线自己做了聚合,因此metrics不再聚合
if strings.HasSuffix(ns, ".daijia.n9e.com") {
nsAggrConfig = Const_NoAggrConfig
found = true
return
}
this.RLock()
nsAggrConfig, found = this.NsConfig[ns]
this.RUnlock()
return
}
func (this *LocalAggrConfig) Update(nac map[string]*NsAggrConfig, version string, updated int64) {
this.Lock()
this.NsConfig = nac
this.Version = version
this.Updated = updated
this.Unlock()
}
func (this *LocalAggrConfig) Clone() *LocalAggrConfig {
ret := &LocalAggrConfig{}
this.RLock()
ret.Updated = this.Updated
ret.NsConfig = this.NsConfig
this.RUnlock()
return ret
}
func (this LocalAggrConfig) CheckType(t string) bool {
switch t {
case Const_AggrType_AllAnyNoaggr, Const_AggrType_SomeSomeAggr:
return true
}
return false
}
*/

View File

@ -0,0 +1,171 @@
package statsd
import (
"fmt"
"sort"
"strconv"
)
type counterAggregator struct {
Counter float64
}
func (self *counterAggregator) new(aggregatorNames []string) (aggregator, error) {
if len(aggregatorNames) < 1 || aggregatorNames[0] != "c" {
return nil, BadAggregatorNameError
}
return &counterAggregator{}, nil
}
// counter类型可以接受一个或多个(并包模式下) value, 没有statusCode字段, 不在sdk做并包
// 形如 10{"\u2318"}1{"\u2318"}20
func (self *counterAggregator) collect(values []string, metric string, argLines string) error {
if len(values) < 1 {
return fmt.Errorf("bad values")
}
for i := range values {
delta := float64(0.0)
parsed, err := strconv.ParseFloat(values[i], 64)
if err != nil {
return err
}
delta = parsed
self.Counter += delta
}
return nil
}
func (self *counterAggregator) dump(points []*Point, timestamp int64,
tags map[string]string, metric, argLines string) ([]*Point, error) {
points = append(points, &Point{
Name: metric + ".counter",
Timestamp: timestamp,
Tags: tags,
Value: self.Counter,
})
return points, nil
}
func (self *counterAggregator) summarize(nsmetric, argLines string, newAggrs map[string]aggregator) {
// 准备: ns/metric
//items, _ := Func{}.TranslateMetricLine(nsmetric)
//ns := items[0]
//metric := items[1]
// 黑名单
// 准备: tags
tags, _, err := Func{}.TranslateArgLines(argLines)
if err != nil {
return
}
self.doAggr(tags, newAggrs)
// 本机聚合
return
}
func (self *counterAggregator) merge(toMerge aggregator) (aggregator, error) {
that := toMerge.(*counterAggregator)
self.Counter += that.Counter
return self, nil
}
func (self *counterAggregator) toMap() (map[string]interface{}, error) {
return map[string]interface{}{
"__aggregator__": "counter",
"counter": self.Counter,
}, nil
}
func (self counterAggregator) fromMap(serialized map[string]interface{}) (aggregator, error) {
return &counterAggregator{Counter: serialized["counter"].(float64)}, nil
}
// internals
func (self counterAggregator) addSummarizeAggregator(argLines string, toMerge *counterAggregator, newAggrs map[string]aggregator) {
aggr, ok := newAggrs[argLines]
if !(ok && aggr != nil) {
nAggr, err := toMerge.clone()
if err == nil {
newAggrs[argLines] = nAggr
}
} else {
aggr.merge(toMerge)
}
}
func (self *counterAggregator) clone() (aggregator, error) {
maps, err := self.toMap()
if err != nil {
return nil, err
}
aggr, err := counterAggregator{}.fromMap(maps)
if err != nil {
return nil, err
}
return aggr, nil
}
func (self *counterAggregator) doAggr(tags map[string]string, newAggrs map[string]aggregator, aggrTagksList ...[][]string) {
tagks := make([]string, 0)
for k, _ := range tags {
tagks = append(tagks, k)
}
tagkNum := len(tagks)
if tagkNum == 0 {
return
}
sort.Strings(tagks)
// get formator
formator := ""
for i := 0; i < tagkNum; i++ {
formator += tagks[i] + "=%s\n"
}
formator += "c"
// 聚合所有维度
ntagvs_all := make([]interface{}, tagkNum)
for i := 0; i < tagkNum; i++ {
ntagvs_all[i] = "<all>"
}
summarizedTags := fmt.Sprintf(formator, ntagvs_all...)
counterAggregator{}.addSummarizeAggregator(summarizedTags, self, newAggrs)
// 聚合指定维度
if len(aggrTagksList) > 0 {
for i := 0; i < len(aggrTagksList[0]); i++ {
aggrTagks := aggrTagksList[0][i]
// 判断合法性
if !(len(aggrTagks) > 0 && len(aggrTagks) < tagkNum && // ==tagsNum 会造成 所有维度 的重复聚合
(Func{}).IsSubKeys(aggrTagks, tags)) { // 监控数据 有 指定的聚合维度
continue
}
// 聚合
sometagks := make([]interface{}, tagkNum)
for i, tk := range tagks {
sometagks[i] = tags[tk]
}
for _, tk := range aggrTagks {
for i := 0; i < tagkNum; i++ {
if tk == tagks[i] {
sometagks[i] = "<all>"
break
}
}
}
summarizedTags := fmt.Sprintf(formator, sometagks...)
counterAggregator{}.addSummarizeAggregator(summarizedTags, self, newAggrs)
}
}
}

View File

@ -0,0 +1,267 @@
package statsd
import (
"fmt"
"sort"
"strconv"
)
// maxAggregator
// counter enhance, aggr="ce"
type counterEAggregator struct {
Counter float64
Stats map[int64]float64 // 不需要加锁, 单线程
lastTimestamp int64
delta float64
raw bool // 原始统计(true) or 聚合后的统计(false), bool型初始化是false
}
func (self *counterEAggregator) new(aggregatorNames []string) (aggregator, error) {
if len(aggregatorNames) < 1 || aggregatorNames[0] != "ce" {
return nil, BadAggregatorNameError
}
return &counterEAggregator{
Stats: make(map[int64]float64),
lastTimestamp: GetTimestamp(),
delta: 0,
raw: true,
}, nil
}
// counterE类型可以接受一个或多个(并包模式下) value, 没有statusCode字段, 不在sdk做并包
// 形如 10{"\u2318"}1{"\u2318"}20
func (self *counterEAggregator) collect(values []string, metric string, argLines string) error {
if len(values) < 1 {
return fmt.Errorf("bad values")
}
ts := GetTimestamp()
for i := range values {
delta := float64(0.0)
parsed, err := strconv.ParseFloat(values[i], 64)
if nil != err {
return err
}
delta = parsed
self.Counter += delta
if ts > self.lastTimestamp {
self.Stats[self.lastTimestamp] = self.delta
self.delta = delta
self.lastTimestamp = ts
} else {
self.delta += delta
}
}
return nil
}
func (self *counterEAggregator) dump(points []*Point, timestamp int64,
tags map[string]string, metric, argLines string) ([]*Point, error) {
points = append(points, &Point{
Name: metric + ".counter",
Timestamp: timestamp,
Tags: tags,
Value: self.Counter,
})
// 原始统计出max/min值,聚合的结果不出
if self.raw {
max := float64(0.0)
min := float64(0.0)
sum := float64(0.0)
cnt := len(self.Stats)
if cnt > 0 {
flag := true
for _, value := range self.Stats {
sum += value
if flag {
max = value
min = value
flag = false
continue
}
if value > max {
max = value
}
if value < min {
min = value
}
}
} else {
cnt = 1
}
points = append(points, &Point{
Name: metric + ".counter.max",
Timestamp: timestamp,
Tags: tags,
Value: max,
})
points = append(points, &Point{
Name: metric + ".counter.min",
Timestamp: timestamp,
Tags: tags,
Value: min,
})
points = append(points, &Point{
Name: metric + ".counter.avg",
Timestamp: timestamp,
Tags: tags,
Value: sum / float64(cnt),
})
}
return points, nil
}
func (self *counterEAggregator) summarize(nsmetric, argLines string, newAggrs map[string]aggregator) {
// 准备: ns/metric
//items, _ := Func{}.TranslateMetricLine(nsmetric)
//ns := items[0]
//metric := items[1]
// 黑名单
// 准备: tags
tags, _, err := Func{}.TranslateArgLines(argLines)
if err != nil {
return
}
// 未统计的delta补齐到stats中
if self.raw && self.delta > 0 {
self.Stats[self.lastTimestamp] = self.delta
}
// 只做默认聚合
self.doAggr(tags, newAggrs)
// 本机聚合
return
}
func (self *counterEAggregator) merge(toMerge aggregator) (aggregator, error) {
that := toMerge.(*counterEAggregator)
self.Counter += that.Counter
for ts, value := range that.Stats {
if _, found := self.Stats[ts]; found {
self.Stats[ts] += value
} else {
self.Stats[ts] = value
}
}
return self, nil
}
func (self *counterEAggregator) toMap() (map[string]interface{}, error) {
stats := map[int64]interface{}{}
for k, v := range self.Stats {
stats[k] = v
}
return map[string]interface{}{
"__aggregator__": "counterE",
"counter": self.Counter,
"stats": stats,
}, nil
}
func (self counterEAggregator) fromMap(serialized map[string]interface{}) (aggregator, error) {
// raw字段默认是false
aggregator := &counterEAggregator{Counter: serialized["counter"].(float64), Stats: map[int64]float64{}}
stats := (serialized["stats"]).(map[int64]interface{})
for k, v := range stats {
aggregator.Stats[k] = v.(float64)
}
return aggregator, nil
}
// internals
func (self counterEAggregator) addSummarizeAggregator(argLines string, toMerge *counterEAggregator, newAggrs map[string]aggregator) {
aggr, ok := newAggrs[argLines]
if !(ok && aggr != nil) {
nAggr, err := toMerge.clone()
if err == nil {
newAggrs[argLines] = nAggr
}
} else {
aggr.merge(toMerge)
}
}
func (self *counterEAggregator) clone() (aggregator, error) {
maps, err := self.toMap()
if err != nil {
return nil, err
}
aggr, err := counterEAggregator{}.fromMap(maps)
if err != nil {
return nil, err
}
return aggr, nil
}
func (self *counterEAggregator) doAggr(tags map[string]string, newAggrs map[string]aggregator, aggrTagksList ...[][]string) {
tagks := make([]string, 0)
for k, _ := range tags {
tagks = append(tagks, k)
}
tagkNum := len(tagks)
if tagkNum == 0 {
return
}
sort.Strings(tagks)
// get formator
formator := ""
for i := 0; i < tagkNum; i++ {
formator += tagks[i] + "=%s\n"
}
formator += "ce"
// 聚合所有维度
ntagvs_all := make([]interface{}, tagkNum)
for i := 0; i < tagkNum; i++ {
ntagvs_all[i] = "<all>"
}
summarizedTags := fmt.Sprintf(formator, ntagvs_all...)
counterEAggregator{}.addSummarizeAggregator(summarizedTags, self, newAggrs)
// 聚合指定维度
if len(aggrTagksList) > 0 {
for i := 0; i < len(aggrTagksList[0]); i++ {
aggrTagks := aggrTagksList[0][i]
// 判断合法性
if !(len(aggrTagks) > 0 && len(aggrTagks) < tagkNum && // ==tagsNum 会造成 所有维度 的重复聚合
(Func{}).IsSubKeys(aggrTagks, tags)) { // 监控数据 有 指定的聚合维度
continue
}
// 聚合
sometagks := make([]interface{}, tagkNum)
for i, tk := range tagks {
sometagks[i] = tags[tk]
}
for _, tk := range aggrTagks {
for i := 0; i < tagkNum; i++ {
if tk == tagks[i] {
sometagks[i] = "<all>"
break
}
}
}
summarizedTags := fmt.Sprintf(formator, sometagks...)
counterEAggregator{}.addSummarizeAggregator(summarizedTags, self, newAggrs)
}
}
}

View File

@ -0,0 +1,69 @@
package statsd
import (
"fmt"
"strconv"
)
type gaugeAggregator struct {
Gauge float64
}
func (self *gaugeAggregator) new(aggregatorNames []string) (aggregator, error) {
if len(aggregatorNames) < 1 || aggregatorNames[0] != "g" {
return nil, BadAggregatorNameError
}
return &gaugeAggregator{}, nil
}
// gauge类型可以接受一个或多个(并包模式下) value, 没有statusCode字段, 不在sdk做并包
// 形如 10{"\u2318"}1{"\u2318"}20
func (self *gaugeAggregator) collect(values []string, metric string, argLines string) error {
if len(values) < 1 {
return fmt.Errorf("bad values")
}
for i := range values {
delta := float64(0.0)
parsed, err := strconv.ParseFloat(values[i], 64)
if err != nil {
return err
}
delta = parsed
self.Gauge = delta
}
return nil
}
func (self *gaugeAggregator) dump(points []*Point, timestamp int64,
tags map[string]string, metric, argLines string) ([]*Point, error) {
points = append(points, &Point{
Name: metric + ".gauge",
Timestamp: timestamp,
Tags: tags,
Value: self.Gauge,
})
return points, nil
}
// 不支持聚合功能
func (self *gaugeAggregator) summarize(nsmetric, argLines string, newAggrs map[string]aggregator) {
return
}
func (self *gaugeAggregator) merge(toMerge aggregator) (aggregator, error) {
return self, nil
}
func (self *gaugeAggregator) toMap() (map[string]interface{}, error) {
return map[string]interface{}{
"__aggregator__": "gauge",
"gauge": self.Gauge,
}, nil
}
func (self gaugeAggregator) fromMap(serialized map[string]interface{}) (aggregator, error) {
return &gaugeAggregator{Gauge: serialized["gauge"].(float64)}, nil
}

View File

@ -0,0 +1,187 @@
package statsd
import (
"bytes"
"encoding/base64"
"fmt"
"strconv"
tdigest "github.com/didi/nightingale/src/toolkits/go-tdigest"
)
type histogramAggregator struct {
AggregatorNames []string
digest *tdigest.TDigest
max float64
min float64
sum float64
cnt int
}
func (self *histogramAggregator) new(aggregatorNames []string) (aggregator, error) {
if len(aggregatorNames) < 1 {
return nil, BadAggregatorNameError
}
ni := self.newInstence(aggregatorNames)
return &ni, nil
}
// histogram类型可以接受一个或多个(并包模式下) value, 没有statusCode字段
// 形如 10.1{"\u2318"}10.2{"\u2318"}20.8
func (self *histogramAggregator) collect(values []string, metric string, argLines string) error {
if len(values) < 1 {
return fmt.Errorf("bad values")
}
for i := range values {
parsed, err := strconv.ParseFloat(values[i], 64)
if nil != err {
return err
}
self.sum += parsed
self.cnt += 1
if self.max < parsed {
self.max = parsed
}
if self.min > parsed {
self.min = parsed
}
err = self.digest.Add(parsed, 1)
return err
}
return nil
}
func (self *histogramAggregator) dump(points []*Point, timestamp int64,
tags map[string]string, metric, argLines string) ([]*Point, error) {
for _, aggregatorName := range self.AggregatorNames {
value := 0.0
percentile := ""
switch aggregatorName {
case "p99":
value = self.digest.Quantile(0.99)
case "p95":
value = self.digest.Quantile(0.95)
case "p90":
value = self.digest.Quantile(0.90)
case "p75":
value = self.digest.Quantile(0.75)
case "p50":
value = self.digest.Quantile(0.5)
case "p25":
value = self.digest.Quantile(0.25)
case "p10":
value = self.digest.Quantile(0.10)
case "p5":
value = self.digest.Quantile(0.05)
case "p1":
value = self.digest.Quantile(0.01)
case "max":
value = self.max
percentile = "max"
case "min":
value = self.min
percentile = "min"
case "sum":
value = self.sum
percentile = "sum"
case "cnt":
value = float64(self.cnt)
percentile = "cnt"
case "avg":
if self.cnt > 0 {
value = self.sum / float64(self.cnt)
}
percentile = "avg"
default:
continue
}
// TODO: 为什么不支持负数的统计? 先保持现状吧, 否则可能会影响rpc的latency指标
if value < 0 {
value = 0
}
myTags := map[string]string{}
for k, v := range tags {
myTags[k] = v
}
if percentile == "" {
myTags["percentile"] = aggregatorName[1:]
} else {
myTags["percentile"] = percentile
}
points = append(points, &Point{
Name: metric,
Timestamp: timestamp,
Tags: myTags,
Value: value,
})
}
return points, nil
}
// 该统计不提供聚合功能, 因此下面的函数 不对 max/min/sum/cnt做处理
func (self *histogramAggregator) summarize(nsmetric, argLines string, newAggrs map[string]aggregator) {
return
}
// aggr_rpc结构体聚合时使用
func (self *histogramAggregator) merge(toMerge aggregator) (aggregator, error) {
that, ok := toMerge.(*histogramAggregator)
if !ok {
return nil, BadSummarizeAggregatorError
}
self.digest.Merge(that.digest)
return self, nil
}
func (self *histogramAggregator) toMap() (map[string]interface{}, error) {
digest, err := self.digest.AsBytes()
if nil != err {
return nil, err
}
aggregatorNames := make([]interface{}, 0)
for _, aggregatorName := range self.AggregatorNames {
aggregatorNames = append(aggregatorNames, aggregatorName)
}
return map[string]interface{}{
"__aggregator__": "histogram",
"aggregatorNames": aggregatorNames,
"digest": base64.StdEncoding.EncodeToString(digest),
}, nil
}
func (self *histogramAggregator) fromMap(serialized map[string]interface{}) (aggregator, error) {
b, err := base64.StdEncoding.DecodeString(serialized["digest"].(string))
if nil != err {
return nil, fmt.Errorf("failed to deserialize: %v", serialized)
}
digest, err := tdigest.FromBytes(bytes.NewReader(b))
if nil != err {
return nil, fmt.Errorf("failed to deserialize: %v", serialized)
}
aggregator := &histogramAggregator{AggregatorNames: make([]string, 0), digest: digest}
aggregatorNames := (serialized["aggregatorNames"]).([]interface{})
for _, aggregatorName := range aggregatorNames {
aggregator.AggregatorNames = append(aggregator.AggregatorNames, aggregatorName.(string))
}
return aggregator, nil
}
// internal functions
func (self histogramAggregator) newInstence(aggregatorNames []string) histogramAggregator {
return histogramAggregator{
AggregatorNames: aggregatorNames,
digest: tdigest.New(100),
max: float64(0.0),
min: float64(0.0),
sum: float64(0.0),
cnt: int(0),
}
}

View File

@ -0,0 +1,12 @@
package statsd
// interface aggregator
type aggregator interface {
new(aggregatorNames []string) (aggregator, error)
collect(values []string, metric string, argLines string) error
dump(points []*Point, timestamp int64, tags map[string]string, metric string, argLines string) ([]*Point, error)
summarize(nsmetric, argLines string, newAggrs map[string]aggregator)
merge(toMerge aggregator) (aggregator, error)
toMap() (map[string]interface{}, error)
fromMap(map[string]interface{}) (aggregator, error)
}

View File

@ -0,0 +1,200 @@
package statsd
import (
"fmt"
"strconv"
"strings"
)
type ratioAggregator struct {
Counters map[string]float64
}
func (self *ratioAggregator) new(aggregatorNames []string) (aggregator, error) {
if len(aggregatorNames) < 1 || aggregatorNames[0] != "r" {
return nil, BadAggregatorNameError
}
return &ratioAggregator{Counters: map[string]float64{}}, nil
}
// ratio类型可以接受一个或多个(并包模式下) value, 有statusCode字段
// 旧版协议 形如: ok{"\u2318"}error{"\u2318"}ok
// 新版协议 形如: 1,ok{"\u2318"}1,error{"\u2318"}0,ok
func (self *ratioAggregator) collect(values []string, metric string, argLines string) error {
if len(values) < 1 {
return fmt.Errorf("bad values")
}
for i := range values {
/*
旧版协议: "error" 计数为 1, 形如"error,none", code取值为error(此处是values[0], none被截断)
新版协议: "2,error" 计数为 2, 形如"2,error,none", code取值为error(此处是values[1], none被截断)
为了兼容旧版
1.只上报"error", 不包含","(逗号) 直接计数为1
2.包含","(逗号), 且values[0]无法解析为数字, 计数为1, code取值values[0]
3.包含","(逗号)且原来通过旧版协议上报了"2,error", 直接按新版处理, code从2变为error
*/
cvalues := strings.Split(values[i], CodeDelimiter)
if len(cvalues) == 0 {
continue
}
if len(cvalues) == 1 {
code := values[0]
self.Counters[code] += 1
continue
}
code := cvalues[1]
value, err := strconv.ParseFloat(cvalues[0], 64)
if err != nil {
value = float64(1) // 兼容旧版协议, 形如"error,something", 按照 1,error 处理
code = values[0]
}
self.Counters[code] += value
}
return nil
}
func (self *ratioAggregator) dump(points []*Point, timestamp int64,
tags map[string]string, metric, argLines string) ([]*Point, error) {
return self._dump(false, points, timestamp, tags, metric, argLines)
}
func (self *ratioAggregator) summarize(nsmetric, argLines string, newAggrs map[string]aggregator) {
return
}
func (self *ratioAggregator) merge(toMerge aggregator) (aggregator, error) {
that := toMerge.(*ratioAggregator)
for k, v2 := range that.Counters {
_, found := self.Counters[k]
if found {
self.Counters[k] += v2
} else {
self.Counters[k] = v2
}
}
return self, nil
}
func (self *ratioAggregator) toMap() (map[string]interface{}, error) {
counters := map[string]float64{}
for k, v := range self.Counters {
counters[k] = v
}
return map[string]interface{}{
"__aggregator__": "ratio",
"counters": counters,
}, nil
}
func (self *ratioAggregator) fromMap(serialized map[string]interface{}) (aggregator, error) {
aggr := &ratioAggregator{Counters: map[string]float64{}}
counters := (serialized["counters"]).(map[string]interface{})
for k, v := range counters {
aggr.Counters[k] = v.(float64)
}
return aggr, nil
}
func (self *ratioAggregator) _dump(
asTags bool, points []*Point, timestamp int64, tags map[string]string,
metric string, argLines string) ([]*Point, error) {
// 没有统计,则不dump
if len(self.Counters) == 0 {
return points, nil
}
convertedCounters := map[string]float64{}
total := float64(0)
for code, byCodeCount := range self.Counters {
counter := byCodeCount
convertedCounters[code] = counter
total += counter
}
if total > 0 {
for code := range self.Counters {
myMetric := metric
myTags := tags
if asTags {
myTags = map[string]string{}
for tagk, tagv := range tags {
myTags[tagk] = tagv
}
myTags["code"] = code
myMetric = metric + ".ratio"
} else {
myMetric = metric + "." + code + ".ratio"
}
points = append(points, &Point{
Name: myMetric,
Timestamp: timestamp,
Tags: myTags,
Value: convertedCounters[code] / total * 100,
})
}
}
points = append(points, &Point{
Name: metric + ".counter",
Timestamp: timestamp,
Tags: tags,
Value: total,
})
return points, nil
}
////////////////////////////////////////////////////////////
// struct ratioAsTagsAggregator
////////////////////////////////////////////////////////////
type ratioAsTagsAggregator struct {
ratioAggregator
}
func (self *ratioAsTagsAggregator) new(aggregatorNames []string) (aggregator, error) {
if len(aggregatorNames) < 1 || aggregatorNames[0] != "rt" {
return nil, BadAggregatorNameError
}
return &ratioAsTagsAggregator{ratioAggregator: ratioAggregator{Counters: map[string]float64{}}}, nil
}
func (self *ratioAsTagsAggregator) dump(points []*Point, timestamp int64,
tags map[string]string, metric, argLines string) ([]*Point, error) {
return self._dump(true, points, timestamp, tags, metric, argLines)
}
func (self *ratioAsTagsAggregator) merge(toMerge aggregator) (aggregator, error) {
that := toMerge.(*ratioAsTagsAggregator)
merged, err := self.ratioAggregator.merge(&that.ratioAggregator)
if err != nil {
return self, err
}
self.ratioAggregator = *(merged.(*ratioAggregator))
return self, nil
}
func (self *ratioAsTagsAggregator) toMap() (map[string]interface{}, error) {
counters := map[string]float64{}
for k, v := range self.Counters {
counters[k] = v
}
return map[string]interface{}{
"__aggregator__": "ratioAsTags",
"counters": counters,
}, nil
}
func (self *ratioAsTagsAggregator) fromMap(serialized map[string]interface{}) (aggregator, error) {
aggr, err := self.ratioAggregator.fromMap(serialized)
if err != nil {
return nil, err
}
raggr := aggr.(*ratioAggregator)
return &ratioAsTagsAggregator{ratioAggregator: *raggr}, nil
}

View File

@ -0,0 +1,441 @@
package statsd
import (
"fmt"
"sort"
"strconv"
"strings"
)
type rpcAggregator struct {
histogramAggregator
Counters map[string]float64
Latencys map[string]float64
}
func (self *rpcAggregator) new(aggregatorNames []string) (aggregator, error) {
if len(aggregatorNames) < 1 || aggregatorNames[0] != "rpc" {
return nil, BadAggregatorNameError
}
histogramAggregatorNames := []string{"p99", "p95", "p75", "p50"}
return &rpcAggregator{
histogramAggregator: histogramAggregator{}.newInstence(histogramAggregatorNames),
Counters: map[string]float64{},
Latencys: map[string]float64{},
}, nil
}
// ratio类型可以接受一个或多个(并包模式下) value, 有statusCode字段
// 形如 10.1,ok{"\u2318"}10.2,error{"\u2318"}20.8,ok
func (self *rpcAggregator) collect(values []string, metric string, argLines string) error {
if len(values) < 1 {
return fmt.Errorf("bad values")
}
for i := range values {
cvalues := strings.Split(values[i], CodeDelimiter)
if len(cvalues) < 2 {
// bad values
continue
}
err := self.histogramAggregator.collect(cvalues[:1], metric, argLines)
if err != nil {
return err
}
latency, err := strconv.ParseFloat(cvalues[0], 64)
if err != nil {
return err
}
code := cvalues[1]
self.Counters[code] += 1
self.Latencys[code] += latency
}
return nil
}
// @input
// metric: $metric_name(不包含ns)
func (self *rpcAggregator) dump(points []*Point, timestamp int64,
tags map[string]string, metric, argLines string) ([]*Point, error) {
var (
err error
)
// 无数据,则不dump点
if len(self.Counters) == 0 {
return points, nil
}
// 验证tag信息: 必须存在callee caller
if _, ok := tags["caller"]; !ok {
return points, nil
}
callee, ok := tags["callee"]
if !ok {
return points, nil
}
tags["callee"] = Func{}.TrimRpcCallee(callee) // 修改callee字段
// 带tag的rpc统计, 指标名称调整为 by_tags.$metric
//if len(tags) > 2 {
// metric = fmt.Sprintf("by_tags.%s", metric)
//}
totalCount := float64(0)
totalErrorCount := float64(0)
for code, count := range self.Counters {
if !(Func{}.IsOk(code)) {
myTags := map[string]string{}
for k, v := range tags {
myTags[k] = v
}
myTags["code"] = code
points = append(points, &Point{
Name: metric + ".error.counter",
Timestamp: timestamp,
Tags: myTags,
Value: count,
})
totalErrorCount += count
}
totalCount += count
}
points = append(points, &Point{
Name: metric + ".counter",
Timestamp: timestamp,
Tags: tags,
Value: totalCount,
})
if totalCount > 0 {
points = append(points, &Point{
Name: metric + ".error.ratio",
Timestamp: timestamp,
Tags: tags,
Value: totalErrorCount / totalCount * 100,
})
myTags := map[string]string{}
for k, v := range tags {
myTags[k] = v
}
myTags["code"] = "<all>"
points = append(points, &Point{
Name: metric + ".error.counter",
Timestamp: timestamp,
Tags: myTags,
Value: totalErrorCount,
})
}
// latency
latencyMetric := fmt.Sprintf("%s.latency", metric)
{ // avg
totalLatency := float64(0)
for _, latency := range self.Latencys {
totalLatency += latency
}
avgLatency := float64(0)
if totalCount > 0 && totalLatency > 0 {
avgLatency = totalLatency / totalCount
}
myTags := map[string]string{}
for k, v := range tags {
myTags[k] = v
}
myTags["percentile"] = "avg"
points = append(points, &Point{
Name: latencyMetric,
Timestamp: timestamp,
Tags: myTags,
Value: avgLatency,
})
}
points, err = self.histogramAggregator.dump(points, timestamp, tags, latencyMetric, argLines) // percentile
return points, err
}
func (self *rpcAggregator) summarize(nsmetric, argLines string, newAggrs map[string]aggregator) {
items, _ := Func{}.TranslateMetricLine(nsmetric)
//ns := items[0]
metric := items[1]
tags, _, err := Func{}.TranslateArgLines(argLines)
if err != nil {
return
}
// rpc_dirpc_call & rpc_dirpc_called
if metric == MetricToBeSummarized_DirpcCallConst || metric == MetricToBeSummarized_DirpcCalledConst {
if len(tags) != 5 {
return
}
callee, _ := tags["callee"]
calleef, _ := tags["callee-func"]
caller, _ := tags["caller"]
callerf, _ := tags["caller-func"]
su, _ := tags["su"]
if !(caller != "" && callerf != "" && callee != "" && calleef != "" && su != "") {
return
}
formator := "callee=%s\ncallee-func=%s\ncaller=%s\ncaller-func=%s\nsu=%s\nrpc"
if calleef != "<all>" {
summarizedCalleef := fmt.Sprintf(formator, callee, "<all>", caller, callerf, su)
rpcAggregator{}.addSummarizeAggregator(summarizedCalleef, self, newAggrs)
}
if callerf != "<all>" {
summarizedCallerf := fmt.Sprintf(formator, callee, calleef, caller, "<all>", su)
rpcAggregator{}.addSummarizeAggregator(summarizedCallerf, self, newAggrs)
}
if calleef != "<all>" && callerf != "<all>" {
summarizedCalleefCallerf := fmt.Sprintf(formator, callee, "<all>", caller, "<all>", su)
rpcAggregator{}.addSummarizeAggregator(summarizedCalleefCallerf, self, newAggrs)
}
return
}
// rpcdisf
if metric == MetricToBeSummarized_RpcdisfConst {
if len(tags) != 7 {
return
}
callee, _ := tags["callee"]
calleec, _ := tags["callee-cluster"]
calleef, _ := tags["callee-func"]
caller, _ := tags["caller"]
callerc, _ := tags["caller-cluster"]
callerf, _ := tags["caller-func"]
su, _ := tags["su"]
if !(caller != "" && callerc != "" && callerf != "" &&
callee != "" && calleec != "" && calleef != "" && su != "") {
return
}
formator := "callee=%s\ncallee-cluster=%s\ncallee-func=%s\ncaller=%s\ncaller-cluster=%s\ncaller-func=%s\nsu=%s\nrpc"
if calleef != "<all>" {
summarizedCalleef := fmt.Sprintf(formator, callee, calleec, "<all>", caller, callerc, callerf, su)
rpcAggregator{}.addSummarizeAggregator(summarizedCalleef, self, newAggrs)
}
if callerf != "<all>" {
summarizedCallerf := fmt.Sprintf(formator, callee, calleec, calleef, caller, callerc, "<all>", su)
rpcAggregator{}.addSummarizeAggregator(summarizedCallerf, self, newAggrs)
}
summarizedCalleefCallerf := fmt.Sprintf(formator, callee, calleec, "<all>", caller, callerc, "<all>", su)
rpcAggregator{}.addSummarizeAggregator(summarizedCalleefCallerf, self, newAggrs)
return
}
// rpcdfe
if metric == MetricToBeSummarized_RpcdfeConst {
if len(tags) != 5 {
return
}
callee, _ := tags["callee"]
caller, _ := tags["caller"]
domain, _ := tags["domain"]
scheme, _ := tags["scheme"]
upstream, _ := tags["upstream"]
if !(callee != "" && caller != "" && domain != "" &&
scheme != "" && upstream != "") {
return
}
formator := "callee=%s\ncaller=%s\ndomain=%s\nscheme=%s\nupstream=%s\nrpc"
if domain != "<all>" {
summarizedDomain := fmt.Sprintf(formator, callee, caller, "<all>", scheme, upstream)
rpcAggregator{}.addSummarizeAggregator(summarizedDomain, self, newAggrs)
}
if scheme != "<all>" {
summarizedScheme := fmt.Sprintf(formator, callee, caller, domain, "<all>", upstream)
rpcAggregator{}.addSummarizeAggregator(summarizedScheme, self, newAggrs)
}
if upstream != "<all>" {
summarizedUpstream := fmt.Sprintf(formator, callee, caller, domain, scheme, "<all>")
rpcAggregator{}.addSummarizeAggregator(summarizedUpstream, self, newAggrs)
}
summarizedDomainSchemeUp := fmt.Sprintf(formator, callee, caller, "<all>", "<all>", "<all>")
rpcAggregator{}.addSummarizeAggregator(summarizedDomainSchemeUp, self, newAggrs)
return
}
// 黑名单
// 只做默认聚合
self.doAggr(tags, newAggrs)
// 本机聚合
return
}
func (self *rpcAggregator) merge(toMerge aggregator) (aggregator, error) {
that, ok := toMerge.(*rpcAggregator)
if !ok {
return nil, BadSummarizeAggregatorError
}
_, err := self.histogramAggregator.merge(&that.histogramAggregator)
if err != nil {
return nil, err
}
for k, v2 := range that.Counters {
_, found := self.Counters[k]
if found {
self.Counters[k] += v2
} else {
self.Counters[k] = v2
}
}
for k, v2 := range that.Latencys {
_, found := self.Latencys[k]
if found {
self.Latencys[k] += v2
} else {
self.Latencys[k] = v2
}
}
return self, nil
}
func (self *rpcAggregator) toMap() (map[string]interface{}, error) {
counters := map[string]interface{}{}
for k, v := range self.Counters {
counters[k] = v
}
latencys := map[string]interface{}{}
for k, v := range self.Latencys {
latencys[k] = v
}
hm, err := self.histogramAggregator.toMap()
if err != nil {
return nil, err
}
return map[string]interface{}{
"__aggregator__": "rpc",
"counters": counters,
"latencys": latencys,
"histogram": hm,
}, nil
}
func (self rpcAggregator) fromMap(serialized map[string]interface{}) (aggregator, error) {
aggregator := &rpcAggregator{Counters: map[string]float64{}, Latencys: map[string]float64{}}
counters := (serialized["counters"]).(map[string]interface{})
for k, v := range counters {
aggregator.Counters[k] = v.(float64)
}
latencys := (serialized["latencys"]).(map[string]interface{})
for k, v := range latencys {
aggregator.Latencys[k] = v.(float64)
}
histogram := (serialized["histogram"]).(map[string]interface{})
hm, err := self.histogramAggregator.fromMap(histogram)
if err != nil {
return nil, err
}
hmaggr, ok := hm.(*histogramAggregator)
if !ok {
return nil, BadDeserializeError
}
aggregator.histogramAggregator = *hmaggr
return aggregator, nil
}
// internal functions
func (self rpcAggregator) addSummarizeAggregator(argLines string, toMerge *rpcAggregator, newAggrs map[string]aggregator) {
aggr, ok := newAggrs[argLines]
if !(ok && aggr != nil) {
nAggr, err := toMerge.clone()
if err == nil {
newAggrs[argLines] = nAggr
}
} else {
aggr.merge(toMerge)
}
}
func (self *rpcAggregator) clone() (aggregator, error) {
maps, err := self.toMap()
if err != nil {
return nil, err
}
aggr, err := rpcAggregator{}.fromMap(maps)
if err != nil {
return nil, err
}
return aggr, nil
}
func (self *rpcAggregator) doAggr(tags map[string]string, newAggrs map[string]aggregator, aggrTagksList ...[][]string) {
tagks := make([]string, 0)
for k, _ := range tags {
tagks = append(tagks, k)
}
tagkNum := len(tagks)
if tagkNum == 0 {
return
}
sort.Strings(tagks)
// get formator
formator := ""
for i := 0; i < tagkNum; i++ {
formator += tagks[i] + "=%s\n"
}
formator += "rpc"
// 聚合所有维度
ntagvs_all := make([]interface{}, tagkNum)
for i := 0; i < tagkNum; i++ {
ntagvs_all[i] = "<all>"
}
summarizedTags := fmt.Sprintf(formator, ntagvs_all...)
rpcAggregator{}.addSummarizeAggregator(summarizedTags, self, newAggrs)
// 聚合指定维度
if len(aggrTagksList) > 0 {
for i := 0; i < len(aggrTagksList[0]); i++ {
aggrTagks := aggrTagksList[0][i]
// 判断合法性
if !(len(aggrTagks) > 0 && len(aggrTagks) < tagkNum && // ==tagsNum 会造成 所有维度 的重复聚合
(Func{}).IsSubKeys(aggrTagks, tags)) { // 监控数据 有 指定的聚合维度
continue
}
// 聚合
sometagks := make([]interface{}, tagkNum)
for i, tk := range tagks {
sometagks[i] = tags[tk]
}
for _, tk := range aggrTagks {
for i := 0; i < tagkNum; i++ {
if tk == tagks[i] {
sometagks[i] = "<all>"
break
}
}
}
summarizedTags := fmt.Sprintf(formator, sometagks...)
rpcAggregator{}.addSummarizeAggregator(summarizedTags, self, newAggrs)
}
}
}

View File

@ -0,0 +1,470 @@
package statsd
import (
"fmt"
"sort"
"strconv"
"strings"
)
type rpcEAggregator struct {
histogramAggregator
Counters map[string]float64
Latencys map[string]float64
}
func (self *rpcEAggregator) new(aggregatorNames []string) (aggregator, error) {
if len(aggregatorNames) < 1 || aggregatorNames[0] != "rpce" {
return nil, BadAggregatorNameError
}
histogramAggregatorNames := []string{"p99", "p95", "p75", "p50"}
return &rpcEAggregator{
histogramAggregator: histogramAggregator{}.newInstence(histogramAggregatorNames),
Counters: map[string]float64{},
Latencys: map[string]float64{},
}, nil
}
func (self *rpcEAggregator) collect(values []string, metric string, argLines string) error {
if len(values) < 1 {
return fmt.Errorf("bad values")
}
for i := range values {
cvalues := strings.Split(values[i], CodeDelimiter)
if len(cvalues) < 2 {
// bad values
continue
}
err := self.histogramAggregator.collect(cvalues[:1], metric, argLines)
if err != nil {
return err
}
latency, err := strconv.ParseFloat(cvalues[0], 64)
if err != nil {
return err
}
code := cvalues[1]
self.Counters[code] += 1
self.Latencys[code] += latency
}
return nil
}
// @input
// metric: $metric_name(不包含ns)
func (self *rpcEAggregator) dump(points []*Point, timestamp int64,
tags map[string]string, metric, argLines string) ([]*Point, error) {
var (
err error
)
// 无数据,则不dump点
if len(self.Counters) == 0 {
return points, nil
}
// 验证tag信息: 必须存在callee caller
if _, ok := tags["caller"]; !ok {
return points, nil
}
callee, ok := tags["callee"]
if !ok {
return points, nil
}
tags["callee"] = Func{}.TrimRpcCallee(callee) // 修改callee字段
// 带tag的rpc统计, 指标名称调整为 by_tags.$metric
//if len(tags) > 2 {
// metric = fmt.Sprintf("by_tags.%s", metric)
//}
totalCount := float64(0)
totalErrorCount := float64(0)
for code, count := range self.Counters {
if !(Func{}.IsOk(code)) {
myTags := map[string]string{}
for k, v := range tags {
myTags[k] = v
}
myTags["code"] = code
points = append(points, &Point{
Name: metric + ".error.counter",
Timestamp: timestamp,
Tags: myTags,
Value: count,
})
totalErrorCount += count
}
totalCount += count
}
points = append(points, &Point{
Name: metric + ".counter",
Timestamp: timestamp,
Tags: tags,
Value: totalCount,
})
if totalCount > 0 {
for code, count := range self.Counters {
myTags := map[string]string{}
for k, v := range tags {
myTags[k] = v
}
myTags["code"] = code
points = append(points, &Point{
Name: metric + ".code.ratio",
Timestamp: timestamp,
Tags: myTags,
Value: count / totalCount * 100,
})
}
points = append(points, &Point{
Name: metric + ".error.ratio",
Timestamp: timestamp,
Tags: tags,
Value: totalErrorCount / totalCount * 100,
})
myTags := map[string]string{}
for k, v := range tags {
myTags[k] = v
}
myTags["code"] = "<all>"
points = append(points, &Point{
Name: metric + ".error.counter",
Timestamp: timestamp,
Tags: myTags,
Value: totalErrorCount,
})
}
// latency
latencyMetric := fmt.Sprintf("%s.latency", metric)
{ // avg
totalLatency := float64(0)
for _, latency := range self.Latencys {
totalLatency += latency
}
avgLatency := float64(0)
if totalCount > 0 && totalLatency > 0 {
avgLatency = totalLatency / totalCount
}
myTags := map[string]string{}
for k, v := range tags {
myTags[k] = v
}
myTags["percentile"] = "avg"
points = append(points, &Point{
Name: latencyMetric,
Timestamp: timestamp,
Tags: myTags,
Value: avgLatency,
})
}
points, err = self.histogramAggregator.dump(points, timestamp, tags, latencyMetric, argLines) // percentile
return points, err
}
func (self *rpcEAggregator) summarize(nsmetric, argLines string, newAggrs map[string]aggregator) {
items, _ := Func{}.TranslateMetricLine(nsmetric)
//ns := items[0]
metric := items[1]
tags, _, err := Func{}.TranslateArgLines(argLines)
if err != nil {
return
}
// rpc_dirpc_call & rpc_dirpc_called
if metric == MetricToBeSummarized_DirpcCallConst || metric == MetricToBeSummarized_DirpcCalledConst {
if len(tags) != 5 {
return
}
callee, _ := tags["callee"]
calleef, _ := tags["callee-func"]
caller, _ := tags["caller"]
callerf, _ := tags["caller-func"]
su, _ := tags["su"]
if !(caller != "" && callerf != "" && callee != "" && calleef != "" && su != "") {
return
}
formator := "callee=%s\ncallee-func=%s\ncaller=%s\ncaller-func=%s\nsu=%s\nrpce"
if calleef != "<all>" {
summarizedCalleef := fmt.Sprintf(formator, callee, "<all>", caller, callerf, su)
rpcEAggregator{}.addSummarizeAggregator(summarizedCalleef, self, newAggrs)
}
if callerf != "<all>" {
summarizedCallerf := fmt.Sprintf(formator, callee, calleef, caller, "<all>", su)
rpcEAggregator{}.addSummarizeAggregator(summarizedCallerf, self, newAggrs)
}
if calleef != "<all>" && callerf != "<all>" {
summarizedCalleefCallerf := fmt.Sprintf(formator, callee, "<all>", caller, "<all>", su)
rpcEAggregator{}.addSummarizeAggregator(summarizedCalleefCallerf, self, newAggrs)
}
return
}
// rpcdisf
if metric == MetricToBeSummarized_RpcdisfConst {
if len(tags) != 7 {
return
}
callee, _ := tags["callee"]
calleec, _ := tags["callee-cluster"]
calleef, _ := tags["callee-func"]
caller, _ := tags["caller"]
callerc, _ := tags["caller-cluster"]
callerf, _ := tags["caller-func"]
su, _ := tags["su"]
if !(caller != "" && callerc != "" && callerf != "" &&
callee != "" && calleec != "" && calleef != "" && su != "") {
return
}
formator := "callee=%s\ncallee-cluster=%s\ncallee-func=%s\ncaller=%s\ncaller-cluster=%s\ncaller-func=%s\nsu=%s\nrpce"
if calleef != "<all>" {
summarizedCalleef := fmt.Sprintf(formator, callee, calleec, "<all>", caller, callerc, callerf, su)
rpcEAggregator{}.addSummarizeAggregator(summarizedCalleef, self, newAggrs)
}
if callerf != "<all>" {
summarizedCallerf := fmt.Sprintf(formator, callee, calleec, calleef, caller, callerc, "<all>", su)
rpcEAggregator{}.addSummarizeAggregator(summarizedCallerf, self, newAggrs)
}
summarizedCalleefCallerf := fmt.Sprintf(formator, callee, calleec, "<all>", caller, callerc, "<all>", su)
rpcEAggregator{}.addSummarizeAggregator(summarizedCalleefCallerf, self, newAggrs)
return
}
// rpcdfe
if metric == MetricToBeSummarized_RpcdfeConst {
tagks := make([]string, 0)
for k, _ := range tags {
tagks = append(tagks, k)
}
tagkLen := len(tagks)
if tagkLen < 3 {
return
}
sort.Strings(tagks)
callee, _ := tags["callee"]
caller, _ := tags["caller"]
service, _ := tags["service"]
if !(callee != "" && caller != "" && service != "") {
return
}
// 单独聚合callee caller service schema
for k, v := range tags {
if (k == "callee" && v != "<all>") || (k == "caller" && v != "<all>") ||
(k == "service" && v != "<all>") || (k == "schema" && v != "<all>") {
formator := ""
for i := 0; i < tagkLen; i++ {
formator += tagks[i] + "=%s\n"
}
formator += "rpce"
// 聚合所有维度
ntagvs_all := make([]interface{}, tagkLen)
for i := 0; i < tagkLen; i++ {
if tagks[i] == k {
ntagvs_all[i] = "<all>"
} else {
ntagvs_all[i] = tags[tagks[i]]
}
}
summarizedTags := fmt.Sprintf(formator, ntagvs_all...)
rpcEAggregator{}.addSummarizeAggregator(summarizedTags, self, newAggrs)
}
}
// 默认聚合所有tag
self.doAggr(tags, newAggrs)
return
}
// 黑名单
// 只做默认聚合
self.doAggr(tags, newAggrs)
// 本机聚合
return
}
func (self *rpcEAggregator) merge(toMerge aggregator) (aggregator, error) {
that, ok := toMerge.(*rpcEAggregator)
if !ok {
return nil, BadSummarizeAggregatorError
}
_, err := self.histogramAggregator.merge(&that.histogramAggregator)
if err != nil {
return nil, err
}
for k, v2 := range that.Counters {
_, found := self.Counters[k]
if found {
self.Counters[k] += v2
} else {
self.Counters[k] = v2
}
}
for k, v2 := range that.Latencys {
_, found := self.Latencys[k]
if found {
self.Latencys[k] += v2
} else {
self.Latencys[k] = v2
}
}
return self, nil
}
func (self *rpcEAggregator) toMap() (map[string]interface{}, error) {
counters := map[string]interface{}{}
for k, v := range self.Counters {
counters[k] = v
}
latencys := map[string]interface{}{}
for k, v := range self.Latencys {
latencys[k] = v
}
hm, err := self.histogramAggregator.toMap()
if err != nil {
return nil, err
}
return map[string]interface{}{
"__aggregator__": "rpce",
"counters": counters,
"latencys": latencys,
"histogram": hm,
}, nil
}
func (self rpcEAggregator) fromMap(serialized map[string]interface{}) (aggregator, error) {
aggregator := &rpcEAggregator{Counters: map[string]float64{}, Latencys: map[string]float64{}}
counters := (serialized["counters"]).(map[string]interface{})
for k, v := range counters {
aggregator.Counters[k] = v.(float64)
}
latencys := (serialized["latencys"]).(map[string]interface{})
for k, v := range latencys {
aggregator.Latencys[k] = v.(float64)
}
histogram := (serialized["histogram"]).(map[string]interface{})
hm, err := self.histogramAggregator.fromMap(histogram)
if err != nil {
return nil, err
}
hmaggr, ok := hm.(*histogramAggregator)
if !ok {
return nil, BadDeserializeError
}
aggregator.histogramAggregator = *hmaggr
return aggregator, nil
}
// internal functions
func (self rpcEAggregator) addSummarizeAggregator(argLines string, toMerge *rpcEAggregator, newAggrs map[string]aggregator) {
aggr, ok := newAggrs[argLines]
if !(ok && aggr != nil) {
nAggr, err := toMerge.clone()
if err == nil {
newAggrs[argLines] = nAggr
}
} else {
aggr.merge(toMerge)
}
}
func (self *rpcEAggregator) clone() (aggregator, error) {
maps, err := self.toMap()
if err != nil {
return nil, err
}
aggr, err := rpcEAggregator{}.fromMap(maps)
if err != nil {
return nil, err
}
return aggr, nil
}
func (self *rpcEAggregator) doAggr(tags map[string]string, newAggrs map[string]aggregator, aggrTagksList ...[][]string) {
tagks := make([]string, 0)
for k, _ := range tags {
tagks = append(tagks, k)
}
tagkNum := len(tagks)
if tagkNum == 0 {
return
}
sort.Strings(tagks)
// get formator
formator := ""
for i := 0; i < tagkNum; i++ {
formator += tagks[i] + "=%s\n"
}
formator += "rpce"
// 聚合所有维度
ntagvs_all := make([]interface{}, tagkNum)
for i := 0; i < tagkNum; i++ {
ntagvs_all[i] = "<all>"
}
summarizedTags := fmt.Sprintf(formator, ntagvs_all...)
rpcEAggregator{}.addSummarizeAggregator(summarizedTags, self, newAggrs)
// 聚合指定维度
if len(aggrTagksList) > 0 {
for i := 0; i < len(aggrTagksList[0]); i++ {
aggrTagks := aggrTagksList[0][i]
// 判断合法性
if !(len(aggrTagks) > 0 && len(aggrTagks) < tagkNum && // ==tagsNum 会造成 所有维度 的重复聚合
(Func{}).IsSubKeys(aggrTagks, tags)) { // 监控数据 有 指定的聚合维度
continue
}
// 聚合
sometagks := make([]interface{}, tagkNum)
for i, tk := range tagks {
sometagks[i] = tags[tk]
}
for _, tk := range aggrTagks {
for i := 0; i < tagkNum; i++ {
if tk == tagks[i] {
sometagks[i] = "<all>"
break
}
}
}
summarizedTags := fmt.Sprintf(formator, sometagks...)
rpcEAggregator{}.addSummarizeAggregator(summarizedTags, self, newAggrs)
}
}
}

View File

@ -0,0 +1,41 @@
package statsd
import (
"sync/atomic"
"time"
)
type Clock struct {
start int64
timestamp int64
}
var clock Clock
func init() {
ts := time.Now().Unix()
clock.start = ts
clock.timestamp = ts
go clock.modify()
}
func (t *Clock) modify() {
duration := time.Duration(100) * time.Millisecond
for {
now := time.Now().Unix()
t.set(now)
time.Sleep(duration)
}
}
func (t *Clock) set(ts int64) {
atomic.StoreInt64(&t.timestamp, ts)
}
func (t *Clock) get() int64 {
return atomic.LoadInt64(&t.timestamp)
}
func GetTimestamp() int64 {
return clock.get()
}

View File

@ -0,0 +1,82 @@
package statsd
import (
"sync"
)
var (
// metrics支持的聚合类型
CommonAggregatorsConst = map[string]bool{
"c": true, "ce": true, "rpc": true, "r": true, "rt": true,
"p1": true, "p5": true, "p25": true, "p50": true, "p75": true,
"p90": true, "p95": true, "p99": true, "rpce": true,
"max": true, "min": true, "sum": true, "avg": true, "cnt": true,
"g": true,
}
HistogramAggregatorsConst = map[string]bool{
"p1": true, "p5": true, "p25": true, "p50": true, "p75": true,
"p90": true, "p95": true, "p99": true,
"max": true, "min": true, "sum": true, "avg": true, "cnt": true,
}
Const_CommonAggregator_Rpc = "rpc"
Const_CommonAggregator_RpcE = "rpce"
// rpc状态码
RpcOkCodesConst = map[string]bool{"ok": true, "0": true,
"200": true, "201": true, "203": true}
// metrics支持的最大tag数
MaxTagsCntConst = 12
// ns前缀后缀
NsPrefixConst = ""
NsSuffixConst = ""
// 需要聚合的metric
MetricToBeSummarized_RpcdisfConst = "rpcdisf"
MetricToBeSummarized_RpcdfeConst = "rpcdfe"
MetricToBeSummarized_DirpcCallConst = "rpc_dirpc_call"
MetricToBeSummarized_DirpcCalledConst = "rpc_dirpc_called"
// summarize等待collect结束的超时时间
SummarizeWaitCollectTimeoutMsConst = 2000
// traceid对应的tagk
TagTraceId = "traceid"
// LRU 缓存的大小
MaxLRUCacheSize = 10000
// 并包模式下的分隔符
MergeDelimiter = "&"
// $value,$statusCode的分隔符, 向前兼容, 使用 ","
CodeDelimiter = ","
)
var (
exitLock = &sync.RWMutex{}
isExited = false
)
func Start() {
isExited = false
// 定时从中心拉取配置
//go MetricAgentConfig{}.UpdateLoop()
// 开启监控数据上报
go StatsdReporter{}.Report()
}
func Exit() {
exitLock.Lock()
isExited = true
exitLock.Unlock()
}
func IsExited() bool {
exitLock.RLock()
r := isExited
exitLock.RUnlock()
return r
}

View File

@ -0,0 +1,43 @@
package statsd
import (
"strings"
"github.com/didi/nightingale/src/toolkits/stats"
"github.com/toolkits/pkg/logger"
)
type StatsdReceiver struct{}
func (self StatsdReceiver) HandlePacket(packet string) {
lines := strings.SplitN(packet, "\n", 3)
if len(lines) != 3 {
logger.Warningf("invalid packet, [error: missing args][packet: %s]", packet)
return
}
value := lines[0]
//
argLines, aggrs, err := Func{}.FormatArgLines(lines[2], lines[1])
if err != nil {
if err.Error() == "ignore" {
return
}
logger.Warningf("invalid packet, [error: bad tags or aggr][msg: %s][packet: %s]", err.Error(), packet)
return
}
metric, err := Func{}.FormatMetricLine(lines[1], aggrs) // metric = $ns/$metric_name
if err != nil {
logger.Warningf("invalid packet, [error: bad metric line][msg: %s][packet %s]", err.Error(), packet)
return
}
stats.Counter.Set("metric.recv.packet", 1)
err = StatsdState{}.GetState().Collect(value, metric, argLines)
if err != nil {
logger.Warningf("invalid packet, [error: collect packet error][msg: %s][packet: %s]", err.Error(), packet)
return
}
}

View File

@ -0,0 +1,255 @@
package statsd
import (
"fmt"
"strings"
"sync"
"time"
"github.com/didi/nightingale/src/common/dataobj"
"github.com/didi/nightingale/src/modules/agent/config"
"github.com/didi/nightingale/src/modules/agent/core"
"github.com/didi/nightingale/src/toolkits/exit"
"github.com/didi/nightingale/src/toolkits/stats"
"github.com/toolkits/pkg/logger"
)
type StatsdReporter struct{}
// point to n9e-agent
type Point struct {
Namespace string `json:"namespace"`
Name string `json:"name"`
Timestamp int64 `json:"timestamp"`
Tags map[string]string `json:"tags"`
Value float64 `json:"value"`
Step int `json:"step"`
}
func (self *Point) String() string {
return fmt.Sprintf("<namespace:%s, name:%s, timestamp:%d, value:%v, step:%d, tags:%v>",
self.Namespace, self.Name, self.Timestamp, self.Value, self.Step, self.Tags)
}
func (self Point) Strings(points []*Point) string {
pointsString := ""
for _, p := range points {
pointsString += p.String() + "\n"
}
return pointsString
}
var (
lastPointLock = &sync.RWMutex{}
lastPoints []*Point
)
var (
isFirstPeriod = true // metrics启动后的第一个统计周期(非线程安全)
)
func (self StatsdReporter) Report() {
// init schedule
schedule := &schedule{}
schedule.clearStateAt = self.nextTenSeconds(time.Now())
schedule.reportAt = schedule.clearStateAt
// send loop
for !IsExited() {
actions := schedule.listActions(time.Now())
if len(actions) != 0 {
self.handleActions(actions)
}
time.Sleep(time.Duration(config.Config.Metrics.ReportIntervalMs) * time.Millisecond)
}
}
func (self StatsdReporter) LastPoints() []*Point {
lastPointLock.RLock()
ret := lastPoints
lastPointLock.RUnlock()
return ret
}
func (self StatsdReporter) setLastPoints(ps []*Point) {
lastPointLock.Lock()
lastPoints = ps
lastPointLock.Unlock()
}
func (self StatsdReporter) handleActions(actions []action) {
defer func() {
if err := recover(); err != nil {
stack := exit.Stack(3)
logger.Warningf("udp handler exit unexpected, [error: %v],[stack: %s]", err, stack)
}
}()
for _, action := range actions {
switch action.actionType {
case "report":
previousState := StatsdState{}.RollState()
//previousState.Summarize() // 指标进一步聚合,得到类似<all>的tag值
// 第一个统计周期不准确, 扔掉
if isFirstPeriod {
isFirstPeriod = false
break
}
// report cnt
// proc
stats.Counter.Set("metric.cache.size", previousState.Size())
//startTs := time.Now()
cnt := self.translateAndSend(previousState, action.toTime, 10, action.prefix)
stats.Counter.Set("metric.report.cnt", cnt)
// proc
//latencyMs := int64(time.Now().Sub(startTs).Nanoseconds() / 1000000)
default:
logger.Debugf("ignored action %s", action.actionType)
}
}
}
func (self StatsdReporter) nextTenSeconds(t time.Time) time.Time {
nowSec := t.Second()
clearStateSec := ((nowSec / 10) * 10)
diff := 10 - (nowSec - clearStateSec)
t = t.Add(time.Duration(-t.Nanosecond()) * time.Nanosecond)
return t.Add(time.Duration(diff) * time.Second)
}
func (self StatsdReporter) translateAndSend(state *state, reportTime time.Time,
frequency int, prefix string) (cnt int) {
cnt = 0
// 业务上报的点
oldPoints := self.translateToPoints(state, reportTime)
// 和traceid统计/过滤相关的点
oldTrace := traceHandler.rollHandler()
tracePoints := oldTrace.dumpPoints(reportTime)
if len(tracePoints) > 0 {
oldPoints = append(oldPoints, tracePoints...)
}
self.setLastPoints(oldPoints)
if len(oldPoints) == 0 {
return
}
buffer := make([]*dataobj.MetricValue, 0)
lastNamespace := oldPoints[0].Namespace
for _, point := range oldPoints {
n9ePoint := TranslateToN9EPoint(point)
if len(buffer) >= config.Config.Metrics.ReportPacketSize || point.Namespace != lastNamespace {
core.Push(buffer)
buffer = make([]*dataobj.MetricValue, 0)
}
n9ePoint.Step = int64(frequency)
buffer = append(buffer, n9ePoint)
lastNamespace = point.Namespace
}
core.Push(buffer)
return
}
func (self StatsdReporter) translateToPoints(state *state, reportTime time.Time) []*Point {
ts := reportTime.Unix()
allPoints := make([]*Point, 0)
for rawMetric, metricState := range state.Metrics {
// 此处不考虑异常: 数据进入时 已经对metric行做了严格校验
items, _ := Func{}.TranslateMetricLine(rawMetric)
namespace := items[0]
metric := items[1]
for key, aggregator := range metricState.Aggrs {
if nil == aggregator {
continue
}
var (
tags map[string]string
err error
)
// 包含 <all> 关键字, 是聚合的结果, 不能从缓存中查询
if strings.Contains(key, "<all>") {
tags, _, err = Func{}.TranslateArgLines(key, true)
} else {
tags, _, err = Func{}.TranslateArgLines(key)
}
if err != nil {
logger.Warningf("post points to n9e-agent failed, tags/aggr error, "+
"[msg: %s][nid/metric: %s][tags/aggr: %s]", err.Error(), rawMetric, key)
continue
}
points := make([]*Point, 0)
points, err = aggregator.dump(points, ts, tags, metric, key)
if err != nil {
logger.Warningf("post points to n9e-agent failed, generate points error, "+
"[msg: %s][ns/metric: %s][tags/aggr: %s]", err.Error(), rawMetric, key)
continue
}
for _, point := range points {
point.Namespace = namespace
allPoints = append(allPoints, point)
}
}
}
return allPoints
}
func TranslateToN9EPoint(point *Point) *dataobj.MetricValue {
if point.Namespace != "" {
point.Tags["instance"] = config.Endpoint
}
obj := &dataobj.MetricValue{
Nid: point.Namespace,
Metric: point.Name,
Timestamp: point.Timestamp,
Step: int64(point.Step),
ValueUntyped: point.Value,
TagsMap: point.Tags,
}
return obj
}
//
type action struct {
actionType string
fromTime time.Time
toTime time.Time
fromFrequency int // in seconds
toFrequency int // in seconds
prefix string
}
//
type schedule struct {
clearStateAt time.Time
reportAt time.Time
}
func (self *schedule) listActions(now time.Time) []action {
actions := make([]action, 0)
if now.After(self.reportAt) {
actions = append(actions, action{
actionType: "report",
fromTime: self.reportAt.Add(-10 * time.Second),
toTime: self.reportAt,
toFrequency: 10,
prefix: "",
})
self.reportAt = StatsdReporter{}.nextTenSeconds(now)
}
return actions
}

View File

@ -0,0 +1,287 @@
package statsd
import (
"fmt"
"sync"
"time"
"github.com/didi/nightingale/src/toolkits/stats"
"github.com/toolkits/pkg/logger"
)
var (
currentState = &state{Metrics: map[string]*metricState{}, packageCounter: map[string]int{}}
currentStateLock = &sync.RWMutex{}
)
type StatsdState struct{}
func (self StatsdState) GetState() *state {
currentStateLock.RLock()
ptr := currentState
currentStateLock.RUnlock()
return ptr
}
func (self StatsdState) RollState() *state {
currentStateLock.Lock()
oldState := currentState
newState := &state{
Metrics: map[string]*metricState{},
packageCounter: map[string]int{},
}
currentState = newState
currentStateLock.Unlock()
return oldState
}
////////////////////////////////////////////////////////////
// struct state
// 所有metric 的 所有tag组合 的 统计器, 全局只有一个
////////////////////////////////////////////////////////////
type state struct {
isCollecting bool
Metrics map[string]*metricState
packageCounter map[string]int // 每个ns/metric的请求数统计, 用于INFO日志
}
// @input
// value: $value 或者 $value,$status "," 就是 ${CodeDelimiter}
// 并包模式下 $value${MergeDelimeter}$value 或者 $value,$status${MergeDelimeter}$value,$status
// metric: $ns/$metric_name
// argLines:$tagk1=$tagv2\n...$tagkN=$tagvN\n$aggr
func (self *state) Collect(value string, metric string, argLines string) error {
self.isCollecting = true
metricState, err := self.getMetricState(metric)
if err != nil {
self.isCollecting = false
return err
}
// Metrics 与 packageCounter的 map key 相同
if _, found := self.packageCounter[metric]; !found {
self.packageCounter[metric] = 1
} else {
self.packageCounter[metric] += 1
}
err = metricState.Collect(value, metric, argLines)
self.isCollecting = false
return err
}
func (self *state) Size() int {
cnt := 0
for _, ms := range self.Metrics {
cnt += len(ms.Aggrs)
}
return cnt
}
func (self *state) ToMap() (map[string]interface{}, error) {
serialized := map[string]interface{}{}
for k, v := range self.Metrics {
m, err := v.ToMap()
if err != nil {
return nil, err
}
serialized[k] = m
}
return map[string]interface{}{"metrics": serialized}, nil
}
func (self *state) Summarize() {
// 等待最后一次Collect执行完毕, 避免state内存区的读写冲突
var waitMs int
for waitMs = 0; waitMs < SummarizeWaitCollectTimeoutMsConst; waitMs += 5 {
time.Sleep(5 * time.Millisecond)
if !self.isCollecting {
break
}
}
if self.isCollecting {
logger.Warningf("summarize wait collect timeout(%dms), summarize skipped", SummarizeWaitCollectTimeoutMsConst)
return
}
// 调试信息
if waitMs > 0 {
logger.Debugf("system info: summarize wait collect %dms", waitMs)
}
for nsmetric, ms := range self.Metrics {
ms.Summarize(nsmetric)
}
}
func (self *state) getMetricState(metricName string) (*metricState, error) {
metric, ok := self.Metrics[metricName]
if ok && metric != nil {
return metric, nil
}
metric = &metricState{Aggrs: map[string]aggregator{}}
self.Metrics[metricName] = metric
return metric, nil
}
////////////////////////////////////////////////////////////
// struct metricState
// 一个metric 的 所有tag组合的 统计器
////////////////////////////////////////////////////////////
type metricState struct {
Aggrs map[string]aggregator
}
// @input
// value: $value 或者 $value,$status, "," 就是 ${CodeDelimiter}
// 并包模式下 $value${MergeDelimeter}$value 或者 $value,$status${MergeDelimeter}$value,$status
// metric: $ns/$metric_name
// argLines:$tagk1=$tagv2\n...$tagkN=$tagvN\n$aggr
func (self *metricState) Collect(value string, metric string, argLines string) error {
aggregator, err := self.getAggregator(value, metric, argLines)
if err != nil {
return err
}
values, err := Func{}.TranslateValueLine(value)
if err != nil {
return err
}
// 记录实际的打点请求数
stats.Counter.Set("metric.recv.cnt", len(values))
return aggregator.collect(values, metric, argLines)
}
func (self *metricState) ToMap() (map[string]interface{}, error) {
maps := map[string]interface{}{}
for k, v := range self.Aggrs {
m, err := v.toMap()
if err != nil {
return nil, err
}
maps[k] = m
}
return map[string]interface{}{"aggrs": maps}, nil
}
func (self *metricState) Summarize(nsmetric string) {
if len(self.Aggrs) == 0 {
return
}
newAggrs := make(map[string]aggregator, 0)
// copy
for argLines, aggr := range self.Aggrs {
key := argLines
ptrAggr := aggr
newAggrs[key] = ptrAggr
}
// summarize
for argLines, aggr := range self.Aggrs {
key := argLines
ptrAggr := aggr
if ptrAggr == nil {
continue
}
ptrAggr.summarize(nsmetric, key, newAggrs)
}
self.Aggrs = newAggrs
}
func (self *metricState) getAggregator(value, metric, argLines string) (aggregator, error) {
aggr, ok := self.Aggrs[argLines]
if ok && aggr != nil {
return aggr, nil
}
// 创建 聚合器
aggregatorNames, err := Func{}.GetAggrsFromArgLines(argLines)
if err != nil {
return nil, err
}
aggr, err = self.createAggregator(aggregatorNames, value, metric, argLines)
if err != nil {
return nil, err
}
self.Aggrs[argLines] = aggr
return aggr, nil
}
func (self *metricState) createAggregator(aggregatorNames []string, value, metric, argLines string) (aggregator, error) {
switch aggregatorNames[0] {
case "c":
return (&counterAggregator{}).new(aggregatorNames)
case "ce":
return (&counterEAggregator{}).new(aggregatorNames)
case "g":
return (&gaugeAggregator{}).new(aggregatorNames)
case "rpc":
return (&rpcAggregator{}).new(aggregatorNames)
case "rpce":
return (&rpcEAggregator{}).new(aggregatorNames)
case "r":
return (&ratioAggregator{}).new(aggregatorNames)
case "rt":
return (&ratioAsTagsAggregator{}).new(aggregatorNames)
case "p1", "p5", "p25", "p50", "p75", "p90", "p95", "p99", "max", "min", "avg", "sum", "cnt":
return (&histogramAggregator{}).new(aggregatorNames)
default:
return nil, fmt.Errorf("unknown aggregator %s", argLines)
}
}
// internals
func (self state) StateFromMap(serialized map[string]interface{}) (*state, error) {
state := &state{Metrics: map[string]*metricState{}}
for k, v := range serialized {
ms, err := (metricState{}.MetricFromMap(v.(map[string]interface{})))
if err != nil {
return nil, err
}
state.Metrics[k] = ms
}
return state, nil
}
func (self metricState) MetricFromMap(serialized map[string]interface{}) (*metricState, error) {
metricState := &metricState{Aggrs: map[string]aggregator{}}
keys := (serialized["aggrs"]).(map[string]interface{})
for k, v := range keys {
ret, err := self.aggregatorFromMap(v.(map[string]interface{}))
if err != nil {
return nil, err
}
metricState.Aggrs[k] = ret
}
return metricState, nil
}
func (self metricState) aggregatorFromMap(serialized map[string]interface{}) (aggregator, error) {
switch serialized["__aggregator__"] {
case "counter":
return (&counterAggregator{}).fromMap(serialized)
case "counterE":
return (&counterEAggregator{}).fromMap(serialized)
case "gauge":
return (&gaugeAggregator{}).fromMap(serialized)
case "ratio":
return (&ratioAggregator{}).fromMap(serialized)
case "ratioAsTags":
return (&ratioAsTagsAggregator{}).fromMap(serialized)
case "histogram":
return (&histogramAggregator{}).fromMap(serialized)
case "rpc":
return (&rpcAggregator{}).fromMap(serialized)
case "rpce":
return (&rpcEAggregator{}).fromMap(serialized)
default:
return nil, fmt.Errorf("unknown aggregator: %v", serialized)
}
}

View File

@ -0,0 +1,420 @@
package statsd
import (
"fmt"
"sort"
"strings"
"sync"
"time"
lru "github.com/hashicorp/golang-lru"
"github.com/spaolacci/murmur3"
)
type Func struct{}
var (
BadRpcMetricError = fmt.Errorf("bad rpc metric")
BadSummarizeAggregatorError = fmt.Errorf("bad summarize aggregator")
BadDeserializeError = fmt.Errorf("bad deserialize")
BadAggregatorNameError = fmt.Errorf("bad aggregator name")
cache *lru.Cache
)
func init() {
cache, _ = lru.New(MaxLRUCacheSize)
}
type ArgCacheUnit struct {
Aggrs []string
Tags map[string]string
ArgLine string
Error error
}
func NewArgCacheUnitWithError(err error) *ArgCacheUnit {
return &ArgCacheUnit{
Aggrs: []string{},
Tags: make(map[string]string),
ArgLine: "",
Error: err,
}
}
func NewArgCacheUnit(argline string, aggrs []string,
tags map[string]string) *ArgCacheUnit {
return &ArgCacheUnit{
Aggrs: aggrs,
Tags: tags,
ArgLine: argline,
Error: nil,
}
}
// tags+aggr lines
func (f Func) FormatArgLines(argLines string, metricLines string) (string, []string, error) {
// BUG: hash碰撞下可能出现问题, 暂时不处理
key := murmur3.Sum32([]byte(argLines))
value, found := cache.Get(key)
if found {
unit, ok := value.(*ArgCacheUnit)
if ok {
return unit.ArgLine, unit.Aggrs, unit.Error
}
}
tags, agg, err := f.TranslateArgLines(argLines, true)
if err != nil {
cache.Add(key, NewArgCacheUnitWithError(err))
return "", []string{}, fmt.Errorf("translate to tags error, [lines: %s][error: %s]", argLines, err.Error())
}
// check
if err := f.checkTags(tags); err != nil {
cache.Add(key, NewArgCacheUnitWithError(err))
return "", []string{}, err
}
aggrs, err := f.formatAggr(agg)
if err != nil {
cache.Add(key, NewArgCacheUnitWithError(err))
return "", []string{}, err
}
if len(tags) == 0 {
cache.Add(key, NewArgCacheUnit(argLines, aggrs, tags))
return argLines, aggrs, nil
}
traceExist := false
if traceid, found := tags[TagTraceId]; found {
traceExist = true
delete(tags, TagTraceId)
ignore := traceHandler.collectAndIgnore(metricLines, traceid)
if ignore {
return "", []string{}, fmt.Errorf("ignore")
}
}
newLines := []string{}
var keys []string
for k, _ := range tags {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
v := tags[k]
if v == "<all>" { // <all>是关键字, 需要去重
v = "all"
tags[k] = v // 缓存的tags 需要更新,保持一致
}
newLines = append(newLines, fmt.Sprintf("%s=%s", k, v))
}
newLines = append(newLines, agg)
newArgLines := strings.Join(newLines, "\n")
// 包含了traceid, 没有必要缓存, 基本不会命中
if !traceExist {
cache.Add(key, NewArgCacheUnit(newArgLines, aggrs, tags))
// argLine重新排序后发生了变化(tag map有关), 新的argLine也要缓存
if argLines != newArgLines {
newKey := murmur3.Sum32([]byte(newArgLines))
cache.Add(newKey, NewArgCacheUnit(newArgLines, aggrs, tags))
}
}
return newArgLines, aggrs, nil
}
func (f Func) GetAggrsFromArgLines(argLines string) ([]string, error) {
key := murmur3.Sum32([]byte(argLines))
value, found := cache.Get(key)
if found {
unit, ok := value.(*ArgCacheUnit)
if ok {
return unit.Aggrs, unit.Error
}
}
lines := strings.Split(argLines, "\n")
lineSize := len(lines)
if lineSize == 0 {
return nil, fmt.Errorf("empty aggr")
}
return strings.Split(lines[lineSize-1], ","), nil
}
func (f Func) TranslateArgLines(argLines string, aggrNeed ...bool) (map[string]string, string, error) {
// 只需要提取tags参数, 尝试从缓存中获取
if len(aggrNeed) == 0 {
key := murmur3.Sum32([]byte(argLines))
value, found := cache.Get(key)
if found {
unit, ok := value.(*ArgCacheUnit)
if ok {
return unit.Tags, "", unit.Error
}
}
}
// 缓存中不存在, 执行解析 or 不允许从缓存中查询
tags := make(map[string]string)
lines := strings.Split(argLines, "\n")
lineSize := len(lines)
if lineSize == 0 {
return tags, "", fmt.Errorf("empty aggr")
}
agg := lines[lineSize-1]
if lineSize == 1 {
return tags, agg, nil
}
for _, line := range lines[:lineSize-1] {
parts := strings.SplitN(line, "=", 2)
if len(parts) == 2 {
tags[parts[0]] = parts[1]
} else {
return nil, "", fmt.Errorf("bad tag [%s]", line)
}
}
return tags, agg, nil
}
func (f Func) checkTags(tags map[string]string) error {
tcnt := len(tags)
if tcnt > MaxTagsCntConst {
return fmt.Errorf("too many tags %v", tags)
}
return nil
}
func (f Func) TrimRpcCallee(callee string) string {
callee = strings.Replace(callee, "://", "|", -1)
return strings.Replace(callee, ":", "|", -1)
}
// metric line: $ns/$raw-metric
func (f Func) FormatMetricLine(metricLine string, aggrs []string) (string, error) {
ret, err := f.TranslateMetricLine(metricLine)
if err != nil {
return "", err
}
if len(ret) != 2 {
return "", fmt.Errorf("bad metric line, missing ns or metric")
}
// ns
ns := ret[0]
if !strings.HasPrefix(ns, NsPrefixConst) {
ns = NsPrefixConst + ns
}
if !strings.HasSuffix(ns, NsSuffixConst) {
ns = ns + NsSuffixConst
}
// metric
metric := ret[1]
if len(aggrs) > 0 &&
(aggrs[0] == Const_CommonAggregator_Rpc || aggrs[0] == Const_CommonAggregator_RpcE) {
// metric: rpc统计类型 必须以rpc开头
if !strings.HasPrefix(metric, "rpc") {
metric = "rpc_" + metric
}
}
return fmt.Sprintf("%s/%s", ns, metric), nil
}
func (f Func) TranslateMetricLine(metricLine string) ([]string, error) {
return strings.SplitN(metricLine, "/", 2), nil
}
// aggr line
func (f Func) formatAggr(aggr string) ([]string, error) {
aggrNames, err := f.translateAggregator(aggr)
if err != nil {
return []string{}, err
}
if len(aggrNames) == 1 {
aggrName := aggrNames[0]
if _, ok := CommonAggregatorsConst[aggrName]; !ok {
return []string{}, fmt.Errorf("bad aggregator %s", aggrName)
}
} else {
for _, aggrName := range aggrNames {
if _, ok := HistogramAggregatorsConst[aggrName]; !ok {
return []string{}, fmt.Errorf("bad aggregator %s", aggrName)
}
}
}
return aggrNames, nil
}
func (f Func) translateAggregator(aggr string) ([]string, error) {
if len(aggr) == 0 {
return nil, fmt.Errorf("emtpy aggr")
}
return strings.Split(aggr, ","), nil
}
// value line
// 拆解为子字符串, 根据协议不同, 每个协议单独对子串进行处理
func (f Func) TranslateValueLine(valueLine string) ([]string, error) {
if len(valueLine) == 0 {
return nil, fmt.Errorf("empty value line")
}
return strings.Split(valueLine, MergeDelimiter), nil
}
//
func (f Func) IsOk(code string) bool {
if ok, exist := RpcOkCodesConst[code]; exist && ok {
return true
}
return false
}
// 检查 a是否为b的keys的子集(subKeys)
func (f Func) IsSubKeys(a []string, b map[string]string) bool {
isAllSub := true
for i := 0; i < len(a) && isAllSub; i++ {
isSub := false
for k, _ := range b {
if a[i] == k {
isSub = true
break
}
}
if !isSub {
isAllSub = false
}
}
return isAllSub
}
// 检查 排序字符串数组数组 a中是否有完全相同的数组
func (f Func) HasSameSortedArray(a [][]string) bool {
hasSameArray := false
for i := 0; i < len(a) && !hasSameArray; i++ {
for k := i + 1; k < len(a) && !hasSameArray; k++ {
t1 := a[i]
t2 := a[k]
if len(t1) != len(t2) {
continue
}
isEqualArray := true
for j := 0; j < len(t1) && isEqualArray; j++ {
if t1[j] != t2[j] {
isEqualArray = false
}
}
if isEqualArray {
hasSameArray = true
}
}
}
return hasSameArray
}
// consts不能被修改, vars可以被修改
func (f Func) MergeSortedArrays(consts, vars [][]string) [][]string {
for i := 0; i < len(consts); i++ {
// check same
hasSame := false
for j := 0; j < len(vars) && !hasSame; j++ {
if len(consts[i]) != len(vars[j]) {
continue
}
isAllItemSame := true
for k := 0; k < len(consts[i]) && isAllItemSame; k++ {
if consts[i][k] != vars[j][k] {
isAllItemSame = false
}
}
if isAllItemSame {
hasSame = true
}
}
if !hasSame {
vars = append(vars, consts[i])
}
}
return vars
}
type TraceHandler struct {
sync.RWMutex
SecurityScanCounter map[string]float64 // map[ns]counter
}
var traceHandler = &TraceHandler{SecurityScanCounter: map[string]float64{}}
func (t *TraceHandler) rollHandler() *TraceHandler {
t.Lock()
defer t.Unlock()
old := &TraceHandler{SecurityScanCounter: map[string]float64{}}
old.SecurityScanCounter = t.SecurityScanCounter
t.SecurityScanCounter = make(map[string]float64)
return old
}
// 后续可以做很多, 比如打印日志,关联把脉 等
func (t *TraceHandler) collectAndIgnore(nsMetric string, traceid string) bool {
t.Lock()
defer t.Unlock()
ignore := false
if strings.HasSuffix(traceid, "ff") {
ignore = true
if _, found := t.SecurityScanCounter[nsMetric]; !found {
t.SecurityScanCounter[nsMetric] = 1
} else {
t.SecurityScanCounter[nsMetric] += 1
}
}
return ignore
}
// 不需要加锁, 单线程不会并发
func (t *TraceHandler) dumpPoints(reportTime time.Time) []*Point {
var ret []*Point
if len(t.SecurityScanCounter) == 0 {
return ret
}
ts := reportTime.Unix()
for nsMetric, counter := range t.SecurityScanCounter {
slice := strings.Split(nsMetric, "/")
if len(slice) != 2 {
continue
}
ns := slice[0]
if !strings.HasPrefix(ns, NsPrefixConst) {
ns = NsPrefixConst + ns
}
ret = append(ret, &Point{
Namespace: ns,
Name: "security.scan.counter",
Timestamp: ts,
Tags: map[string]string{
"metric": slice[1],
},
Value: counter,
})
}
return ret
}

View File

@ -0,0 +1,41 @@
package udp
import (
"sync"
"github.com/didi/nightingale/src/modules/agent/statsd"
"github.com/didi/nightingale/src/toolkits/exit"
"github.com/toolkits/pkg/logger"
)
var ByteSlicePool = sync.Pool{
New: func() interface{} {
return make([]byte, 4096, 4096)
}}
func handleUdpPackets() {
defer func() {
if err := recover(); err != nil {
stack := exit.Stack(3)
logger.Warningf("udp handler exit unexpected, [error: %v],[stack: %s]", err, stack)
panic(err) // udp异常, 为保证metrics功能完备性, 快速panic
}
// 停止udp服务
stop()
}()
message := ByteSlicePool.Get().([]byte)
for !statsd.IsExited() {
n, _, err := udpConn.ReadFrom(message)
if err != nil {
logger.Warningf("read from udp error, [error: %s]", err.Error())
continue
}
packet := string(message[0:n])
ByteSlicePool.Put(message)
logger.Debugf("recv packet: %v\n", packet)
statsd.StatsdReceiver{}.HandlePacket(packet)
}
}

View File

@ -0,0 +1,42 @@
package udp
import (
"fmt"
"log"
"net"
"github.com/didi/nightingale/src/modules/agent/config"
)
var (
udpConn *net.UDPConn = nil
)
func Start() {
if !config.Config.Udp.Enable {
log.Println("udp server disabled")
return
}
address, _ := net.ResolveUDPAddr("udp4", config.Config.Udp.Listen)
conn, err := net.ListenUDP("udp4", address)
if err != nil {
errsmg := fmt.Sprintf("listen udp error, [addr: %s][error: %s]", config.Config.Udp.Listen, err.Error())
log.Printf(errsmg)
panic(errsmg)
}
log.Println("udp start, listening on ", config.Config.Udp.Listen)
// 保存 udp服务链接
udpConn = conn
// 开启 udp数据包处理进程
go handleUdpPackets()
}
func stop() error {
if udpConn != nil {
udpConn.Close()
}
return nil
}

View File

@ -36,5 +36,8 @@ func Config(r *gin.Engine) {
userLogin.GET("/task/:id", taskView)
userLogin.PUT("/task/:id/action", taskActionPut)
userLogin.PUT("/task/:id/host", taskHostPut)
// 专门针对工单系统开发的接口
userLogin.POST("/run/:id", taskRunForTT)
}
}

View File

@ -256,10 +256,5 @@ func cleanHosts(formHosts []string) []string {
arr = append(arr, item)
}
cnt = len(arr)
if cnt == 0 {
bomb("arg[hosts] empty")
}
return arr
}

View File

@ -9,8 +9,10 @@ import (
"github.com/gin-gonic/gin"
"github.com/toolkits/pkg/logger"
"github.com/toolkits/pkg/net/httplib"
"github.com/toolkits/pkg/slice"
"github.com/didi/nightingale/src/common/address"
"github.com/didi/nightingale/src/models"
"github.com/didi/nightingale/src/modules/job/config"
)
@ -34,6 +36,9 @@ func taskPost(c *gin.Context) {
var f taskForm
bind(c, &f)
hosts := cleanHosts(f.Hosts)
if len(hosts) == 0 {
bomb("arg[hosts] empty")
}
checkTaskPerm(hosts, user, f.Account)
@ -557,3 +562,215 @@ func taskCallback(c *gin.Context) {
renderMessage(c, nil)
}
// 这个数据结构是tt回调的时候使用的通用数据结构里边既有工单基本信息也有结构化数据job这里只需要从中解析出结构化数据
type ttForm struct {
Id int64 `json:"id" binding:"required"`
RunUser string `json:"runUser" binding:"required"`
Form map[string]interface{} `json:"form" binding:"required"`
Approval int `json:"approval"`
}
// /api/job-ce/run/:id?hosts=10.3.4.5,10.4.5.6
func taskRunForTT(c *gin.Context) {
var f ttForm
bind(c, &f)
action := c.Request.Host + c.Request.URL.Path
if f.Approval == 2 {
renderMessage(c, "该任务未通过审批")
return
}
tpl := TaskTpl(urlParamInt64(c, "id"))
arr, err := tpl.Hosts()
dangerous(err)
// 如果QueryString里带有hosts参数就用QueryString里的机器列表
// 否则就从结构化数据中解析hosts
// 如果结构化数据中也没有,那只能有模板里的,模板里也没有就报错
hosts := queryStr(c, "hosts", "")
if hosts != "" {
// 使用QueryString传过来的hosts
tmp := cleanHosts(strings.Split(hosts, ","))
if len(tmp) > 0 {
arr = tmp
}
} else {
if v, ok := f.Form["hosts"]; ok {
hosts = v.(string)
hosts = strings.ReplaceAll(hosts, "\r", ",")
hosts = strings.ReplaceAll(hosts, "\n", ",")
tmp := cleanHosts(strings.Split(hosts, ","))
if len(tmp) > 0 {
arr = tmp
}
}
}
if len(arr) == 0 {
bomb("hosts empty")
}
// 校验权限
user := loginUser(c)
checkTaskPerm(arr, user, tpl.Account)
task := &models.TaskMeta{
Title: tpl.Title,
Account: tpl.Account,
Batch: tpl.Batch,
Tolerance: tpl.Tolerance,
Timeout: tpl.Timeout,
Pause: tpl.Pause,
Script: tpl.Script,
Creator: user.Username,
}
task.Args = ""
for k, v := range f.Form {
switch v.(type) {
case string:
if k == "hosts" {
tmp := v.(string)
tmp = strings.ReplaceAll(tmp, "\r", ",")
tmp = strings.ReplaceAll(tmp, "\n", ",")
tmpArray := cleanHosts(strings.Split(hosts, ","))
if len(tmpArray) > 0 {
v = strings.Join(tmpArray, ",")
}
}
if len(v.(string)) < 1600 {
task.Args += fmt.Sprintf("--%s=%s,,", k, v.(string))
}
case int:
task.Args += fmt.Sprintf("--%s=%d,,", k, v.(int))
case int64:
task.Args += fmt.Sprintf("--%s=%d,,", k, v.(int64))
case float64:
//TODO 暂时不支持传非整型
task.Args += fmt.Sprintf("--%s=%d,,", k, int64(v.(float64)))
}
}
task.Args = strings.TrimSuffix(task.Args, ",,")
dangerous(task.Save(arr, "start"))
go func() {
var arr2Map = map[string]int{}
for _, a := range arr {
arr2Map[a] = 1
}
for {
var (
restHosts = map[string]int{}
)
for h, _ := range arr2Map {
th, err := models.TaskHostGet(task.Id, h)
if err == nil {
if th.Status == "killed" {
reply := fmt.Sprintf("### Job通知推送\n* Job平台任务(ID:%d)在机器%s中执行失败"+
"原因为task被kill掉\n* 执行action接口地址为: %s\n* 标准输出: %s\n* 错误输出: %s\n",
task.Id, h, action, th.Stdout, th.Stderr)
err = TicketSender(f.Id, action, "task has been killed", reply, -1,
nil)
if err != nil {
logger.Errorf("send callback to ticket, err: %v", err)
}
} else if th.Status == "failed" {
reply := fmt.Sprintf("### Job通知推送\n* Job平台任务(ID:%d)在机器%s中执行失败"+
"详情见错误输出\n* 执行action接口地址为: %s\n* 标准输出: %s\n* 错误输出: %s\n",
task.Id, h, action, th.Stdout, th.Stderr)
err = TicketSender(f.Id, action, "run task failed", reply, -1,
nil)
if err != nil {
logger.Errorf("send callback to ticket, err: %v", err)
}
} else if th.Status == "timeout" {
reply := fmt.Sprintf("### Job通知推送\n* Job平台任务(ID:%d)在机器%s中执行超时"+
"\n* 执行action接口地址为: %s\n* 标准输出: %s\n* 错误输出: %s\n",
task.Id, h, action, th.Stdout, th.Stderr)
err = TicketSender(f.Id, action, "run task failed", reply, -1,
nil)
if err != nil {
logger.Errorf("send callback to ticket, err: %v", err)
}
} else if th.Status == "success" {
reply := fmt.Sprintf("### Job通知推送\n* Job平台任务(ID:%d)在机器%s中执行成功"+
"\n* 执行action接口地址为: %s\n* 标准输出: %s\n* 错误输出: %s\n",
task.Id, h, action, th.Stdout, th.Stderr)
err = TicketSender(f.Id, action, "task ", reply, 1,
nil)
if err != nil {
logger.Errorf("send callback to ticket, err: %v", err)
}
} else {
restHosts[h] = 1
}
} else {
logger.Errorf("get task_host err: %v", err)
}
}
arr2Map = restHosts
time.Sleep(time.Second)
}
}()
go func() {
time.Sleep(time.Second)
reply := fmt.Sprintf("[任务详情请关注Job平台任务(ID:%d)详情页地址](%s)", task.Id, fmt.Sprintf("/job/tasks/%d/result", task.Id))
err = TicketSender(f.Id, action, "", reply, -1,
nil)
if err != nil {
logger.Errorf("send callback to ticket, err: %v", err)
}
}()
renderData(c, gin.H{"taskID": task.Id, "detailPage": fmt.Sprintf("/job/tasks/%d/result", task.Id)}, nil)
}
type ticketCallBackForm struct {
TicketId int64 `json:"ticketId" binding:"required"`
ActionApi string `json:"actionApi" binding:"required"`
SystemName string `json:"systemName" binding:"required"`
Success int `json:"success" binding:"required"`
Reason string `json:"reason"`
Info interface{} `json:"info"`
AutoReply string `json:"autoReply"`
}
func TicketSender(id int64, action, reason, reply string, result int, info interface{}) error {
addr := address.GetHTTPListen("ticket")
data := ticketCallBackForm{
TicketId: id,
ActionApi: action,
Success: result,
Reason: reason,
Info: info,
AutoReply: reply,
}
url := fmt.Sprintf("%s/v1/ticket/callback?systemName=job", addr)
if !(strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "https://")) {
url = "http://" + url
}
res, code, err := httplib.PostJSON(url, time.Second*5, data, map[string]string{"x-srv-token": "ticket-builtin-token"})
if err != nil {
logger.Errorf("call sender api failed, server: %v, data: %+v, err: %v, resp:%v, status code:%d", url, data, err, string(res), code)
return err
}
if code != 200 {
logger.Errorf("call sender api failed, server: %v, data: %+v, resp:%v, code:%d", url, data, string(res), code)
return err
}
logger.Debugf("ticket response %s", string(res))
return nil
}

View File

@ -293,6 +293,9 @@ func taskTplRun(c *gin.Context) {
f.Overwrite(tpl)
hosts := cleanHosts(f.Hosts)
if len(hosts) == 0 {
bomb("arg[hosts] empty")
}
checkTaskPerm(hosts, user, f.Account)

View File

@ -86,7 +86,7 @@ func popEvent(queues []interface{}) (*models.Event, bool) {
var curNodePath string
node, err := models.NodeGet("id=?", stra.Nid)
if err != nil {
if err != nil || node == nil {
logger.Warningf("get node failed, node id: %v, event: %+v, err: %v", stra.Nid, event, err)
} else {
nodePath = node.Path

View File

@ -270,6 +270,10 @@ func HostBindingsForMon(endpointList []string) ([]string, error) {
return list, err
}
if node == nil {
continue
}
list = append(list, node.Path)
}
return list, nil

View File

@ -18,6 +18,7 @@ type ConfigT struct {
Sender map[string]senderSection `yaml:"sender"`
RabbitMQ rabbitmqSection `yaml:"rabbitmq"`
WeChat wechatSection `yaml:"wechat"`
Captcha bool `yaml:"captcha"`
}
type wechatSection struct {
@ -33,7 +34,7 @@ type ssoSection struct {
ClientId string `yaml:"clientId"`
ClientSecret string `yaml:"clientSecret"`
ApiKey string `yaml:"apiKey"`
StateExpiresIn int `yaml:"stateExpiresIn"`
StateExpiresIn int64 `yaml:"stateExpiresIn"`
CoverAttributes bool `yaml:"coverAttributes"`
Attributes struct {
Dispname string `yaml:"dispname"`

View File

@ -0,0 +1,19 @@
package cron
import (
"time"
"github.com/didi/nightingale/src/models"
)
const cleanerInterval = 3600 * time.Second
func CleanerLoop() {
tc := time.Tick(cleanerInterval)
for {
models.AuthState{}.CleanUp()
models.Captcha{}.CleanUp()
<-tc
}
}

View File

@ -18,12 +18,17 @@ func Config(r *gin.Engine) {
notLogin.GET("/roles/local", localRoleGet)
notLogin.POST("/users/invite", userInvitePost)
notLogin.GET("/auth/authorize", authAuthorize)
notLogin.GET("/auth/callback", authCallback)
notLogin.GET("/auth/settings", authSettings)
notLogin.GET("/auth/v2/authorize", authAuthorizeV2)
notLogin.GET("/auth/v2/callback", authCallbackV2)
notLogin.GET("/auth/v2/logout", logoutV2)
notLogin.POST("/auth/send-login-code-by-sms", v1SendLoginCodeBySms)
notLogin.POST("/auth/send-login-code-by-email", v1SendLoginCodeByEmail)
notLogin.POST("/auth/send-rst-code-by-sms", sendRstCodeBySms)
notLogin.POST("/auth/rst-password", rstPassword)
notLogin.GET("/auth/captcha", captchaGet)
notLogin.GET("/v2/nodes", nodeGets)
}
hbs := r.Group("/api/hbs")
@ -111,6 +116,7 @@ func Config(r *gin.Engine) {
userLogin.POST("/node/:id/roles", rolesUnderNodePost)
userLogin.DELETE("/node/:id/roles", rolesUnderNodeDel)
userLogin.GET("/node/:id/resources", resourceUnderNodeGet)
userLogin.GET("/node/:id/resources/cate-count", renderNodeResourcesCountByCate)
userLogin.POST("/node/:id/resources/bind", resourceBindNode)
userLogin.POST("/node/:id/resources/unbind", resourceUnbindNode)
userLogin.PUT("/node/:id/resources/note", resourceUnderNodeNotePut)
@ -170,5 +176,14 @@ func Config(r *gin.Engine) {
v1.GET("/users", userListGet)
v1.POST("/login", v1Login)
v1.POST("/send-login-code-by-sms", v1SendLoginCodeBySms)
v1.POST("/send-login-code-by-email", v1SendLoginCodeByEmail)
// 第三方系统获取某个用户的所有权限点
v1.GET("/perms/global", v1PermGlobalOps)
// 第三方系统同步权限表的数据
v1.GET("/table/sync/role-operation", v1RoleOperationGets)
v1.GET("/table/sync/role-global-user", v1RoleGlobalUserGets)
}
}

View File

@ -1,71 +1,102 @@
package http
import (
"bytes"
"errors"
"fmt"
"html/template"
"log"
"math/rand"
"path"
"strings"
"time"
"github.com/gin-gonic/gin"
"github.com/mojocn/base64Captcha"
"github.com/toolkits/pkg/file"
"github.com/toolkits/pkg/str"
"github.com/didi/nightingale/src/common/dataobj"
"github.com/didi/nightingale/src/models"
"github.com/didi/nightingale/src/modules/rdb/config"
"github.com/didi/nightingale/src/modules/rdb/redisc"
"github.com/didi/nightingale/src/modules/rdb/ssoc"
)
type loginForm struct {
Username string `json:"username" binding:"required"`
Password string `json:"password" binding:"required"`
IsLDAP int `json:"is_ldap"`
RemoteAddr string `json:"remote_addr"`
}
var (
loginCodeSmsTpl *template.Template
loginCodeEmailTpl *template.Template
errUnsupportCaptcha = errors.New("unsupported captcha")
errInvalidAnswer = errors.New("Invalid captcha answer")
func (f *loginForm) validate() {
if str.Dangerous(f.Username) {
bomb("%s invalid", f.Username)
// TODO: set false
debug = true
// https://captcha.mojotv.cn
captchaDirver = base64Captcha.DriverString{
Height: 30,
Width: 120,
ShowLineOptions: 0,
Length: 4,
Source: "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789",
//ShowLineOptions: 14,
}
)
func getConfigFile(name, ext string) (string, error) {
if p := path.Join(path.Join(file.SelfDir(), "etc", name+".local."+ext)); file.IsExist(p) {
return p, nil
}
if p := path.Join(path.Join(file.SelfDir(), "etc", name+"."+ext)); file.IsExist(p) {
return p, nil
} else {
return "", fmt.Errorf("file %s not found", p)
}
if len(f.Username) > 64 {
bomb("%s too long", f.Username)
}
func init() {
filename, err := getConfigFile("login-code-sms", "tpl")
if err != nil {
log.Fatal(err)
}
loginCodeSmsTpl, err = template.ParseFiles(filename)
if err != nil {
log.Fatalf("open %s err: %s", filename, err)
}
filename, err = getConfigFile("login-code-email", "tpl")
if err != nil {
log.Fatal(err)
}
loginCodeEmailTpl, err = template.ParseFiles(filename)
if err != nil {
log.Fatalf("open %s err: %s", filename, err)
}
}
func login(c *gin.Context) {
var f loginForm
var f loginInput
bind(c, &f)
f.validate()
if f.IsLDAP == 1 {
dangerous(models.LdapLogin(f.Username, f.Password, c.ClientIP()))
} else {
dangerous(models.PassLogin(f.Username, f.Password, c.ClientIP()))
if config.Config.Captcha {
c, err := models.CaptchaGet("captcha_id=?", f.CaptchaId)
dangerous(err)
if strings.ToLower(c.Answer) != strings.ToLower(f.Answer) {
dangerous(errInvalidAnswer)
}
}
user, err := models.UserGet("username=?", f.Username)
user, err := authLogin(f)
dangerous(err)
writeCookieUser(c, user.UUID)
renderMessage(c, "")
}
// v1Login called by sso.rdb module
func v1Login(c *gin.Context) {
var f loginForm
bind(c, &f)
f.validate()
if f.IsLDAP == 1 {
dangerous(models.LdapLogin(f.Username, f.Password, c.ClientIP()))
} else {
dangerous(models.PassLogin(f.Username, f.Password, c.ClientIP()))
}
user, err := models.UserGet("username=?", f.Username)
dangerous(err)
writeCookieUser(c, user.UUID)
// TODO: implement remote address access control
go models.LoginLogNew(f.Username, f.RemoteAddr, "in")
renderData(c, user, nil)
go models.LoginLogNew(user.Username, c.ClientIP(), "in")
}
func logout(c *gin.Context) {
@ -83,39 +114,24 @@ func logout(c *gin.Context) {
writeCookieUser(c, "")
go models.LoginLogNew(username, c.ClientIP(), "out")
if config.Config.SSO.Enable {
redirect := queryStr(c, "redirect", "/")
c.Redirect(302, ssoc.LogoutLocation(redirect))
} else {
c.String(200, "logout successfully")
}
}
func authAuthorize(c *gin.Context) {
username := cookieUsername(c)
if username != "" { // alread login
c.String(200, "hi, "+username)
return
}
redirect := queryStr(c, "redirect", "/")
if config.Config.SSO.Enable {
c.Redirect(302, ssoc.Authorize(redirect))
} else {
c.String(200, "sso does not enable")
}
go models.LoginLogNew(username, c.ClientIP(), "out")
}
type authRedirect struct {
Redirect string `json:"redirect"`
Msg string `json:"msg"`
}
func authAuthorizeV2(c *gin.Context) {
redirect := queryStr(c, "redirect", "/")
log.Printf("---> redirect %s", redirect)
ret := &authRedirect{Redirect: redirect}
username := cookieUsername(c)
@ -124,29 +140,13 @@ func authAuthorizeV2(c *gin.Context) {
return
}
var err error
if config.Config.SSO.Enable {
ret.Redirect = ssoc.Authorize(redirect)
ret.Redirect, err = ssoc.Authorize(redirect)
} else {
ret.Redirect = "/login"
}
renderData(c, ret, nil)
}
func authCallback(c *gin.Context) {
code := queryStr(c, "code", "")
state := queryStr(c, "state", "")
if code == "" {
if redirect := queryStr(c, "redirect"); redirect != "" {
c.Redirect(302, redirect)
return
}
}
redirect, user, err := ssoc.Callback(code, state)
dangerous(err)
writeCookieUser(c, user.UUID)
c.Redirect(302, redirect)
renderData(c, ret, err)
}
func authCallbackV2(c *gin.Context) {
@ -172,10 +172,337 @@ func authCallbackV2(c *gin.Context) {
renderData(c, ret, nil)
}
func authSettings(c *gin.Context) {
renderData(c, struct {
Sso bool `json:"sso"`
}{
Sso: config.Config.SSO.Enable,
}, nil)
func logoutV2(c *gin.Context) {
redirect := queryStr(c, "redirect", "")
ret := &authRedirect{Redirect: redirect}
uuid := readCookieUser(c)
if uuid == "" {
renderData(c, ret, nil)
return
}
username := models.UsernameByUUID(uuid)
if username == "" {
renderData(c, ret, nil)
return
}
writeCookieUser(c, "")
ret.Msg = "logout successfully"
if config.Config.SSO.Enable {
if redirect == "" {
redirect = "/"
}
ret.Redirect = ssoc.LogoutLocation(redirect)
}
renderData(c, ret, nil)
go models.LoginLogNew(username, c.ClientIP(), "out")
}
type loginInput struct {
Username string `json:"username"`
Password string `json:"password"`
Phone string `json:"phone"`
Email string `json:"email"`
Code string `json:"code"`
CaptchaId string `json:"captcha_id"`
Answer string `json:"answer" description:"captcha answer"`
Type string `json:"type" description:"sms-code|email-code|password|ldap"`
RemoteAddr string `json:"remote_addr" description:"use for server account(v1)"`
IsLDAP int `json:"is_ldap" description:"deprecated"`
}
func (f *loginInput) validate() {
if f.IsLDAP == 1 {
f.Type = models.LOGIN_T_LDAP
}
if f.Type == "" {
f.Type = models.LOGIN_T_PWD
}
if f.Type == models.LOGIN_T_PWD {
if str.Dangerous(f.Username) {
bomb("%s invalid", f.Username)
}
if len(f.Username) > 64 {
bomb("%s too long", f.Username)
}
}
}
// v1Login called by sso.rdb module
func v1Login(c *gin.Context) {
var f loginInput
bind(c, &f)
user, err := authLogin(f)
renderData(c, *user, err)
go models.LoginLogNew(user.Username, f.RemoteAddr, "in")
}
// authLogin called by /v1/rdb/login, /api/rdb/auth/login
func authLogin(in loginInput) (user *models.User, err error) {
switch strings.ToLower(in.Type) {
case models.LOGIN_T_LDAP:
return models.LdapLogin(in.Username, in.Password)
case models.LOGIN_T_PWD:
return models.PassLogin(in.Username, in.Password)
case models.LOGIN_T_SMS:
return models.SmsCodeLogin(in.Phone, in.Code)
case models.LOGIN_T_EMAIL:
return models.EmailCodeLogin(in.Email, in.Code)
default:
return nil, fmt.Errorf("invalid login type %s", in.Type)
}
}
type v1SendLoginCodeBySmsInput struct {
Phone string `json:"phone"`
}
func v1SendLoginCodeBySms(c *gin.Context) {
var f v1SendLoginCodeBySmsInput
bind(c, &f)
msg, err := func() (string, error) {
if !config.Config.Redis.Enable {
return "", fmt.Errorf("sms sender is disabled")
}
phone := f.Phone
user, _ := models.UserGet("phone=?", phone)
if user == nil {
return "", fmt.Errorf("phone %s dose not exist", phone)
}
// general a random code and add cache
code := fmt.Sprintf("%06d", rand.Intn(1000000))
loginCode := &models.LoginCode{
Username: user.Username,
Code: code,
LoginType: models.LOGIN_T_SMS,
CreatedAt: time.Now().Unix(),
}
if err := loginCode.Save(); err != nil {
return "", err
}
var buf bytes.Buffer
if err := loginCodeSmsTpl.Execute(&buf, loginCode); err != nil {
return "", err
}
if err := redisc.Write(&dataobj.Message{
Tos: []string{phone},
Content: buf.String(),
}, config.SMS_QUEUE_NAME); err != nil {
return "", err
}
if debug {
return fmt.Sprintf("[debug]: %s", buf.String()), nil
}
return "successed", nil
}()
renderData(c, msg, err)
}
type v1SendLoginCodeByEmailInput struct {
Email string `json:"email"`
}
func v1SendLoginCodeByEmail(c *gin.Context) {
var f v1SendLoginCodeByEmailInput
bind(c, &f)
msg, err := func() (string, error) {
if !config.Config.Redis.Enable {
return "", fmt.Errorf("mail sender is disabled")
}
email := f.Email
user, _ := models.UserGet("email=?", email)
if user == nil {
return "", fmt.Errorf("email %s dose not exist", email)
}
// general a random code and add cache
code := fmt.Sprintf("%06d", rand.Intn(1000000))
loginCode := &models.LoginCode{
Username: user.Username,
Code: code,
LoginType: models.LOGIN_T_EMAIL,
CreatedAt: time.Now().Unix(),
}
if err := loginCode.Save(); err != nil {
return "", err
}
var buf bytes.Buffer
if err := loginCodeEmailTpl.Execute(&buf, loginCode); err != nil {
return "", err
}
if err := redisc.Write(&dataobj.Message{
Tos: []string{email},
Content: buf.String(),
}, config.SMS_QUEUE_NAME); err != nil {
return "", err
}
if debug {
return fmt.Sprintf("[debug]: %s", buf.String()), nil
}
return "successed", nil
}()
renderData(c, msg, err)
}
type sendRstCodeBySmsInput struct {
Username string `json:"username"`
Phone string `json:"phone"`
}
func sendRstCodeBySms(c *gin.Context) {
var f sendRstCodeBySmsInput
bind(c, &f)
msg, err := func() (string, error) {
if !config.Config.Redis.Enable {
return "", fmt.Errorf("sms sender is disabled")
}
phone := f.Phone
user, _ := models.UserGet("username=? and phone=?", f.Username, phone)
if user == nil {
return "", fmt.Errorf("user %s phone %s dose not exist", f.Username, phone)
}
// general a random code and add cache
code := fmt.Sprintf("%06d", rand.Intn(1000000))
loginCode := &models.LoginCode{
Username: user.Username,
Code: code,
LoginType: models.LOGIN_T_RST,
CreatedAt: time.Now().Unix(),
}
if err := loginCode.Save(); err != nil {
return "", err
}
var buf bytes.Buffer
if err := loginCodeSmsTpl.Execute(&buf, loginCode); err != nil {
return "", err
}
if err := redisc.Write(&dataobj.Message{
Tos: []string{phone},
Content: buf.String(),
}, config.SMS_QUEUE_NAME); err != nil {
return "", err
}
if debug {
return fmt.Sprintf("[debug] msg: %s", buf.String()), nil
}
return "successed", nil
}()
renderData(c, msg, err)
}
type rstPasswordInput struct {
Username string `json:"username"`
Phone string `json:"phone"`
Code string `json:"code"`
Password string `json:"password"`
Type string `json:"type"`
}
func rstPassword(c *gin.Context) {
var in rstPasswordInput
bind(c, &in)
err := func() error {
user, _ := models.UserGet("username=? and phone=?", in.Username, in.Phone)
if user == nil {
return fmt.Errorf("user's phone not exist")
}
lc, err := models.LoginCodeGet("username=? and code=? and login_type=?",
user.Username, in.Code, models.LOGIN_T_RST)
if err != nil {
return fmt.Errorf("invalid code")
}
if time.Now().Unix()-lc.CreatedAt > models.LOGIN_EXPIRES_IN {
return fmt.Errorf("the code has expired")
}
if in.Type == "verify-code" {
return nil
}
defer lc.Del()
// update password
if user.Password, err = models.CryptoPass(in.Password); err != nil {
return err
}
if err = checkPassword(in.Password); err != nil {
return err
}
if err = user.Update("password"); err != nil {
return err
}
return nil
}()
if err != nil {
renderData(c, nil, err)
} else {
renderData(c, "reset successfully", nil)
}
}
func captchaGet(c *gin.Context) {
ret, err := func() (*models.Captcha, error) {
if !config.Config.Captcha {
return nil, errUnsupportCaptcha
}
driver := captchaDirver.ConvertFonts()
id, content, answer := driver.GenerateIdQuestionAnswer()
item, err := driver.DrawCaptcha(content)
if err != nil {
return nil, err
}
ret := &models.Captcha{
CaptchaId: id,
Answer: answer,
Image: item.EncodeB64string(),
CreatedAt: time.Now().Unix(),
}
if err := ret.Save(); err != nil {
return nil, err
}
return ret, nil
}()
renderData(c, ret, err)
}

View File

@ -1,6 +1,7 @@
package http
import (
"fmt"
"strconv"
"github.com/gin-gonic/gin"
@ -137,6 +138,61 @@ type idsForm struct {
Ids []int64 `json:"ids"`
}
func checkPassword(passwd string) error {
indNum := [4]int{0, 0, 0, 0}
spCode := []byte{'!', '@', '#', '$', '%', '^', '&', '*', '_', '-', '~', '.', ',', '<', '>', '/', ';', ':', '|', '?', '+', '='}
if len(passwd) < 6 {
return fmt.Errorf("password too short")
}
passwdByte := []byte(passwd)
for _, i := range passwdByte {
if i >= 'A' && i <= 'Z' {
indNum[0] = 1
continue
}
if i >= 'a' && i <= 'z' {
indNum[1] = 1
continue
}
if i >= '0' && i <= '9' {
indNum[2] = 1
continue
}
has := false
for _, s := range spCode {
if i == s {
indNum[3] = 1
has = true
break
}
}
if !has {
return fmt.Errorf("character: %s not supported", string(i))
}
}
codeCount := 0
for _, i := range indNum {
codeCount += i
}
if codeCount < 4 {
return fmt.Errorf("password too simple")
}
return nil
}
// ------------
func loginUsername(c *gin.Context) string {

View File

@ -58,8 +58,12 @@ func (f nodeForm) Validate() {
bomb("arg[pid] invalid")
}
if !str.IsMatch(f.Ident, `^[a-zA-Z0-9\-_]+$`) {
bomb("ident legal characters: [a-zA-Z0-9_-]")
if !str.IsMatch(f.Ident, `^[a-z0-9\-_]+$`) {
bomb("ident legal characters: [a-z0-9_-]")
}
if len(f.Ident) >= 32 {
bomb("ident length should be less than 32")
}
if f.Leaf != 0 && f.Leaf != 1 {

View File

@ -41,3 +41,8 @@ func v1CandoNodeOps(c *gin.Context) {
renderData(c, ret, nil)
}
func v1RoleGlobalUserGets(c *gin.Context) {
objs, err := models.RoleGlobalUserAll()
renderData(c, objs, err)
}

View File

@ -351,3 +351,54 @@ func v1ResourcesUnregisterPost(c *gin.Context) {
dangerous(models.ResourceUnregister(uuids))
renderMessage(c, nil)
}
type nodeResourcesCountResp struct {
Name string `json:"name"`
Count int `json:"count"`
}
func renderNodeResourcesCountByCate(c *gin.Context) {
needSourceList := []string{"physical", "virtual", "redis", "mongo", "mysql", "container", "sw"}
nodeId := urlParamInt64(c, "id")
node := Node(nodeId)
leadIds, err := node.LeafIds()
dangerous(err)
limit := 10000
query := ""
batch := ""
field := "ident"
ress, err := models.ResourceUnderNodeGets(leadIds, query, batch, field, limit, 0)
dangerous(err)
aggDat := make(map[string]int, len(ress))
for _, res := range ress {
cate := res.Cate
if cate != "" {
if _, ok := aggDat[cate]; !ok {
aggDat[cate] = 0
}
aggDat[cate]++
}
}
for _, need := range needSourceList {
if _, ok := aggDat[need]; !ok {
aggDat[need] = 0
}
}
var list []*nodeResourcesCountResp
for n, c := range aggDat {
ns := new(nodeResourcesCountResp)
ns.Name = n
ns.Count = c
list = append(list, ns)
}
renderData(c, list, nil)
}

View File

@ -102,6 +102,10 @@ func roleGlobalUsersGet(c *gin.Context) {
list, err := models.UserSearchListInIds(ids, query, limit, offset(c, limit))
dangerous(err)
for i := 0; i < len(list); i++ {
list[i].UUID = ""
}
renderData(c, gin.H{
"list": list,
"total": total,
@ -138,3 +142,8 @@ func roleGlobalUsersUnbind(c *gin.Context) {
renderMessage(c, obj.UnbindUsers(f.Ids))
}
func v1RoleOperationGets(c *gin.Context) {
objs, err := models.RoleOperationAll()
renderData(c, objs, err)
}

View File

@ -42,6 +42,7 @@ type selfPasswordForm struct {
func selfPasswordPut(c *gin.Context) {
var f selfPasswordForm
bind(c, &f)
dangerous(checkPassword(f.NewPass))
oldpass, err := models.CryptoPass(f.OldPass)
dangerous(err)
@ -112,3 +113,35 @@ func permGlobalOps(c *gin.Context) {
renderData(c, operations, err)
}
func v1PermGlobalOps(c *gin.Context) {
user, err := models.UserGet("username=?", queryStr(c, "username"))
dangerous(err)
operations := make(map[string]struct{})
if user.IsRoot == 1 {
for _, system := range config.GlobalOps {
for _, group := range system.Groups {
for _, op := range group.Ops {
operations[op.En] = struct{}{}
}
}
}
renderData(c, operations, nil)
return
}
roleIds, err := models.RoleIdsGetByUserId(user.Id)
dangerous(err)
ops, err := models.OperationsOfRoles(roleIds)
dangerous(err)
for _, op := range ops {
operations[op] = struct{}{}
}
renderData(c, operations, err)
}

View File

@ -23,6 +23,10 @@ func userListGet(c *gin.Context) {
list, err := models.UserGets(ids, query, limit, offset(c, limit))
dangerous(err)
for i := 0; i < len(list); i++ {
list[i].UUID = ""
}
renderData(c, gin.H{
"list": list,
"total": total,
@ -45,6 +49,7 @@ func userAddPost(c *gin.Context) {
var f userProfileForm
bind(c, &f)
dangerous(checkPassword(f.Password))
pass, err := models.CryptoPass(f.Password)
dangerous(err)
@ -74,7 +79,9 @@ func userAddPost(c *gin.Context) {
}
func userProfileGet(c *gin.Context) {
renderData(c, User(urlParamInt64(c, "id")), nil)
user := User(urlParamInt64(c, "id"))
user.UUID = ""
renderData(c, user, nil)
}
func userProfilePut(c *gin.Context) {
@ -140,6 +147,7 @@ func userPasswordPut(c *gin.Context) {
var f userPasswordForm
bind(c, &f)
dangerous(checkPassword(f.Password))
target := User(urlParamInt64(c, "id"))
@ -259,6 +267,7 @@ type userInviteForm struct {
func userInvitePost(c *gin.Context) {
var f userInviteForm
bind(c, &f)
dangerous(checkPassword(f.Password))
inv, err := models.InviteGet("token=?", f.Token)
dangerous(err)

View File

@ -54,9 +54,9 @@ func dispatchHandler(method string, jsonBytes []byte) error {
switch method {
case "oplog_add":
return oplogAdd(jsonBytes)
case "resource_register":
case "res_create":
return resourceRegister(jsonBytes)
case "resource_unregister":
case "res_delete":
return resourceUnregister(jsonBytes)
default:
logger.Warning("mq_request.method not support")
@ -103,19 +103,18 @@ func resourceRegister(jsonBytes []byte) error {
// 第三方系统比如RDS、Redis等资源销毁了要通知到RDB
func resourceUnregister(jsonBytes []byte) error {
var uuids []string
err := json.Unmarshal(jsonBytes, &uuids)
var item models.ResourceRegisterItem
err := json.Unmarshal(jsonBytes, &item)
if err != nil {
logger.Error(err)
// 这种错误不需要重试所以也就不需要return err了
logger.Warning(err)
return nil
}
if len(uuids) == 0 {
if item.UUID == "" {
return nil
}
err = models.ResourceUnregister(uuids)
err = models.ResourceUnregister([]string{item.UUID})
if err != nil {
logger.Error(err)
return err

View File

@ -74,6 +74,7 @@ func main() {
go cron.ConsumeSms()
go cron.ConsumeVoice()
go cron.ConsumeIm()
go cron.CleanerLoop()
http.Start()

View File

@ -3,6 +3,7 @@ package ssoc
import (
"context"
"crypto/tls"
"errors"
"fmt"
"io"
"log"
@ -16,15 +17,18 @@ import (
"github.com/didi/nightingale/src/modules/rdb/config"
"github.com/google/uuid"
"golang.org/x/oauth2"
"k8s.io/apimachinery/pkg/util/cache"
)
var (
errState = errors.New("您的登录信息已过期,请前往首页重新登录..")
errUser = errors.New("用户信息异常")
)
type ssoClient struct {
verifier *oidc.IDTokenVerifier
config oauth2.Config
apiKey string
cache *cache.LRUExpireCache
stateExpiresIn time.Duration
stateExpiresIn int64
ssoAddr string
callbackAddr string
coverAttributes bool
@ -48,7 +52,6 @@ func InitSSO() {
return
}
cli.cache = cache.NewLRUExpireCache(1000)
cli.ssoAddr = cf.SsoAddr
cli.callbackAddr = cf.RedirectURL
cli.coverAttributes = cf.CoverAttributes
@ -75,18 +78,26 @@ func InitSSO() {
}
cli.apiKey = cf.ApiKey
if cf.StateExpiresIn == 0 {
cli.stateExpiresIn = time.Second * 60
} else {
cli.stateExpiresIn = time.Second * time.Duration(cf.StateExpiresIn)
if cli.stateExpiresIn = cf.StateExpiresIn; cli.stateExpiresIn == 0 {
cli.stateExpiresIn = 60
}
}
// Authorize return the sso authorize location with state
func Authorize(redirect string) string {
state := uuid.New().String()
cli.cache.Add(state, redirect, cli.stateExpiresIn)
return cli.config.AuthCodeURL(state)
func Authorize(redirect string) (string, error) {
state := &models.AuthState{
State: uuid.New().String(),
Typ: "OAuth2.CODE",
Redirect: redirect,
ExpiresAt: time.Now().Unix() + cli.stateExpiresIn,
}
if err := state.Save(); err != nil {
return "", err
}
// log.Printf("add state %s", state)
return cli.config.AuthCodeURL(state.State), nil
}
// LogoutLocation return logout location
@ -99,24 +110,23 @@ func LogoutLocation(redirect string) string {
// Callback 用 code 兑换 accessToken 以及 用户信息,
func Callback(code, state string) (string, *models.User, error) {
s, ok := cli.cache.Get(state)
if !ok {
return "", nil, fmt.Errorf("invalid state %s", state)
s, err := models.AuthStateGet("state=?", state)
if err != nil {
return "", nil, errState
}
cli.cache.Remove(state)
redirect := s.(string)
log.Printf("callback, get state %s redirect %s", state, redirect)
s.Del()
// log.Printf("remove state %s", state)
u, err := exchangeUser(code)
if err != nil {
return "", nil, err
return "", nil, errUser
}
log.Printf("exchange user %v", u)
// log.Printf("exchange user %v", u)
user, err := models.UserGet("username=?", u.Username)
if err != nil {
return "", nil, err
return "", nil, errUser
}
if user == nil {
@ -130,7 +140,7 @@ func Callback(code, state string) (string, *models.User, error) {
err = user.Update("email", "dispname", "phone", "im")
}
return redirect, user, err
return s.Redirect, user, err
}
func exchangeUser(code string) (*models.User, error) {

92
src/toolkits/exit/exit.go Normal file
View File

@ -0,0 +1,92 @@
package exit
import (
"bytes"
"fmt"
"io/ioutil"
"log"
"runtime"
)
var (
dunno = []byte("???")
centerDot = []byte("·")
dot = []byte(".")
slash = []byte("/")
)
func OnExit(onexits ...func()) {
if err := recover(); err != nil {
stack := Stack(3)
log.Println("\napp exit unexpected, \n[error]: %v\n[stack]: %s", err, stack)
}
if len(onexits) != 0 {
for _, f := range onexits {
if f != nil {
f()
}
}
}
}
// stack returns a nicely formated stack frame, skipping skip frames
func Stack(skip int) []byte {
buf := new(bytes.Buffer) // the returned data
// As we loop, we open files and read them. These variables record the currently
// loaded file.
var lines [][]byte
var lastFile string
for i := skip; ; i++ { // Skip the expected number of frames
pc, file, line, ok := runtime.Caller(i)
if !ok {
break
}
// Print this much at least. If we can't find the source, it won't show.
fmt.Fprintf(buf, "%s:%d (0x%x)\n", file, line, pc)
if file != lastFile {
data, err := ioutil.ReadFile(file)
if err != nil {
continue
}
lines = bytes.Split(data, []byte{'\n'})
lastFile = file
}
fmt.Fprintf(buf, "\t%s: %s\n", function(pc), source(lines, line))
}
return buf.Bytes()
}
// source returns a space-trimmed slice of the n'th line.
func source(lines [][]byte, n int) []byte {
n-- // in stack trace, lines are 1-indexed but our array is 0-indexed
if n < 0 || n >= len(lines) {
return dunno
}
return bytes.TrimSpace(lines[n])
}
// function returns, if possible, the name of the function containing the PC.
func function(pc uintptr) []byte {
fn := runtime.FuncForPC(pc)
if fn == nil {
return dunno
}
name := []byte(fn.Name())
// The name includes the path name to the package, which is unnecessary
// since the file name is already included. Plus, it has center dots.
// That is, we see
// runtime/debug.*T·ptrmethod
// and want
// *T.ptrmethod
// Also the package path might contains dot (e.g. code.google.com/...),
// so first eliminate the path prefix
if lastslash := bytes.LastIndex(name, slash); lastslash >= 0 {
name = name[lastslash+1:]
}
if period := bytes.Index(name, dot); period >= 0 {
name = name[period+1:]
}
name = bytes.Replace(name, centerDot, dot, -1)
return name
}

View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2015 Caio Romão Costa Nascimento
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,55 @@
# T-Digest
A map-reduce and parallel streaming friendly data-structure for accurate
quantile approximation.
This package provides a very crude implementation of Ted Dunning's t-digest
data structure in Go.
[![Build Status](https://travis-ci.org/caio/go-tdigest.svg?branch=master)](https://travis-ci.org/caio/go-tdigest)
[![GoDoc](https://godoc.org/github.com/caio/go-tdigest?status.svg)](http://godoc.org/github.com/caio/go-tdigest)
[![Coverage](http://gocover.io/_badge/github.com/caio/go-tdigest)](http://gocover.io/github.com/caio/go-tdigest)
[![Go Report Card](https://goreportcard.com/badge/github.com/caio/go-tdigest)](https://goreportcard.com/report/github.com/caio/go-tdigest)
## Installation
go get github.com/caio/go-tdigest
## Usage
package main
import (
"fmt"
"math/rand"
"github.com/caio/go-tdigest"
)
func main() {
var t = tdigest.New(100)
for i := 0; i < 10000; i++ {
t.Add(rand.Float64(), 1)
}
fmt.Printf("p(.5) = %.6f\n", t.Quantile(0.5))
}
## Disclaimer
I've written this solely with the purpose of understanding how the
data-structure works, it hasn't been throughly verified nor battle tested
in a production environment.
## References
This is a very simple port of the [reference][1] implementation with some
ideas borrowed from the [python version][2]. If you wanna get a quick grasp of
how it works and why it's useful, [this video and companion article is pretty
helpful][3].
[1]: https://github.com/tdunning/t-digest
[2]: https://github.com/CamDavidsonPilon/tdigest
[3]: https://www.mapr.com/blog/better-anomaly-detection-t-digest-whiteboard-walkthrough

View File

@ -0,0 +1,131 @@
package tdigest
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
)
const smallEncoding int32 = 2
var endianess = binary.BigEndian
// AsBytes serializes the digest into a byte array so it can be
// saved to disk or sent over the wire.
func (t TDigest) AsBytes() ([]byte, error) {
buffer := new(bytes.Buffer)
err := binary.Write(buffer, endianess, smallEncoding)
if err != nil {
return nil, err
}
err = binary.Write(buffer, endianess, t.compression)
if err != nil {
return nil, err
}
err = binary.Write(buffer, endianess, int32(t.summary.Len()))
if err != nil {
return nil, err
}
var x float64
t.summary.Iterate(func(item centroid) bool {
delta := item.mean - x
x = item.mean
err = binary.Write(buffer, endianess, float32(delta))
return err == nil
})
if err != nil {
return nil, err
}
t.summary.Iterate(func(item centroid) bool {
err = encodeUint(buffer, item.count)
return err == nil
})
if err != nil {
return nil, err
}
return buffer.Bytes(), nil
}
// FromBytes reads a byte buffer with a serialized digest (from AsBytes)
// and deserializes it.
func FromBytes(buf *bytes.Reader) (*TDigest, error) {
var encoding int32
err := binary.Read(buf, endianess, &encoding)
if err != nil {
return nil, err
}
if encoding != smallEncoding {
return nil, fmt.Errorf("Unsupported encoding version: %d", encoding)
}
var compression float64
err = binary.Read(buf, endianess, &compression)
if err != nil {
return nil, err
}
t := New(compression)
var numCentroids int32
err = binary.Read(buf, endianess, &numCentroids)
if err != nil {
return nil, err
}
if numCentroids < 0 || numCentroids > 1<<22 {
return nil, errors.New("bad number of centroids in serialization")
}
means := make([]float64, numCentroids)
var delta float32
var x float64
for i := 0; i < int(numCentroids); i++ {
err = binary.Read(buf, endianess, &delta)
if err != nil {
return nil, err
}
x += float64(delta)
means[i] = x
}
for i := 0; i < int(numCentroids); i++ {
decUint, err := decodeUint(buf)
if err != nil {
return nil, err
}
t.Add(means[i], decUint)
}
return t, nil
}
func encodeUint(buf *bytes.Buffer, n uint32) error {
var b [binary.MaxVarintLen32]byte
l := binary.PutUvarint(b[:], uint64(n))
buf.Write(b[:l])
return nil
}
func decodeUint(buf *bytes.Reader) (uint32, error) {
v, err := binary.ReadUvarint(buf)
if v > 0xffffffff {
return 0, errors.New("Something wrong, this number looks too big")
}
return uint32(v), err
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,202 @@
package tdigest
import (
"fmt"
"math"
"sort"
)
type centroid struct {
mean float64
count uint32
index int
}
func (c centroid) isValid() bool {
return !math.IsNaN(c.mean) && c.count > 0
}
func (c *centroid) Update(x float64, weight uint32) {
c.count += weight
c.mean += float64(weight) * (x - c.mean) / float64(c.count)
}
var invalidCentroid = centroid{mean: math.NaN(), count: 0}
type summary struct {
keys []float64
counts []uint32
}
func newSummary(initialCapacity uint) *summary {
return &summary{
keys: make([]float64, 0, initialCapacity),
counts: make([]uint32, 0, initialCapacity),
}
}
func (s summary) Len() int {
return len(s.keys)
}
func (s *summary) Add(key float64, value uint32) error {
if math.IsNaN(key) {
return fmt.Errorf("Key must not be NaN")
}
if value == 0 {
return fmt.Errorf("Count must be >0")
}
idx := s.FindIndex(key)
if s.meanAtIndexIs(idx, key) {
s.updateAt(idx, key, value)
return nil
}
s.keys = append(s.keys, math.NaN())
s.counts = append(s.counts, 0)
copy(s.keys[idx+1:], s.keys[idx:])
copy(s.counts[idx+1:], s.counts[idx:])
s.keys[idx] = key
s.counts[idx] = value
return nil
}
func (s summary) Find(x float64) centroid {
idx := s.FindIndex(x)
if idx < s.Len() && s.keys[idx] == x {
return centroid{x, s.counts[idx], idx}
}
return invalidCentroid
}
func (s summary) FindIndex(x float64) int {
// FIXME When is linear scan better than binsearch()?
// should I even bother?
if len(s.keys) < 30 {
for i, item := range s.keys {
if item >= x {
return i
}
}
return len(s.keys)
}
return sort.Search(len(s.keys), func(i int) bool {
return s.keys[i] >= x
})
}
func (s summary) At(index int) centroid {
if s.Len()-1 < index || index < 0 {
return invalidCentroid
}
return centroid{s.keys[index], s.counts[index], index}
}
func (s summary) Iterate(f func(c centroid) bool) {
for i := 0; i < s.Len(); i++ {
if !f(centroid{s.keys[i], s.counts[i], i}) {
break
}
}
}
func (s summary) Min() centroid {
return s.At(0)
}
func (s summary) Max() centroid {
return s.At(s.Len() - 1)
}
func (s summary) Data() []centroid {
data := make([]centroid, 0, s.Len())
s.Iterate(func(c centroid) bool {
data = append(data, c)
return true
})
return data
}
func (s summary) successorAndPredecessorItems(mean float64) (centroid, centroid) {
idx := s.FindIndex(mean)
return s.At(idx + 1), s.At(idx - 1)
}
func (s summary) ceilingAndFloorItems(mean float64) (centroid, centroid) {
idx := s.FindIndex(mean)
// Case 1: item is greater than all items in the summary
if idx == s.Len() {
return invalidCentroid, s.Max()
}
item := s.At(idx)
// Case 2: item exists in the summary
if item.isValid() && mean == item.mean {
return item, item
}
// Case 3: item is smaller than all items in the summary
if idx == 0 {
return s.Min(), invalidCentroid
}
return item, s.At(idx - 1)
}
func (s summary) sumUntilMean(mean float64) uint32 {
var cumSum uint32
for i := range s.keys {
if s.keys[i] < mean {
cumSum += s.counts[i]
} else {
break
}
}
return cumSum
}
func (s *summary) updateAt(index int, mean float64, count uint32) {
c := centroid{s.keys[index], s.counts[index], index}
c.Update(mean, count)
oldMean := s.keys[index]
s.keys[index] = c.mean
s.counts[index] = c.count
if c.mean > oldMean {
s.adjustRight(index)
} else if c.mean < oldMean {
s.adjustLeft(index)
}
}
func (s *summary) adjustRight(index int) {
for i := index + 1; i < len(s.keys) && s.keys[i-1] > s.keys[i]; i++ {
s.keys[i-1], s.keys[i] = s.keys[i], s.keys[i-1]
s.counts[i-1], s.counts[i] = s.counts[i], s.counts[i-1]
}
}
func (s *summary) adjustLeft(index int) {
for i := index - 1; i >= 0 && s.keys[i] > s.keys[i+1]; i-- {
s.keys[i], s.keys[i+1] = s.keys[i+1], s.keys[i]
s.counts[i], s.counts[i+1] = s.counts[i+1], s.counts[i]
}
}
func (s summary) meanAtIndexIs(index int, mean float64) bool {
return index < len(s.keys) && s.keys[index] == mean
}

View File

@ -0,0 +1,239 @@
package tdigest
import (
"math"
"math/rand"
"sort"
"testing"
)
func TestBasics(t *testing.T) {
s := newSummary(2)
for _, n := range []float64{12, 13, 14, 15} {
item := s.Find(n)
if item.isValid() {
t.Errorf("Found something for non existing key %.0f: %v", n, item)
}
}
err := s.Add(1, 1)
if err != nil {
t.Errorf("Failed to add simple item")
}
if s.Add(math.NaN(), 1) == nil {
t.Errorf("Adding math.NaN() shouldn't be allowed")
}
if s.Add(1, 0) == nil {
t.Errorf("Adding count=0 shouldn't be allowed")
}
}
func checkSorted(s *summary, t *testing.T) {
if !sort.Float64sAreSorted(s.keys) {
t.Fatalf("Keys are not sorted! %v", s.keys)
}
}
func TestCore(t *testing.T) {
testData := make(map[float64]uint32)
const maxDataSize = 10000
s := newSummary(maxDataSize)
checkSorted(s, t)
if s.Len() != 0 {
t.Errorf("Initial size should be zero regardless of capacity. Got %d", s.Len())
}
for i := 0; i < maxDataSize; i++ {
k := rand.Float64()
v := rand.Uint32()
err := s.Add(k, v)
if err != nil {
_, exists := testData[k]
if !exists {
t.Errorf("Failed to insert %.2f even though it doesn't exist yet", k)
}
}
testData[k] = v
}
checkSorted(s, t)
if s.Len() != len(testData) {
t.Errorf("Got Len() == %d. Expected %d", s.Len(), len(testData))
}
for k, v := range testData {
c := s.Find(k)
if !c.isValid() || c.count != v {
t.Errorf("Find(%.0f) returned %d, expected %d", k, c.count, v)
}
}
}
func TestGetAt(t *testing.T) {
data := make(map[int]uint32)
const maxDataSize = 1000
s := newSummary(maxDataSize)
c := s.At(0)
if c.isValid() {
t.Errorf("At() on an empty structure should give invalid data. Got %v", c)
}
for i := 0; i < maxDataSize; i++ {
data[i] = rand.Uint32()
s.Add(float64(i), data[i])
}
for i, v := range data {
c := s.At(i)
if !c.isValid() || c.count != v {
t.Errorf("At(%d) = %d. Should've been %d", i, c.count, v)
}
}
c = s.At(s.Len())
if c.isValid() {
t.Errorf("At() past the slice length should give invalid data")
}
c = s.At(-10)
if c.isValid() {
t.Errorf("At() with negative index should give invalid data")
}
}
func TestIterate(t *testing.T) {
s := newSummary(10)
for _, i := range []uint32{1, 2, 3, 4, 5, 6} {
s.Add(float64(i), i*10)
}
c := 0
s.Iterate(func(i centroid) bool {
c++
return false
})
if c != 1 {
t.Errorf("Iterate must exit early if the closure returns false")
}
var tot uint32
s.Iterate(func(i centroid) bool {
tot += i.count
return true
})
if tot != 210 {
t.Errorf("Iterate must walk through the whole data if it always returns true")
}
}
func TestCeilingAndFloor(t *testing.T) {
s := newSummary(100)
ceil, floor := s.ceilingAndFloorItems(1)
if ceil.isValid() || floor.isValid() {
t.Errorf("Empty centroids must return invalid ceiling and floor items")
}
s.Add(0.4, 1)
ceil, floor = s.ceilingAndFloorItems(0.3)
if floor.isValid() || ceil.mean != 0.4 {
t.Errorf("Expected to find a ceil and NOT find a floor. ceil=%v, floor=%v", ceil, floor)
}
ceil, floor = s.ceilingAndFloorItems(0.5)
if ceil.isValid() || floor.mean != 0.4 {
t.Errorf("Expected to find a floor and NOT find a ceiling. ceil=%v, floor=%v", ceil, floor)
}
s.Add(0.1, 2)
ceil, floor = s.ceilingAndFloorItems(0.2)
if ceil.mean != 0.4 || floor.mean != 0.1 {
t.Errorf("Expected to find a ceiling and a floor. ceil=%v, floor=%v", ceil, floor)
}
s.Add(0.21, 3)
ceil, floor = s.ceilingAndFloorItems(0.2)
if ceil.mean != 0.21 || floor.mean != 0.1 {
t.Errorf("Ceil should've shrunk. ceil=%v, floor=%v", ceil, floor)
}
s.Add(0.1999, 1)
ceil, floor = s.ceilingAndFloorItems(0.2)
if ceil.mean != 0.21 || floor.mean != 0.1999 {
t.Errorf("Floor should've shrunk. ceil=%v, floor=%v", ceil, floor)
}
ceil, floor = s.ceilingAndFloorItems(10)
if ceil.isValid() {
t.Errorf("Expected an invalid ceil. Got %v", ceil)
}
ceil, floor = s.ceilingAndFloorItems(0.0001)
if floor.isValid() {
t.Errorf("Expected an invalid floor. Got %v", floor)
}
m := float64(0.42)
s.Add(m, 1)
ceil, floor = s.ceilingAndFloorItems(m)
if ceil.mean != m || floor.mean != m {
t.Errorf("ceiling and floor of an existing item should be the item itself")
}
}
func TestAdjustLeftRight(t *testing.T) {
keys := []float64{1, 2, 3, 4, 9, 5, 6, 7, 8}
counts := []uint32{1, 2, 3, 4, 9, 5, 6, 7, 8}
s := summary{keys: keys, counts: counts}
s.adjustRight(4)
if !sort.Float64sAreSorted(s.keys) || s.counts[4] != 5 {
t.Errorf("adjustRight should have fixed the keys/counts state. %v %v", s.keys, s.counts)
}
keys = []float64{1, 2, 3, 4, 0, 5, 6, 7, 8}
counts = []uint32{1, 2, 3, 4, 0, 5, 6, 7, 8}
s = summary{keys: keys, counts: counts}
s.adjustLeft(4)
if !sort.Float64sAreSorted(s.keys) || s.counts[4] != 4 {
t.Errorf("adjustLeft should have fixed the keys/counts state. %v %v", s.keys, s.counts)
}
}

View File

@ -0,0 +1,245 @@
// Package tdigest provides a highly accurate mergeable data-structure
// for quantile estimation.
package tdigest
import (
"fmt"
"math"
"math/rand"
)
// TDigest is a quantile approximation data structure.
// Typical T-Digest use cases involve accumulating metrics on several
// distinct nodes of a cluster and then merging them together to get
// a system-wide quantile overview. Things such as: sensory data from
// IoT devices, quantiles over enormous document datasets (think
// ElasticSearch), performance metrics for distributed systems, etc.
type TDigest struct {
summary *summary
compression float64
count uint32
}
// New creates a new digest.
// The compression parameter rules the threshold in which samples are
// merged together - the more often distinct samples are merged the more
// precision is lost. Compression should be tuned according to your data
// distribution, but a value of 100 is often good enough. A higher
// compression value means holding more centroids in memory (thus: better
// precision), which means a bigger serialization payload and higher
// memory footprint.
// Compression must be a value greater of equal to 1, will panic
// otherwise.
func New(compression float64) *TDigest {
if compression < 1 {
panic("Compression must be >= 1.0")
}
return &TDigest{
compression: compression,
summary: newSummary(estimateCapacity(compression)),
count: 0,
}
}
// Quantile returns the desired percentile estimation.
// Values of p must be between 0 and 1 (inclusive), will panic otherwise.
func (t *TDigest) Quantile(q float64) float64 {
if q < 0 || q > 1 {
panic("q must be between 0 and 1 (inclusive)")
}
if t.summary.Len() == 0 {
return math.NaN()
} else if t.summary.Len() == 1 {
return t.summary.Min().mean
}
q *= float64(t.count)
var total float64
i := 0
found := false
var result float64
t.summary.Iterate(func(item centroid) bool {
k := float64(item.count)
if q < total+k {
if i == 0 || i+1 == t.summary.Len() {
result = item.mean
found = true
return false
}
succ, pred := t.summary.successorAndPredecessorItems(item.mean)
delta := (succ.mean - pred.mean) / 2
result = item.mean + ((q-total)/k-0.5)*delta
found = true
return false
}
i++
total += k
return true
})
if found {
return result
}
return t.summary.Max().mean
}
// Add registers a new sample in the digest.
// It's the main entry point for the digest and very likely the only
// method to be used for collecting samples. The count parameter is for
// when you are registering a sample that occurred multiple times - the
// most common value for this is 1.
func (t *TDigest) Add(value float64, count uint32) error {
if count == 0 {
return fmt.Errorf("Illegal datapoint <value: %.4f, count: %d>", value, count)
}
if t.summary.Len() == 0 {
t.summary.Add(value, count)
t.count = count
return nil
}
// Avoid allocation for our slice by using a local array here.
ar := [2]centroid{}
candidates := ar[:]
candidates[0], candidates[1] = t.findNearestCentroids(value)
if !candidates[1].isValid() {
candidates = candidates[:1]
}
for len(candidates) > 0 && count > 0 {
j := 0
if len(candidates) > 1 {
j = rand.Intn(len(candidates))
}
chosen := candidates[j]
quantile := t.computeCentroidQuantile(&chosen)
if float64(chosen.count+count) > t.threshold(quantile) {
candidates = append(candidates[:j], candidates[j+1:]...)
continue
}
t.summary.updateAt(chosen.index, value, uint32(count))
t.count += count
count = 0
}
if count > 0 {
t.summary.Add(value, count)
t.count += count
}
if float64(t.summary.Len()) > 20*t.compression {
t.Compress()
}
return nil
}
// Compress tries to reduce the number of individual centroids stored
// in the digest.
// Compression trades off accuracy for performance and happens
// automatically after a certain amount of distinct samples have been
// stored.
func (t *TDigest) Compress() {
if t.summary.Len() <= 1 {
return
}
oldTree := t.summary
t.summary = newSummary(estimateCapacity(t.compression))
t.count = 0
nodes := oldTree.Data()
shuffle(nodes)
for _, item := range nodes {
t.Add(item.mean, item.count)
}
}
// Merge joins a given digest into itself.
// Merging is useful when you have multiple TDigest instances running
// in separate threads and you want to compute quantiles over all the
// samples. This is particularly important on a scatter-gather/map-reduce
// scenario.
func (t *TDigest) Merge(other *TDigest) {
if other.summary.Len() == 0 {
return
}
nodes := other.summary.Data()
shuffle(nodes)
for _, item := range nodes {
t.Add(item.mean, item.count)
}
}
// Len returns the number of centroids in the TDigest.
func (t *TDigest) Len() int { return t.summary.Len() }
// ForEachCentroid calls the specified function for each centroid.
// Iteration stops when the supplied function returns false, or when all
// centroids have been iterated.
func (t *TDigest) ForEachCentroid(f func(mean float64, count uint32) bool) {
s := t.summary
for i := 0; i < s.Len(); i++ {
if !f(s.keys[i], s.counts[i]) {
break
}
}
}
func shuffle(data []centroid) {
for i := len(data) - 1; i > 1; i-- {
other := rand.Intn(i + 1)
tmp := data[other]
data[other] = data[i]
data[i] = tmp
}
}
func estimateCapacity(compression float64) uint {
return uint(compression) * 10
}
func (t *TDigest) threshold(q float64) float64 {
return (4 * float64(t.count) * q * (1 - q)) / t.compression
}
func (t *TDigest) computeCentroidQuantile(c *centroid) float64 {
cumSum := t.summary.sumUntilMean(c.mean)
return (float64(c.count)/2.0 + float64(cumSum)) / float64(t.count)
}
func (t *TDigest) findNearestCentroids(mean float64) (centroid, centroid) {
ceil, floor := t.summary.ceilingAndFloorItems(mean)
if !ceil.isValid() && !floor.isValid() {
panic("findNearestCentroids called on an empty tree")
}
if !ceil.isValid() {
return floor, invalidCentroid
}
if !floor.isValid() {
return ceil, invalidCentroid
}
if math.Abs(floor.mean-mean) < math.Abs(ceil.mean-mean) {
return floor, invalidCentroid
} else if math.Abs(floor.mean-mean) == math.Abs(ceil.mean-mean) && floor.mean != ceil.mean {
return floor, ceil
} else {
return ceil, invalidCentroid
}
}

View File

@ -0,0 +1,430 @@
package tdigest
import (
"math"
"math/rand"
"sort"
"testing"
)
// Test of tdigest internals and accuracy. Note no t.Parallel():
// during tests the default random seed is consistent, but varying
// concurrency scheduling mixes up the random values used in each test.
// Since there's a random number call inside tdigest this breaks repeatability
// for all tests. So, no test concurrency here.
func TestTInternals(t *testing.T) {
tdigest := New(100)
if !math.IsNaN(tdigest.Quantile(0.1)) {
t.Errorf("Quantile() on an empty digest should return NaN. Got: %.4f", tdigest.Quantile(0.1))
}
tdigest.Add(0.4, 1)
if tdigest.Quantile(0.1) != 0.4 {
t.Errorf("Quantile() on a single-sample digest should return the samples's mean. Got %.4f", tdigest.Quantile(0.1))
}
tdigest.Add(0.5, 1)
if tdigest.summary.Len() != 2 {
t.Errorf("Expected size 2, got %d", tdigest.summary.Len())
}
if tdigest.summary.Min().mean != 0.4 {
t.Errorf("Min() returned an unexpected centroid: %v", tdigest.summary.Min())
}
if tdigest.summary.Max().mean != 0.5 {
t.Errorf("Min() returned an unexpected centroid: %v", tdigest.summary.Min())
}
tdigest.Add(0.4, 2)
tdigest.Add(0.4, 3)
if tdigest.summary.Len() != 2 {
t.Errorf("Adding centroids of same mean shouldn't change size")
}
y := tdigest.summary.Find(0.4)
if y.count != 6 || y.mean != 0.4 {
t.Errorf("Adding centroids with same mean should increment the count only. Got %v", y)
}
err := tdigest.Add(0, 0)
if err == nil {
t.Errorf("Expected Add() to error out with input (0,0)")
}
if tdigest.Quantile(0.9999999) != tdigest.summary.Max().mean {
t.Errorf("High quantiles with little data should give out the MAX recorded mean")
}
if tdigest.Quantile(0.0000001) != tdigest.summary.Min().mean {
t.Errorf("Low quantiles with little data should give out the MIN recorded mean")
}
}
func assertDifferenceSmallerThan(tdigest *TDigest, p float64, m float64, t *testing.T) {
tp := tdigest.Quantile(p)
if math.Abs(tp-p) >= m {
t.Errorf("T-Digest.Quantile(%.4f) = %.4f. Diff (%.4f) >= %.4f", p, tp, math.Abs(tp-p), m)
}
}
func TestUniformDistribution(t *testing.T) {
tdigest := New(100)
for i := 0; i < 10000; i++ {
tdigest.Add(rand.Float64(), 1)
}
assertDifferenceSmallerThan(tdigest, 0.5, 0.02, t)
assertDifferenceSmallerThan(tdigest, 0.1, 0.01, t)
assertDifferenceSmallerThan(tdigest, 0.9, 0.01, t)
assertDifferenceSmallerThan(tdigest, 0.01, 0.005, t)
assertDifferenceSmallerThan(tdigest, 0.99, 0.005, t)
assertDifferenceSmallerThan(tdigest, 0.001, 0.001, t)
assertDifferenceSmallerThan(tdigest, 0.999, 0.001, t)
}
// Asserts quantile p is no greater than absolute m off from "true"
// fractional quantile for supplied data. So m must be scaled
// appropriately for source data range.
func assertDifferenceFromQuantile(data []float64, tdigest *TDigest, p float64, m float64, t *testing.T) {
q := quantile(p, data)
tp := tdigest.Quantile(p)
if math.Abs(tp-q) >= m {
t.Fatalf("T-Digest.Quantile(%.4f) = %.4f vs actual %.4f. Diff (%.4f) >= %.4f", p, tp, q, math.Abs(tp-q), m)
}
}
func TestSequentialInsertion(t *testing.T) {
tdigest := New(10)
data := make([]float64, 10000)
for i := 0; i < len(data); i++ {
data[i] = float64(i)
}
for i := 0; i < len(data); i++ {
tdigest.Add(data[i], 1)
assertDifferenceFromQuantile(data[:i+1], tdigest, 0.001, 1.0+0.001*float64(i), t)
assertDifferenceFromQuantile(data[:i+1], tdigest, 0.01, 1.0+0.005*float64(i), t)
assertDifferenceFromQuantile(data[:i+1], tdigest, 0.05, 1.0+0.01*float64(i), t)
assertDifferenceFromQuantile(data[:i+1], tdigest, 0.25, 1.0+0.03*float64(i), t)
assertDifferenceFromQuantile(data[:i+1], tdigest, 0.5, 1.0+0.03*float64(i), t)
assertDifferenceFromQuantile(data[:i+1], tdigest, 0.75, 1.0+0.03*float64(i), t)
assertDifferenceFromQuantile(data[:i+1], tdigest, 0.95, 1.0+0.01*float64(i), t)
assertDifferenceFromQuantile(data[:i+1], tdigest, 0.99, 1.0+0.005*float64(i), t)
assertDifferenceFromQuantile(data[:i+1], tdigest, 0.999, 1.0+0.001*float64(i), t)
}
}
func TestNonUniformDistribution(t *testing.T) {
tdigest := New(10)
// Not quite a uniform distribution, but close.
data := make([]float64, 1000)
for i := 0; i < 500; i++ {
data[i] = 700.0 + rand.Float64()*100.0
}
for i := 500; i < 750; i++ {
data[i] = 100.0 + rand.Float64()*100.0
}
for i := 750; i < 1000; i++ {
data[i] = 600.0 + rand.Float64()*10.0
}
for i := 0; i < len(data); i++ {
tdigest.Add(data[i], 1)
}
max := float64(len(data))
sort.Float64s(data)
assertDifferenceFromQuantile(data, tdigest, 0.001, 1.0+0.001*max, t)
assertDifferenceFromQuantile(data, tdigest, 0.01, 1.0+0.005*max, t)
assertDifferenceFromQuantile(data, tdigest, 0.05, 1.0+0.01*max, t)
assertDifferenceFromQuantile(data, tdigest, 0.25, 1.0+0.01*max, t)
assertDifferenceFromQuantile(data, tdigest, 0.5, 1.0+0.05*max, t)
assertDifferenceFromQuantile(data, tdigest, 0.75, 1.0+0.01*max, t)
assertDifferenceFromQuantile(data, tdigest, 0.95, 1.0+0.01*max, t)
assertDifferenceFromQuantile(data, tdigest, 0.99, 1.0+0.005*max, t)
assertDifferenceFromQuantile(data, tdigest, 0.999, 1.0+0.001*max, t)
}
func TestNonSequentialInsertion(t *testing.T) {
tdigest := New(10)
// Not quite a uniform distribution, but close.
data := make([]float64, 1000)
for i := 0; i < len(data); i++ {
tmp := (i * 1627) % len(data)
data[i] = float64(tmp)
}
sorted := make([]float64, 0, len(data))
for i := 0; i < len(data); i++ {
tdigest.Add(data[i], 1)
sorted = append(sorted, data[i])
// Estimated quantiles are all over the place for low counts, which is
// OK given that something like P99 is not very meaningful when there are
// 25 samples. To account for this, increase the error tolerance for
// smaller counts.
if i == 0 {
continue
}
max := float64(len(data))
fac := 1.0 + max/float64(i)
sort.Float64s(sorted)
assertDifferenceFromQuantile(sorted, tdigest, 0.001, fac+0.001*max, t)
assertDifferenceFromQuantile(sorted, tdigest, 0.01, fac+0.005*max, t)
assertDifferenceFromQuantile(sorted, tdigest, 0.05, fac+0.01*max, t)
assertDifferenceFromQuantile(sorted, tdigest, 0.25, fac+0.01*max, t)
assertDifferenceFromQuantile(sorted, tdigest, 0.5, fac+0.02*max, t)
assertDifferenceFromQuantile(sorted, tdigest, 0.75, fac+0.01*max, t)
assertDifferenceFromQuantile(sorted, tdigest, 0.95, fac+0.01*max, t)
assertDifferenceFromQuantile(sorted, tdigest, 0.99, fac+0.005*max, t)
assertDifferenceFromQuantile(sorted, tdigest, 0.999, fac+0.001*max, t)
}
}
func TestWeights(t *testing.T) {
tdigest := New(10)
// Create data slice with repeats matching weights we gave to tdigest
data := []float64{}
for i := 0; i < 100; i++ {
tdigest.Add(float64(i), uint32(i))
for j := 0; j < i; j++ {
data = append(data, float64(i))
}
}
assertDifferenceFromQuantile(data, tdigest, 0.001, 1.0+0.001*100.0, t)
assertDifferenceFromQuantile(data, tdigest, 0.01, 1.0+0.005*100.0, t)
assertDifferenceFromQuantile(data, tdigest, 0.05, 1.0+0.01*100.0, t)
assertDifferenceFromQuantile(data, tdigest, 0.25, 1.0+0.01*100.0, t)
assertDifferenceFromQuantile(data, tdigest, 0.5, 1.0+0.02*100.0, t)
assertDifferenceFromQuantile(data, tdigest, 0.75, 1.0+0.01*100.0, t)
assertDifferenceFromQuantile(data, tdigest, 0.95, 1.0+0.01*100.0, t)
assertDifferenceFromQuantile(data, tdigest, 0.99, 1.0+0.005*100.0, t)
assertDifferenceFromQuantile(data, tdigest, 0.999, 1.0+0.001*100.0, t)
}
func TestIntegers(t *testing.T) {
tdigest := New(100)
tdigest.Add(1, 1)
tdigest.Add(2, 1)
tdigest.Add(3, 1)
if tdigest.Quantile(0.5) != 2 {
t.Errorf("Expected p(0.5) = 2, Got %.2f instead", tdigest.Quantile(0.5))
}
tdigest = New(100)
for _, i := range []float64{1, 2, 2, 2, 2, 2, 2, 2, 3} {
tdigest.Add(i, 1)
}
if tdigest.Quantile(0.5) != 2 {
t.Errorf("Expected p(0.5) = 2, Got %.2f instead", tdigest.Quantile(0.5))
}
var tot uint32
tdigest.summary.Iterate(func(item centroid) bool {
tot += item.count
return true
})
if tot != 9 {
t.Errorf("Expected the centroid count to be 9, Got %d instead", tot)
}
}
func quantile(q float64, data []float64) float64 {
if len(data) == 0 {
return math.NaN()
}
if q == 1 || len(data) == 1 {
return data[len(data)-1]
}
index := q * (float64(len(data)) - 1)
return data[int(index)+1]*(index-float64(int(index))) + data[int(index)]*(float64(int(index)+1)-index)
}
func TestMerge(t *testing.T) {
if testing.Short() {
t.Skipf("Skipping merge test. Short flag is on")
}
const numItems = 10000
const numSubs = 5
data := make([]float64, numItems)
var subs [numSubs]*TDigest
dist1 := New(10)
for i := 0; i < numSubs; i++ {
subs[i] = New(10)
}
for i := 0; i < numItems; i++ {
num := rand.Float64()
data[i] = num
dist1.Add(num, 1)
for j := 0; j < numSubs; j++ {
subs[j].Add(num, 1)
}
}
dist2 := New(10)
for i := 0; i < numSubs; i++ {
dist2.Merge(subs[i])
}
// Merge empty. Should be no-op
dist2.Merge(New(10))
sort.Float64s(data)
for _, p := range []float64{0.001, 0.01, 0.1, 0.2, 0.3, 0.5} {
q := quantile(p, data)
p1 := dist1.Quantile(p)
p2 := dist2.Quantile(p)
e1 := math.Abs(p1 - q)
e2 := math.Abs(p1 - q)
if e2/p >= 0.3 {
t.Errorf("Relative error for %f above threshold. q=%f p1=%f p2=%f e1=%f e2=%f", p, q, p1, p2, e1, e2)
}
if e2 >= 0.015 {
t.Errorf("Absolute error for %f above threshold. q=%f p1=%f p2=%f e1=%f e2=%f", p, q, p1, p2, e1, e2)
}
}
}
func TestCompressDoesntChangeCount(t *testing.T) {
tdigest := New(100)
for i := 0; i < 1000; i++ {
tdigest.Add(rand.Float64(), 1)
}
initialCount := tdigest.count
tdigest.Compress()
if tdigest.count != initialCount {
t.Errorf("Compress() should not change count. Wanted %d, got %d", initialCount, tdigest.count)
}
}
func shouldPanic(f func(), t *testing.T, message string) {
defer func() {
tryRecover := recover()
if tryRecover == nil {
t.Errorf(message)
}
}()
f()
}
func TestPanic(t *testing.T) {
shouldPanic(func() {
New(0.5)
}, t, "Compression < 1 should panic!")
tdigest := New(100)
shouldPanic(func() {
tdigest.Quantile(-42)
}, t, "Quantile < 0 should panic!")
shouldPanic(func() {
tdigest.Quantile(42)
}, t, "Quantile > 1 should panic!")
shouldPanic(func() {
tdigest.findNearestCentroids(0.2)
}, t, "findNearestCentroids on empty summary should panic!")
}
func TestForEachCentroid(t *testing.T) {
t.Parallel()
tdigest := New(10)
for i := 0; i < 100; i++ {
tdigest.Add(float64(i), 1)
}
// Iterate limited number.
means := []float64{}
tdigest.ForEachCentroid(func(mean float64, count uint32) bool {
means = append(means, mean)
if len(means) == 3 {
return false
}
return true
})
if len(means) != 3 {
t.Errorf("ForEachCentroid handled incorrect number of data items")
}
// Iterate all datapoints.
means = []float64{}
tdigest.ForEachCentroid(func(mean float64, count uint32) bool {
means = append(means, mean)
return true
})
if len(means) != tdigest.Len() {
t.Errorf("ForEachCentroid did not handle all data")
}
}
func benchmarkAdd(compression float64, b *testing.B) {
t := New(compression)
data := make([]float64, b.N)
for n := 0; n < b.N; n++ {
data[n] = rand.Float64()
}
b.ResetTimer()
for n := 0; n < b.N; n++ {
err := t.Add(data[n], 1)
if err != nil {
b.Error(err)
}
}
b.StopTimer()
}
func BenchmarkAdd1(b *testing.B) {
benchmarkAdd(1, b)
}
func BenchmarkAdd10(b *testing.B) {
benchmarkAdd(10, b)
}
func BenchmarkAdd100(b *testing.B) {
benchmarkAdd(100, b)
}