zk插件新增集群标识
This commit is contained in:
parent
2953890391
commit
dd9af850ba
|
@ -2,7 +2,8 @@
|
|||
# interval = 15
|
||||
|
||||
# [[instances]]
|
||||
# address = "127.0.0.1:2181"
|
||||
# cluster_name = "dev-zk-cluster"
|
||||
# addresses = "127.0.0.1:2181"
|
||||
# timeout = 10
|
||||
|
||||
# important! use global unique string to specify instance
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# zookeeper
|
||||
**注意: >=3.6.0 zookeeper 版本内置 [prometheus 的支持](https://zookeeper.apache.org/doc/current/zookeeperMonitor.html),即,如果 zookeeper 启用了 prometheus,Categraf 可使用 prometheus 插件从这个 metrics 接口拉取数据即可;**
|
||||
|
||||
移植于 [dabealu/zookeeper-exporter](https://github.com/dabealu/zookeeper-exporter),原理就是利用 Zookeper 提供的四字命令(The Four Letter Words)获取监控信息;
|
||||
移植于 [dabealu/zookeeper-exporter](https://github.com/dabealu/zookeeper-exporter),适用于 `<3.6.0` 版本的 zookeeper, 原理就是利用 Zookeper 提供的四字命令(The Four Letter Words)获取监控信息;
|
||||
|
||||
需要注意的是,在 zookeeper v3.4.10 以后添加了四字命令白名单,需要在 zookeeper 的配置文件 `zoo.cfg` 中新增白名单配置:
|
||||
```
|
||||
|
@ -9,32 +10,29 @@
|
|||
|
||||
## Configuration
|
||||
|
||||
zookeeper 插件的配置在 `conf/input.zookeeper/zookeeper.toml` 最简单的配置如下:
|
||||
zookeeper 插件的配置在 `conf/input.zookeeper/zookeeper.toml` 集群中的多个实例地址请用空格分隔:
|
||||
|
||||
```toml
|
||||
[[instances]]
|
||||
address = "127.0.0.1:2181"
|
||||
labels = { instance="n9e-10.23.25.2:2181" }
|
||||
cluster_name = "dev-zk-cluster"
|
||||
addresses = "127.0.0.1:2181"
|
||||
timeout = 10
|
||||
```
|
||||
|
||||
如果要监控多个 zookeeper 实例 (同一集群的多个实例也请分别添加,可在 `labels` 中添加 `cluster_name="xxx"` 来进行标识),就增加 instances 即可:
|
||||
如果要监控多个 zookeeper 集群,就增加 instances 即可:
|
||||
|
||||
```toml
|
||||
[[instances]]
|
||||
address = "10.23.25.2:2181"
|
||||
username = ""
|
||||
password = ""
|
||||
labels = { instance="n9e-10.23.25.2:2181" }
|
||||
cluster_name = "dev-zk-cluster"
|
||||
addresses = "127.0.0.1:2181"
|
||||
timeout = 10
|
||||
|
||||
[[instances]]
|
||||
address = "10.23.25.3:2181"
|
||||
username = ""
|
||||
password = ""
|
||||
labels = { instance="n9e-10.23.25.3:2181" }
|
||||
cluster_name = "test-zk-cluster"
|
||||
addresses = "127.0.0.1:2181 127.0.0.1:2182 127.0.0.1:2183"
|
||||
timeout = 10
|
||||
```
|
||||
|
||||
建议通过 labels 配置附加一个 instance 标签,便于后面复用监控大盘。
|
||||
|
||||
## 监控大盘和告警规则
|
||||
|
||||
该 README 的同级目录下,提供了 dashboard.json 就是监控大盘的配置,alerts.json 是告警规则,可以导入夜莺使用。
|
||||
|
|
|
@ -1,134 +1,134 @@
|
|||
[
|
||||
{
|
||||
"name": "【Zookeeper】Zookeeper Down",
|
||||
"note": "",
|
||||
"prod": "",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "zk_up == 0",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_users_obj": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "【Zookeeper】Zookeeper instance is not ok",
|
||||
"note": "",
|
||||
"prod": "",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 120,
|
||||
"prom_ql": "zk_ruok == 0",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_users_obj": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "【Zookeeper】Zookeeper 集群不存在 Leader",
|
||||
"note": "",
|
||||
"prod": "",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "sum(zk_server_leader) == 0",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_users_obj": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "【Zookeeper】Zookeeper 集群存在多个 Leader",
|
||||
"note": "",
|
||||
"prod": "",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "sum(zk_server_leader) > 1",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_users_obj": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
}
|
||||
]
|
||||
{
|
||||
"name": "【Zookeeper】Zookeeper instance is not ok",
|
||||
"note": "",
|
||||
"prod": "",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 2,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 120,
|
||||
"prom_ql": "zk_ruok == 0",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_users_obj": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "【Zookeeper】Zookeeper Down",
|
||||
"note": "",
|
||||
"prod": "",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "zk_up == 0",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_users_obj": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "【Zookeeper】Zookeeper 集群不存在 Leader",
|
||||
"note": "",
|
||||
"prod": "",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "sum(zk_server_leader) by (zk_cluster) == 0 and count(zk_up) by (zk_cluster) >= 3",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_users_obj": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
},
|
||||
{
|
||||
"name": "【Zookeeper】Zookeeper 集群存在多个 Leader",
|
||||
"note": "",
|
||||
"prod": "",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 1,
|
||||
"disabled": 0,
|
||||
"prom_for_duration": 0,
|
||||
"prom_ql": "sum(zk_server_leader) by (zk_cluster) > 1",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_users_obj": [],
|
||||
"notify_repeat_step": 60,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": []
|
||||
}
|
||||
]
|
File diff suppressed because it is too large
Load Diff
|
@ -33,17 +33,22 @@ var (
|
|||
)
|
||||
|
||||
type Instance struct {
|
||||
Address string `toml:"address"`
|
||||
Timeout int `toml:"timeout"`
|
||||
Labels map[string]string `toml:"labels"`
|
||||
Addresses string `toml:"addresses"`
|
||||
Timeout int `toml:"timeout"`
|
||||
ClusterName string `toml:"cluster_name"`
|
||||
Labels map[string]string `toml:"labels"`
|
||||
tls.ClientConfig
|
||||
}
|
||||
|
||||
func (i *Instance) ZkConnect() (net.Conn, error) {
|
||||
func (i *Instance) ZkHosts() []string {
|
||||
return strings.Fields(i.Addresses)
|
||||
}
|
||||
|
||||
func (i *Instance) ZkConnect(host string) (net.Conn, error) {
|
||||
dialer := net.Dialer{Timeout: time.Duration(i.Timeout) * time.Second}
|
||||
tcpaddr, err := net.ResolveTCPAddr("tcp", i.Address)
|
||||
tcpaddr, err := net.ResolveTCPAddr("tcp", host)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to resolve zookeeper address: %s: %v", i.Address, err)
|
||||
return nil, fmt.Errorf("failed to resolve zookeeper(cluster: %s) address: %s: %v", i.ClusterName, host, err)
|
||||
}
|
||||
|
||||
if !i.UseTLS {
|
||||
|
@ -87,17 +92,23 @@ func (z *Zookeeper) Gather(slist *list.SafeList) {
|
|||
atomic.AddUint64(&z.Counter, 1)
|
||||
for i := range z.Instances {
|
||||
ins := z.Instances[i]
|
||||
z.wg.Add(1)
|
||||
go z.gatherOnce(slist, ins)
|
||||
zkHosts := ins.ZkHosts()
|
||||
if len(zkHosts) == 0 {
|
||||
log.Printf("E! no target zookeeper cluster %s addresses specified", ins.ClusterName)
|
||||
continue
|
||||
}
|
||||
for _, zkHost := range zkHosts {
|
||||
z.wg.Add(1)
|
||||
go z.gatherOnce(slist, ins, zkHost)
|
||||
}
|
||||
}
|
||||
z.wg.Wait()
|
||||
}
|
||||
|
||||
func (z *Zookeeper) gatherOnce(slist *list.SafeList, ins *Instance) {
|
||||
func (z *Zookeeper) gatherOnce(slist *list.SafeList, ins *Instance, zkHost string) {
|
||||
defer z.wg.Done()
|
||||
|
||||
// metrics labels
|
||||
tags := map[string]string{"address": ins.Address, "zk_host": ins.Address}
|
||||
tags := map[string]string{"zk_host": zkHost, "zk_cluster": ins.ClusterName}
|
||||
for k, v := range ins.Labels {
|
||||
tags[k] = v
|
||||
}
|
||||
|
@ -111,21 +122,29 @@ func (z *Zookeeper) gatherOnce(slist *list.SafeList, ins *Instance) {
|
|||
}(begun)
|
||||
|
||||
// zk_up
|
||||
conn, err := ins.ZkConnect()
|
||||
conn, err := ins.ZkConnect(zkHost)
|
||||
if err != nil {
|
||||
slist.PushFront(inputs.NewSample("zk_up", 0, tags))
|
||||
log.Println("E! failed connect to zookeeper:"+ins.Address, "err:", err)
|
||||
log.Println("E! :"+zkHost, "err:", err)
|
||||
return
|
||||
}
|
||||
|
||||
defer conn.Close()
|
||||
|
||||
z.gatherMntrResult(conn, slist, ins, tags)
|
||||
z.gatherRuokResult(conn, slist, ins, tags)
|
||||
|
||||
// zk_ruok
|
||||
ruokConn, err := ins.ZkConnect(zkHost)
|
||||
if err != nil {
|
||||
slist.PushFront(inputs.NewSample("zk_ruok", 0, tags))
|
||||
log.Println("E! :"+zkHost, "err:", err)
|
||||
return
|
||||
}
|
||||
defer ruokConn.Close()
|
||||
z.gatherRuokResult(ruokConn, slist, ins, tags)
|
||||
}
|
||||
|
||||
func (z *Zookeeper) gatherMntrResult(conn net.Conn, slist *list.SafeList, ins *Instance, globalTags map[string]string) {
|
||||
res := sendZookeeperCmd(conn, ins.Address, "mntr")
|
||||
res := sendZookeeperCmd(conn, "mntr")
|
||||
|
||||
// get slice of strings from response, like 'zk_avg_latency 0'
|
||||
lines := strings.Split(res, "\n")
|
||||
|
@ -133,7 +152,7 @@ func (z *Zookeeper) gatherMntrResult(conn net.Conn, slist *list.SafeList, ins *I
|
|||
// 'mntr' command isn't allowed in zk config, log as warning
|
||||
if strings.Contains(lines[0], cmdNotExecutedSffx) {
|
||||
slist.PushFront(inputs.NewSample("zk_up", 0, globalTags))
|
||||
log.Printf(commandNotAllowedTmpl, "mntr", ins.Address)
|
||||
log.Printf(commandNotAllowedTmpl, "mntr", conn.RemoteAddr().String())
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -151,7 +170,7 @@ func (z *Zookeeper) gatherMntrResult(conn net.Conn, slist *list.SafeList, ins *I
|
|||
continue
|
||||
}
|
||||
|
||||
kv := strings.Split(strings.Replace(l, "\t", " ", -1), " ")
|
||||
kv := strings.Fields(l)
|
||||
key := kv[0]
|
||||
value := kv[1]
|
||||
|
||||
|
@ -172,29 +191,35 @@ func (z *Zookeeper) gatherMntrResult(conn net.Conn, slist *list.SafeList, ins *I
|
|||
|
||||
default:
|
||||
var k string
|
||||
k = metricNameReplacer.Replace(key)
|
||||
|
||||
if !isDigit(value) {
|
||||
log.Printf("warning: skipping metric %q which holds not-digit value: %q", key, value)
|
||||
continue
|
||||
}
|
||||
slist.PushFront(inputs.NewSample(k, value, globalTags))
|
||||
k = metricNameReplacer.Replace(key)
|
||||
if strings.Contains(k, "{") {
|
||||
labels := parseLabels(k)
|
||||
slist.PushFront(inputs.NewSample(k, value, globalTags, labels))
|
||||
} else {
|
||||
slist.PushFront(inputs.NewSample(k, value, globalTags))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (z *Zookeeper) gatherRuokResult(conn net.Conn, slist *list.SafeList, ins *Instance, globalTags map[string]string) {
|
||||
res := sendZookeeperCmd(conn, ins.Address, "ruok")
|
||||
res := sendZookeeperCmd(conn, "ruok")
|
||||
if res == "imok" {
|
||||
slist.PushFront(inputs.NewSample("zk_ruok", 1, globalTags))
|
||||
} else {
|
||||
if strings.Contains(res, cmdNotExecutedSffx) {
|
||||
log.Printf(commandNotAllowedTmpl, "ruok", ins.Address)
|
||||
log.Printf(commandNotAllowedTmpl, "ruok", conn.RemoteAddr().String())
|
||||
}
|
||||
slist.PushFront(inputs.NewSample("zk_ruok", 0, globalTags))
|
||||
}
|
||||
}
|
||||
|
||||
func sendZookeeperCmd(conn net.Conn, host, cmd string) string {
|
||||
func sendZookeeperCmd(conn net.Conn, cmd string) string {
|
||||
_, err := conn.Write([]byte(cmd))
|
||||
if err != nil {
|
||||
log.Println("E! failed to exec Zookeeper command:", cmd)
|
||||
|
@ -202,7 +227,7 @@ func sendZookeeperCmd(conn net.Conn, host, cmd string) string {
|
|||
|
||||
res, err := ioutil.ReadAll(conn)
|
||||
if err != nil {
|
||||
log.Printf("E! failed read Zookeeper command: '%s' response from '%s': %s", cmd, host, err)
|
||||
log.Printf("E! failed read Zookeeper command: '%s' response from '%s': %s", cmd, conn.RemoteAddr().String(), err)
|
||||
}
|
||||
return string(res)
|
||||
}
|
||||
|
@ -217,3 +242,23 @@ func isDigit(in string) bool {
|
|||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func parseLabels(in string) map[string]string {
|
||||
labels := map[string]string{}
|
||||
|
||||
labelsRE := regexp.MustCompile(`{(.*)}`)
|
||||
labelRE := regexp.MustCompile(`(.*)\=(\".*\")`)
|
||||
matchLables := labelsRE.FindStringSubmatch(in)
|
||||
if len(matchLables) > 1 {
|
||||
labelsStr := matchLables[1]
|
||||
for _, labelStr := range strings.Split(labelsStr, ",") {
|
||||
m := labelRE.FindStringSubmatch(labelStr)
|
||||
if len(m) == 3 {
|
||||
key := m[1]
|
||||
value := m[2]
|
||||
labels[key] = value
|
||||
}
|
||||
}
|
||||
}
|
||||
return labels
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue