zk插件新增集群标识

This commit is contained in:
huangchao 2022-06-20 17:11:08 +08:00
parent 2953890391
commit dd9af850ba
5 changed files with 818 additions and 767 deletions

View File

@ -2,7 +2,8 @@
# interval = 15
# [[instances]]
# address = "127.0.0.1:2181"
# cluster_name = "dev-zk-cluster"
# addresses = "127.0.0.1:2181"
# timeout = 10
# important! use global unique string to specify instance

View File

@ -1,6 +1,7 @@
# zookeeper
**注意: >=3.6.0 zookeeper 版本内置 [prometheus 的支持](https://zookeeper.apache.org/doc/current/zookeeperMonitor.html),即,如果 zookeeper 启用了 prometheusCategraf 可使用 prometheus 插件从这个 metrics 接口拉取数据即可;**
移植于 [dabealu/zookeeper-exporter](https://github.com/dabealu/zookeeper-exporter),原理就是利用 Zookeper 提供的四字命令The Four Letter Words获取监控信息
移植于 [dabealu/zookeeper-exporter](https://github.com/dabealu/zookeeper-exporter)适用于 `<3.6.0` 版本的 zookeeper, 原理就是利用 Zookeper 提供的四字命令The Four Letter Words获取监控信息
需要注意的是,在 zookeeper v3.4.10 以后添加了四字命令白名单,需要在 zookeeper 的配置文件 `zoo.cfg` 中新增白名单配置:
```
@ -9,32 +10,29 @@
## Configuration
zookeeper 插件的配置在 `conf/input.zookeeper/zookeeper.toml` 最简单的配置如下
zookeeper 插件的配置在 `conf/input.zookeeper/zookeeper.toml` 集群中的多个实例地址请用空格分隔
```toml
[[instances]]
address = "127.0.0.1:2181"
labels = { instance="n9e-10.23.25.2:2181" }
cluster_name = "dev-zk-cluster"
addresses = "127.0.0.1:2181"
timeout = 10
```
如果要监控多个 zookeeper 实例 (同一集群的多个实例也请分别添加,可在 `labels` 中添加 `cluster_name="xxx"` 来进行标识),就增加 instances 即可:
如果要监控多个 zookeeper 集群,就增加 instances 即可:
```toml
[[instances]]
address = "10.23.25.2:2181"
username = ""
password = ""
labels = { instance="n9e-10.23.25.2:2181" }
cluster_name = "dev-zk-cluster"
addresses = "127.0.0.1:2181"
timeout = 10
[[instances]]
address = "10.23.25.3:2181"
username = ""
password = ""
labels = { instance="n9e-10.23.25.3:2181" }
cluster_name = "test-zk-cluster"
addresses = "127.0.0.1:2181 127.0.0.1:2182 127.0.0.1:2183"
timeout = 10
```
建议通过 labels 配置附加一个 instance 标签,便于后面复用监控大盘。
## 监控大盘和告警规则
该 README 的同级目录下,提供了 dashboard.json 就是监控大盘的配置alerts.json 是告警规则,可以导入夜莺使用。

View File

@ -1,134 +1,134 @@
[
{
"name": "【Zookeeper】Zookeeper Down",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "zk_up == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "【Zookeeper】Zookeeper instance is not ok",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "zk_ruok == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "【Zookeeper】Zookeeper 集群不存在 Leader",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "sum(zk_server_leader) == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "【Zookeeper】Zookeeper 集群存在多个 Leader",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "sum(zk_server_leader) > 1",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
}
]
{
"name": "【Zookeeper】Zookeeper instance is not ok",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "zk_ruok == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "【Zookeeper】Zookeeper Down",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "zk_up == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "【Zookeeper】Zookeeper 集群不存在 Leader",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "sum(zk_server_leader) by (zk_cluster) == 0 and count(zk_up) by (zk_cluster) >= 3",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "【Zookeeper】Zookeeper 集群存在多个 Leader",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "sum(zk_server_leader) by (zk_cluster) > 1",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
}
]

File diff suppressed because it is too large Load Diff

View File

@ -33,17 +33,22 @@ var (
)
type Instance struct {
Address string `toml:"address"`
Timeout int `toml:"timeout"`
Labels map[string]string `toml:"labels"`
Addresses string `toml:"addresses"`
Timeout int `toml:"timeout"`
ClusterName string `toml:"cluster_name"`
Labels map[string]string `toml:"labels"`
tls.ClientConfig
}
func (i *Instance) ZkConnect() (net.Conn, error) {
func (i *Instance) ZkHosts() []string {
return strings.Fields(i.Addresses)
}
func (i *Instance) ZkConnect(host string) (net.Conn, error) {
dialer := net.Dialer{Timeout: time.Duration(i.Timeout) * time.Second}
tcpaddr, err := net.ResolveTCPAddr("tcp", i.Address)
tcpaddr, err := net.ResolveTCPAddr("tcp", host)
if err != nil {
return nil, fmt.Errorf("failed to resolve zookeeper address: %s: %v", i.Address, err)
return nil, fmt.Errorf("failed to resolve zookeeper(cluster: %s) address: %s: %v", i.ClusterName, host, err)
}
if !i.UseTLS {
@ -87,17 +92,23 @@ func (z *Zookeeper) Gather(slist *list.SafeList) {
atomic.AddUint64(&z.Counter, 1)
for i := range z.Instances {
ins := z.Instances[i]
z.wg.Add(1)
go z.gatherOnce(slist, ins)
zkHosts := ins.ZkHosts()
if len(zkHosts) == 0 {
log.Printf("E! no target zookeeper cluster %s addresses specified", ins.ClusterName)
continue
}
for _, zkHost := range zkHosts {
z.wg.Add(1)
go z.gatherOnce(slist, ins, zkHost)
}
}
z.wg.Wait()
}
func (z *Zookeeper) gatherOnce(slist *list.SafeList, ins *Instance) {
func (z *Zookeeper) gatherOnce(slist *list.SafeList, ins *Instance, zkHost string) {
defer z.wg.Done()
// metrics labels
tags := map[string]string{"address": ins.Address, "zk_host": ins.Address}
tags := map[string]string{"zk_host": zkHost, "zk_cluster": ins.ClusterName}
for k, v := range ins.Labels {
tags[k] = v
}
@ -111,21 +122,29 @@ func (z *Zookeeper) gatherOnce(slist *list.SafeList, ins *Instance) {
}(begun)
// zk_up
conn, err := ins.ZkConnect()
conn, err := ins.ZkConnect(zkHost)
if err != nil {
slist.PushFront(inputs.NewSample("zk_up", 0, tags))
log.Println("E! failed connect to zookeeper:"+ins.Address, "err:", err)
log.Println("E! :"+zkHost, "err:", err)
return
}
defer conn.Close()
z.gatherMntrResult(conn, slist, ins, tags)
z.gatherRuokResult(conn, slist, ins, tags)
// zk_ruok
ruokConn, err := ins.ZkConnect(zkHost)
if err != nil {
slist.PushFront(inputs.NewSample("zk_ruok", 0, tags))
log.Println("E! :"+zkHost, "err:", err)
return
}
defer ruokConn.Close()
z.gatherRuokResult(ruokConn, slist, ins, tags)
}
func (z *Zookeeper) gatherMntrResult(conn net.Conn, slist *list.SafeList, ins *Instance, globalTags map[string]string) {
res := sendZookeeperCmd(conn, ins.Address, "mntr")
res := sendZookeeperCmd(conn, "mntr")
// get slice of strings from response, like 'zk_avg_latency 0'
lines := strings.Split(res, "\n")
@ -133,7 +152,7 @@ func (z *Zookeeper) gatherMntrResult(conn net.Conn, slist *list.SafeList, ins *I
// 'mntr' command isn't allowed in zk config, log as warning
if strings.Contains(lines[0], cmdNotExecutedSffx) {
slist.PushFront(inputs.NewSample("zk_up", 0, globalTags))
log.Printf(commandNotAllowedTmpl, "mntr", ins.Address)
log.Printf(commandNotAllowedTmpl, "mntr", conn.RemoteAddr().String())
return
}
@ -151,7 +170,7 @@ func (z *Zookeeper) gatherMntrResult(conn net.Conn, slist *list.SafeList, ins *I
continue
}
kv := strings.Split(strings.Replace(l, "\t", " ", -1), " ")
kv := strings.Fields(l)
key := kv[0]
value := kv[1]
@ -172,29 +191,35 @@ func (z *Zookeeper) gatherMntrResult(conn net.Conn, slist *list.SafeList, ins *I
default:
var k string
k = metricNameReplacer.Replace(key)
if !isDigit(value) {
log.Printf("warning: skipping metric %q which holds not-digit value: %q", key, value)
continue
}
slist.PushFront(inputs.NewSample(k, value, globalTags))
k = metricNameReplacer.Replace(key)
if strings.Contains(k, "{") {
labels := parseLabels(k)
slist.PushFront(inputs.NewSample(k, value, globalTags, labels))
} else {
slist.PushFront(inputs.NewSample(k, value, globalTags))
}
}
}
}
func (z *Zookeeper) gatherRuokResult(conn net.Conn, slist *list.SafeList, ins *Instance, globalTags map[string]string) {
res := sendZookeeperCmd(conn, ins.Address, "ruok")
res := sendZookeeperCmd(conn, "ruok")
if res == "imok" {
slist.PushFront(inputs.NewSample("zk_ruok", 1, globalTags))
} else {
if strings.Contains(res, cmdNotExecutedSffx) {
log.Printf(commandNotAllowedTmpl, "ruok", ins.Address)
log.Printf(commandNotAllowedTmpl, "ruok", conn.RemoteAddr().String())
}
slist.PushFront(inputs.NewSample("zk_ruok", 0, globalTags))
}
}
func sendZookeeperCmd(conn net.Conn, host, cmd string) string {
func sendZookeeperCmd(conn net.Conn, cmd string) string {
_, err := conn.Write([]byte(cmd))
if err != nil {
log.Println("E! failed to exec Zookeeper command:", cmd)
@ -202,7 +227,7 @@ func sendZookeeperCmd(conn net.Conn, host, cmd string) string {
res, err := ioutil.ReadAll(conn)
if err != nil {
log.Printf("E! failed read Zookeeper command: '%s' response from '%s': %s", cmd, host, err)
log.Printf("E! failed read Zookeeper command: '%s' response from '%s': %s", cmd, conn.RemoteAddr().String(), err)
}
return string(res)
}
@ -217,3 +242,23 @@ func isDigit(in string) bool {
}
return true
}
func parseLabels(in string) map[string]string {
labels := map[string]string{}
labelsRE := regexp.MustCompile(`{(.*)}`)
labelRE := regexp.MustCompile(`(.*)\=(\".*\")`)
matchLables := labelsRE.FindStringSubmatch(in)
if len(matchLables) > 1 {
labelsStr := matchLables[1]
for _, labelStr := range strings.Split(labelsStr, ",") {
m := labelRE.FindStringSubmatch(labelStr)
if len(m) == 3 {
key := m[1]
value := m[2]
labels[key] = value
}
}
}
return labels
}