采集zookeeper监控数据

This commit is contained in:
huangchao 2022-06-14 18:09:22 +08:00
parent 80a68edd1a
commit ff879d7d75
6 changed files with 1009 additions and 0 deletions

View File

@ -41,6 +41,7 @@ import (
_ "flashcat.cloud/categraf/inputs/switch_legacy"
_ "flashcat.cloud/categraf/inputs/system"
_ "flashcat.cloud/categraf/inputs/tomcat"
_ "flashcat.cloud/categraf/inputs/zookeeper"
)
const inputFilePrefix = "input."

View File

@ -0,0 +1,18 @@
# # collect interval
# interval = 15
# [[instances]]
# address = "127.0.0.1:2181"
# timeout = 10
# important! use global unique string to specify instance
# labels = { instance="n9e-10.2.3.4:2181" }
## Optional TLS Config
# use_tls = false
# tls_min_version = "1.2"
# tls_ca = "/etc/categraf/ca.pem"
# tls_cert = "/etc/categraf/cert.pem"
# tls_key = "/etc/categraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = true

View File

@ -0,0 +1,41 @@
# zookeeper
移植于 [dabealu/zookeeper-exporter](https://github.com/dabealu/zookeeper-exporter),原理就是利用 Zookeper 提供的四字命令The Four Letter Words获取监控信息
需要注意的是,在 zookeeper v3.4.10 以后添加了四字命令白名单,需要在 zookeeper 的配置文件 `zoo.cfg` 中新增白名单配置:
```
4lw.commands.whitelist=mntr,ruok
```
## Configuration
zookeeper 插件的配置在 `conf/input.zookeeper/zookeeper.toml` 最简单的配置如下:
```toml
[[instances]]
address = "127.0.0.1:2181"
labels = { instance="n9e-10.23.25.2:2181" }
```
如果要监控多个 zookeeper 实例,就增加 instances 即可:
```toml
[[instances]]
address = "10.23.25.2:2181"
username = ""
password = ""
labels = { instance="n9e-10.23.25.2:2181" }
[[instances]]
address = "10.23.25.3:2181"
username = ""
password = ""
labels = { instance="n9e-10.23.25.3:2181" }
```
建议通过 labels 配置附加一个 instance 标签,便于后面复用监控大盘。
## 监控大盘和告警规则
该 README 的同级目录下,提供了 dashboard.json 就是监控大盘的配置alerts.json 是告警规则,可以导入夜莺使用。

View File

@ -0,0 +1,134 @@
[
{
"name": "【Zookeeper】Zookeeper Down",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "zk_up == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "【Zookeeper】Zookeeper instance is not ok",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"disabled": 0,
"prom_for_duration": 120,
"prom_ql": "zk_ruok == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "【Zookeeper】Zookeeper 集群不存在 Leader",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "sum(zk_server_leader) == 0",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "【Zookeeper】Zookeeper 集群存在多个 Leader",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 1,
"disabled": 0,
"prom_for_duration": 0,
"prom_ql": "sum(zk_server_leader) > 1",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_users_obj": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
}
]

View File

@ -0,0 +1,596 @@
{
"name": "Zookeeper",
"tags": "",
"configs": {
"var": [
{
"definition": "label_values(zk_up,instance)",
"name": "instance"
}
],
"panels": [
{
"type": "row",
"id": "204ed80c-88a7-4075-90bf-0dce6f319caa",
"name": "分组",
"collapsed": true,
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "204ed80c-88a7-4075-90bf-0dce6f319caa",
"isResizable": false
}
},
{
"targets": [
{
"refId": "A",
"expr": "zk_up{instance=~\"$instance\"}"
}
],
"name": "Status",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [
{
"type": "special",
"match": {
"special": 1
},
"result": {
"text": "Up",
"color": "#7ed321"
}
},
{
"type": "special",
"match": {
"special": 0
},
"result": {
"text": "Down",
"color": "#d0021b"
}
}
],
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 4,
"w": 4,
"x": 0,
"y": 1,
"i": "ef4804c3-5eb2-43b6-af4f-35cad5114e7e",
"isResizable": true
},
"id": "ef4804c3-5eb2-43b6-af4f-35cad5114e7e"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_server_leader{instance=~\"$instance\"}"
}
],
"name": "Is Leader",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [
{
"type": "special",
"match": {
"special": 1
},
"result": {
"text": "Yes",
"color": "#7ed321"
}
},
{
"type": "special",
"match": {
"special": 0
},
"result": {
"text": "No",
"color": "#d0021b"
}
}
],
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 4,
"w": 4,
"x": 4,
"y": 1,
"i": "3f1d1548-fc5b-4188-bf72-d5fea7c682ca",
"isResizable": true
},
"id": "f73b13a6-62d2-4b7d-9448-b0c4cb0d5144"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_ephemerals_count{instance=~\"$instance\"}"
}
],
"name": "Ephemerals Count",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [],
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 4,
"w": 4,
"x": 8,
"y": 1,
"i": "78ff9d1f-8d3c-440e-9fd7-4040575eddf9",
"isResizable": true
},
"id": "d55c0555-b3fa-466d-a380-4a2a98af3431"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_znode_count{instance=~\"$instance\"}"
}
],
"name": "Znode Count",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [],
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 4,
"w": 4,
"x": 12,
"y": 1,
"i": "612b69b8-346a-419e-bb22-1d372535bac8",
"isResizable": true
},
"id": "15c3ddc5-a30f-4e32-904f-4590494ee11b"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_watch_count{instance=~\"$instance\"}"
}
],
"name": "Watch Count",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"valueMappings": [],
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 4,
"w": 4,
"x": 16,
"y": 1,
"i": "a2417ea6-e3d7-41cd-9985-d6ea5db43217",
"isResizable": true
},
"id": "3b903da1-c9a4-4a87-b0a0-afd3defe4c15"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_version{instance=~\"$instance\"}"
}
],
"name": "Version",
"custom": {
"showHeader": true,
"calc": "lastNotNull",
"displayMode": "labelsOfSeriesToRows",
"columns": [
"zk_host",
"version"
]
},
"options": {
"valueMappings": [],
"standardOptions": {}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 4,
"w": 4,
"x": 20,
"y": 1,
"i": "4cc8bad9-b441-4d10-abb3-7d50bb624967",
"isResizable": true
},
"id": "8a3cf9d0-bb26-4b67-8fe6-c9d76e5eb618"
},
{
"targets": [
{
"refId": "A",
"expr": "rate(zk_packets_sent{instance=~\"$instance\"}[5m])",
"legend": "{{ instance }}"
}
],
"name": "Pakages Sent",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 5,
"i": "9dcf936f-72c1-434b-af93-369c9c991bb2",
"isResizable": true
},
"id": "9dcf936f-72c1-434b-af93-369c9c991bb2"
},
{
"targets": [
{
"refId": "A",
"expr": "rate(zk_packets_received{instance=~\"$instance\"}[5m])",
"legend": "{{ instance }}"
}
],
"name": "Pakages Recieved",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 5,
"i": "bf87aba7-3d5b-427c-a0cf-426c65fbecae",
"isResizable": true
},
"id": "e085001a-f693-4723-958a-b910843e0339"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_num_alive_connections{instance=~\"$instance\"}",
"legend": "{{ instance }}"
}
],
"name": "Alive Connections",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 9,
"i": "f7043b0a-a853-4e4c-aec3-46e2dcf52586",
"isResizable": true
},
"id": "5e2183f9-6277-43f9-b9be-6dbaa35cb582"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_open_file_descriptor_count{instance=~\"$instance\"}",
"legend": "{{ instance }}"
}
],
"name": "File Descriptors",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 9,
"i": "d298a41a-26e9-46d7-b9e4-7497d1f9ef7d",
"isResizable": true
},
"id": "ba3aebdc-5982-4b89-82be-c28d03776c0f"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_avg_latency{instance=~\"$instance\"}",
"legend": "{{ instance }}"
}
],
"name": "Avg Latency",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 13,
"i": "97b6b491-6d71-4e54-8ac5-9c1214f5b42e",
"isResizable": true
},
"id": "a2e8a9bc-6b09-40d9-80c1-1dc0f0cbd5e2"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_min_latency{instance=~\"$instance\"}",
"legend": "{{ instance }}"
}
],
"name": "Min Latency(seconds)",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 4,
"w": 6,
"x": 12,
"y": 13,
"i": "6abdaef3-9985-4325-a563-929f515ddbbd",
"isResizable": true
},
"id": "7eb7f2f2-7319-42b2-9fa1-2868fa490eaf"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_max_latency{instance=~\"$instance\"}",
"legend": "{{ instance }}"
}
],
"name": "Max Latency(seconds)",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 4,
"w": 6,
"x": 18,
"y": 13,
"i": "b2c368fa-f61c-4ddd-a7d8-d214ec67182d",
"isResizable": true
},
"id": "524ca86b-2854-4ed4-a3bc-a506ae7763eb"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_outstanding_requests{instance=~\"$instance\"}",
"legend": "{{ instance }}"
}
],
"name": "Outstanding Requests",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "seconds"
},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 0,
"y": 17,
"i": "eb83dd45-98e7-4d76-94c3-24681b4957a8",
"isResizable": true
},
"id": "e24f9e2b-ce0a-4cf2-bf0d-bead1df222b2"
},
{
"targets": [
{
"refId": "A",
"expr": "zk_approximate_data_size{instance=~\"$instance\"}",
"legend": "{{ instance }}"
}
],
"name": "Approx Data Size",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "bytesIEC"
},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 4,
"w": 12,
"x": 12,
"y": 17,
"i": "a1ea51d8-9b26-4eb8-8be4-97acf92f7ca2",
"isResizable": true
},
"id": "c94ba892-2af9-4a31-9f0f-d7e7786c7530"
}
],
"version": "2.0.0"
}
}

View File

@ -0,0 +1,219 @@
package zookeeper
import (
crypto_tls "crypto/tls"
"fmt"
"io/ioutil"
"log"
"net"
"regexp"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"flashcat.cloud/categraf/config"
"flashcat.cloud/categraf/inputs"
"flashcat.cloud/categraf/pkg/tls"
"flashcat.cloud/categraf/types"
"github.com/toolkits/pkg/container/list"
)
const (
inputName = "zookeeper"
commandNotAllowedTmpl = "warning: %q command isn't allowed at %q, see '4lw.commands.whitelist' ZK config parameter"
instanceNotServingMessage = "This ZooKeeper instance is not currently serving requests"
cmdNotExecutedSffx = "is not executed because it is not in the whitelist."
)
var (
versionRE = regexp.MustCompile(`^([0-9]+\.[0-9]+\.[0-9]+).*$`)
metricNameReplacer = strings.NewReplacer("-", "_", ".", "_")
)
type Instance struct {
Address string `toml:"address"`
Timeout int `toml:"timeout"`
Labels map[string]string `toml:"labels"`
tls.ClientConfig
}
func (i *Instance) ZkConnect() (net.Conn, error) {
dialer := net.Dialer{Timeout: time.Duration(i.Timeout) * time.Second}
tcpaddr, err := net.ResolveTCPAddr("tcp", i.Address)
if err != nil {
return nil, fmt.Errorf("failed to resolve zookeeper address: %s: %v", i.Address, err)
}
if !i.UseTLS {
return dialer.Dial("tcp", tcpaddr.String())
}
tlsConfig, err := i.TLSConfig()
if err != nil {
return nil, fmt.Errorf("failed to init tls config: %v", err)
}
return crypto_tls.DialWithDialer(&dialer, "tcp", tcpaddr.String(), tlsConfig)
}
type Zookeeper struct {
config.Interval
Instances []*Instance `toml:"instances"`
Counter uint64
wg sync.WaitGroup
}
func init() {
inputs.Add(inputName, func() inputs.Input {
return &Zookeeper{}
})
}
func (z *Zookeeper) Prefix() string {
return ""
}
func (z *Zookeeper) Init() error {
if len(z.Instances) == 0 {
return types.ErrInstancesEmpty
}
return nil
}
func (z *Zookeeper) Drop() {}
func (z *Zookeeper) Gather(slist *list.SafeList) {
atomic.AddUint64(&z.Counter, 1)
for i := range z.Instances {
ins := z.Instances[i]
z.wg.Add(1)
go z.gatherOnce(slist, ins)
}
z.wg.Wait()
}
func (z *Zookeeper) gatherOnce(slist *list.SafeList, ins *Instance) {
defer z.wg.Done()
// metrics labels
tags := map[string]string{"address": ins.Address, "zk_host": ins.Address}
for k, v := range ins.Labels {
tags[k] = v
}
begun := time.Now()
// scrape use seconds
defer func(begun time.Time) {
use := time.Since(begun).Seconds()
slist.PushFront(inputs.NewSample("zk_scrape_use_seconds", use, tags))
}(begun)
// zk_up
conn, err := ins.ZkConnect()
if err != nil {
slist.PushFront(inputs.NewSample("zk_up", 0, tags))
log.Println("E! failed connect to zookeeper:"+ins.Address, "err:", err)
return
}
defer conn.Close()
z.gatherMntrResult(conn, slist, ins, tags)
z.gatherRuokResult(conn, slist, ins, tags)
}
func (z *Zookeeper) gatherMntrResult(conn net.Conn, slist *list.SafeList, ins *Instance, globalTags map[string]string) {
res := sendZookeeperCmd(conn, ins.Address, "mntr")
// get slice of strings from response, like 'zk_avg_latency 0'
lines := strings.Split(res, "\n")
// 'mntr' command isn't allowed in zk config, log as warning
if strings.Contains(lines[0], cmdNotExecutedSffx) {
slist.PushFront(inputs.NewSample("zk_up", 0, globalTags))
log.Printf(commandNotAllowedTmpl, "mntr", ins.Address)
return
}
slist.PushFront(inputs.NewSample("zk_up", 1, globalTags))
// skip instance if it in a leader only state and doesnt serving client requests
if lines[0] == instanceNotServingMessage {
slist.PushFront(inputs.NewSample("zk_server_leader", 1, globalTags))
return
}
// split each line into key-value pair
for _, l := range lines {
if l == "" {
continue
}
kv := strings.Split(strings.Replace(l, "\t", " ", -1), " ")
key := kv[0]
value := kv[1]
switch key {
case "zk_server_state":
if value == "leader" {
slist.PushFront(inputs.NewSample("zk_server_leader", 1, globalTags))
} else {
slist.PushFront(inputs.NewSample("zk_server_leader", 0, globalTags))
}
case "zk_version":
version := versionRE.ReplaceAllString(value, "$1")
slist.PushFront(inputs.NewSample("zk_version", 1, globalTags, map[string]string{"version": version}))
case "zk_peer_state":
slist.PushFront(inputs.NewSample("zk_peer_state", 1, globalTags, map[string]string{"state": value}))
default:
var k string
k = metricNameReplacer.Replace(key)
if !isDigit(value) {
log.Printf("warning: skipping metric %q which holds not-digit value: %q", key, value)
continue
}
slist.PushFront(inputs.NewSample(k, value, globalTags))
}
}
}
func (z *Zookeeper) gatherRuokResult(conn net.Conn, slist *list.SafeList, ins *Instance, globalTags map[string]string) {
res := sendZookeeperCmd(conn, ins.Address, "ruok")
if res == "imok" {
slist.PushFront(inputs.NewSample("zk_ruok", 1, globalTags))
} else {
if strings.Contains(res, cmdNotExecutedSffx) {
log.Printf(commandNotAllowedTmpl, "ruok", ins.Address)
}
slist.PushFront(inputs.NewSample("zk_ruok", 0, globalTags))
}
}
func sendZookeeperCmd(conn net.Conn, host, cmd string) string {
_, err := conn.Write([]byte(cmd))
if err != nil {
log.Println("E! failed to exec Zookeeper command:", cmd)
}
res, err := ioutil.ReadAll(conn)
if err != nil {
log.Printf("E! failed read Zookeeper command: '%s' response from '%s': %s", cmd, host, err)
}
return string(res)
}
func isDigit(in string) bool {
// check input is an int
if _, err := strconv.Atoi(in); err != nil {
// not int, try float
if _, err := strconv.ParseFloat(in, 64); err != nil {
return false
}
}
return true
}