add kafka alerts and dashboard conf

This commit is contained in:
yushuangyu 2022-07-04 15:37:31 +08:00
parent f3d496099a
commit 870b6be01a
7 changed files with 536 additions and 17 deletions

View File

@ -101,7 +101,7 @@ Click on the links to see the README of each plugin.
- [ ] mongodb
- [ ] rocketmq
- [ ] activemq
- [ ] kafka
- [x] [kafka](inputs/kafka)
- [x] [elasticsearch](inputs/elasticsearch)
- [x] windows
- [ ] mssql
@ -117,12 +117,12 @@ Click on the links to see the README of each plugin.
- [ ] ipmi
- [ ] smartctl
- [ ] logging
- [ ] trace
- [x] [traces](traces)
## Thanks
Categraf is developed on the basis of Telegraf and Exporters. Thanks to the great open source community.
Categraf is developed on the basis of Telegraf, Exporters and the OpenTelemetry. Thanks to the great open source community.
## Community

View File

@ -8,5 +8,78 @@
# interval_times = 1
# append some labels to metrics
labels = { cluster="cloud-n9e-kafka" }
kafka_uris = ["127.0.0.1:9092","127.0.0.1:9092","127.0.0.1:9092"]
# instance is a preferred tag with the cluster name. If none is provided, the first of kafka_uris will be used
labels = { instance="kafka-cluster-01" }
# log level only for kafka exporter
log_level = "error"
# Address (host:port) of Kafka server.
kafka_uris = ["127.0.0.1:9092","127.0.0.1:9092","127.0.0.1:9092"]
# Connect using SASL/PLAIN
# Default is false
# use_sasl = false
# Only set this to false if using a non-Kafka SASL proxy
# Default is true
# use_sasl_handshake = false
# SASL user name
# sasl_username = "username"
# SASL user password
# sasl_password = "password"
# The SASL SCRAM SHA algorithm sha256 or sha512 as mechanism
# sasl_mechanism = ""
# Connect using TLS
# use_tls = false
# The optional certificate authority file for TLS client authentication
# ca_file = ""
# The optional certificate file for TLS client authentication
# cert_file = ""
# The optional key file for TLS client authentication
# key_file = ""
# If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure
# insecure_skip_verify = true
# Kafka broker version
# Default is 2.0.0
# kafka_version = "2.0.0"
# if you need to use a group from zookeeper
# Default is false
# use_zookeeper_lag = false
# Address array (hosts) of zookeeper server.
# zookeeper_uris = []
# Metadata refresh interval
# Default is 1s
# metadata_refresh_interval = "1m"
# If true, all scrapes will trigger kafka operations otherwise, they will share results. WARN: This should be disabled on large clusters
# Default is false
# allow_concurrency = false
# Maximum number of offsets to store in the interpolation table for a partition
# Default is 1000
# max_offsets = 1000
# How frequently should the interpolation table be pruned, in seconds.
# Default is 30
# prune_interval_seconds = 30
# Regex filter for topics to be monitored
# Default is ".*"
# topics_filter_regex = ".*"
# Regex filter for consumer groups to be monitored
# Default is ".*"
# groups_filter_regex = ".*"

View File

@ -1,18 +1,11 @@
# kafka
kafka 监控采集插件,封装kafka-exporterhttps://github.com/davidmparrott/kafka_exporter而来
kafka 监控采集插件,kafka-exporterhttps://github.com/davidmparrott/kafka_exporter封装而来
## Configuration
```toml
# # collect interval
# interval = 15
# 要监控 MySQL首先要给出要监控的MySQL的连接地址、用户名、密码
[[instances]]
```
请参考配置[示例](../../conf/input.kafka/kafka.toml)
## 监控大盘和告警规则
本 README 的同级目录,大家可以看到 dashboard.json 就是监控大盘导入夜莺就可以使用alerts.json 是告警规则,也是导入夜莺就可以使用。
同级目录下的 dashboard.json、alerts.json 可以直接导入夜莺使用。

View File

@ -0,0 +1,72 @@
[
{
"name": "数据有丢失风险-副本数小于3",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka"
]
},
{
"name": "消费能力不足-延迟超过5分钟",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "kafka_consumer_lag_millis / 1000 > 300",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka"
]
}
]

View File

@ -0,0 +1,360 @@
{
"name": "Kafka - 模板",
"tags": "Kafka Prometheus ",
"configs": {
"var": [
{
"name": "instance",
"definition": "label_values(kafka_brokers, instance)",
"type": "query"
}
],
"panels": [
{
"id": "6fac3216-f9e2-45c5-8037-959ab3c98de5",
"type": "row",
"name": "overview",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "6fac3216-f9e2-45c5-8037-959ab3c98de5",
"isResizable": false
},
"collapsed": true,
"panels": []
},
{
"targets": [
{
"refId": "A",
"expr": "kafka_brokers{instance=\"$instance\"}"
}
],
"name": "brokers",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 0,
"y": 1,
"i": "b10ab025-2795-4ea1-b537-d03948324ea8",
"isResizable": true
},
"id": "b10ab025-2795-4ea1-b537-d03948324ea8"
},
{
"targets": [
{
"refId": "A",
"expr": "count(count by (topic) (kafka_topic_partitions{instance=\"$instance\"}))"
}
],
"name": "topics",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 6,
"y": 1,
"i": "8845f449-cb6c-4fa6-9930-351f106f9e52",
"isResizable": true
},
"id": "8845f449-cb6c-4fa6-9930-351f106f9e52"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(kafka_topic_partitions{instance=\"$instance\"})",
"legend": ""
}
],
"name": "partitions",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 12,
"y": 1,
"i": "cc26ea7b-8860-45cd-9f62-90f42bd195f5",
"isResizable": true
},
"id": "cc26ea7b-8860-45cd-9f62-90f42bd195f5"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(kafka_topic_partition_replicas{instance=~\"$instance\"})"
}
],
"name": "Replicas",
"custom": {
"textMode": "valueAndName",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 18,
"y": 1,
"i": "98cd9cee-69ad-4533-9eed-e307a24fffa6",
"isResizable": true
},
"id": "98cd9cee-69ad-4533-9eed-e307a24fffa6"
},
{
"id": "79a8e48a-fdf0-4c7e-bae4-478f7b294751",
"type": "row",
"name": "throughput",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 4,
"i": "79a8e48a-fdf0-4c7e-bae4-478f7b294751",
"isResizable": false
}
},
{
"targets": [
{
"expr": "sum(rate(kafka_topic_partition_current_offset{instance=\"$instance\"}[1m])) by (topic)"
}
],
"name": "Messages produced per second",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 0,
"y": 5,
"i": "3ceedd68-54d0-44db-9390-ceb2299619e5",
"isResizable": true
},
"id": "3ceedd68-54d0-44db-9390-ceb2299619e5"
},
{
"targets": [
{
"expr": "sum(rate(kafka_consumergroup_current_offset{instance=\"$instance\"}[1m])) by (topic)"
}
],
"name": "Messages consumed per second",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 8,
"y": 5,
"i": "a43a7752-00e6-41fb-9055-e04d17e22d99",
"isResizable": true
},
"id": "a43a7752-00e6-41fb-9055-e04d17e22d99"
},
{
"targets": [
{
"expr": "sum(kafka_consumer_lag_millis{instance=\"$instance\"}) by (consumergroup, topic) ",
"legend": "{{consumergroup}} (topic: {{topic}})"
}
],
"name": "Latency by Consumer Group",
"options": {
"tooltip": {
"mode": "all",
"sort": "desc"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "milliseconds"
},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 16,
"y": 5,
"i": "422193ca-facf-450c-b7cb-4975447f3ffc",
"isResizable": true
},
"id": "422193ca-facf-450c-b7cb-4975447f3ffc"
},
{
"id": "e85fc913-f075-4284-a9bc-75e481039372",
"type": "row",
"name": "patition/replicate",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 12,
"i": "e85fc913-f075-4284-a9bc-75e481039372",
"isResizable": false
}
},
{
"targets": [
{
"refId": "A",
"expr": "kafka_topic_partitions{instance=\"$instance\"}",
"legend": "{{topic}}"
}
],
"name": "Partitions per Topic",
"custom": {
"showHeader": true,
"calc": "lastNotNull",
"displayMode": "seriesToRows"
},
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 13,
"i": "8d50a6ec-9dde-4239-830d-d3568d0e8748",
"isResizable": true
},
"id": "8d50a6ec-9dde-4239-830d-d3568d0e8748"
},
{
"targets": [
{
"refId": "A",
"expr": "kafka_topic_partition_under_replicated_partition",
"legend": "{{topic}}-{{partition}}"
}
],
"name": "Partitions Under Replicated",
"description": "副本不同步预案\n1. Restart the Zookeeper leader.\n2. Restart the broker\\brokers that are not replicating some of the partitions.",
"custom": {
"showHeader": true,
"colorMode": "value",
"calc": "lastNotNull",
"displayMode": "seriesToRows"
},
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 13,
"i": "9aaf3255-8281-47f3-996b-8585e0f68c05",
"isResizable": true
},
"id": "9aaf3255-8281-47f3-996b-8585e0f68c05"
}
]
}
}

View File

@ -180,10 +180,10 @@ func (ins *Instance) Init() error {
ins.KafkaVersion = sarama.V2_0_0_0.String()
}
if len(ins.MetadataRefreshInterval) == 0 {
ins.MetadataRefreshInterval = "1s"
ins.MetadataRefreshInterval = "1m"
}
if ins.AllowConcurrent == nil {
flag := true
flag := false
ins.AllowConcurrent = &flag
}
if ins.MaxOffsets <= 0 {
@ -198,6 +198,13 @@ func (ins *Instance) Init() error {
if len(ins.GroupFilter) == 0 {
ins.GroupFilter = ".*"
}
if ins.Labels == nil {
ins.Labels = make(map[string]string)
}
_, ok := ins.Labels["instance"]
if !ok {
ins.Labels["instance"] = ins.KafkaURIs[0]
}
options := exporter.Options{
Uri: ins.KafkaURIs,
@ -224,6 +231,7 @@ func (ins *Instance) Init() error {
for k, v := range ins.Labels {
encLabels = append(encLabels, fmt.Sprintf("%s=%s", k, v))
}
options.Labels = strings.Join(encLabels, ",")
ins.l = level.NewFilter(klog.NewLogfmtLogger(klog.NewSyncWriter(os.Stderr)), levelFilter(ins.LogLevel))

13
traces/README.md Normal file
View File

@ -0,0 +1,13 @@
# traces
Categraf simply wrapped the OpenTelemetry Collector, which means you can get a full support for recving data from and exporting to popular trace vendors, such as the Jaeger and Zipkin.
We only support the common [components](../config/traces/components.go) as default. If you want more, simply add the new ones to [components.go](../config/traces/components.go),
and make sure you configure that in the conf.
For more details, see the official docs:
- https://opentelemetry.io/docs/collector/getting-started
- https://github.com/open-telemetry/opentelemetry-collector
## Configuration
Here is the [examples](../conf/traces.yaml).