From 870b6be01a548e7396bb949c0b7abc9f39e90795 Mon Sep 17 00:00:00 2001 From: yushuangyu Date: Mon, 4 Jul 2022 15:37:31 +0800 Subject: [PATCH] add kafka alerts and dashboard conf --- README.md | 6 +- conf/input.kafka/kafka.toml | 77 +++++++- inputs/kafka/README.md | 13 +- inputs/kafka/alerts.json | 72 ++++++++ inputs/kafka/dashboard.json | 360 ++++++++++++++++++++++++++++++++++++ inputs/kafka/kafka.go | 12 +- traces/README.md | 13 ++ 7 files changed, 536 insertions(+), 17 deletions(-) create mode 100644 traces/README.md diff --git a/README.md b/README.md index c20660b..b73f202 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ Click on the links to see the README of each plugin. - [ ] mongodb - [ ] rocketmq - [ ] activemq -- [ ] kafka +- [x] [kafka](inputs/kafka) - [x] [elasticsearch](inputs/elasticsearch) - [x] windows - [ ] mssql @@ -117,12 +117,12 @@ Click on the links to see the README of each plugin. - [ ] ipmi - [ ] smartctl - [ ] logging -- [ ] trace +- [x] [traces](traces) ## Thanks -Categraf is developed on the basis of Telegraf and Exporters. Thanks to the great open source community. +Categraf is developed on the basis of Telegraf, Exporters and the OpenTelemetry. Thanks to the great open source community. ## Community diff --git a/conf/input.kafka/kafka.toml b/conf/input.kafka/kafka.toml index 8b4e513..02e1620 100644 --- a/conf/input.kafka/kafka.toml +++ b/conf/input.kafka/kafka.toml @@ -8,5 +8,78 @@ # interval_times = 1 # append some labels to metrics -labels = { cluster="cloud-n9e-kafka" } -kafka_uris = ["127.0.0.1:9092","127.0.0.1:9092","127.0.0.1:9092"] \ No newline at end of file +# instance is a preferred tag with the cluster name. If none is provided, the first of kafka_uris will be used +labels = { instance="kafka-cluster-01" } + +# log level only for kafka exporter +log_level = "error" + +# Address (host:port) of Kafka server. +kafka_uris = ["127.0.0.1:9092","127.0.0.1:9092","127.0.0.1:9092"] + +# Connect using SASL/PLAIN +# Default is false +# use_sasl = false + +# Only set this to false if using a non-Kafka SASL proxy +# Default is true +# use_sasl_handshake = false + +# SASL user name +# sasl_username = "username" + +# SASL user password +# sasl_password = "password" + +# The SASL SCRAM SHA algorithm sha256 or sha512 as mechanism +# sasl_mechanism = "" + +# Connect using TLS +# use_tls = false + +# The optional certificate authority file for TLS client authentication +# ca_file = "" + +# The optional certificate file for TLS client authentication +# cert_file = "" + +# The optional key file for TLS client authentication +# key_file = "" + +# If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure +# insecure_skip_verify = true + +# Kafka broker version +# Default is 2.0.0 +# kafka_version = "2.0.0" + +# if you need to use a group from zookeeper +# Default is false +# use_zookeeper_lag = false + +# Address array (hosts) of zookeeper server. +# zookeeper_uris = [] + +# Metadata refresh interval +# Default is 1s +# metadata_refresh_interval = "1m" + +# If true, all scrapes will trigger kafka operations otherwise, they will share results. WARN: This should be disabled on large clusters +# Default is false +# allow_concurrency = false + +# Maximum number of offsets to store in the interpolation table for a partition +# Default is 1000 +# max_offsets = 1000 + +# How frequently should the interpolation table be pruned, in seconds. +# Default is 30 +# prune_interval_seconds = 30 + +# Regex filter for topics to be monitored +# Default is ".*" +# topics_filter_regex = ".*" + +# Regex filter for consumer groups to be monitored +# Default is ".*" +# groups_filter_regex = ".*" \ No newline at end of file diff --git a/inputs/kafka/README.md b/inputs/kafka/README.md index 9dfbba2..e8a89d3 100644 --- a/inputs/kafka/README.md +++ b/inputs/kafka/README.md @@ -1,18 +1,11 @@ # kafka -kafka 监控采集插件,封装kafka-exporter(https://github.com/davidmparrott/kafka_exporter)而来 +kafka 监控采集插件,由kafka-exporter(https://github.com/davidmparrott/kafka_exporter)封装而来。 ## Configuration -```toml -# # collect interval -# interval = 15 - -# 要监控 MySQL,首先要给出要监控的MySQL的连接地址、用户名、密码 -[[instances]] - -``` +请参考配置[示例](../../conf/input.kafka/kafka.toml) ## 监控大盘和告警规则 -本 README 的同级目录,大家可以看到 dashboard.json 就是监控大盘,导入夜莺就可以使用,alerts.json 是告警规则,也是导入夜莺就可以使用。 \ No newline at end of file +同级目录下的 dashboard.json、alerts.json 可以直接导入夜莺使用。 \ No newline at end of file diff --git a/inputs/kafka/alerts.json b/inputs/kafka/alerts.json index e69de29..3bd622e 100644 --- a/inputs/kafka/alerts.json +++ b/inputs/kafka/alerts.json @@ -0,0 +1,72 @@ +[ + { + "name": "数据有丢失风险-副本数小于3", + "note": "", + "prod": "", + "algorithm": "", + "algo_params": null, + "delay": 0, + "severity": 2, + "disabled": 1, + "prom_for_duration": 60, + "prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "notify_max_number": 0, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [ + "service=kafka" + ] + }, + { + "name": "消费能力不足-延迟超过5分钟", + "note": "", + "prod": "", + "algorithm": "", + "algo_params": null, + "delay": 0, + "severity": 2, + "disabled": 1, + "prom_for_duration": 60, + "prom_ql": "kafka_consumer_lag_millis / 1000 > 300", + "prom_eval_interval": 15, + "enable_stime": "00:00", + "enable_etime": "23:59", + "enable_days_of_week": [ + "1", + "2", + "3", + "4", + "5", + "6", + "0" + ], + "enable_in_bg": 0, + "notify_recovered": 1, + "notify_channels": [], + "notify_repeat_step": 60, + "notify_max_number": 0, + "recover_duration": 0, + "callbacks": [], + "runbook_url": "", + "append_tags": [ + "service=kafka" + ] + } + ] \ No newline at end of file diff --git a/inputs/kafka/dashboard.json b/inputs/kafka/dashboard.json index e69de29..58adf92 100644 --- a/inputs/kafka/dashboard.json +++ b/inputs/kafka/dashboard.json @@ -0,0 +1,360 @@ +{ + "name": "Kafka - 模板", + "tags": "Kafka Prometheus ", + "configs": { + "var": [ + { + "name": "instance", + "definition": "label_values(kafka_brokers, instance)", + "type": "query" + } + ], + "panels": [ + { + "id": "6fac3216-f9e2-45c5-8037-959ab3c98de5", + "type": "row", + "name": "overview", + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 0, + "i": "6fac3216-f9e2-45c5-8037-959ab3c98de5", + "isResizable": false + }, + "collapsed": true, + "panels": [] + }, + { + "targets": [ + { + "refId": "A", + "expr": "kafka_brokers{instance=\"$instance\"}" + } + ], + "name": "brokers", + "custom": { + "textMode": "value", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": { + "value": 50 + } + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 6, + "x": 0, + "y": 1, + "i": "b10ab025-2795-4ea1-b537-d03948324ea8", + "isResizable": true + }, + "id": "b10ab025-2795-4ea1-b537-d03948324ea8" + }, + { + "targets": [ + { + "refId": "A", + "expr": "count(count by (topic) (kafka_topic_partitions{instance=\"$instance\"}))" + } + ], + "name": "topics", + "custom": { + "textMode": "value", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": { + "value": 50 + } + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 6, + "x": 6, + "y": 1, + "i": "8845f449-cb6c-4fa6-9930-351f106f9e52", + "isResizable": true + }, + "id": "8845f449-cb6c-4fa6-9930-351f106f9e52" + }, + { + "targets": [ + { + "refId": "A", + "expr": "sum(kafka_topic_partitions{instance=\"$instance\"})", + "legend": "" + } + ], + "name": "partitions", + "custom": { + "textMode": "value", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": { + "value": 50 + } + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 6, + "x": 12, + "y": 1, + "i": "cc26ea7b-8860-45cd-9f62-90f42bd195f5", + "isResizable": true + }, + "id": "cc26ea7b-8860-45cd-9f62-90f42bd195f5" + }, + { + "targets": [ + { + "refId": "A", + "expr": "sum(kafka_topic_partition_replicas{instance=~\"$instance\"})" + } + ], + "name": "Replicas", + "custom": { + "textMode": "valueAndName", + "colorMode": "value", + "calc": "lastNotNull", + "colSpan": 1, + "textSize": {} + }, + "options": { + "standardOptions": {} + }, + "version": "2.0.0", + "type": "stat", + "layout": { + "h": 3, + "w": 6, + "x": 18, + "y": 1, + "i": "98cd9cee-69ad-4533-9eed-e307a24fffa6", + "isResizable": true + }, + "id": "98cd9cee-69ad-4533-9eed-e307a24fffa6" + }, + { + "id": "79a8e48a-fdf0-4c7e-bae4-478f7b294751", + "type": "row", + "name": "throughput", + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 4, + "i": "79a8e48a-fdf0-4c7e-bae4-478f7b294751", + "isResizable": false + } + }, + { + "targets": [ + { + "expr": "sum(rate(kafka_topic_partition_current_offset{instance=\"$instance\"}[1m])) by (topic)" + } + ], + "name": "Messages produced per second", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 7, + "w": 8, + "x": 0, + "y": 5, + "i": "3ceedd68-54d0-44db-9390-ceb2299619e5", + "isResizable": true + }, + "id": "3ceedd68-54d0-44db-9390-ceb2299619e5" + }, + { + "targets": [ + { + "expr": "sum(rate(kafka_consumergroup_current_offset{instance=\"$instance\"}[1m])) by (topic)" + } + ], + "name": "Messages consumed per second", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": {}, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 7, + "w": 8, + "x": 8, + "y": 5, + "i": "a43a7752-00e6-41fb-9055-e04d17e22d99", + "isResizable": true + }, + "id": "a43a7752-00e6-41fb-9055-e04d17e22d99" + }, + { + "targets": [ + { + "expr": "sum(kafka_consumer_lag_millis{instance=\"$instance\"}) by (consumergroup, topic) ", + "legend": "{{consumergroup}} (topic: {{topic}})" + } + ], + "name": "Latency by Consumer Group", + "options": { + "tooltip": { + "mode": "all", + "sort": "desc" + }, + "legend": { + "displayMode": "hidden" + }, + "standardOptions": { + "util": "milliseconds" + }, + "thresholds": {} + }, + "custom": { + "drawStyle": "lines", + "lineInterpolation": "smooth", + "fillOpacity": 0.5, + "stack": "off" + }, + "version": "2.0.0", + "type": "timeseries", + "layout": { + "h": 7, + "w": 8, + "x": 16, + "y": 5, + "i": "422193ca-facf-450c-b7cb-4975447f3ffc", + "isResizable": true + }, + "id": "422193ca-facf-450c-b7cb-4975447f3ffc" + }, + { + "id": "e85fc913-f075-4284-a9bc-75e481039372", + "type": "row", + "name": "patition/replicate", + "layout": { + "h": 1, + "w": 24, + "x": 0, + "y": 12, + "i": "e85fc913-f075-4284-a9bc-75e481039372", + "isResizable": false + } + }, + { + "targets": [ + { + "refId": "A", + "expr": "kafka_topic_partitions{instance=\"$instance\"}", + "legend": "{{topic}}" + } + ], + "name": "Partitions per Topic", + "custom": { + "showHeader": true, + "calc": "lastNotNull", + "displayMode": "seriesToRows" + }, + "options": { + "standardOptions": {} + }, + "overrides": [ + {} + ], + "version": "2.0.0", + "type": "table", + "layout": { + "h": 7, + "w": 12, + "x": 0, + "y": 13, + "i": "8d50a6ec-9dde-4239-830d-d3568d0e8748", + "isResizable": true + }, + "id": "8d50a6ec-9dde-4239-830d-d3568d0e8748" + }, + { + "targets": [ + { + "refId": "A", + "expr": "kafka_topic_partition_under_replicated_partition", + "legend": "{{topic}}-{{partition}}" + } + ], + "name": "Partitions Under Replicated", + "description": "副本不同步预案\n1. Restart the Zookeeper leader.\n2. Restart the broker\\brokers that are not replicating some of the partitions.", + "custom": { + "showHeader": true, + "colorMode": "value", + "calc": "lastNotNull", + "displayMode": "seriesToRows" + }, + "options": { + "standardOptions": {} + }, + "overrides": [ + {} + ], + "version": "2.0.0", + "type": "table", + "layout": { + "h": 7, + "w": 12, + "x": 12, + "y": 13, + "i": "9aaf3255-8281-47f3-996b-8585e0f68c05", + "isResizable": true + }, + "id": "9aaf3255-8281-47f3-996b-8585e0f68c05" + } + ] + } +} \ No newline at end of file diff --git a/inputs/kafka/kafka.go b/inputs/kafka/kafka.go index 8ccc0c4..dd9a425 100644 --- a/inputs/kafka/kafka.go +++ b/inputs/kafka/kafka.go @@ -180,10 +180,10 @@ func (ins *Instance) Init() error { ins.KafkaVersion = sarama.V2_0_0_0.String() } if len(ins.MetadataRefreshInterval) == 0 { - ins.MetadataRefreshInterval = "1s" + ins.MetadataRefreshInterval = "1m" } if ins.AllowConcurrent == nil { - flag := true + flag := false ins.AllowConcurrent = &flag } if ins.MaxOffsets <= 0 { @@ -198,6 +198,13 @@ func (ins *Instance) Init() error { if len(ins.GroupFilter) == 0 { ins.GroupFilter = ".*" } + if ins.Labels == nil { + ins.Labels = make(map[string]string) + } + _, ok := ins.Labels["instance"] + if !ok { + ins.Labels["instance"] = ins.KafkaURIs[0] + } options := exporter.Options{ Uri: ins.KafkaURIs, @@ -224,6 +231,7 @@ func (ins *Instance) Init() error { for k, v := range ins.Labels { encLabels = append(encLabels, fmt.Sprintf("%s=%s", k, v)) } + options.Labels = strings.Join(encLabels, ",") ins.l = level.NewFilter(klog.NewLogfmtLogger(klog.NewSyncWriter(os.Stderr)), levelFilter(ins.LogLevel)) diff --git a/traces/README.md b/traces/README.md new file mode 100644 index 0000000..58c96dd --- /dev/null +++ b/traces/README.md @@ -0,0 +1,13 @@ +# traces +Categraf simply wrapped the OpenTelemetry Collector, which means you can get a full support for recving data from and exporting to popular trace vendors, such as the Jaeger and Zipkin. + +We only support the common [components](../config/traces/components.go) as default. If you want more, simply add the new ones to [components.go](../config/traces/components.go), +and make sure you configure that in the conf. + +For more details, see the official docs: +- https://opentelemetry.io/docs/collector/getting-started +- https://github.com/open-telemetry/opentelemetry-collector + +## Configuration + +Here is the [examples](../conf/traces.yaml). \ No newline at end of file