add kafka alerts and dashboard conf
This commit is contained in:
parent
f3d496099a
commit
870b6be01a
|
@ -101,7 +101,7 @@ Click on the links to see the README of each plugin.
|
|||
- [ ] mongodb
|
||||
- [ ] rocketmq
|
||||
- [ ] activemq
|
||||
- [ ] kafka
|
||||
- [x] [kafka](inputs/kafka)
|
||||
- [x] [elasticsearch](inputs/elasticsearch)
|
||||
- [x] windows
|
||||
- [ ] mssql
|
||||
|
@ -117,12 +117,12 @@ Click on the links to see the README of each plugin.
|
|||
- [ ] ipmi
|
||||
- [ ] smartctl
|
||||
- [ ] logging
|
||||
- [ ] trace
|
||||
- [x] [traces](traces)
|
||||
|
||||
|
||||
## Thanks
|
||||
|
||||
Categraf is developed on the basis of Telegraf and Exporters. Thanks to the great open source community.
|
||||
Categraf is developed on the basis of Telegraf, Exporters and the OpenTelemetry. Thanks to the great open source community.
|
||||
|
||||
## Community
|
||||
|
||||
|
|
|
@ -8,5 +8,78 @@
|
|||
# interval_times = 1
|
||||
|
||||
# append some labels to metrics
|
||||
labels = { cluster="cloud-n9e-kafka" }
|
||||
kafka_uris = ["127.0.0.1:9092","127.0.0.1:9092","127.0.0.1:9092"]
|
||||
# instance is a preferred tag with the cluster name. If none is provided, the first of kafka_uris will be used
|
||||
labels = { instance="kafka-cluster-01" }
|
||||
|
||||
# log level only for kafka exporter
|
||||
log_level = "error"
|
||||
|
||||
# Address (host:port) of Kafka server.
|
||||
kafka_uris = ["127.0.0.1:9092","127.0.0.1:9092","127.0.0.1:9092"]
|
||||
|
||||
# Connect using SASL/PLAIN
|
||||
# Default is false
|
||||
# use_sasl = false
|
||||
|
||||
# Only set this to false if using a non-Kafka SASL proxy
|
||||
# Default is true
|
||||
# use_sasl_handshake = false
|
||||
|
||||
# SASL user name
|
||||
# sasl_username = "username"
|
||||
|
||||
# SASL user password
|
||||
# sasl_password = "password"
|
||||
|
||||
# The SASL SCRAM SHA algorithm sha256 or sha512 as mechanism
|
||||
# sasl_mechanism = ""
|
||||
|
||||
# Connect using TLS
|
||||
# use_tls = false
|
||||
|
||||
# The optional certificate authority file for TLS client authentication
|
||||
# ca_file = ""
|
||||
|
||||
# The optional certificate file for TLS client authentication
|
||||
# cert_file = ""
|
||||
|
||||
# The optional key file for TLS client authentication
|
||||
# key_file = ""
|
||||
|
||||
# If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure
|
||||
# insecure_skip_verify = true
|
||||
|
||||
# Kafka broker version
|
||||
# Default is 2.0.0
|
||||
# kafka_version = "2.0.0"
|
||||
|
||||
# if you need to use a group from zookeeper
|
||||
# Default is false
|
||||
# use_zookeeper_lag = false
|
||||
|
||||
# Address array (hosts) of zookeeper server.
|
||||
# zookeeper_uris = []
|
||||
|
||||
# Metadata refresh interval
|
||||
# Default is 1s
|
||||
# metadata_refresh_interval = "1m"
|
||||
|
||||
# If true, all scrapes will trigger kafka operations otherwise, they will share results. WARN: This should be disabled on large clusters
|
||||
# Default is false
|
||||
# allow_concurrency = false
|
||||
|
||||
# Maximum number of offsets to store in the interpolation table for a partition
|
||||
# Default is 1000
|
||||
# max_offsets = 1000
|
||||
|
||||
# How frequently should the interpolation table be pruned, in seconds.
|
||||
# Default is 30
|
||||
# prune_interval_seconds = 30
|
||||
|
||||
# Regex filter for topics to be monitored
|
||||
# Default is ".*"
|
||||
# topics_filter_regex = ".*"
|
||||
|
||||
# Regex filter for consumer groups to be monitored
|
||||
# Default is ".*"
|
||||
# groups_filter_regex = ".*"
|
|
@ -1,18 +1,11 @@
|
|||
# kafka
|
||||
|
||||
kafka 监控采集插件,封装kafka-exporter(https://github.com/davidmparrott/kafka_exporter)而来
|
||||
kafka 监控采集插件,由kafka-exporter(https://github.com/davidmparrott/kafka_exporter)封装而来。
|
||||
|
||||
## Configuration
|
||||
|
||||
```toml
|
||||
# # collect interval
|
||||
# interval = 15
|
||||
|
||||
# 要监控 MySQL,首先要给出要监控的MySQL的连接地址、用户名、密码
|
||||
[[instances]]
|
||||
|
||||
```
|
||||
请参考配置[示例](../../conf/input.kafka/kafka.toml)
|
||||
|
||||
## 监控大盘和告警规则
|
||||
|
||||
本 README 的同级目录,大家可以看到 dashboard.json 就是监控大盘,导入夜莺就可以使用,alerts.json 是告警规则,也是导入夜莺就可以使用。
|
||||
同级目录下的 dashboard.json、alerts.json 可以直接导入夜莺使用。
|
|
@ -0,0 +1,72 @@
|
|||
[
|
||||
{
|
||||
"name": "数据有丢失风险-副本数小于3",
|
||||
"note": "",
|
||||
"prod": "",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 2,
|
||||
"disabled": 1,
|
||||
"prom_for_duration": 60,
|
||||
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"notify_max_number": 0,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": [
|
||||
"service=kafka"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "消费能力不足-延迟超过5分钟",
|
||||
"note": "",
|
||||
"prod": "",
|
||||
"algorithm": "",
|
||||
"algo_params": null,
|
||||
"delay": 0,
|
||||
"severity": 2,
|
||||
"disabled": 1,
|
||||
"prom_for_duration": 60,
|
||||
"prom_ql": "kafka_consumer_lag_millis / 1000 > 300",
|
||||
"prom_eval_interval": 15,
|
||||
"enable_stime": "00:00",
|
||||
"enable_etime": "23:59",
|
||||
"enable_days_of_week": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"0"
|
||||
],
|
||||
"enable_in_bg": 0,
|
||||
"notify_recovered": 1,
|
||||
"notify_channels": [],
|
||||
"notify_repeat_step": 60,
|
||||
"notify_max_number": 0,
|
||||
"recover_duration": 0,
|
||||
"callbacks": [],
|
||||
"runbook_url": "",
|
||||
"append_tags": [
|
||||
"service=kafka"
|
||||
]
|
||||
}
|
||||
]
|
|
@ -0,0 +1,360 @@
|
|||
{
|
||||
"name": "Kafka - 模板",
|
||||
"tags": "Kafka Prometheus ",
|
||||
"configs": {
|
||||
"var": [
|
||||
{
|
||||
"name": "instance",
|
||||
"definition": "label_values(kafka_brokers, instance)",
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"id": "6fac3216-f9e2-45c5-8037-959ab3c98de5",
|
||||
"type": "row",
|
||||
"name": "overview",
|
||||
"layout": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"i": "6fac3216-f9e2-45c5-8037-959ab3c98de5",
|
||||
"isResizable": false
|
||||
},
|
||||
"collapsed": true,
|
||||
"panels": []
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "kafka_brokers{instance=\"$instance\"}"
|
||||
}
|
||||
],
|
||||
"name": "brokers",
|
||||
"custom": {
|
||||
"textMode": "value",
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"colSpan": 1,
|
||||
"textSize": {
|
||||
"value": 50
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "stat",
|
||||
"layout": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 1,
|
||||
"i": "b10ab025-2795-4ea1-b537-d03948324ea8",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "b10ab025-2795-4ea1-b537-d03948324ea8"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "count(count by (topic) (kafka_topic_partitions{instance=\"$instance\"}))"
|
||||
}
|
||||
],
|
||||
"name": "topics",
|
||||
"custom": {
|
||||
"textMode": "value",
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"colSpan": 1,
|
||||
"textSize": {
|
||||
"value": 50
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "stat",
|
||||
"layout": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 1,
|
||||
"i": "8845f449-cb6c-4fa6-9930-351f106f9e52",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "8845f449-cb6c-4fa6-9930-351f106f9e52"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(kafka_topic_partitions{instance=\"$instance\"})",
|
||||
"legend": ""
|
||||
}
|
||||
],
|
||||
"name": "partitions",
|
||||
"custom": {
|
||||
"textMode": "value",
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"colSpan": 1,
|
||||
"textSize": {
|
||||
"value": 50
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "stat",
|
||||
"layout": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 1,
|
||||
"i": "cc26ea7b-8860-45cd-9f62-90f42bd195f5",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "cc26ea7b-8860-45cd-9f62-90f42bd195f5"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(kafka_topic_partition_replicas{instance=~\"$instance\"})"
|
||||
}
|
||||
],
|
||||
"name": "Replicas",
|
||||
"custom": {
|
||||
"textMode": "valueAndName",
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"colSpan": 1,
|
||||
"textSize": {}
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "stat",
|
||||
"layout": {
|
||||
"h": 3,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 1,
|
||||
"i": "98cd9cee-69ad-4533-9eed-e307a24fffa6",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "98cd9cee-69ad-4533-9eed-e307a24fffa6"
|
||||
},
|
||||
{
|
||||
"id": "79a8e48a-fdf0-4c7e-bae4-478f7b294751",
|
||||
"type": "row",
|
||||
"name": "throughput",
|
||||
"layout": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 4,
|
||||
"i": "79a8e48a-fdf0-4c7e-bae4-478f7b294751",
|
||||
"isResizable": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(kafka_topic_partition_current_offset{instance=\"$instance\"}[1m])) by (topic)"
|
||||
}
|
||||
],
|
||||
"name": "Messages produced per second",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"standardOptions": {},
|
||||
"thresholds": {}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 7,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 5,
|
||||
"i": "3ceedd68-54d0-44db-9390-ceb2299619e5",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "3ceedd68-54d0-44db-9390-ceb2299619e5"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(kafka_consumergroup_current_offset{instance=\"$instance\"}[1m])) by (topic)"
|
||||
}
|
||||
],
|
||||
"name": "Messages consumed per second",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"standardOptions": {},
|
||||
"thresholds": {}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 7,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 5,
|
||||
"i": "a43a7752-00e6-41fb-9055-e04d17e22d99",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "a43a7752-00e6-41fb-9055-e04d17e22d99"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(kafka_consumer_lag_millis{instance=\"$instance\"}) by (consumergroup, topic) ",
|
||||
"legend": "{{consumergroup}} (topic: {{topic}})"
|
||||
}
|
||||
],
|
||||
"name": "Latency by Consumer Group",
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "all",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "hidden"
|
||||
},
|
||||
"standardOptions": {
|
||||
"util": "milliseconds"
|
||||
},
|
||||
"thresholds": {}
|
||||
},
|
||||
"custom": {
|
||||
"drawStyle": "lines",
|
||||
"lineInterpolation": "smooth",
|
||||
"fillOpacity": 0.5,
|
||||
"stack": "off"
|
||||
},
|
||||
"version": "2.0.0",
|
||||
"type": "timeseries",
|
||||
"layout": {
|
||||
"h": 7,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 5,
|
||||
"i": "422193ca-facf-450c-b7cb-4975447f3ffc",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "422193ca-facf-450c-b7cb-4975447f3ffc"
|
||||
},
|
||||
{
|
||||
"id": "e85fc913-f075-4284-a9bc-75e481039372",
|
||||
"type": "row",
|
||||
"name": "patition/replicate",
|
||||
"layout": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 12,
|
||||
"i": "e85fc913-f075-4284-a9bc-75e481039372",
|
||||
"isResizable": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "kafka_topic_partitions{instance=\"$instance\"}",
|
||||
"legend": "{{topic}}"
|
||||
}
|
||||
],
|
||||
"name": "Partitions per Topic",
|
||||
"custom": {
|
||||
"showHeader": true,
|
||||
"calc": "lastNotNull",
|
||||
"displayMode": "seriesToRows"
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"overrides": [
|
||||
{}
|
||||
],
|
||||
"version": "2.0.0",
|
||||
"type": "table",
|
||||
"layout": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 13,
|
||||
"i": "8d50a6ec-9dde-4239-830d-d3568d0e8748",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "8d50a6ec-9dde-4239-830d-d3568d0e8748"
|
||||
},
|
||||
{
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "kafka_topic_partition_under_replicated_partition",
|
||||
"legend": "{{topic}}-{{partition}}"
|
||||
}
|
||||
],
|
||||
"name": "Partitions Under Replicated",
|
||||
"description": "副本不同步预案\n1. Restart the Zookeeper leader.\n2. Restart the broker\\brokers that are not replicating some of the partitions.",
|
||||
"custom": {
|
||||
"showHeader": true,
|
||||
"colorMode": "value",
|
||||
"calc": "lastNotNull",
|
||||
"displayMode": "seriesToRows"
|
||||
},
|
||||
"options": {
|
||||
"standardOptions": {}
|
||||
},
|
||||
"overrides": [
|
||||
{}
|
||||
],
|
||||
"version": "2.0.0",
|
||||
"type": "table",
|
||||
"layout": {
|
||||
"h": 7,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 13,
|
||||
"i": "9aaf3255-8281-47f3-996b-8585e0f68c05",
|
||||
"isResizable": true
|
||||
},
|
||||
"id": "9aaf3255-8281-47f3-996b-8585e0f68c05"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
|
@ -180,10 +180,10 @@ func (ins *Instance) Init() error {
|
|||
ins.KafkaVersion = sarama.V2_0_0_0.String()
|
||||
}
|
||||
if len(ins.MetadataRefreshInterval) == 0 {
|
||||
ins.MetadataRefreshInterval = "1s"
|
||||
ins.MetadataRefreshInterval = "1m"
|
||||
}
|
||||
if ins.AllowConcurrent == nil {
|
||||
flag := true
|
||||
flag := false
|
||||
ins.AllowConcurrent = &flag
|
||||
}
|
||||
if ins.MaxOffsets <= 0 {
|
||||
|
@ -198,6 +198,13 @@ func (ins *Instance) Init() error {
|
|||
if len(ins.GroupFilter) == 0 {
|
||||
ins.GroupFilter = ".*"
|
||||
}
|
||||
if ins.Labels == nil {
|
||||
ins.Labels = make(map[string]string)
|
||||
}
|
||||
_, ok := ins.Labels["instance"]
|
||||
if !ok {
|
||||
ins.Labels["instance"] = ins.KafkaURIs[0]
|
||||
}
|
||||
|
||||
options := exporter.Options{
|
||||
Uri: ins.KafkaURIs,
|
||||
|
@ -224,6 +231,7 @@ func (ins *Instance) Init() error {
|
|||
for k, v := range ins.Labels {
|
||||
encLabels = append(encLabels, fmt.Sprintf("%s=%s", k, v))
|
||||
}
|
||||
|
||||
options.Labels = strings.Join(encLabels, ",")
|
||||
|
||||
ins.l = level.NewFilter(klog.NewLogfmtLogger(klog.NewSyncWriter(os.Stderr)), levelFilter(ins.LogLevel))
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
# traces
|
||||
Categraf simply wrapped the OpenTelemetry Collector, which means you can get a full support for recving data from and exporting to popular trace vendors, such as the Jaeger and Zipkin.
|
||||
|
||||
We only support the common [components](../config/traces/components.go) as default. If you want more, simply add the new ones to [components.go](../config/traces/components.go),
|
||||
and make sure you configure that in the conf.
|
||||
|
||||
For more details, see the official docs:
|
||||
- https://opentelemetry.io/docs/collector/getting-started
|
||||
- https://github.com/open-telemetry/opentelemetry-collector
|
||||
|
||||
## Configuration
|
||||
|
||||
Here is the [examples](../conf/traces.yaml).
|
Loading…
Reference in New Issue