update kafka alerts and dashboard (#1012)

* update kafka alerts and dashboard

* update kafka dashboard

Co-authored-by: yushuangyu <yushuangyu@flashcat.cloud>
This commit is contained in:
ysyneu 2022-07-04 19:56:51 +08:00 committed by GitHub
parent 45945876d8
commit 64a671ae13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 422 additions and 109 deletions

View File

@ -1,9 +1,13 @@
[ [
{ {
"name": "数据有丢失风险-同步副本数小于3", "name": "数据有丢失风险-副本数小于3",
"note": "", "note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2, "severity": 2,
"disabled": 0, "disabled": 1,
"prom_for_duration": 60, "prom_for_duration": 60,
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3", "prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3",
"prom_eval_interval": 15, "prom_eval_interval": 15,
@ -22,18 +26,25 @@
"notify_recovered": 1, "notify_recovered": 1,
"notify_channels": [], "notify_channels": [],
"notify_repeat_step": 60, "notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0, "recover_duration": 0,
"callbacks": [], "callbacks": [],
"runbook_url": "", "runbook_url": "",
"append_tags": [] "append_tags": [
"service=kafka"
]
}, },
{ {
"name": "消费能力不足-积压消息数超过50条", "name": "消费能力不足-延迟超过5分钟",
"note": "", "note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2, "severity": 2,
"disabled": 0, "disabled": 1,
"prom_for_duration": 60, "prom_for_duration": 60,
"prom_ql": "sum(kafka_topic_partition_current_offset{instance=\"$instance\"}) by (topic) - sum(kafka_consumergroup_current_offset{instance=\"$instance\"}) by (topic) ", "prom_ql": "kafka_consumer_lag_millis / 1000 > 300",
"prom_eval_interval": 15, "prom_eval_interval": 15,
"enable_stime": "00:00", "enable_stime": "00:00",
"enable_etime": "23:59", "enable_etime": "23:59",
@ -50,9 +61,12 @@
"notify_recovered": 1, "notify_recovered": 1,
"notify_channels": [], "notify_channels": [],
"notify_repeat_step": 60, "notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0, "recover_duration": 0,
"callbacks": [], "callbacks": [],
"runbook_url": "", "runbook_url": "",
"append_tags": [] "append_tags": [
"service=kafka"
]
} }
] ]

View File

@ -1,63 +1,362 @@
[
{ {
"name": "Kafka - 模板", "name": "Kafka - 模板",
"tags": "Kafka Prometheus", "tags": "Kafka Prometheus",
"configs": "{\"var\":[{\"name\":\"instance\",\"definition\":\"label_values(kafka_brokers, instance)\"},{\"name\":\"job\",\"definition\":\"label_values(kafka_brokers, job)\"}]}", "configs": {
"chart_groups": [ "var": [
{ {
"name": "cluster",
"definition": "label_values(kafka_brokers, cluster)",
"type": "query"
}
],
"version": "2.0.0",
"panels": [
{
"id": "51502c3a-dd6f-41c7-b8f1-87b88826c96e",
"type": "row",
"name": "overview", "name": "overview",
"weight": 0, "layout": {
"charts": [ "h": 1,
{ "w": 24,
"configs": "{\"targets\":[{\"refId\":\"A\",\"expr\":\"count(count by (topic) (kafka_topic_partitions))\"}],\"name\":\"topics\",\"custom\":{\"textMode\":\"value\",\"colorMode\":\"value\",\"calc\":\"lastNotNull\",\"colSpan\":1,\"textSize\":{\"value\":50}},\"options\":{\"standardOptions\":{}},\"version\":\"2.0.0\",\"type\":\"stat\",\"layout\":{\"h\":1,\"w\":8,\"x\":8,\"y\":0,\"i\":\"0\"}}", "x": 0,
"weight": 0 "y": 0,
"i": "51502c3a-dd6f-41c7-b8f1-87b88826c96e",
"isResizable": false
},
"collapsed": true
}, },
{ {
"configs": "{\"targets\":[{\"refId\":\"A\",\"expr\":\"kafka_brokers\"}],\"name\":\"brokers\",\"custom\":{\"textMode\":\"value\",\"colorMode\":\"value\",\"calc\":\"lastNotNull\",\"colSpan\":1,\"textSize\":{\"value\":50}},\"options\":{\"standardOptions\":{}},\"version\":\"2.0.0\",\"type\":\"stat\",\"layout\":{\"h\":1,\"w\":8,\"x\":0,\"y\":0,\"i\":\"1\"}}", "targets": [
"weight": 0
},
{ {
"configs": "{\"targets\":[{\"refId\":\"A\",\"expr\":\"sum(kafka_topic_partitions)\"}],\"name\":\"partitions\",\"custom\":{\"textMode\":\"value\",\"colorMode\":\"value\",\"calc\":\"lastNotNull\",\"colSpan\":1,\"textSize\":{\"value\":50}},\"options\":{\"standardOptions\":{}},\"version\":\"2.0.0\",\"type\":\"stat\",\"layout\":{\"h\":1,\"w\":8,\"x\":16,\"y\":0,\"i\":\"2\"}}", "refId": "A",
"weight": 0 "expr": "kafka_brokers{cluster=\"$cluster\"}"
} }
] ],
"name": "brokers",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 0,
"y": 1,
"i": "e2c1d271-ec43-4821-aa19-451e856af755",
"isResizable": true
},
"id": "e2c1d271-ec43-4821-aa19-451e856af755"
}, },
{ {
"targets": [
{
"refId": "A",
"expr": "count(count by (topic) (kafka_topic_partitions{cluster=\"$cluster\"}))"
}
],
"name": "topics",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 6,
"y": 1,
"i": "fd3a0b9f-fd67-4360-a94c-869fee7b5b98",
"isResizable": true
},
"id": "fd3a0b9f-fd67-4360-a94c-869fee7b5b98"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(kafka_topic_partitions{cluster=\"$cluster\"})"
}
],
"name": "partitions",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 12,
"y": 1,
"i": "e228d857-746b-41b6-8d2d-0152453c46f4",
"isResizable": true
},
"id": "e228d857-746b-41b6-8d2d-0152453c46f4"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(kafka_topic_partition_replicas{cluster=\"$cluster\"})"
}
],
"name": "Replicas",
"custom": {
"textMode": "valueAndName",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 18,
"y": 1,
"i": "85438099-8d6b-4817-b9b9-1d0ed36029cd",
"isResizable": true
},
"id": "85438099-8d6b-4817-b9b9-1d0ed36029cd"
},
{
"id": "0db4aac4-86cf-44cd-950e-6c6a99be8ff4",
"type": "row",
"name": "throughput", "name": "throughput",
"weight": 1, "layout": {
"charts": [ "h": 1,
{ "w": 24,
"configs": "{\"targets\":[{\"expr\":\"sum(rate(kafka_topic_partition_current_offset{instance=\\\"$instance\\\"}[1m])) by (topic)\"}],\"name\":\"Message in per second\",\"options\":{\"tooltip\":{\"mode\":\"all\",\"sort\":\"none\"},\"legend\":{\"displayMode\":\"hidden\"},\"standardOptions\":{},\"thresholds\":{}},\"custom\":{\"drawStyle\":\"lines\",\"lineInterpolation\":\"smooth\",\"fillOpacity\":0.5,\"stack\":\"off\"},\"version\":\"2.0.0\",\"type\":\"timeseries\",\"layout\":{\"h\":2,\"w\":12,\"x\":0,\"y\":0,\"i\":\"0\"}}", "x": 0,
"weight": 0 "y": 4,
"i": "0db4aac4-86cf-44cd-950e-6c6a99be8ff4",
"isResizable": false
},
"collapsed": true
}, },
{ {
"configs": "{\"targets\":[{\"expr\":\"sum(kafka_consumer_lag_millis{instance=\\\"$instance\\\"}) by (consumergroup, topic) \",\"legend\":\"{{consumergroup}} (topic: {{topic}})\"}],\"name\":\"Latency by Consumer Group\",\"options\":{\"tooltip\":{\"mode\":\"all\",\"sort\":\"none\"},\"legend\":{\"displayMode\":\"hidden\"},\"standardOptions\":{\"util\":\"humantimeMilliseconds\"},\"thresholds\":{}},\"custom\":{\"drawStyle\":\"lines\",\"lineInterpolation\":\"smooth\",\"fillOpacity\":0.5,\"stack\":\"off\"},\"version\":\"2.0.0\",\"type\":\"timeseries\",\"layout\":{\"h\":2,\"w\":12,\"x\":0,\"y\":2,\"i\":\"1\"}}", "targets": [
"weight": 0
},
{ {
"configs": "{\"targets\":[{\"expr\":\"sum(rate(kafka_consumergroup_current_offset{instance=\\\"$instance\\\"}[1m])) by (topic)\"}],\"name\":\"Message consume per second\",\"options\":{\"tooltip\":{\"mode\":\"all\",\"sort\":\"none\"},\"legend\":{\"displayMode\":\"hidden\"},\"standardOptions\":{},\"thresholds\":{}},\"custom\":{\"drawStyle\":\"lines\",\"lineInterpolation\":\"smooth\",\"fillOpacity\":0.5,\"stack\":\"off\"},\"version\":\"2.0.0\",\"type\":\"timeseries\",\"layout\":{\"h\":2,\"w\":12,\"x\":12,\"y\":0,\"i\":\"2\"}}", "expr": "sum(rate(kafka_topic_partition_current_offset{cluster=\"$cluster\"}[1m])) by (topic)"
"weight": 0
},
{
"configs": "{\"targets\":[{\"expr\":\"sum(kafka_topic_partition_current_offset{instance=\\\"$instance\\\"}) by (topic) - sum(kafka_consumergroup_current_offset{instance=\\\"$instance\\\"}) by (topic) \",\"legend\":\"{{consumergroup}} (topic: {{topic}})\"}],\"name\":\"Lag by Consumer Group\",\"options\":{\"tooltip\":{\"mode\":\"all\",\"sort\":\"none\"},\"legend\":{\"displayMode\":\"hidden\"},\"standardOptions\":{},\"thresholds\":{}},\"custom\":{\"drawStyle\":\"lines\",\"lineInterpolation\":\"smooth\",\"fillOpacity\":0.5,\"stack\":\"off\"},\"version\":\"2.0.0\",\"type\":\"timeseries\",\"layout\":{\"h\":2,\"w\":12,\"x\":12,\"y\":2,\"i\":\"3\"}}",
"weight": 0
} }
] ],
"name": "Messages produced per second",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 0,
"y": 5,
"i": "c2ec4036-3081-45cc-b672-024c6df93833",
"isResizable": true
},
"id": "c2ec4036-3081-45cc-b672-024c6df93833"
}, },
{ {
"targets": [
{
"expr": "sum(rate(kafka_consumergroup_current_offset{cluster=\"$cluster\"}[1m])) by (topic)"
}
],
"name": "Messages consumed per second",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 8,
"y": 5,
"i": "7ad651a6-c12c-4d46-8d01-749fa776faef",
"isResizable": true
},
"id": "7ad651a6-c12c-4d46-8d01-749fa776faef"
},
{
"targets": [
{
"expr": "sum(kafka_consumer_lag_millis{cluster=\"$cluster\"}) by (consumergroup, topic)",
"legend": "{{consumergroup}} (topic: {{topic}})"
}
],
"name": "Latency by Consumer Group",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "humantimeMilliseconds"
},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 16,
"y": 5,
"i": "855aa8f5-0c51-42d4-b9a4-5460b7cd0f5a",
"isResizable": true
},
"id": "855aa8f5-0c51-42d4-b9a4-5460b7cd0f5a"
},
{
"id": "20166830-7f85-4665-8f39-bf904267af29",
"type": "row",
"name": "patition/replicate", "name": "patition/replicate",
"weight": 2, "layout": {
"charts": [ "h": 1,
{ "w": 24,
"configs": "{\"targets\":[{\"refId\":\"A\",\"expr\":\"kafka_topic_partitions{instance=\\\"$instance\\\"}\",\"legend\":\"{{topic}}\"}],\"name\":\"Partitions per Topic\",\"custom\":{\"showHeader\":true,\"calc\":\"lastNotNull\",\"displayMode\":\"seriesToRows\"},\"options\":{\"standardOptions\":{}},\"overrides\":[{}],\"version\":\"2.0.0\",\"type\":\"table\",\"layout\":{\"h\":2,\"w\":12,\"x\":0,\"y\":0,\"i\":\"0\"}}", "x": 0,
"weight": 0 "y": 12,
"i": "20166830-7f85-4665-8f39-bf904267af29",
"isResizable": false
},
"collapsed": true
}, },
{ {
"configs": "{\"targets\":[{\"refId\":\"A\",\"expr\":\"kafka_topic_partition_under_replicated_partition\",\"legend\":\"{{topic}}-{{partition}}\"}],\"name\":\"Under Replicated\",\"description\":\"副本不同步预案\\n1. Restart the Zookeeper leader.\\n2. Restart the broker\\\\brokers that are not replicating some of the partitions.\",\"custom\":{\"showHeader\":true,\"calc\":\"lastNotNull\",\"displayMode\":\"seriesToRows\"},\"options\":{\"standardOptions\":{}},\"overrides\":[{}],\"version\":\"2.0.0\",\"type\":\"table\",\"layout\":{\"h\":2,\"w\":12,\"x\":12,\"y\":0,\"i\":\"1\"}}", "targets": [
"weight": 0 {
"refId": "A",
"expr": "kafka_topic_partitions{cluster=\"$cluster\"}",
"legend": "{{topic}}"
}
],
"name": "Partitions per Topic",
"custom": {
"showHeader": true,
"colorMode": "value",
"calc": "lastNotNull",
"displayMode": "seriesToRows"
},
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 13,
"i": "8837a52e-c9eb-4afa-acc1-c3a5dac72d3b",
"isResizable": true
},
"id": "8837a52e-c9eb-4afa-acc1-c3a5dac72d3b"
},
{
"targets": [
{
"refId": "A",
"expr": "kafka_topic_partition_under_replicated_partition{cluster=\"$cluster\"}",
"legend": "{{topic}}-{{partition}}"
}
],
"name": "Partitions Under Replicated",
"description": "副本不同步预案\n1. Restart the Zookeeper leader.\n2. Restart the broker\\brokers that are not replicating some of the partitions.",
"custom": {
"showHeader": true,
"colorMode": "value",
"calc": "lastNotNull",
"displayMode": "seriesToRows"
},
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 13,
"i": "dd615767-dda7-4da6-b37f-0d484553aac6",
"isResizable": true
},
"id": "dd615767-dda7-4da6-b37f-0d484553aac6"
} }
] ]
} }
]
} }
]