update kafka alerts and dashboard (#1012)

* update kafka alerts and dashboard

* update kafka dashboard

Co-authored-by: yushuangyu <yushuangyu@flashcat.cloud>
This commit is contained in:
ysyneu 2022-07-04 19:56:51 +08:00 committed by GitHub
parent 45945876d8
commit 64a671ae13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 422 additions and 109 deletions

View File

@ -1,58 +1,72 @@
[
{
"name": "数据有丢失风险-同步副本数小于3",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
},
{
"name": "消费能力不足-积压消息数超过50条",
"note": "",
"severity": 2,
"disabled": 0,
"prom_for_duration": 60,
"prom_ql": "sum(kafka_topic_partition_current_offset{instance=\"$instance\"}) by (topic) - sum(kafka_consumergroup_current_offset{instance=\"$instance\"}) by (topic) ",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": []
}
]
{
"name": "数据有丢失风险-副本数小于3",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka"
]
},
{
"name": "消费能力不足-延迟超过5分钟",
"note": "",
"prod": "",
"algorithm": "",
"algo_params": null,
"delay": 0,
"severity": 2,
"disabled": 1,
"prom_for_duration": 60,
"prom_ql": "kafka_consumer_lag_millis / 1000 > 300",
"prom_eval_interval": 15,
"enable_stime": "00:00",
"enable_etime": "23:59",
"enable_days_of_week": [
"1",
"2",
"3",
"4",
"5",
"6",
"0"
],
"enable_in_bg": 0,
"notify_recovered": 1,
"notify_channels": [],
"notify_repeat_step": 60,
"notify_max_number": 0,
"recover_duration": 0,
"callbacks": [],
"runbook_url": "",
"append_tags": [
"service=kafka"
]
}
]

View File

@ -1,63 +1,362 @@
[
{
"name": "Kafka - 模板",
"tags": "Kafka Prometheus ",
"configs": "{\"var\":[{\"name\":\"instance\",\"definition\":\"label_values(kafka_brokers, instance)\"},{\"name\":\"job\",\"definition\":\"label_values(kafka_brokers, job)\"}]}",
"chart_groups": [
{
"name": "overview",
"weight": 0,
"charts": [
{
"name": "Kafka - 模板",
"tags": "Kafka Prometheus",
"configs": {
"var": [
{
"configs": "{\"targets\":[{\"refId\":\"A\",\"expr\":\"count(count by (topic) (kafka_topic_partitions))\"}],\"name\":\"topics\",\"custom\":{\"textMode\":\"value\",\"colorMode\":\"value\",\"calc\":\"lastNotNull\",\"colSpan\":1,\"textSize\":{\"value\":50}},\"options\":{\"standardOptions\":{}},\"version\":\"2.0.0\",\"type\":\"stat\",\"layout\":{\"h\":1,\"w\":8,\"x\":8,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"configs": "{\"targets\":[{\"refId\":\"A\",\"expr\":\"kafka_brokers\"}],\"name\":\"brokers\",\"custom\":{\"textMode\":\"value\",\"colorMode\":\"value\",\"calc\":\"lastNotNull\",\"colSpan\":1,\"textSize\":{\"value\":50}},\"options\":{\"standardOptions\":{}},\"version\":\"2.0.0\",\"type\":\"stat\",\"layout\":{\"h\":1,\"w\":8,\"x\":0,\"y\":0,\"i\":\"1\"}}",
"weight": 0
},
{
"configs": "{\"targets\":[{\"refId\":\"A\",\"expr\":\"sum(kafka_topic_partitions)\"}],\"name\":\"partitions\",\"custom\":{\"textMode\":\"value\",\"colorMode\":\"value\",\"calc\":\"lastNotNull\",\"colSpan\":1,\"textSize\":{\"value\":50}},\"options\":{\"standardOptions\":{}},\"version\":\"2.0.0\",\"type\":\"stat\",\"layout\":{\"h\":1,\"w\":8,\"x\":16,\"y\":0,\"i\":\"2\"}}",
"weight": 0
"name": "cluster",
"definition": "label_values(kafka_brokers, cluster)",
"type": "query"
}
]
},
{
"name": "throughput",
"weight": 1,
"charts": [
],
"version": "2.0.0",
"panels": [
{
"configs": "{\"targets\":[{\"expr\":\"sum(rate(kafka_topic_partition_current_offset{instance=\\\"$instance\\\"}[1m])) by (topic)\"}],\"name\":\"Message in per second\",\"options\":{\"tooltip\":{\"mode\":\"all\",\"sort\":\"none\"},\"legend\":{\"displayMode\":\"hidden\"},\"standardOptions\":{},\"thresholds\":{}},\"custom\":{\"drawStyle\":\"lines\",\"lineInterpolation\":\"smooth\",\"fillOpacity\":0.5,\"stack\":\"off\"},\"version\":\"2.0.0\",\"type\":\"timeseries\",\"layout\":{\"h\":2,\"w\":12,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
"id": "51502c3a-dd6f-41c7-b8f1-87b88826c96e",
"type": "row",
"name": "overview",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 0,
"i": "51502c3a-dd6f-41c7-b8f1-87b88826c96e",
"isResizable": false
},
"collapsed": true
},
{
"configs": "{\"targets\":[{\"expr\":\"sum(kafka_consumer_lag_millis{instance=\\\"$instance\\\"}) by (consumergroup, topic) \",\"legend\":\"{{consumergroup}} (topic: {{topic}})\"}],\"name\":\"Latency by Consumer Group\",\"options\":{\"tooltip\":{\"mode\":\"all\",\"sort\":\"none\"},\"legend\":{\"displayMode\":\"hidden\"},\"standardOptions\":{\"util\":\"humantimeMilliseconds\"},\"thresholds\":{}},\"custom\":{\"drawStyle\":\"lines\",\"lineInterpolation\":\"smooth\",\"fillOpacity\":0.5,\"stack\":\"off\"},\"version\":\"2.0.0\",\"type\":\"timeseries\",\"layout\":{\"h\":2,\"w\":12,\"x\":0,\"y\":2,\"i\":\"1\"}}",
"weight": 0
"targets": [
{
"refId": "A",
"expr": "kafka_brokers{cluster=\"$cluster\"}"
}
],
"name": "brokers",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 0,
"y": 1,
"i": "e2c1d271-ec43-4821-aa19-451e856af755",
"isResizable": true
},
"id": "e2c1d271-ec43-4821-aa19-451e856af755"
},
{
"configs": "{\"targets\":[{\"expr\":\"sum(rate(kafka_consumergroup_current_offset{instance=\\\"$instance\\\"}[1m])) by (topic)\"}],\"name\":\"Message consume per second\",\"options\":{\"tooltip\":{\"mode\":\"all\",\"sort\":\"none\"},\"legend\":{\"displayMode\":\"hidden\"},\"standardOptions\":{},\"thresholds\":{}},\"custom\":{\"drawStyle\":\"lines\",\"lineInterpolation\":\"smooth\",\"fillOpacity\":0.5,\"stack\":\"off\"},\"version\":\"2.0.0\",\"type\":\"timeseries\",\"layout\":{\"h\":2,\"w\":12,\"x\":12,\"y\":0,\"i\":\"2\"}}",
"weight": 0
"targets": [
{
"refId": "A",
"expr": "count(count by (topic) (kafka_topic_partitions{cluster=\"$cluster\"}))"
}
],
"name": "topics",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 6,
"y": 1,
"i": "fd3a0b9f-fd67-4360-a94c-869fee7b5b98",
"isResizable": true
},
"id": "fd3a0b9f-fd67-4360-a94c-869fee7b5b98"
},
{
"configs": "{\"targets\":[{\"expr\":\"sum(kafka_topic_partition_current_offset{instance=\\\"$instance\\\"}) by (topic) - sum(kafka_consumergroup_current_offset{instance=\\\"$instance\\\"}) by (topic) \",\"legend\":\"{{consumergroup}} (topic: {{topic}})\"}],\"name\":\"Lag by Consumer Group\",\"options\":{\"tooltip\":{\"mode\":\"all\",\"sort\":\"none\"},\"legend\":{\"displayMode\":\"hidden\"},\"standardOptions\":{},\"thresholds\":{}},\"custom\":{\"drawStyle\":\"lines\",\"lineInterpolation\":\"smooth\",\"fillOpacity\":0.5,\"stack\":\"off\"},\"version\":\"2.0.0\",\"type\":\"timeseries\",\"layout\":{\"h\":2,\"w\":12,\"x\":12,\"y\":2,\"i\":\"3\"}}",
"weight": 0
"targets": [
{
"refId": "A",
"expr": "sum(kafka_topic_partitions{cluster=\"$cluster\"})"
}
],
"name": "partitions",
"custom": {
"textMode": "value",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {
"value": 50
}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 12,
"y": 1,
"i": "e228d857-746b-41b6-8d2d-0152453c46f4",
"isResizable": true
},
"id": "e228d857-746b-41b6-8d2d-0152453c46f4"
},
{
"targets": [
{
"refId": "A",
"expr": "sum(kafka_topic_partition_replicas{cluster=\"$cluster\"})"
}
],
"name": "Replicas",
"custom": {
"textMode": "valueAndName",
"colorMode": "value",
"calc": "lastNotNull",
"colSpan": 1,
"textSize": {}
},
"options": {
"standardOptions": {}
},
"version": "2.0.0",
"type": "stat",
"layout": {
"h": 3,
"w": 6,
"x": 18,
"y": 1,
"i": "85438099-8d6b-4817-b9b9-1d0ed36029cd",
"isResizable": true
},
"id": "85438099-8d6b-4817-b9b9-1d0ed36029cd"
},
{
"id": "0db4aac4-86cf-44cd-950e-6c6a99be8ff4",
"type": "row",
"name": "throughput",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 4,
"i": "0db4aac4-86cf-44cd-950e-6c6a99be8ff4",
"isResizable": false
},
"collapsed": true
},
{
"targets": [
{
"expr": "sum(rate(kafka_topic_partition_current_offset{cluster=\"$cluster\"}[1m])) by (topic)"
}
],
"name": "Messages produced per second",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 0,
"y": 5,
"i": "c2ec4036-3081-45cc-b672-024c6df93833",
"isResizable": true
},
"id": "c2ec4036-3081-45cc-b672-024c6df93833"
},
{
"targets": [
{
"expr": "sum(rate(kafka_consumergroup_current_offset{cluster=\"$cluster\"}[1m])) by (topic)"
}
],
"name": "Messages consumed per second",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 8,
"y": 5,
"i": "7ad651a6-c12c-4d46-8d01-749fa776faef",
"isResizable": true
},
"id": "7ad651a6-c12c-4d46-8d01-749fa776faef"
},
{
"targets": [
{
"expr": "sum(kafka_consumer_lag_millis{cluster=\"$cluster\"}) by (consumergroup, topic)",
"legend": "{{consumergroup}} (topic: {{topic}})"
}
],
"name": "Latency by Consumer Group",
"options": {
"tooltip": {
"mode": "all",
"sort": "none"
},
"legend": {
"displayMode": "hidden"
},
"standardOptions": {
"util": "humantimeMilliseconds"
},
"thresholds": {}
},
"custom": {
"drawStyle": "lines",
"lineInterpolation": "smooth",
"fillOpacity": 0.5,
"stack": "off"
},
"version": "2.0.0",
"type": "timeseries",
"layout": {
"h": 7,
"w": 8,
"x": 16,
"y": 5,
"i": "855aa8f5-0c51-42d4-b9a4-5460b7cd0f5a",
"isResizable": true
},
"id": "855aa8f5-0c51-42d4-b9a4-5460b7cd0f5a"
},
{
"id": "20166830-7f85-4665-8f39-bf904267af29",
"type": "row",
"name": "patition/replicate",
"layout": {
"h": 1,
"w": 24,
"x": 0,
"y": 12,
"i": "20166830-7f85-4665-8f39-bf904267af29",
"isResizable": false
},
"collapsed": true
},
{
"targets": [
{
"refId": "A",
"expr": "kafka_topic_partitions{cluster=\"$cluster\"}",
"legend": "{{topic}}"
}
],
"name": "Partitions per Topic",
"custom": {
"showHeader": true,
"colorMode": "value",
"calc": "lastNotNull",
"displayMode": "seriesToRows"
},
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 7,
"w": 12,
"x": 0,
"y": 13,
"i": "8837a52e-c9eb-4afa-acc1-c3a5dac72d3b",
"isResizable": true
},
"id": "8837a52e-c9eb-4afa-acc1-c3a5dac72d3b"
},
{
"targets": [
{
"refId": "A",
"expr": "kafka_topic_partition_under_replicated_partition{cluster=\"$cluster\"}",
"legend": "{{topic}}-{{partition}}"
}
],
"name": "Partitions Under Replicated",
"description": "副本不同步预案\n1. Restart the Zookeeper leader.\n2. Restart the broker\\brokers that are not replicating some of the partitions.",
"custom": {
"showHeader": true,
"colorMode": "value",
"calc": "lastNotNull",
"displayMode": "seriesToRows"
},
"options": {
"standardOptions": {}
},
"overrides": [
{}
],
"version": "2.0.0",
"type": "table",
"layout": {
"h": 7,
"w": 12,
"x": 12,
"y": 13,
"i": "dd615767-dda7-4da6-b37f-0d484553aac6",
"isResizable": true
},
"id": "dd615767-dda7-4da6-b37f-0d484553aac6"
}
]
},
{
"name": "patition/replicate",
"weight": 2,
"charts": [
{
"configs": "{\"targets\":[{\"refId\":\"A\",\"expr\":\"kafka_topic_partitions{instance=\\\"$instance\\\"}\",\"legend\":\"{{topic}}\"}],\"name\":\"Partitions per Topic\",\"custom\":{\"showHeader\":true,\"calc\":\"lastNotNull\",\"displayMode\":\"seriesToRows\"},\"options\":{\"standardOptions\":{}},\"overrides\":[{}],\"version\":\"2.0.0\",\"type\":\"table\",\"layout\":{\"h\":2,\"w\":12,\"x\":0,\"y\":0,\"i\":\"0\"}}",
"weight": 0
},
{
"configs": "{\"targets\":[{\"refId\":\"A\",\"expr\":\"kafka_topic_partition_under_replicated_partition\",\"legend\":\"{{topic}}-{{partition}}\"}],\"name\":\"Under Replicated\",\"description\":\"副本不同步预案\\n1. Restart the Zookeeper leader.\\n2. Restart the broker\\\\brokers that are not replicating some of the partitions.\",\"custom\":{\"showHeader\":true,\"calc\":\"lastNotNull\",\"displayMode\":\"seriesToRows\"},\"options\":{\"standardOptions\":{}},\"overrides\":[{}],\"version\":\"2.0.0\",\"type\":\"table\",\"layout\":{\"h\":2,\"w\":12,\"x\":12,\"y\":0,\"i\":\"1\"}}",
"weight": 0
}
]
}
]
]
}
]
}