ConsulManager/flask-consul/units/gen_config.py

149 lines
5.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from config import consul_token,consul_url
def ecs_config(services_list,ostype_list):
consul_server = consul_url.split("/")[2]
job_dict = {'linux':'node_exporter','windows':'windows_exporter'}
configs = ''
for ostype in ostype_list:
job_name = job_dict[ostype]
config_str = f"""
- job_name: {job_name}
scrape_interval: 15s
scrape_timeout: 5s
consul_sd_configs:
- server: '{consul_server}'
token: '{consul_token}'
refresh_interval: 30s
services: {services_list}
tags: ['{ostype}']
relabel_configs:
- source_labels: ['__meta_consul_service']
target_label: cservice
- source_labels: ['__meta_consul_service_metadata_vendor']
target_label: vendor
- source_labels: ['__meta_consul_service_metadata_region']
target_label: region
- source_labels: ['__meta_consul_service_metadata_group']
target_label: group
- source_labels: ['__meta_consul_service_metadata_account']
target_label: account
- source_labels: ['__meta_consul_service_metadata_name']
target_label: name
- source_labels: ['__meta_consul_service_metadata_iid']
target_label: iid
- source_labels: ['__meta_consul_service_metadata_exp']
target_label: exp
- source_labels: ['__meta_consul_service_metadata_instance']
target_label: instance
- source_labels: [instance]
target_label: __address__
"""
configs = configs + config_str
return {'code': 20000,'configs': configs }
def get_rules():
rules = """
groups:
- name: node_usage_record_rules
interval: 1m
rules:
- record: cpu:usage:rate1m
expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100
- record: mem:usage:rate1m
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
- name: node-exporter
rules:
- alert: 内存使用率
expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
for: 5m
labels:
alertype: system
severity: warning
annotations:
description: "{{ $labels.name }}:内存使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}"
- alert: CPU使用率
expr: 100 - (avg by(instance,name,group,account) (irate(node_cpu_seconds_total[5m])) * 100) > 92
for: 5m
labels:
alertype: system
severity: warning
annotations:
description: "{{ $labels.name }}CPU使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}"
- alert: 系统负载
expr: node_load5 / on (instance,name,group,account) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance,name,group,account)) by(instance,name,group,account) > 1.7
for: 10m
labels:
alertype: system
severity: warning
annotations:
description: "{{ $labels.name }}:系统负载{{ $value | humanize }}倍\\n> {{ $labels.group }}-{{ $labels.instance }}"
- alert: 磁盘使用率
expr: |
100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype=~"ext.?|xfs",mountpoint!~".*pods.*|/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85
for: 5m
labels:
alertype: system
severity: warning
annotations:
description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}"
- alert: K8S节点POD磁盘使用率
expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{mountpoint=~"/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85
for: 5m
labels:
alertype: system
severity: warning
annotations:
description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}"
- alert: NFS磁盘使用率
expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype="nfs"} * 100) > 90
for: 5m
labels:
alertype: system
severity: warning
annotations:
description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}"
- alert: 磁盘读写容量
expr: (irate(node_disk_read_bytes_total[5m]) ) /1024 /1024 > 80 or (irate(node_disk_written_bytes_total[5m]) ) /1024 /1024 > 80
for: 8m
labels:
alertype: disk
severity: warning
annotations:
description: "{{ $labels.name }}_{{ $labels.device }}当前IO为{{ $value | humanize }}MB/s\\n> {{ $labels.group }}-{{ $labels.instance }}"
- alert: 网络流入(下载)数据过多
expr: sum by(device,instance, name, group, account) (irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70
for: 5m
labels:
alertype: network
severity: warning
annotations:
description: "{{ $labels.name }}:流入数据为{{ $value | humanize }}MB/s\\n> {{ $labels.group }}-{{ $labels.instance }}"
- alert: 网络流出(上传)数据过多
expr: sum by(device,instance, name, group, account) (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70
for: 5m
labels:
alertype: network
severity: warning
annotations:
description: "{{ $labels.name }}:流出数据为{{ $value | humanize }}MB/s\\n> {{ $labels.group }}-{{ $labels.instance }}"
- name: Itself
rules:
- alert: Exporter状态
expr: up == 0
for: 3m
labels:
alertype: itself
severity: critical
annotations:
description: "{{ $labels.job }}:异常\\n> {{ $labels.group }}-{{ $labels.name }}-{{ $labels.instance }}"
"""
return {"code": 20000, "rules": rules}