custom alerts for CI VMs

its expected for some VMs to have high Read / Write rates for some time
so this is a custom alerts for ours CI VMs
This commit is contained in:
christian 2024-11-10 17:06:41 +01:00
parent 3284fae62a
commit a386f9e2eb
Signed by: c6ristian
SSH key fingerprint: SHA256:B3m+yzpaxGXSEcDBpPHfvza/DNC0wuX+CKMeGq8wgak

View file

@ -46,7 +46,7 @@ groups:
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}" description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}"
# Have different disk read and write rate alerts for VMs and physical machines. # Have different disk read and write rate alerts for VMs and physical machines.
- alert: VirtualHostUnusualDiskReadRate - alert: VirtualHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"} expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
@ -54,13 +54,30 @@ groups:
summary: Virtual host unusual disk read rate (instance {{ $labels.instance }}) summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: VirtualHostUnusualDiskWriteRate - alert: VirtualHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"} expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Virtual host unusual disk write rate (instance {{ $labels.instance }}) summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
# Some VMs are expected to have high Read / Write rates z.B. CI servers
- alert: VirtualHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
for: 10m
labels:
severity: warning
annotations:
summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: VirtualHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
for: 4m
labels:
severity: warning
annotations:
summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualDiskReadRate - alert: PhysicalHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 20m for: 20m