custom alerts for CI VMs
its expected for some VMs to have high Read / Write rates for some time so this is a custom alerts for ours CI VMs
This commit is contained in:
parent
3284fae62a
commit
a386f9e2eb
|
@ -46,7 +46,7 @@ groups:
|
||||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
||||||
# Have different disk read and write rate alerts for VMs and physical machines.
|
# Have different disk read and write rate alerts for VMs and physical machines.
|
||||||
- alert: VirtualHostUnusualDiskReadRate
|
- alert: VirtualHostUnusualDiskReadRate
|
||||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"}
|
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -54,13 +54,30 @@ groups:
|
||||||
summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
|
summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
|
||||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
- alert: VirtualHostUnusualDiskWriteRate
|
- alert: VirtualHostUnusualDiskWriteRate
|
||||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
|
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
|
summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
|
||||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
|
# Some VMs are expected to have high Read / Write rates z.B. CI servers
|
||||||
|
- alert: VirtualHostUnusualDiskReadRate
|
||||||
|
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
|
- alert: VirtualHostUnusualDiskWriteRate
|
||||||
|
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||||
|
for: 4m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
- alert: PhysicalHostUnusualDiskReadRate
|
- alert: PhysicalHostUnusualDiskReadRate
|
||||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
||||||
for: 20m
|
for: 20m
|
||||||
|
|
Loading…
Reference in a new issue