Make Prometheus Alert Rules Work Better and Organize Them Better #19

Merged
june merged 5 commits from adjust_prometheus_alert_rules into main 2025-02-07 13:57:48 +01:00
Showing only changes of commit ee66631c2d - Show all commits

View file

@ -230,30 +230,35 @@ groups:
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}"
# Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
- alert: PhysicalHostUnusualHardDiskIo
expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
# General unusual disk io alerts.
# Excluding: hypervisor hosts
- alert: HostUnusualDiskIo
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="chaosknoten"}
for: 5m
labels:
severity: warning
annotations:
summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualOtherDiskIo
expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 5m
# Hypervisor host unusual hard disk io alerts.
# Since hard disks on the hypervisor can easily have their IO saturated by hypervisor tasks (backups, moving VMs, etc.), alert when the IO is above the regular threshold for a very long time.
- alert: HypervisorHostUnusualHardDiskIo
expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
for: 50m
labels:
severity: warning
annotations:
summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
summary: Hypervisor host unusual hard disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: VirtualHostUnusualDiskIo
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
# Hypervisor host unusual other (non-hard) disk io alerts.
# This is the same as the regular unsual disk io alerts.
- alert: HypervisorHostUnusualOtherDiskIo
expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
for: 5m
labels:
severity: warning
annotations:
summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
summary: Hypervisor host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: HostRaidArrayGotInactive
expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}