grafana: diff. prometheus disk io alerts by host task and disk type

Differentiate by host task (hypervisor or not) and disk (hard disk or
not) type not by whether or not the host is physical and virtual and
then by disk type.
This is in line with the disk rate alerts changes and allows for
fine-grained adjustments based on the host task type, which actually
matters for these alerts.
This commit is contained in:
June 2025-02-06 01:13:10 +01:00
parent 9e77a41e3c
commit ee66631c2d
Signed by: june
SSH key fingerprint: SHA256:o9EAq4Y9N9K0pBQeBTqhSDrND5E7oB+60ZNx0U1yPe0

View file

@ -230,30 +230,35 @@ groups:
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}"
# Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
- alert: PhysicalHostUnusualHardDiskIo
expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
# General unusual disk io alerts.
# Excluding: hypervisor hosts
- alert: HostUnusualDiskIo
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="chaosknoten"}
for: 5m
labels:
severity: warning
annotations:
summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualOtherDiskIo
expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 5m
# Hypervisor host unusual hard disk io alerts.
# Since hard disks on the hypervisor can easily have their IO saturated by hypervisor tasks (backups, moving VMs, etc.), alert when the IO is above the regular threshold for a very long time.
- alert: HypervisorHostUnusualHardDiskIo
expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
for: 50m
labels:
severity: warning
annotations:
summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
summary: Hypervisor host unusual hard disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: VirtualHostUnusualDiskIo
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
# Hypervisor host unusual other (non-hard) disk io alerts.
# This is the same as the regular unsual disk io alerts.
- alert: HypervisorHostUnusualOtherDiskIo
expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
for: 5m
labels:
severity: warning
annotations:
summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
summary: Hypervisor host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: HostRaidArrayGotInactive
expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}