From ee66631c2d348ba7e924353f7314c90dc8195a0b Mon Sep 17 00:00:00 2001 From: June Date: Thu, 6 Feb 2025 01:13:10 +0100 Subject: [PATCH] grafana: diff. prometheus disk io alerts by host task and disk type Differentiate by host task (hypervisor or not) and disk (hard disk or not) type not by whether or not the host is physical and virtual and then by disk type. This is in line with the disk rate alerts changes and allows for fine-grained adjustments based on the host task type, which actually matters for these alerts. --- .../prometheus_alerts.rules.yaml | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml index 8cfd99a..8c8f374 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -230,30 +230,35 @@ groups: annotations: summary: Host unusual disk write latency (instance {{ $labels.instance }}) description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}" - # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks. - - alert: PhysicalHostUnusualHardDiskIo - expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} + # General unusual disk io alerts. + # Excluding: hypervisor hosts + - alert: HostUnusualDiskIo + expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="chaosknoten"} for: 5m labels: severity: warning annotations: - summary: Physical host unusual hard disk IO (instance {{ $labels.instance }}) + summary: Host unusual disk IO (instance {{ $labels.instance }}) description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" - - alert: PhysicalHostUnusualOtherDiskIo - expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} - for: 5m + # Hypervisor host unusual hard disk io alerts. + # Since hard disks on the hypervisor can easily have their IO saturated by hypervisor tasks (backups, moving VMs, etc.), alert when the IO is above the regular threshold for a very long time. + - alert: HypervisorHostUnusualHardDiskIo + expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"} + for: 50m labels: severity: warning annotations: - summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }}) + summary: Hypervisor host unusual hard disk IO (instance {{ $labels.instance }}) description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" - - alert: VirtualHostUnusualDiskIo - expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"} + # Hypervisor host unusual other (non-hard) disk io alerts. + # This is the same as the regular unsual disk io alerts. + - alert: HypervisorHostUnusualOtherDiskIo + expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"} for: 5m labels: severity: warning annotations: - summary: Virtual host unusual disk IO (instance {{ $labels.instance }}) + summary: Hypervisor host unusual other (non-hard) disk IO (instance {{ $labels.instance }}) description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" - alert: HostRaidArrayGotInactive expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}