grafana: diff. prometheus disk io alerts by host task and disk type

Differentiate by host task (hypervisor or not) and disk (hard disk or not) type not by whether or not the host is physical and virtual and then by disk type. This is in line with the disk rate alerts changes and allows for fine-grained adjustments based on the host task type, which actually matters for these alerts.
2025-02-06 01:13:10 +01:00 · 2025-02-06 01:13:10 +01:00 · ee66631c2d
commit ee66631c2d
parent 9e77a41e3c
1 changed files with 16 additions and 11 deletions
--- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
@ -230,30 +230,35 @@ groups:
        annotations:
          summary: Host unusual disk write latency (instance {{ $labels.instance }})
          description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}"
-      # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
-      - alert: PhysicalHostUnusualHardDiskIo
-        expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+      # General unusual disk io alerts.
+      # Excluding: hypervisor hosts
+      - alert: HostUnusualDiskIo
+        expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="chaosknoten"}
        for: 5m
        labels:
          severity: warning
        annotations:
-          summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
+          summary: Host unusual disk IO (instance {{ $labels.instance }})
          description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
-      - alert: PhysicalHostUnusualOtherDiskIo
-        expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
-        for: 5m
+      # Hypervisor host unusual hard disk io alerts.
+      # Since hard disks on the hypervisor can easily have their IO saturated by hypervisor tasks (backups, moving VMs, etc.), alert when the IO is above the regular threshold for a very long time.
+      - alert: HypervisorHostUnusualHardDiskIo
+        expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
+        for: 50m
        labels:
          severity: warning
        annotations:
-          summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
+          summary: Hypervisor host unusual hard disk IO (instance {{ $labels.instance }})
          description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
-      - alert: VirtualHostUnusualDiskIo
-        expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
+      # Hypervisor host unusual other (non-hard) disk io alerts.
+      # This is the same as the regular unsual disk io alerts.
+      - alert: HypervisorHostUnusualOtherDiskIo
+        expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
        for: 5m
        labels:
          severity: warning
        annotations:
-          summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
+          summary: Hypervisor host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
          description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
      - alert: HostRaidArrayGotInactive
        expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}