From ee66631c2d348ba7e924353f7314c90dc8195a0b Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Thu, 6 Feb 2025 01:13:10 +0100
Subject: [PATCH] grafana: diff. prometheus disk io alerts by host task and
 disk type

Differentiate by host task (hypervisor or not) and disk (hard disk or
not) type not by whether or not the host is physical and virtual and
then by disk type.
This is in line with the disk rate alerts changes and allows for
fine-grained adjustments based on the host task type, which actually
matters for these alerts.
---
 .../prometheus_alerts.rules.yaml              | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
index 8cfd99a..8c8f374 100644
--- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
@@ -230,30 +230,35 @@ groups:
         annotations:
           summary: Host unusual disk write latency (instance {{ $labels.instance }})
           description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}"
-      # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
-      - alert: PhysicalHostUnusualHardDiskIo
-        expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+      # General unusual disk io alerts.
+      # Excluding: hypervisor hosts
+      - alert: HostUnusualDiskIo
+        expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="chaosknoten"}
         for: 5m
         labels:
           severity: warning
         annotations:
-          summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
+          summary: Host unusual disk IO (instance {{ $labels.instance }})
           description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
-      - alert: PhysicalHostUnusualOtherDiskIo
-        expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
-        for: 5m
+      # Hypervisor host unusual hard disk io alerts.
+      # Since hard disks on the hypervisor can easily have their IO saturated by hypervisor tasks (backups, moving VMs, etc.), alert when the IO is above the regular threshold for a very long time.
+      - alert: HypervisorHostUnusualHardDiskIo
+        expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
+        for: 50m
         labels:
           severity: warning
         annotations:
-          summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
+          summary: Hypervisor host unusual hard disk IO (instance {{ $labels.instance }})
           description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
-      - alert: VirtualHostUnusualDiskIo
-        expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
+      # Hypervisor host unusual other (non-hard) disk io alerts.
+      # This is the same as the regular unsual disk io alerts.
+      - alert: HypervisorHostUnusualOtherDiskIo
+        expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
         for: 5m
         labels:
           severity: warning
         annotations:
-          summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
+          summary: Hypervisor host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
           description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
       - alert: HostRaidArrayGotInactive
         expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}