From a9a5c422bf87b6d857807cfb0be3847e76345d7a Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Thu, 6 Feb 2025 00:58:55 +0100
Subject: [PATCH] grafana: differentiate prometheus disk rate alerts by host
 type

Also only differentiate on the duration not the rate, to not
accidentally exclude slow hard disks.
---
 .../prometheus_alerts.rules.yaml              | 54 ++++++++++---------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
index 9b1ee26..8cfd99a 100644
--- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
@@ -114,56 +114,60 @@ groups:
 
   - name: node-exporter-disk
     rules:
-      # Have different disk read and write rate alerts for VMs and physical machines.
-      - alert: VirtualHostUnusualDiskReadRate
-        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
+      # General high disk read and write rate alerts.
+      # Excluding: hypervisor hosts, CI hosts
+      - alert: HostUnusualDiskReadRate
+        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
         for: 5m
         labels:
           severity: warning
         annotations:
-          summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
+          summary: Host unusual disk read rate (instance {{ $labels.instance }})
           description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
-      - alert: VirtualHostUnusualDiskWriteRate
-        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
+      - alert: HostUnusualDiskWriteRate
+        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
         for: 2m
         labels:
           severity: warning
         annotations:
-          summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
+          summary: Host unusual disk write rate (instance {{ $labels.instance }})
           description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
-      # Some VMs are expected to have high Read / Write rates z.B. CI servers
-      - alert: VirtualHostUnusualDiskReadRate
-        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
+      # CI hosts high disk read and write alerts.
+      # Longer intervals to account for disk intensive CI tasks.
+      - alert: CIHostUnusualDiskReadRate
+        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
         for: 10m
         labels:
           severity: warning
         annotations:
-          summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }})
+          summary: CI host unusual disk read rate for 10 min (instance {{ $labels.instance }})
           description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
       - alert: VirtualHostUnusualDiskWriteRate
-        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
+        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
         for: 4m
         labels:
           severity: warning
         annotations:
-          summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }})
+          summary: CI host unusual disk write rate for 4 min (instance {{ $labels.instance }})
           description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
-      - alert: PhysicalHostUnusualDiskReadRate
-        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+      # Hypervisor host high disk read and write alerts.
+      # Longer intervals to account for disk intensive hypervisor tasks (backups, moving VMs, etc.).
+      - alert: HypervisorHostUnusualDiskReadRate
+        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
+        for: 30m
+        labels:
+          severity: warning
+        annotations:
+          summary: Hypervisor host unusual disk read rate (instance {{ $labels.instance }})
+          description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+      - alert: HypervisorHostUnusualDiskWriteRate
+        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
         for: 20m
         labels:
           severity: warning
         annotations:
-          summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
-          description: "Disk is probably reading too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
-      - alert: PhysicalHostUnusualDiskWriteRate
-        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
-          description: "Disk is probably writing too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+          summary: Hypervisor host unusual disk write rate (instance {{ $labels.instance }})
+          description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
       # Please add ignored mountpoints in node_exporter parameters like
       # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
       # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.