From a9a5c422bf87b6d857807cfb0be3847e76345d7a Mon Sep 17 00:00:00 2001 From: June <june@jsts.xyz> Date: Thu, 6 Feb 2025 00:58:55 +0100 Subject: [PATCH] grafana: differentiate prometheus disk rate alerts by host type Also only differentiate on the duration not the rate, to not accidentally exclude slow hard disks. --- .../prometheus_alerts.rules.yaml | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml index 9b1ee26..8cfd99a 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -114,56 +114,60 @@ groups: - name: node-exporter-disk rules: - # Have different disk read and write rate alerts for VMs and physical machines. - - alert: VirtualHostUnusualDiskReadRate - expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"} + # General high disk read and write rate alerts. + # Excluding: hypervisor hosts, CI hosts + - alert: HostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"} for: 5m labels: severity: warning annotations: - summary: Virtual host unusual disk read rate (instance {{ $labels.instance }}) + summary: Host unusual disk read rate (instance {{ $labels.instance }}) description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" - - alert: VirtualHostUnusualDiskWriteRate - expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"} + - alert: HostUnusualDiskWriteRate + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"} for: 2m labels: severity: warning annotations: - summary: Virtual host unusual disk write rate (instance {{ $labels.instance }}) + summary: Host unusual disk write rate (instance {{ $labels.instance }}) description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" - # Some VMs are expected to have high Read / Write rates z.B. CI servers - - alert: VirtualHostUnusualDiskReadRate - expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"} + # CI hosts high disk read and write alerts. + # Longer intervals to account for disk intensive CI tasks. + - alert: CIHostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"} for: 10m labels: severity: warning annotations: - summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }}) + summary: CI host unusual disk read rate for 10 min (instance {{ $labels.instance }}) description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" - alert: VirtualHostUnusualDiskWriteRate - expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"} + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"} for: 4m labels: severity: warning annotations: - summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }}) + summary: CI host unusual disk write rate for 4 min (instance {{ $labels.instance }}) description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" - - alert: PhysicalHostUnusualDiskReadRate - expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} + # Hypervisor host high disk read and write alerts. + # Longer intervals to account for disk intensive hypervisor tasks (backups, moving VMs, etc.). + - alert: HypervisorHostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"} + for: 30m + labels: + severity: warning + annotations: + summary: Hypervisor host unusual disk read rate (instance {{ $labels.instance }}) + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" + - alert: HypervisorHostUnusualDiskWriteRate + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"} for: 20m labels: severity: warning annotations: - summary: Physical host unusual disk read rate (instance {{ $labels.instance }}) - description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}" - - alert: PhysicalHostUnusualDiskWriteRate - expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} - for: 15m - labels: - severity: warning - annotations: - summary: Physical host unusual disk write rate (instance {{ $labels.instance }}) - description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}" + summary: Hypervisor host unusual disk write rate (instance {{ $labels.instance }}) + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.