grafana: differentiate prometheus disk rate alerts by host task type

Not by a mix of host task type (CI server or not) and whether or not the
host is virtual or physical.

Also only differentiate on the duration not the rate, to not
accidentally exclude slow hard disks.
This commit is contained in:
June 2025-02-06 01:05:05 +01:00
parent 5016407cef
commit 9e77a41e3c
Signed by: june
SSH key fingerprint: SHA256:o9EAq4Y9N9K0pBQeBTqhSDrND5E7oB+60ZNx0U1yPe0

View file

@ -114,56 +114,60 @@ groups:
- name: node-exporter-disk - name: node-exporter-disk
rules: rules:
# Have different disk read and write rate alerts for VMs and physical machines. # General high disk read and write rate alerts.
- alert: VirtualHostUnusualDiskReadRate # Excluding: hypervisor hosts, CI hosts
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"} - alert: HostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Virtual host unusual disk read rate (instance {{ $labels.instance }}) summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: VirtualHostUnusualDiskWriteRate - alert: HostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"} expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Virtual host unusual disk write rate (instance {{ $labels.instance }}) summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
# Some VMs are expected to have high Read / Write rates z.B. CI servers # CI hosts high disk read and write alerts.
- alert: VirtualHostUnusualDiskReadRate # Longer intervals to account for disk intensive CI tasks.
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"} - alert: CIHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }}) summary: CI host unusual disk read rate for 10 min (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: VirtualHostUnusualDiskWriteRate - alert: VirtualHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"} expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
for: 4m for: 4m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }}) summary: CI host unusual disk write rate for 4 min (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualDiskReadRate # Hypervisor host high disk read and write alerts.
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} # Longer intervals to account for disk intensive hypervisor tasks (backups, moving VMs, etc.).
- alert: HypervisorHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
for: 30m
labels:
severity: warning
annotations:
summary: Hypervisor host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: HypervisorHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
for: 20m for: 20m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Physical host unusual disk read rate (instance {{ $labels.instance }}) summary: Hypervisor host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 15m
labels:
severity: warning
annotations:
summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}"
# Please add ignored mountpoints in node_exporter parameters like # Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.