From 4cac84e7ec6f7d9f8cde3a5a0677d744545c97b2 Mon Sep 17 00:00:00 2001 From: June Date: Sat, 5 Oct 2024 17:17:35 +0200 Subject: [PATCH] prometheus: have different disk alerts for physical and virtual hosts Have more relaxed read/write alerts for physical hosts as they are probably hypervisors and regular high read/writes are more common. Also differentiate between physical and virtual hosts for IO alerts and allow for hard disks to spend more time in IO. --- .../prometheus_alerts.rules.yaml | 52 +++++++++++++++---- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml index 1c06485..284c7ec 100644 --- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -44,22 +44,39 @@ groups: annotations: summary: Host unusual network throughput out (instance {{ $labels.instance }}) description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}" - - alert: HostUnusualDiskReadRate - expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + # Have different disk read and write rate alerts for VMs and physical machines. + - alert: VirtualHostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"} for: 5m labels: severity: warning annotations: - summary: Host unusual disk read rate (instance {{ $labels.instance }}) + summary: Virtual host unusual disk read rate (instance {{ $labels.instance }}) description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" - - alert: HostUnusualDiskWriteRate - expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + - alert: VirtualHostUnusualDiskWriteRate + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"} for: 2m labels: severity: warning annotations: - summary: Host unusual disk write rate (instance {{ $labels.instance }}) + summary: Virtual host unusual disk write rate (instance {{ $labels.instance }}) description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" + - alert: PhysicalHostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} + for: 20m + labels: + severity: warning + annotations: + summary: Physical host unusual disk read rate (instance {{ $labels.instance }}) + description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}" + - alert: PhysicalHostUnusualDiskWriteRate + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} + for: 15m + labels: + severity: warning + annotations: + summary: Physical host unusual disk write rate (instance {{ $labels.instance }}) + description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}" # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. @@ -156,13 +173,30 @@ groups: annotations: summary: Host CPU high iowait (instance {{ $labels.instance }}) description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}" - - alert: HostUnusualDiskIo - expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks. + - alert: PhysicalHostUnusualHardDiskIo + expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} for: 5m labels: severity: warning annotations: - summary: Host unusual disk IO (instance {{ $labels.instance }}) + summary: Physical host unusual hard disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" + - alert: PhysicalHostUnusualOtherDiskIo + expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" + - alert: VirtualHostUnusualDiskIo + expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Virtual host unusual disk IO (instance {{ $labels.instance }}) description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" # # x2 context switches is an arbitrary number. # # The alert threshold depends on the nature of the application.