diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml index 8cfd99a..8c8f374 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -230,30 +230,35 @@ groups: annotations: summary: Host unusual disk write latency (instance {{ $labels.instance }}) description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}" - # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks. - - alert: PhysicalHostUnusualHardDiskIo - expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} + # General unusual disk io alerts. + # Excluding: hypervisor hosts + - alert: HostUnusualDiskIo + expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="chaosknoten"} for: 5m labels: severity: warning annotations: - summary: Physical host unusual hard disk IO (instance {{ $labels.instance }}) + summary: Host unusual disk IO (instance {{ $labels.instance }}) description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" - - alert: PhysicalHostUnusualOtherDiskIo - expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} - for: 5m + # Hypervisor host unusual hard disk io alerts. + # Since hard disks on the hypervisor can easily have their IO saturated by hypervisor tasks (backups, moving VMs, etc.), alert when the IO is above the regular threshold for a very long time. + - alert: HypervisorHostUnusualHardDiskIo + expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"} + for: 50m labels: severity: warning annotations: - summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }}) + summary: Hypervisor host unusual hard disk IO (instance {{ $labels.instance }}) description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" - - alert: VirtualHostUnusualDiskIo - expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"} + # Hypervisor host unusual other (non-hard) disk io alerts. + # This is the same as the regular unsual disk io alerts. + - alert: HypervisorHostUnusualOtherDiskIo + expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"} for: 5m labels: severity: warning annotations: - summary: Virtual host unusual disk IO (instance {{ $labels.instance }}) + summary: Hypervisor host unusual other (non-hard) disk IO (instance {{ $labels.instance }}) description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" - alert: HostRaidArrayGotInactive expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}