Make Prometheus Alert Rules Work Better and Organize Them Better #19
1 changed files with 16 additions and 11 deletions
|
@ -230,30 +230,35 @@ groups:
|
|||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}"
|
||||
# Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
|
||||
- alert: PhysicalHostUnusualHardDiskIo
|
||||
expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
||||
# General unusual disk io alerts.
|
||||
# Excluding: hypervisor hosts
|
||||
- alert: HostUnusualDiskIo
|
||||
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="chaosknoten"}
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
|
||||
- alert: PhysicalHostUnusualOtherDiskIo
|
||||
expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
||||
for: 5m
|
||||
# Hypervisor host unusual hard disk io alerts.
|
||||
# Since hard disks on the hypervisor can easily have their IO saturated by hypervisor tasks (backups, moving VMs, etc.), alert when the IO is above the regular threshold for a very long time.
|
||||
- alert: HypervisorHostUnusualHardDiskIo
|
||||
expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
|
||||
for: 50m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
|
||||
summary: Hypervisor host unusual hard disk IO (instance {{ $labels.instance }})
|
||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
|
||||
- alert: VirtualHostUnusualDiskIo
|
||||
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
|
||||
# Hypervisor host unusual other (non-hard) disk io alerts.
|
||||
# This is the same as the regular unsual disk io alerts.
|
||||
- alert: HypervisorHostUnusualOtherDiskIo
|
||||
expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
|
||||
summary: Hypervisor host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
|
||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
|
|
Loading…
Add table
Reference in a new issue