grafana: differentiate prometheus disk rate alerts by host type
Also only differentiate on the duration not the rate, to not accidentally exclude slow hard disks.
This commit is contained in:
parent
5016407cef
commit
a9a5c422bf
1 changed files with 29 additions and 25 deletions
|
@ -114,56 +114,60 @@ groups:
|
|||
|
||||
- name: node-exporter-disk
|
||||
rules:
|
||||
# Have different disk read and write rate alerts for VMs and physical machines.
|
||||
- alert: VirtualHostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
|
||||
# General high disk read and write rate alerts.
|
||||
# Excluding: hypervisor hosts, CI hosts
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
- alert: VirtualHostUnusualDiskWriteRate
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
# Some VMs are expected to have high Read / Write rates z.B. CI servers
|
||||
- alert: VirtualHostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||
# CI hosts high disk read and write alerts.
|
||||
# Longer intervals to account for disk intensive CI tasks.
|
||||
- alert: CIHostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }})
|
||||
summary: CI host unusual disk read rate for 10 min (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
- alert: VirtualHostUnusualDiskWriteRate
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||
for: 4m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }})
|
||||
summary: CI host unusual disk write rate for 4 min (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
- alert: PhysicalHostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
||||
# Hypervisor host high disk read and write alerts.
|
||||
# Longer intervals to account for disk intensive hypervisor tasks (backups, moving VMs, etc.).
|
||||
- alert: HypervisorHostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Hypervisor host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
- alert: HypervisorHostUnusualDiskWriteRate
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
||||
- alert: PhysicalHostUnusualDiskWriteRate
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
||||
summary: Hypervisor host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
|
|
Loading…
Add table
Reference in a new issue