grafana: differentiate prometheus disk rate alerts by host task type
Not by a mix of host task type (CI server or not) and whether or not the host is virtual or physical. Also only differentiate on the duration not the rate, to not accidentally exclude slow hard disks.
This commit is contained in:
parent
5016407cef
commit
9e77a41e3c
1 changed files with 29 additions and 25 deletions
|
@ -114,56 +114,60 @@ groups:
|
|||
|
||||
- name: node-exporter-disk
|
||||
rules:
|
||||
# Have different disk read and write rate alerts for VMs and physical machines.
|
||||
- alert: VirtualHostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
|
||||
# General high disk read and write rate alerts.
|
||||
# Excluding: hypervisor hosts, CI hosts
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
- alert: VirtualHostUnusualDiskWriteRate
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
# Some VMs are expected to have high Read / Write rates z.B. CI servers
|
||||
- alert: VirtualHostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||
# CI hosts high disk read and write alerts.
|
||||
# Longer intervals to account for disk intensive CI tasks.
|
||||
- alert: CIHostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }})
|
||||
summary: CI host unusual disk read rate for 10 min (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
- alert: VirtualHostUnusualDiskWriteRate
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||
for: 4m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }})
|
||||
summary: CI host unusual disk write rate for 4 min (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
- alert: PhysicalHostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
||||
# Hypervisor host high disk read and write alerts.
|
||||
# Longer intervals to account for disk intensive hypervisor tasks (backups, moving VMs, etc.).
|
||||
- alert: HypervisorHostUnusualDiskReadRate
|
||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Hypervisor host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
- alert: HypervisorHostUnusualDiskWriteRate
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
||||
- alert: PhysicalHostUnusualDiskWriteRate
|
||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
||||
summary: Hypervisor host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
|
|
Loading…
Add table
Reference in a new issue