grafana: differentiate prometheus disk rate alerts by host task type
Not by a mix of host task type (CI server or not) and whether or not the host is virtual or physical. Also only differentiate on the duration not the rate, to not accidentally exclude slow hard disks.
This commit is contained in:
parent
5016407cef
commit
9e77a41e3c
1 changed files with 29 additions and 25 deletions
|
@ -114,56 +114,60 @@ groups:
|
||||||
|
|
||||||
- name: node-exporter-disk
|
- name: node-exporter-disk
|
||||||
rules:
|
rules:
|
||||||
# Have different disk read and write rate alerts for VMs and physical machines.
|
# General high disk read and write rate alerts.
|
||||||
- alert: VirtualHostUnusualDiskReadRate
|
# Excluding: hypervisor hosts, CI hosts
|
||||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
|
- alert: HostUnusualDiskReadRate
|
||||||
|
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
|
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
- alert: VirtualHostUnusualDiskWriteRate
|
- alert: HostUnusualDiskWriteRate
|
||||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
|
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
|
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
# Some VMs are expected to have high Read / Write rates z.B. CI servers
|
# CI hosts high disk read and write alerts.
|
||||||
- alert: VirtualHostUnusualDiskReadRate
|
# Longer intervals to account for disk intensive CI tasks.
|
||||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
|
- alert: CIHostUnusualDiskReadRate
|
||||||
|
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }})
|
summary: CI host unusual disk read rate for 10 min (instance {{ $labels.instance }})
|
||||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
- alert: VirtualHostUnusualDiskWriteRate
|
- alert: VirtualHostUnusualDiskWriteRate
|
||||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
|
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
|
||||||
for: 4m
|
for: 4m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }})
|
summary: CI host unusual disk write rate for 4 min (instance {{ $labels.instance }})
|
||||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
- alert: PhysicalHostUnusualDiskReadRate
|
# Hypervisor host high disk read and write alerts.
|
||||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
# Longer intervals to account for disk intensive hypervisor tasks (backups, moving VMs, etc.).
|
||||||
|
- alert: HypervisorHostUnusualDiskReadRate
|
||||||
|
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Hypervisor host unusual disk read rate (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
|
- alert: HypervisorHostUnusualDiskWriteRate
|
||||||
|
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
|
||||||
for: 20m
|
for: 20m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
|
summary: Hypervisor host unusual disk write rate (instance {{ $labels.instance }})
|
||||||
description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
- alert: PhysicalHostUnusualDiskWriteRate
|
|
||||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
|
|
||||||
description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
|
||||||
# Please add ignored mountpoints in node_exporter parameters like
|
# Please add ignored mountpoints in node_exporter parameters like
|
||||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue