From a386f9e2ebfa59cfbabbb4255064b5e7a511f844 Mon Sep 17 00:00:00 2001 From: c6ristian Date: Sun, 10 Nov 2024 17:06:41 +0100 Subject: [PATCH] custom alerts for CI VMs its expected for some VMs to have high Read / Write rates for some time so this is a custom alerts for ours CI VMs --- .../prometheus_alerts.rules.yaml | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml index e638248..fe12bfd 100644 --- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -46,7 +46,7 @@ groups: description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}" # Have different disk read and write rate alerts for VMs and physical machines. - alert: VirtualHostUnusualDiskReadRate - expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"} + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"} for: 5m labels: severity: warning @@ -54,13 +54,30 @@ groups: summary: Virtual host unusual disk read rate (instance {{ $labels.instance }}) description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" - alert: VirtualHostUnusualDiskWriteRate - expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"} + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"} for: 2m labels: severity: warning annotations: summary: Virtual host unusual disk write rate (instance {{ $labels.instance }}) description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" + # Some VMs are expected to have high Read / Write rates z.B. CI servers + - alert: VirtualHostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"} + for: 10m + labels: + severity: warning + annotations: + summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }}) + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" + - alert: VirtualHostUnusualDiskWriteRate + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"} + for: 4m + labels: + severity: warning + annotations: + summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }}) + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" - alert: PhysicalHostUnusualDiskReadRate expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} for: 20m