grafana: make alerts better for fux
Some checks failed
/ Ansible Lint (push) Failing after 51s

This commit is contained in:
chris 2025-09-05 15:04:04 +02:00
commit 7f9d282155
Signed by: c6ristian
SSH key fingerprint: SHA256:B3m+yzpaxGXSEcDBpPHfvza/DNC0wuX+CKMeGq8wgak

View file

@ -1,4 +1,14 @@
groups:
- name: Generic
rules:
- alert: HostJobFlaky
expr: group by(host, job) (changes(up{org="fux", job!="integrations/unix"}[24h]) > 5)
for: 0m
labels:
severity: info
annotations:
summary: Job {{ $labels.job }} flaky on (host {{ $labels.instance }})
description: "The job {{ $labels.job }} on target: {{ labels.host }} has been flaky over the last 24 hours.\n VALUE = {{ $value }}"
- name: SNMP
rules:
- alert: SnmpTargetMissing
@ -8,15 +18,7 @@ groups:
severity: critical
annotations:
summary: SNMP target missing (instance {{ $labels.instance }})
description: "A SNMP target has disappeared for more the 30 min.\n VALUE = {{ $value }}"
- alert: SnmpTargetFalky
expr: changes(up{job=~"snmp"}[24h]) > 5
for: 0m
labels:
severity: info
annotations:
summary: SNMP target flaky (instance {{ $labels.instance }})
description: "A SNMP target is has a flaky respons over the last 24 hours.\n VALUE = {{ $value }}"
description: "A SNMP target has disappeared for more the 15 min.\n VALUE = {{ $value }}"
- name: DHCP
rules:
- alert: DhcpFuxSharedFailed