grafana: make alerts better for fux
Some checks failed
/ Ansible Lint (push) Failing after 46s

This commit is contained in:
chris 2025-09-01 01:25:11 +02:00
commit 1355d4d834
Signed by: c6ristian
SSH key fingerprint: SHA256:B3m+yzpaxGXSEcDBpPHfvza/DNC0wuX+CKMeGq8wgak
6 changed files with 47 additions and 4 deletions

View file

@ -10,6 +10,8 @@ docker_compose__configuration_files:
content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}" content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}"
- name: prometheus_alerts.rules.yaml - name: prometheus_alerts.rules.yaml
content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}" content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}"
- name: prometheus_alerts-fux.rules.yaml
content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml') }}"
- name: alertmanager_alert_templates.tmpl - name: alertmanager_alert_templates.tmpl
content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}" content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}"
- name: loki.yaml - name: loki.yaml

View file

@ -79,7 +79,7 @@ receivers:
- name: "email-fux-critical" - name: "email-fux-critical"
email_configs: email_configs:
- send_resolved: true - send_resolved: true
to: "stb@lassitu.de" to: "stb@lassitu.de,fux@zimdahl.org"
from: "alert-manager@hamburg.ccc.de" from: "alert-manager@hamburg.ccc.de"
smarthost: "cow.hamburg.ccc.de:587" smarthost: "cow.hamburg.ccc.de:587"
auth_username: "alert-manager@hamburg.ccc.de" auth_username: "alert-manager@hamburg.ccc.de"

View file

@ -14,6 +14,7 @@ services:
volumes: volumes:
- ./configs/prometheus.yml:/etc/prometheus/prometheus.yml - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml
- ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml - ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml
- ./configs/prometheus_alerts-fux.rules.yaml:/etc/prometheus/rules/alerts-fux.rules.yaml
- prom_data:/prometheus - prom_data:/prometheus
alertmanager: alertmanager:

View file

@ -18,4 +18,3 @@ datasources:
httpHeaderName1: "X-Scope-OrgID" httpHeaderName1: "X-Scope-OrgID"
secureJsonData: secureJsonData:
httpHeaderValue1: "chaos" httpHeaderValue1: "chaos"

View file

@ -0,0 +1,41 @@
groups:
- name: Fux-Generic
rules:
- alert: HostJobFlaky
expr: group by(instance, job) (changes(up{org="fux"}[24h]) > 7)
for: 0m
labels:
severity: info
org: fux
annotations:
summary: Job {{ $labels.job }} flaky on (instance {{ $labels.instance }})
description: "The job {{ $labels.job }} on target: {{ $labels.instance }} has been flaky over the last 24 hours."
- name: Fux-SNMP
rules:
- alert: SnmpTargetMissing
expr: up{job=~".*snmp.*", org="fux"} == 0
for: 15m
labels:
severity: critical
org: fux
annotations:
summary: SNMP target missing (instance {{ $labels.instance }})
description: "SNMP target: {{ $labels.instance }} has disappeared for more the 15 min."
- name: Fux-DHCP
rules:
- alert: DhcpFuxSharedFailed
expr: script_success{script="check_dhcp_fux_shared"} == 0
for: 0m
labels:
severity: critical
annotations:
summary: DHCP for Fux Shared stoped working
description: "No DHCP lease for the Fux Shared range was received \n V"
- alert: DhcpFuxAdminFailed
expr: script_success{script_success="check_dhcp_fux_admin"} == 0
for: 0m
labels:
severity: critical
annotations:
summary: DHCP for Fux Admin stoped working
description: "No DHCP lease for the Fux Admin range was received"

View file

@ -410,7 +410,7 @@ groups:
summary: Prometheus job missing (instance {{ $labels.instance }}) summary: Prometheus job missing (instance {{ $labels.instance }})
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" description: "A Prometheus job has disappeared\n VALUE = {{ $value }}"
- alert: PrometheusTargetMissing - alert: PrometheusTargetMissing
expr: up == 0 expr: up{job!~"snmp|noc_room_temp"} == 0
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
@ -418,7 +418,7 @@ groups:
summary: Prometheus target missing (instance {{ $labels.instance }}) summary: Prometheus target missing (instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}" description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}"
- alert: PrometheusAllTargetsMissing - alert: PrometheusAllTargetsMissing
expr: sum by (job) (up) == 0 expr: sum by (job) (up{job!~"snmp|noc_room_temp"}) == 0
for: 0m for: 0m
labels: labels:
severity: critical severity: critical