From 5af9a0989496e7f61ef97267e738384637acdfbe Mon Sep 17 00:00:00 2001 From: c6ristian Date: Mon, 1 Sep 2025 01:25:11 +0200 Subject: [PATCH] grafana: make alerts better for fux --- .../chaosknoten/host_vars/grafana.yaml | 2 + .../docker_compose/alertmanager.yaml.j2 | 2 +- .../grafana/docker_compose/compose.yaml.j2 | 1 + .../prometheus_alerts-fux.rules.yaml | 39 +++++++++++++++++++ .../prometheus_alerts.rules.yaml | 4 +- 5 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml diff --git a/inventories/chaosknoten/host_vars/grafana.yaml b/inventories/chaosknoten/host_vars/grafana.yaml index 1ca6b1b..2e3672e 100644 --- a/inventories/chaosknoten/host_vars/grafana.yaml +++ b/inventories/chaosknoten/host_vars/grafana.yaml @@ -10,6 +10,8 @@ docker_compose__configuration_files: content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}" - name: prometheus_alerts.rules.yaml content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}" + - name: prometheus_alerts-fux.rules.yaml + content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml') }}" - name: alertmanager_alert_templates.tmpl content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}" - name: loki.yaml diff --git a/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 b/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 index 3e51e55..51aeb63 100644 --- a/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 +++ b/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 @@ -79,7 +79,7 @@ receivers: - name: "email-fux-critical" email_configs: - send_resolved: true - to: "stb@lassitu.de" + to: "stb@lassitu.de,fux@zimdahl.org" from: "alert-manager@hamburg.ccc.de" smarthost: "cow.hamburg.ccc.de:587" auth_username: "alert-manager@hamburg.ccc.de" diff --git a/resources/chaosknoten/grafana/docker_compose/compose.yaml.j2 b/resources/chaosknoten/grafana/docker_compose/compose.yaml.j2 index c9c4cca..1683b79 100644 --- a/resources/chaosknoten/grafana/docker_compose/compose.yaml.j2 +++ b/resources/chaosknoten/grafana/docker_compose/compose.yaml.j2 @@ -14,6 +14,7 @@ services: volumes: - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml - ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml + - ./configs/prometheus_alerts-fux.rules.yaml:/etc/prometheus/rules/alerts-fux.rules.yaml - prom_data:/prometheus alertmanager: diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml new file mode 100644 index 0000000..d6210c7 --- /dev/null +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml @@ -0,0 +1,39 @@ +groups: + - name: Fux-Generic + rules: + - alert: HostJobFlaky + expr: group by(host, job) (changes(up{org="fux"}[24h]) > 7) + for: 0m + labels: + severity: info + annotations: + summary: Job {{ $labels.job }} flaky on (host {{ $labels.instance }}) + description: "The job {{ $labels.job }} on target: {{ $labels.host }} has been flaky over the last 24 hours." + - name: Fux-SNMP + rules: + - alert: SnmpTargetMissing + expr: up{job=~".*snmp|SNMP.*"} == 0 + for: 15m + labels: + severity: critical + annotations: + summary: SNMP target missing (instance {{ $labels.instance }}) + description: "SNMP target: {{ $labels.instance }} has disappeared for more the 15 min." + - name: Fux-DHCP + rules: + - alert: DhcpFuxSharedFailed + expr: script_success{script="check_dhcp_fux_shared"} == 0 + for: 0m + labels: + severity: critical + annotations: + summary: DHCP for Fux Shared stoped working + description: "No DHCP lease for the Fux Shared range was received \n V" + - alert: DhcpFuxAdminFailed + expr: script_success{script_success="check_dhcp_fux_admin"} == 0 + for: 0m + labels: + severity: critical + annotations: + summary: DHCP for Fux Admin stoped working + description: "No DHCP lease for the Fux Admin range was received" diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml index aa20a42..4a2bc6f 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -410,7 +410,7 @@ groups: summary: Prometheus job missing (instance {{ $labels.instance }}) description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" - alert: PrometheusTargetMissing - expr: up == 0 + expr: up{job!~"snmp|noc_room_temp"} == 0 for: 0m labels: severity: critical @@ -418,7 +418,7 @@ groups: summary: Prometheus target missing (instance {{ $labels.instance }}) description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}" - alert: PrometheusAllTargetsMissing - expr: sum by (job) (up) == 0 + expr: sum by (job) (up{job!~"snmp|noc_room_temp"}) == 0 for: 0m labels: severity: critical