From 1355d4d83419ee0964323ea49961c3e4642b721d Mon Sep 17 00:00:00 2001 From: c6ristian Date: Mon, 1 Sep 2025 01:25:11 +0200 Subject: [PATCH] grafana: make alerts better for fux --- .../chaosknoten/host_vars/grafana.yaml | 2 + .../docker_compose/alertmanager.yaml.j2 | 2 +- .../grafana/docker_compose/compose.yaml.j2 | 1 + .../docker_compose/grafana-datasource.yml | 1 - .../prometheus_alerts-fux.rules.yaml | 41 +++++++++++++++++++ .../prometheus_alerts.rules.yaml | 4 +- 6 files changed, 47 insertions(+), 4 deletions(-) create mode 100644 resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml diff --git a/inventories/chaosknoten/host_vars/grafana.yaml b/inventories/chaosknoten/host_vars/grafana.yaml index 1ca6b1b..2e3672e 100644 --- a/inventories/chaosknoten/host_vars/grafana.yaml +++ b/inventories/chaosknoten/host_vars/grafana.yaml @@ -10,6 +10,8 @@ docker_compose__configuration_files: content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}" - name: prometheus_alerts.rules.yaml content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}" + - name: prometheus_alerts-fux.rules.yaml + content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml') }}" - name: alertmanager_alert_templates.tmpl content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}" - name: loki.yaml diff --git a/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 b/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 index 3e51e55..51aeb63 100644 --- a/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 +++ b/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 @@ -79,7 +79,7 @@ receivers: - name: "email-fux-critical" email_configs: - send_resolved: true - to: "stb@lassitu.de" + to: "stb@lassitu.de,fux@zimdahl.org" from: "alert-manager@hamburg.ccc.de" smarthost: "cow.hamburg.ccc.de:587" auth_username: "alert-manager@hamburg.ccc.de" diff --git a/resources/chaosknoten/grafana/docker_compose/compose.yaml.j2 b/resources/chaosknoten/grafana/docker_compose/compose.yaml.j2 index c9c4cca..1683b79 100644 --- a/resources/chaosknoten/grafana/docker_compose/compose.yaml.j2 +++ b/resources/chaosknoten/grafana/docker_compose/compose.yaml.j2 @@ -14,6 +14,7 @@ services: volumes: - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml - ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml + - ./configs/prometheus_alerts-fux.rules.yaml:/etc/prometheus/rules/alerts-fux.rules.yaml - prom_data:/prometheus alertmanager: diff --git a/resources/chaosknoten/grafana/docker_compose/grafana-datasource.yml b/resources/chaosknoten/grafana/docker_compose/grafana-datasource.yml index 632ad1c..3cb6995 100644 --- a/resources/chaosknoten/grafana/docker_compose/grafana-datasource.yml +++ b/resources/chaosknoten/grafana/docker_compose/grafana-datasource.yml @@ -18,4 +18,3 @@ datasources: httpHeaderName1: "X-Scope-OrgID" secureJsonData: httpHeaderValue1: "chaos" - diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml new file mode 100644 index 0000000..6d1187c --- /dev/null +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml @@ -0,0 +1,41 @@ +groups: + - name: Fux-Generic + rules: + - alert: HostJobFlaky + expr: group by(instance, job) (changes(up{org="fux"}[24h]) > 7) + for: 0m + labels: + severity: info + org: fux + annotations: + summary: Job {{ $labels.job }} flaky on (instance {{ $labels.instance }}) + description: "The job {{ $labels.job }} on target: {{ $labels.instance }} has been flaky over the last 24 hours." + - name: Fux-SNMP + rules: + - alert: SnmpTargetMissing + expr: up{job=~".*snmp.*", org="fux"} == 0 + for: 15m + labels: + severity: critical + org: fux + annotations: + summary: SNMP target missing (instance {{ $labels.instance }}) + description: "SNMP target: {{ $labels.instance }} has disappeared for more the 15 min." + - name: Fux-DHCP + rules: + - alert: DhcpFuxSharedFailed + expr: script_success{script="check_dhcp_fux_shared"} == 0 + for: 0m + labels: + severity: critical + annotations: + summary: DHCP for Fux Shared stoped working + description: "No DHCP lease for the Fux Shared range was received \n V" + - alert: DhcpFuxAdminFailed + expr: script_success{script_success="check_dhcp_fux_admin"} == 0 + for: 0m + labels: + severity: critical + annotations: + summary: DHCP for Fux Admin stoped working + description: "No DHCP lease for the Fux Admin range was received" diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml index aa20a42..4a2bc6f 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -410,7 +410,7 @@ groups: summary: Prometheus job missing (instance {{ $labels.instance }}) description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" - alert: PrometheusTargetMissing - expr: up == 0 + expr: up{job!~"snmp|noc_room_temp"} == 0 for: 0m labels: severity: critical @@ -418,7 +418,7 @@ groups: summary: Prometheus target missing (instance {{ $labels.instance }}) description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}" - alert: PrometheusAllTargetsMissing - expr: sum by (job) (up) == 0 + expr: sum by (job) (up{job!~"snmp|noc_room_temp"}) == 0 for: 0m labels: severity: critical