From 0a50ee470a4bfca38a9b817c537c5aeeae90ad66 Mon Sep 17 00:00:00 2001 From: c6ristian Date: Tue, 10 Jun 2025 21:22:53 +0200 Subject: [PATCH] grafana: add email alerts, and fix some rules --- .../chaosknoten/host_vars/grafana.sops.yaml | 5 +- .../docker_compose/alertmanager.yaml.j2 | 56 ++++++++++++------- .../grafana/docker_compose/prometheus.yml | 2 +- .../prometheus_alerts.rules.yaml | 2 +- 4 files changed, 42 insertions(+), 23 deletions(-) diff --git a/inventories/chaosknoten/host_vars/grafana.sops.yaml b/inventories/chaosknoten/host_vars/grafana.sops.yaml index 8d5e665..aa5ccbf 100644 --- a/inventories/chaosknoten/host_vars/grafana.sops.yaml +++ b/inventories/chaosknoten/host_vars/grafana.sops.yaml @@ -9,9 +9,10 @@ secret__metrics_chaos_basic_auth: ENC[AES256_GCM,data:eT39ijCsheJZP3D335EIRdeVR4 secret__metrics_fux: ENC[AES256_GCM,data:aV6zeZ/XsVlA3QepSfVd/cOr+tqFVhlAxRO9SHx7,iv:fxo0o9amrh5ivPTxRVkvymB3fr5dLFVE7EqIpBlNZBk=,tag:41dm29mrV/jmqj5IkuNAaw==,type:str] secret__metrics_fux_basic_auth: ENC[AES256_GCM,data:YL+QLzZyyObzDcz+FcefViMrvdkVSwRhDsBx/AwoDX3RLHCDjg==,iv:GADdMa7FHMM1FnyPp8DUHElpXsJeqD+gN5Slw0R9bgs=,tag:KGCoEud2JLU5s1gurrbywg==,type:str] secret__ntfy_token: ENC[AES256_GCM,data:0tuPJVmxHcdDWOMIo0QQXgIEkJo+p9A5emH+kc+U5tw=,iv:NZcfiz3UFw2fMcMf+q1GRp4Fsxpxbptsx9n8wPR54z0=,tag:SJYFtXccCbPrXjECiKUOUA==,type:str] +secret__alert_manager_email_password: ENC[AES256_GCM,data:AsBzn9KJEoMjcrUWiIhR7I/1jaaFEa+cl3gImOQVKrg=,iv:mtQnZqT0taap3+z/L/nMfUvQF3JlTKIdoljmzVr1R3c=,tag:mZrCB597p8LyB61I7ZvHNA==,type:str] sops: - lastmodified: "2025-06-02T20:28:07Z" - mac: ENC[AES256_GCM,data:mrA/ytnxpotGkGLCLRAGEEEiQmhcVtsCcSguZ1hnF9Qw+sIt/QULImP0yTVpQIfn3nVYBKn06+ZfRab7hTO48YuF+w1l/hkqYIcfoiikREtO9IO+Z4LBRoh59SpfQuAFAfmdegu5iTp6cXCWrEg5LElQQP3yg930kNN/HIEpZhM=,iv:3MdudOS5QaEaRQUyFANXBga8gyrTkD/CTM6qrcH8nL4=,tag:AvxWzNVLD8gOF93LXoSavA==,type:str] + lastmodified: "2025-06-10T19:17:41Z" + mac: ENC[AES256_GCM,data:8GGZFGSRXAaLoWUowbxd3RVv7NPMVsbkDttDxC1Aeuwjy6678ddioHTiOWn04noWSPXhVnnpaTHWNW9dT5EcbLHvTl9Vb/ydKq5EnjDi3vAI2hQZ5bJ29rwSIW2YBMwpceqh+2GqDuzebhOKxJ0ZFYsPzbfTGPt8blqOQ1abVR0=,iv:aDbIiH7H72jsBRe0rSDXHMQy6zc1QFrI6ZakJj8zxZ4=,tag:+ARO2ST+1I9gOB/f9V/OjQ==,type:str] pgp: - created_at: "2025-05-04T13:15:49Z" enc: |- diff --git a/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 b/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 index 799701a..8d29b43 100644 --- a/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 +++ b/resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2 @@ -9,31 +9,40 @@ route: group_interval: 5m repeat_interval: 6h routes: - - matchers: - - org = "ccchh" - - severity = "critical", - receiver: ntfy-ccchh-critical + - receiver: ntfy-ccchh-critical + matchers: + - org = "ccchh" + - severity = "critical", repeat_interval: 18h - - matchers: + continue: true + - receiver: ntfy-ccchh + matchers: - org = "ccchh" - severity =~ "info|warning", - receiver: ntfy-ccchh repeat_interval: 36h - - matchers: - - org = "fux" - - severity = "critical", - receiver: ntfy-fux-critical + continue: true + - receiver: ntfy-fux-critical + matchers: + - org = "fux" + - severity = "critical", repeat_interval: 18h - - matchers: - - org = "fux" - - severity =~ "info|warning", - receiver: ntfy-fux + continue: true + - receiver: email-fux-critical + matchers: + - org = "fux" + - severity = "critical", repeat_interval: 36h - - matchers: - - org = "ccchh" - - severity =~ "info|warning|critical" - receiver: ccchh-infrastructure-alerts - repeat_interval: 6h + continue: true + - receiver: ntfy-fux + matchers: + - org = "fux" + - severity =~ "info|warning", + repeat_interval: 36h + continue: true + - receiver: ccchh-infrastructure-alerts + matchers: + - org = "ccchh" + - severity =~ "info|warning|critical" templates: - "/etc/alertmanager/templates/*.tmpl" @@ -62,3 +71,12 @@ receivers: - name: "ntfy-fux" webhook_configs: - url: "http://ntfy-alertmanager-fux:8011" + + - name: "email-fux-critical" + email_configs: + - send_resolved: true + to: "fux@zimdahl.org,stb@lassitu.de" + from: "alert-manager@hamburg.ccc.de" + smarthost: "cow.hamburg.ccc.de:587" + auth_username: "alert-manager@hamburg.ccc.de" + auth_password: {{ secret__alert_manager_email_password }} diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus.yml b/resources/chaosknoten/grafana/docker_compose/prometheus.yml index 769cdc8..87ee5ae 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus.yml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus.yml @@ -1,5 +1,5 @@ global: - scrape_interval: 15s + scrape_interval: 30s scrape_timeout: 10s evaluation_interval: 15s diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml index bd0a400..a9d3e48 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -627,7 +627,7 @@ groups: summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}" - alert: PrometheusTimeseriesCardinality - expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000 + expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 15000 for: 0m labels: severity: warning