grafana: add email alerts, and fix some rules
Some checks failed
/ Ansible Lint (push) Failing after 2m0s

This commit is contained in:
chris 2025-06-10 21:22:53 +02:00
commit 0a50ee470a
Signed by: c6ristian
SSH key fingerprint: SHA256:B3m+yzpaxGXSEcDBpPHfvza/DNC0wuX+CKMeGq8wgak
4 changed files with 42 additions and 23 deletions

View file

@ -9,31 +9,40 @@ route:
group_interval: 5m
repeat_interval: 6h
routes:
- matchers:
- org = "ccchh"
- severity = "critical",
receiver: ntfy-ccchh-critical
- receiver: ntfy-ccchh-critical
matchers:
- org = "ccchh"
- severity = "critical",
repeat_interval: 18h
- matchers:
continue: true
- receiver: ntfy-ccchh
matchers:
- org = "ccchh"
- severity =~ "info|warning",
receiver: ntfy-ccchh
repeat_interval: 36h
- matchers:
- org = "fux"
- severity = "critical",
receiver: ntfy-fux-critical
continue: true
- receiver: ntfy-fux-critical
matchers:
- org = "fux"
- severity = "critical",
repeat_interval: 18h
- matchers:
- org = "fux"
- severity =~ "info|warning",
receiver: ntfy-fux
continue: true
- receiver: email-fux-critical
matchers:
- org = "fux"
- severity = "critical",
repeat_interval: 36h
- matchers:
- org = "ccchh"
- severity =~ "info|warning|critical"
receiver: ccchh-infrastructure-alerts
repeat_interval: 6h
continue: true
- receiver: ntfy-fux
matchers:
- org = "fux"
- severity =~ "info|warning",
repeat_interval: 36h
continue: true
- receiver: ccchh-infrastructure-alerts
matchers:
- org = "ccchh"
- severity =~ "info|warning|critical"
templates:
- "/etc/alertmanager/templates/*.tmpl"
@ -62,3 +71,12 @@ receivers:
- name: "ntfy-fux"
webhook_configs:
- url: "http://ntfy-alertmanager-fux:8011"
- name: "email-fux-critical"
email_configs:
- send_resolved: true
to: "fux@zimdahl.org,stb@lassitu.de"
from: "alert-manager@hamburg.ccc.de"
smarthost: "cow.hamburg.ccc.de:587"
auth_username: "alert-manager@hamburg.ccc.de"
auth_password: {{ secret__alert_manager_email_password }}

View file

@ -1,5 +1,5 @@
global:
scrape_interval: 15s
scrape_interval: 30s
scrape_timeout: 10s
evaluation_interval: 15s

View file

@ -627,7 +627,7 @@ groups:
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}"
- alert: PrometheusTimeseriesCardinality
expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000
expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 15000
for: 0m
labels:
severity: warning