grafana: add email alerts, and fix some rules
Some checks failed
/ Ansible Lint (push) Failing after 2m0s
Some checks failed
/ Ansible Lint (push) Failing after 2m0s
This commit is contained in:
parent
db99b153e4
commit
0a50ee470a
4 changed files with 42 additions and 23 deletions
|
@ -9,31 +9,40 @@ route:
|
|||
group_interval: 5m
|
||||
repeat_interval: 6h
|
||||
routes:
|
||||
- matchers:
|
||||
- org = "ccchh"
|
||||
- severity = "critical",
|
||||
receiver: ntfy-ccchh-critical
|
||||
- receiver: ntfy-ccchh-critical
|
||||
matchers:
|
||||
- org = "ccchh"
|
||||
- severity = "critical",
|
||||
repeat_interval: 18h
|
||||
- matchers:
|
||||
continue: true
|
||||
- receiver: ntfy-ccchh
|
||||
matchers:
|
||||
- org = "ccchh"
|
||||
- severity =~ "info|warning",
|
||||
receiver: ntfy-ccchh
|
||||
repeat_interval: 36h
|
||||
- matchers:
|
||||
- org = "fux"
|
||||
- severity = "critical",
|
||||
receiver: ntfy-fux-critical
|
||||
continue: true
|
||||
- receiver: ntfy-fux-critical
|
||||
matchers:
|
||||
- org = "fux"
|
||||
- severity = "critical",
|
||||
repeat_interval: 18h
|
||||
- matchers:
|
||||
- org = "fux"
|
||||
- severity =~ "info|warning",
|
||||
receiver: ntfy-fux
|
||||
continue: true
|
||||
- receiver: email-fux-critical
|
||||
matchers:
|
||||
- org = "fux"
|
||||
- severity = "critical",
|
||||
repeat_interval: 36h
|
||||
- matchers:
|
||||
- org = "ccchh"
|
||||
- severity =~ "info|warning|critical"
|
||||
receiver: ccchh-infrastructure-alerts
|
||||
repeat_interval: 6h
|
||||
continue: true
|
||||
- receiver: ntfy-fux
|
||||
matchers:
|
||||
- org = "fux"
|
||||
- severity =~ "info|warning",
|
||||
repeat_interval: 36h
|
||||
continue: true
|
||||
- receiver: ccchh-infrastructure-alerts
|
||||
matchers:
|
||||
- org = "ccchh"
|
||||
- severity =~ "info|warning|critical"
|
||||
|
||||
templates:
|
||||
- "/etc/alertmanager/templates/*.tmpl"
|
||||
|
@ -62,3 +71,12 @@ receivers:
|
|||
- name: "ntfy-fux"
|
||||
webhook_configs:
|
||||
- url: "http://ntfy-alertmanager-fux:8011"
|
||||
|
||||
- name: "email-fux-critical"
|
||||
email_configs:
|
||||
- send_resolved: true
|
||||
to: "fux@zimdahl.org,stb@lassitu.de"
|
||||
from: "alert-manager@hamburg.ccc.de"
|
||||
smarthost: "cow.hamburg.ccc.de:587"
|
||||
auth_username: "alert-manager@hamburg.ccc.de"
|
||||
auth_password: {{ secret__alert_manager_email_password }}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
global:
|
||||
scrape_interval: 15s
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
evaluation_interval: 15s
|
||||
|
||||
|
|
|
@ -627,7 +627,7 @@ groups:
|
|||
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}"
|
||||
- alert: PrometheusTimeseriesCardinality
|
||||
expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000
|
||||
expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 15000
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue