forked from CCCHH/ansible-infra
		
	grafana: add email alerts, and fix some rules
This commit is contained in:
		
					parent
					
						
							
								db99b153e4
							
						
					
				
			
			
				commit
				
					
						0a50ee470a
					
				
			
		
					 4 changed files with 42 additions and 23 deletions
				
			
		| 
						 | 
				
			
			@ -9,9 +9,10 @@ secret__metrics_chaos_basic_auth: ENC[AES256_GCM,data:eT39ijCsheJZP3D335EIRdeVR4
 | 
			
		|||
secret__metrics_fux: ENC[AES256_GCM,data:aV6zeZ/XsVlA3QepSfVd/cOr+tqFVhlAxRO9SHx7,iv:fxo0o9amrh5ivPTxRVkvymB3fr5dLFVE7EqIpBlNZBk=,tag:41dm29mrV/jmqj5IkuNAaw==,type:str]
 | 
			
		||||
secret__metrics_fux_basic_auth: ENC[AES256_GCM,data:YL+QLzZyyObzDcz+FcefViMrvdkVSwRhDsBx/AwoDX3RLHCDjg==,iv:GADdMa7FHMM1FnyPp8DUHElpXsJeqD+gN5Slw0R9bgs=,tag:KGCoEud2JLU5s1gurrbywg==,type:str]
 | 
			
		||||
secret__ntfy_token: ENC[AES256_GCM,data:0tuPJVmxHcdDWOMIo0QQXgIEkJo+p9A5emH+kc+U5tw=,iv:NZcfiz3UFw2fMcMf+q1GRp4Fsxpxbptsx9n8wPR54z0=,tag:SJYFtXccCbPrXjECiKUOUA==,type:str]
 | 
			
		||||
secret__alert_manager_email_password: ENC[AES256_GCM,data:AsBzn9KJEoMjcrUWiIhR7I/1jaaFEa+cl3gImOQVKrg=,iv:mtQnZqT0taap3+z/L/nMfUvQF3JlTKIdoljmzVr1R3c=,tag:mZrCB597p8LyB61I7ZvHNA==,type:str]
 | 
			
		||||
sops:
 | 
			
		||||
  lastmodified: "2025-06-02T20:28:07Z"
 | 
			
		||||
  mac: ENC[AES256_GCM,data:mrA/ytnxpotGkGLCLRAGEEEiQmhcVtsCcSguZ1hnF9Qw+sIt/QULImP0yTVpQIfn3nVYBKn06+ZfRab7hTO48YuF+w1l/hkqYIcfoiikREtO9IO+Z4LBRoh59SpfQuAFAfmdegu5iTp6cXCWrEg5LElQQP3yg930kNN/HIEpZhM=,iv:3MdudOS5QaEaRQUyFANXBga8gyrTkD/CTM6qrcH8nL4=,tag:AvxWzNVLD8gOF93LXoSavA==,type:str]
 | 
			
		||||
  lastmodified: "2025-06-10T19:17:41Z"
 | 
			
		||||
  mac: ENC[AES256_GCM,data:8GGZFGSRXAaLoWUowbxd3RVv7NPMVsbkDttDxC1Aeuwjy6678ddioHTiOWn04noWSPXhVnnpaTHWNW9dT5EcbLHvTl9Vb/ydKq5EnjDi3vAI2hQZ5bJ29rwSIW2YBMwpceqh+2GqDuzebhOKxJ0ZFYsPzbfTGPt8blqOQ1abVR0=,iv:aDbIiH7H72jsBRe0rSDXHMQy6zc1QFrI6ZakJj8zxZ4=,tag:+ARO2ST+1I9gOB/f9V/OjQ==,type:str]
 | 
			
		||||
  pgp:
 | 
			
		||||
    - created_at: "2025-05-04T13:15:49Z"
 | 
			
		||||
      enc: |-
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -9,31 +9,40 @@ route:
 | 
			
		|||
  group_interval: 5m
 | 
			
		||||
  repeat_interval: 6h
 | 
			
		||||
  routes:
 | 
			
		||||
    - matchers:
 | 
			
		||||
      - org = "ccchh"
 | 
			
		||||
      - severity = "critical",
 | 
			
		||||
      receiver: ntfy-ccchh-critical
 | 
			
		||||
    - receiver: ntfy-ccchh-critical
 | 
			
		||||
      matchers:
 | 
			
		||||
        - org = "ccchh"
 | 
			
		||||
        - severity = "critical",
 | 
			
		||||
      repeat_interval: 18h
 | 
			
		||||
    - matchers:
 | 
			
		||||
      continue: true
 | 
			
		||||
    - receiver: ntfy-ccchh
 | 
			
		||||
      matchers:
 | 
			
		||||
      - org = "ccchh"
 | 
			
		||||
      - severity =~ "info|warning",
 | 
			
		||||
      receiver: ntfy-ccchh
 | 
			
		||||
      repeat_interval: 36h
 | 
			
		||||
    - matchers:
 | 
			
		||||
      - org = "fux"
 | 
			
		||||
      - severity = "critical",
 | 
			
		||||
      receiver: ntfy-fux-critical
 | 
			
		||||
      continue: true
 | 
			
		||||
    - receiver: ntfy-fux-critical
 | 
			
		||||
      matchers:
 | 
			
		||||
        - org = "fux"
 | 
			
		||||
        - severity = "critical",
 | 
			
		||||
      repeat_interval: 18h
 | 
			
		||||
    - matchers:
 | 
			
		||||
      - org = "fux"
 | 
			
		||||
      - severity =~ "info|warning",
 | 
			
		||||
      receiver: ntfy-fux
 | 
			
		||||
      continue: true
 | 
			
		||||
    - receiver: email-fux-critical
 | 
			
		||||
      matchers:
 | 
			
		||||
        - org = "fux"
 | 
			
		||||
        - severity = "critical",
 | 
			
		||||
      repeat_interval: 36h
 | 
			
		||||
    - matchers:
 | 
			
		||||
      - org = "ccchh"
 | 
			
		||||
      - severity =~ "info|warning|critical"
 | 
			
		||||
      receiver: ccchh-infrastructure-alerts
 | 
			
		||||
      repeat_interval: 6h
 | 
			
		||||
      continue: true
 | 
			
		||||
    - receiver: ntfy-fux
 | 
			
		||||
      matchers:
 | 
			
		||||
        - org = "fux"
 | 
			
		||||
        - severity =~ "info|warning",
 | 
			
		||||
      repeat_interval: 36h
 | 
			
		||||
      continue: true
 | 
			
		||||
    - receiver: ccchh-infrastructure-alerts
 | 
			
		||||
      matchers:
 | 
			
		||||
        - org = "ccchh"
 | 
			
		||||
        - severity =~ "info|warning|critical"
 | 
			
		||||
 | 
			
		||||
templates:
 | 
			
		||||
  - "/etc/alertmanager/templates/*.tmpl"
 | 
			
		||||
| 
						 | 
				
			
			@ -62,3 +71,12 @@ receivers:
 | 
			
		|||
  - name: "ntfy-fux"
 | 
			
		||||
    webhook_configs:
 | 
			
		||||
      - url: "http://ntfy-alertmanager-fux:8011"
 | 
			
		||||
 | 
			
		||||
  - name: "email-fux-critical"
 | 
			
		||||
    email_configs:
 | 
			
		||||
      - send_resolved: true
 | 
			
		||||
        to: "fux@zimdahl.org,stb@lassitu.de"
 | 
			
		||||
        from: "alert-manager@hamburg.ccc.de"
 | 
			
		||||
        smarthost: "cow.hamburg.ccc.de:587"
 | 
			
		||||
        auth_username: "alert-manager@hamburg.ccc.de"
 | 
			
		||||
        auth_password: {{ secret__alert_manager_email_password }}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,5 @@
 | 
			
		|||
global:
 | 
			
		||||
  scrape_interval: 15s
 | 
			
		||||
  scrape_interval: 30s
 | 
			
		||||
  scrape_timeout: 10s
 | 
			
		||||
  evaluation_interval: 15s
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -627,7 +627,7 @@ groups:
 | 
			
		|||
          summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"
 | 
			
		||||
      - alert: PrometheusTimeseriesCardinality
 | 
			
		||||
        expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000
 | 
			
		||||
        expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 15000
 | 
			
		||||
        for: 0m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue