grafana: add email alerts, and fix some rules
	
		
			
	
		
	
	
		
	
		
			Some checks failed
		
		
	
	
		
			
				
	
				/ Ansible Lint (push) Failing after 2m0s
				
			
		
		
	
	
		
	
		
			Some checks failed
		
		
	
	/ Ansible Lint (push) Failing after 2m0s
				
			This commit is contained in:
		
					parent
					
						
							
								db99b153e4
							
						
					
				
			
			
				commit
				
					
						0a50ee470a
					
				
			
		
					 4 changed files with 42 additions and 23 deletions
				
			
		|  | @ -9,31 +9,40 @@ route: | |||
|   group_interval: 5m | ||||
|   repeat_interval: 6h | ||||
|   routes: | ||||
|     - matchers: | ||||
|       - org = "ccchh" | ||||
|       - severity = "critical", | ||||
|       receiver: ntfy-ccchh-critical | ||||
|     - receiver: ntfy-ccchh-critical | ||||
|       matchers: | ||||
|         - org = "ccchh" | ||||
|         - severity = "critical", | ||||
|       repeat_interval: 18h | ||||
|     - matchers: | ||||
|       continue: true | ||||
|     - receiver: ntfy-ccchh | ||||
|       matchers: | ||||
|       - org = "ccchh" | ||||
|       - severity =~ "info|warning", | ||||
|       receiver: ntfy-ccchh | ||||
|       repeat_interval: 36h | ||||
|     - matchers: | ||||
|       - org = "fux" | ||||
|       - severity = "critical", | ||||
|       receiver: ntfy-fux-critical | ||||
|       continue: true | ||||
|     - receiver: ntfy-fux-critical | ||||
|       matchers: | ||||
|         - org = "fux" | ||||
|         - severity = "critical", | ||||
|       repeat_interval: 18h | ||||
|     - matchers: | ||||
|       - org = "fux" | ||||
|       - severity =~ "info|warning", | ||||
|       receiver: ntfy-fux | ||||
|       continue: true | ||||
|     - receiver: email-fux-critical | ||||
|       matchers: | ||||
|         - org = "fux" | ||||
|         - severity = "critical", | ||||
|       repeat_interval: 36h | ||||
|     - matchers: | ||||
|       - org = "ccchh" | ||||
|       - severity =~ "info|warning|critical" | ||||
|       receiver: ccchh-infrastructure-alerts | ||||
|       repeat_interval: 6h | ||||
|       continue: true | ||||
|     - receiver: ntfy-fux | ||||
|       matchers: | ||||
|         - org = "fux" | ||||
|         - severity =~ "info|warning", | ||||
|       repeat_interval: 36h | ||||
|       continue: true | ||||
|     - receiver: ccchh-infrastructure-alerts | ||||
|       matchers: | ||||
|         - org = "ccchh" | ||||
|         - severity =~ "info|warning|critical" | ||||
| 
 | ||||
| templates: | ||||
|   - "/etc/alertmanager/templates/*.tmpl" | ||||
|  | @ -62,3 +71,12 @@ receivers: | |||
|   - name: "ntfy-fux" | ||||
|     webhook_configs: | ||||
|       - url: "http://ntfy-alertmanager-fux:8011" | ||||
| 
 | ||||
|   - name: "email-fux-critical" | ||||
|     email_configs: | ||||
|       - send_resolved: true | ||||
|         to: "fux@zimdahl.org,stb@lassitu.de" | ||||
|         from: "alert-manager@hamburg.ccc.de" | ||||
|         smarthost: "cow.hamburg.ccc.de:587" | ||||
|         auth_username: "alert-manager@hamburg.ccc.de" | ||||
|         auth_password: {{ secret__alert_manager_email_password }} | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| global: | ||||
|   scrape_interval: 15s | ||||
|   scrape_interval: 30s | ||||
|   scrape_timeout: 10s | ||||
|   evaluation_interval: 15s | ||||
| 
 | ||||
|  |  | |||
|  | @ -627,7 +627,7 @@ groups: | |||
|           summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}" | ||||
|       - alert: PrometheusTimeseriesCardinality | ||||
|         expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000 | ||||
|         expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 15000 | ||||
|         for: 0m | ||||
|         labels: | ||||
|           severity: warning | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue