This commit is contained in:
		
					parent
					
						
							
								7900e458aa
							
						
					
				
			
			
				commit
				
					
						00bcd45111
					
				
			
		
					 2 changed files with 20 additions and 0 deletions
				
			
		|  | @ -13,10 +13,12 @@ route: | |||
|       - org = "ccchh" | ||||
|       - severity = "critical", | ||||
|       receiver: ntfy-ccchh-critical | ||||
|       repeat_interval: 18h | ||||
|     - matchers: | ||||
|       - org = "ccchh" | ||||
|       - severity =~ "info|warning", | ||||
|       receiver: ntfy-ccchh | ||||
|       repeat_interval: 36h | ||||
|     - matchers: | ||||
|       - org = "fux" | ||||
|       - severity = "critical", | ||||
|  | @ -31,6 +33,7 @@ route: | |||
|       - org = "ccchh" | ||||
|       - severity =~ "info|warning|critical" | ||||
|       receiver: ccchh-infrastructure-alerts | ||||
|       repeat_interval: 6h | ||||
| 
 | ||||
| templates: | ||||
|   - "/etc/alertmanager/templates/*.tmpl" | ||||
|  |  | |||
|  | @ -438,6 +438,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus too many restarts (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}" | ||||
|  | @ -446,6 +447,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) | ||||
|           description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}" | ||||
|  | @ -454,6 +456,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) | ||||
|           description: "AlertManager configuration reload error\n  VALUE = {{ $value }}" | ||||
|  | @ -462,6 +465,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) | ||||
|           description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}" | ||||
|  | @ -479,6 +483,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}" | ||||
|  | @ -487,6 +492,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}" | ||||
|  | @ -495,6 +501,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}" | ||||
|  | @ -503,6 +510,7 @@ groups: | |||
|         for: 5m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}" | ||||
|  | @ -519,6 +527,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) | ||||
|           description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}" | ||||
|  | @ -527,6 +536,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus target empty (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}" | ||||
|  | @ -535,6 +545,7 @@ groups: | |||
|         for: 5m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus target scraping slow (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}" | ||||
|  | @ -575,6 +586,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}" | ||||
|  | @ -583,6 +595,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}" | ||||
|  | @ -591,6 +604,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}" | ||||
|  | @ -599,6 +613,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}" | ||||
|  | @ -607,6 +622,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) | ||||
|           description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}" | ||||
|  | @ -615,6 +631,7 @@ groups: | |||
|         for: 0m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           org: ccchh | ||||
|         annotations: | ||||
|           summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) | ||||
|           description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}" | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue