This commit is contained in:
parent
7900e458aa
commit
00bcd45111
2 changed files with 20 additions and 0 deletions
|
@ -13,10 +13,12 @@ route:
|
||||||
- org = "ccchh"
|
- org = "ccchh"
|
||||||
- severity = "critical",
|
- severity = "critical",
|
||||||
receiver: ntfy-ccchh-critical
|
receiver: ntfy-ccchh-critical
|
||||||
|
repeat_interval: 18h
|
||||||
- matchers:
|
- matchers:
|
||||||
- org = "ccchh"
|
- org = "ccchh"
|
||||||
- severity =~ "info|warning",
|
- severity =~ "info|warning",
|
||||||
receiver: ntfy-ccchh
|
receiver: ntfy-ccchh
|
||||||
|
repeat_interval: 36h
|
||||||
- matchers:
|
- matchers:
|
||||||
- org = "fux"
|
- org = "fux"
|
||||||
- severity = "critical",
|
- severity = "critical",
|
||||||
|
@ -31,6 +33,7 @@ route:
|
||||||
- org = "ccchh"
|
- org = "ccchh"
|
||||||
- severity =~ "info|warning|critical"
|
- severity =~ "info|warning|critical"
|
||||||
receiver: ccchh-infrastructure-alerts
|
receiver: ccchh-infrastructure-alerts
|
||||||
|
repeat_interval: 6h
|
||||||
|
|
||||||
templates:
|
templates:
|
||||||
- "/etc/alertmanager/templates/*.tmpl"
|
- "/etc/alertmanager/templates/*.tmpl"
|
||||||
|
|
|
@ -438,6 +438,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus too many restarts (instance {{ $labels.instance }})
|
summary: Prometheus too many restarts (instance {{ $labels.instance }})
|
||||||
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}"
|
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}"
|
||||||
|
@ -446,6 +447,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
|
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
|
||||||
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}"
|
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}"
|
||||||
|
@ -454,6 +456,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
|
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
|
||||||
description: "AlertManager configuration reload error\n VALUE = {{ $value }}"
|
description: "AlertManager configuration reload error\n VALUE = {{ $value }}"
|
||||||
|
@ -462,6 +465,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
|
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
|
||||||
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}"
|
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}"
|
||||||
|
@ -479,6 +483,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
|
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
|
||||||
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}"
|
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}"
|
||||||
|
@ -487,6 +492,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
||||||
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}"
|
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}"
|
||||||
|
@ -495,6 +501,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
|
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
|
||||||
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}"
|
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}"
|
||||||
|
@ -503,6 +510,7 @@ groups:
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
||||||
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}"
|
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}"
|
||||||
|
@ -519,6 +527,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
||||||
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}"
|
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}"
|
||||||
|
@ -527,6 +536,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus target empty (instance {{ $labels.instance }})
|
summary: Prometheus target empty (instance {{ $labels.instance }})
|
||||||
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}"
|
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}"
|
||||||
|
@ -535,6 +545,7 @@ groups:
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
|
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
|
||||||
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}"
|
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}"
|
||||||
|
@ -575,6 +586,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
|
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
|
||||||
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}"
|
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}"
|
||||||
|
@ -583,6 +595,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
|
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
|
||||||
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}"
|
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}"
|
||||||
|
@ -591,6 +604,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
|
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
|
||||||
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}"
|
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}"
|
||||||
|
@ -599,6 +613,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
|
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
|
||||||
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}"
|
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}"
|
||||||
|
@ -607,6 +622,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
|
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
|
||||||
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}"
|
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}"
|
||||||
|
@ -615,6 +631,7 @@ groups:
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
org: ccchh
|
||||||
annotations:
|
annotations:
|
||||||
summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
|
summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
|
||||||
description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}"
|
description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue