From 0a05cad0a1adef0ec1bb0496e6076d9365b3f37f Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Wed, 2 Oct 2024 04:13:37 +0200
Subject: [PATCH] prometheus & alertmanager: add self-alerting

Add self-alerting for Prometheus and Alertmanager using rules from
https://samber.github.io/awesome-prometheus-alerts/rules
---
 .../grafana/docker_compose/prometheus.yml     |  23 +-
 .../prometheus_alerts.rules.yaml              | 219 ++++++++++++++++++
 2 files changed, 233 insertions(+), 9 deletions(-)

diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
index b50b0df..dce71c2 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@@ -22,6 +22,13 @@ scrape_configs:
   static_configs:
   - targets:
     - localhost:9090
+- job_name: alertmanager
+  honor_timestamps: true
+  metrics_path: /metrics
+  scheme: http
+  static_configs:
+  - targets:
+    - alertmanager:9093
 - job_name: c3lingo
   honor_timestamps: true
   scrape_interval: 5s
@@ -58,15 +65,6 @@ scrape_configs:
   static_configs:
   - targets:
     - jitsi.hamburg.ccc.de:9888 # Jitsi Video Bridge
-- job_name: chaosknoten
-  honor_timestamps: true
-  scrape_interval: 5s
-  scrape_timeout: 1s
-  metrics_path: /metrics
-  scheme: http
-  static_configs:
-  - targets:
-    - chaosknoten.hamburg.ccc.de:9100 # Node Exporter
 - job_name: 'pve'
   static_configs:
     - targets:
@@ -85,6 +83,7 @@ scrape_configs:
       replacement: pve-exporter:9221
 - job_name: hosts
   static_configs:
+    # Wieske Chaosknoten VMs
     - labels:
         site: wieske
         type: virtual_machine
@@ -101,3 +100,9 @@ scrape_configs:
         - woodpecker-intern.hamburg.ccc.de:9100
         - penpot-intern.hamburg.ccc.de:9100
         - jitsi.hamburg.ccc.de:9100
+    # Wieske Physical Machines
+    - labels:
+        site: wieske
+        type: physical_machine
+      targets:
+        - chaosknoten.hamburg.ccc.de:9100
diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
index ebc1748..1c06485 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
@@ -311,3 +311,222 @@ groups:
     annotations:
       summary: Host requires reboot (instance {{ $labels.instance }})
       description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}"
+- name: prometheus
+  rules:
+  - alert: PrometheusJobMissing
+    expr: absent(up{job="prometheus"})
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus job missing (instance {{ $labels.instance }})
+      description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetMissing
+    expr: up == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus target missing (instance {{ $labels.instance }})
+      description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}"
+  - alert: PrometheusAllTargetsMissing
+    expr: sum by (job) (up) == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus all targets missing (instance {{ $labels.instance }})
+      description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
+  - alert: PrometheusConfigurationReloadFailure
+    expr: prometheus_config_last_reload_successful != 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+      description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
+  - alert: PrometheusTooManyRestarts
+    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus too many restarts (instance {{ $labels.instance }})
+      description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerJobMissing
+    expr: absent(up{job="alertmanager"})
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
+      description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerConfigurationReloadFailure
+    expr: alertmanager_config_last_reload_successful != 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+      description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerConfigNotSynced
+    expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
+      description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}"
+  # For testing.
+  # - alert: PrometheusAlertmanagerE2eDeadManSwitch
+  #   expr: vector(1)
+  #   for: 0m
+  #   labels:
+  #     severity: critical
+  #   annotations:
+  #     summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
+  #     description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}"
+  - alert: PrometheusNotConnectedToAlertmanager
+    expr: prometheus_notifications_alertmanagers_discovered < 1
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
+      description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
+  - alert: PrometheusRuleEvaluationFailures
+    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
+  - alert: PrometheusTemplateTextExpansionFailures
+    expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusRuleEvaluationSlow
+    expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+      description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
+  - alert: PrometheusNotificationsBacklog
+    expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+      description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerNotificationFailing
+    expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+      description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetEmpty
+    expr: prometheus_sd_discovered_targets == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus target empty (instance {{ $labels.instance }})
+      description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetScrapingSlow
+    expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+      description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}"
+  - alert: PrometheusLargeScrape
+    expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus large scrape (instance {{ $labels.instance }})
+      description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetScrapeDuplicate
+    expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+      description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbCheckpointCreationFailures
+    expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbCheckpointDeletionFailures
+    expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbCompactionsFailed
+    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbHeadTruncationsFailed
+    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbReloadFailures
+    expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbWalCorruptions
+    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbWalTruncationsFailed
+    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTimeseriesCardinality
+    expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
+      description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}"