prometheus, alertmanager: use Prometheus alerts with Alertmanager

For now introduce node-exporter/hosts alert rules, which got taken from https://samber.github.io/awesome-prometheus-alerts/rules However with the labels removed from the description, since they don't render correctly (at least in Telegram) and don't seem to provide much value, as we render the labels in the notification anyway. Also only have Telegram as the notification channel for now, as it was the easiest to set up.
2024-10-02 03:36:30 +02:00 · 2024-10-02 03:36:30 +02:00 · 30876f821c
commit 30876f821c
parent 803b19de0a
6 changed files with 416 additions and 4 deletions
--- a/inventories/chaosknoten/host_vars/grafana.yaml
+++ b/inventories/chaosknoten/host_vars/grafana.yaml
@ -6,6 +6,12 @@ docker_compose__configuration_files:
    content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml') }}"
  - name: prometheus.yml
    content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/prometheus.yml') }}"
  - name: alertmanager.yaml
    content: "{{ lookup('ansible.builtin.template', 'templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2') }}"
  - name: prometheus_alerts.rules.yaml
    content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml') }}"
  - name: alertmanager_alert_templates.tmpl
    content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}"
 certbot__version_spec: ""
 certbot__acme_account_email_address: le-admin@hamburg.ccc.de
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl
@ -0,0 +1,35 @@
 {{/*
 Links & Resources
 - https://prometheus.io/blog/2016/03/03/custom-alertmanager-templates/
 - https://prometheus.io/docs/alerting/latest/notifications/
 - https://gist.github.com/jidckii/5ac5f8f20368b56de72af70222509b7b
 */}}
 {{ define "alert-item.telegram.ccchh.internal" }}
 <b>[{{ .Labels.alertname }}] {{ .Labels.nodename }}</b>
 {{- if .Annotations.summary }}
 <i>Summary</i>: {{ .Annotations.summary }}
 {{- end }}
 {{- if .Annotations.description }}
 <i>Description</i>: {{ .Annotations.description }}
 {{- end }}
 <i>Labels</i>:
 {{ range .Labels.SortedPairs -}}
 • <i>{{ .Name }}</i>: <code>{{ .Value }}</code>
 {{ end }}
 {{- end }}
 {{ define "alert-message.telegram.ccchh" }}
 {{- if .Alerts.Firing }}
 <u>🔥{{ len .Alerts.Firing }} Alert(/s) Firing 🔥</u>
 {{ range .Alerts.Firing -}}
 {{ template "alert-item.telegram.ccchh.internal" . }}
 {{- end }}
 {{- end }}
 {{- if .Alerts.Resolved }}
 <u>✅{{ len .Alerts.Resolved }} Alert(/s) Resolved ✅</u>
 {{ range .Alerts.Resolved -}}
 {{ template "alert-item.telegram.ccchh.internal" . }}
 {{- end }}
 {{- end }}
 {{- end }}
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@ -2,13 +2,18 @@ global:
  scrape_interval: 15s
  scrape_timeout: 10s
  evaluation_interval: 15s
 alerting:
  alertmanagers:
-  - static_configs:
+  - scheme: http
    - targets: []
    scheme: http
    timeout: 10s
-    api_version: v1
+    static_configs:
    - targets:
      - "alertmanager:9093"
 rule_files:
  - "/etc/prometheus/rules/*.rules.yaml"
 scrape_configs:
 - job_name: prometheus
  honor_timestamps: true
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
@ -0,0 +1,313 @@
 # Links & Resources:
 # - https://samber.github.io/awesome-prometheus-alerts/rules
 groups:
 - name: node-exporter
  rules:
  - alert: HostOutOfMemory
    expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of memory (instance {{ $labels.instance }})
      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}"
  - alert: HostMemoryUnderMemoryPressure
    expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host memory under memory pressure (instance {{ $labels.instance }})
      description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}"
  # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
  - alert: HostMemoryIsUnderutilized
    expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 1w
    labels:
      severity: info
    annotations:
      summary: Host Memory is underutilized (instance {{ $labels.instance }})
      description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}"
  - alert: HostUnusualNetworkThroughputIn
    expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host unusual network throughput in (instance {{ $labels.instance }})
      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
  - alert: HostUnusualNetworkThroughputOut
    expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host unusual network throughput out (instance {{ $labels.instance }})
      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
  - alert: HostUnusualDiskReadRate
    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk read rate (instance {{ $labels.instance }})
      description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
  - alert: HostUnusualDiskWriteRate
    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk write rate (instance {{ $labels.instance }})
      description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
  # Please add ignored mountpoints in node_exporter parameters like
  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
  - alert: HostOutOfDiskSpace
    expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of disk space (instance {{ $labels.instance }})
      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}"
  # Please add ignored mountpoints in node_exporter parameters like
  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
  - alert: HostDiskWillFillIn24Hours
    expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
      description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}"
  - alert: HostOutOfInodes
    expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of inodes (instance {{ $labels.instance }})
      description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}"
  - alert: HostInodesWillFillIn24Hours
    expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
      description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}"
  - alert: HostFilesystemDeviceError
    expr: node_filesystem_device_error == 1
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: Host filesystem device error (instance {{ $labels.instance }})
      description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}"
  - alert: HostUnusualDiskReadLatency
    expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk read latency (instance {{ $labels.instance }})
      description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}"
  - alert: HostUnusualDiskWriteLatency
    expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk write latency (instance {{ $labels.instance }})
      description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}"
  - alert: HostHighCpuLoad
    expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: Host high CPU load (instance {{ $labels.instance }})
      description: "CPU load is > 80%\n  VALUE = {{ $value }}"
  # We might want to introduce that later, tho maybe excluding hosts with one core, if possible and only for VMs?
  # # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
  # - alert: HostCpuIsUnderutilized
  #   expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
  #   for: 1w
  #   labels:
  #     severity: info
  #   annotations:
  #     summary: Host CPU is underutilized (instance {{ $labels.instance }})
  #     description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}"
  - alert: HostCpuStealNoisyNeighbor
    expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
      description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}"
  - alert: HostCpuHighIowait
    expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host CPU high iowait (instance {{ $labels.instance }})
      description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}"
  - alert: HostUnusualDiskIo
    expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host unusual disk IO (instance {{ $labels.instance }})
      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
  # # x2 context switches is an arbitrary number.
  # # The alert threshold depends on the nature of the application.
  # # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
  # - alert: HostContextSwitchingHigh
  #   expr: (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
  #   for: 0m
  #   labels:
  #     severity: warning
  #   annotations:
  #     summary: Host context switching high (instance {{ $labels.instance }})
  #     description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}"
  - alert: HostSwapIsFillingUp
    expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host swap is filling up (instance {{ $labels.instance }})
      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}"
  - alert: HostSystemdServiceCrashed
    expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host systemd service crashed (instance {{ $labels.instance }})
      description: "systemd service crashed\n  VALUE = {{ $value }}"
  - alert: HostPhysicalComponentTooHot
    expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host physical component too hot (instance {{ $labels.instance }})
      description: "Physical hardware component too hot\n  VALUE = {{ $value }}"
  - alert: HostNodeOvertemperatureAlarm
    expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Host node overtemperature alarm (instance {{ $labels.instance }})
      description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}"
  - alert: HostRaidArrayGotInactive
    expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Host RAID array got inactive (instance {{ $labels.instance }})
      description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}"
  - alert: HostRaidDiskFailure
    expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host RAID disk failure (instance {{ $labels.instance }})
      description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}"
  - alert: HostKernelVersionDeviations
    expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 6h
    labels:
      severity: warning
    annotations:
      summary: Host kernel version deviations (instance {{ $labels.instance }})
      description: "Different kernel versions are running\n  VALUE = {{ $value }}"
  - alert: HostOomKillDetected
    expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host OOM kill detected (instance {{ $labels.instance }})
      description: "OOM kill detected\n  VALUE = {{ $value }}"
  - alert: HostEdacCorrectableErrorsDetected
    expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 0m
    labels:
      severity: info
    annotations:
      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
      description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}"
  - alert: HostEdacUncorrectableErrorsDetected
    expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
      description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}"
  - alert: HostNetworkReceiveErrors
    expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host Network Receive Errors (instance {{ $labels.instance }})
      description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}"
  - alert: HostNetworkTransmitErrors
    expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host Network Transmit Errors (instance {{ $labels.instance }})
      description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}"
  - alert: HostNetworkBondDegraded
    expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host Network Bond Degraded (instance {{ $labels.instance }})
      description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}"
  - alert: HostConntrackLimit
    expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Host conntrack limit (instance {{ $labels.instance }})
      description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}"
  - alert: HostClockSkew
    expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: Host clock skew (instance {{ $labels.instance }})
      description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}"
  - alert: HostClockNotSynchronising
    expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host clock not synchronising (instance {{ $labels.instance }})
      description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}"
  - alert: HostRequiresReboot
    expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
    for: 4h
    labels:
      severity: info
    annotations:
      summary: Host requires reboot (instance {{ $labels.instance }})
      description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}"
--- a/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2
@ -11,8 +11,21 @@ services:
    restart: unless-stopped
    volumes:
      - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml
      - prom_data:/prometheus
  alertmanager:
    image: prom/alertmanager
    container_name: alertmanager
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yaml'
    ports:
      - 9093:9093
    restart: unless-stopped
    volumes:
      - ./configs/alertmanager.yaml:/etc/alertmanager/alertmanager.yaml
      - ./configs/alertmanager_alert_templates.tmpl:/etc/alertmanager/templates/alert_templates.tmpl
  grafana:
    image: grafana/grafana
    container_name: grafana
--- a/playbooks/templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2
@ -0,0 +1,40 @@
 # Links & References:
 # - https://prometheus.io/docs/alerting/latest/configuration/
 # - https://github.com/prometheus/alertmanager/blob/48a99764a1fc9279fc828de83e7a03ae2219abc7/doc/examples/simple.yml
 route:
  group_by: ["alertname", "site", "type", "hypervisor"]
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 3h
  receiver: ccchh-infrastructure-alerts
 {# Disable these for now, but might be interesting in the future.
 # Inhibition rules allow to mute a set of alerts given that another alert is
 # firing.
 # We use this to mute any warning-level notifications if the same alert is
 # already critical.
 inhibit_rules:
  - source_matchers: [severity="critical"]
    target_matchers: [severity="warning"]
    # Apply inhibition if the alertname is the same.
    # CAUTION:
    #   If all label names listed in `equal` are missing
    #   from both the source and target alerts,
    #   the inhibition rule will apply!
    equal: [alertname, cluster, service] #}
 templates:
  - "/etc/alertmanager/templates/*.tmpl"
 receivers:
  - name: "ccchh-infrastructure-alerts"
    telegram_configs:
      - send_resolved: true
        bot_token: {{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/grafana/alertmanager_telegram_bot_token", create=false, missing="error") }}
        chat_id: -1002434372415
        parse_mode: HTML
        message: {{ "'{{ template \"alert-message.telegram.ccchh\" . }}'" }}