From e81ae5165fbd3a9c1f8ca029ddc334826e57fc23 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Sat, 28 Sep 2024 05:04:01 +0200
Subject: [PATCH 01/20] public-reverse-proxy: config for eh20 static website
 deploy

---
 .../configs/public-reverse-proxy/nginx/acme_challenge.conf     | 3 +++
 .../chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf  | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
index 01dbe31..edc1d6e 100644
--- a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
+++ b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
@@ -29,6 +29,8 @@ map $host $upstream_acme_challenge_host {
     eh07.easterhegg.eu 172.31.17.151:31820;
     eh09.easterhegg.eu 172.31.17.151:31820;
     eh11.easterhegg.eu 172.31.17.151:31820;
+    eh20.easterhegg.eu 172.31.17.151:31820;
+    www.eh20.easterhegg.eu 172.31.17.151:31820;
     eh22.easterhegg.eu 172.31.17.159:31820;
     easterheggxxxx.hamburg.ccc.de 172.31.17.151:31820;
     eh2003.hamburg.ccc.de 172.31.17.151:31820;
@@ -57,6 +59,7 @@ map $host $upstream_acme_challenge_host {
     www.eh11.hamburg.ccc.de 172.31.17.151:31820;
     easterhegg2011.hamburg.ccc.de 172.31.17.151:31820;
     www.easterhegg2011.hamburg.ccc.de 172.31.17.151:31820;
+    eh20.hamburg.ccc.de 172.31.17.151:31820;
     hacker.tours 172.31.17.151:31820;
     staging.hacker.tours 172.31.17.151:31820;
     woodpecker.hamburg.ccc.de 172.31.17.160:31820;
diff --git a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
index ee4c432..72f475c 100644
--- a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
+++ b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
@@ -47,6 +47,8 @@ stream {
         eh07.easterhegg.eu 172.31.17.151:8443;
         eh09.easterhegg.eu 172.31.17.151:8443;
         eh11.easterhegg.eu 172.31.17.151:8443;
+        eh20.easterhegg.eu 172.31.17.151:8443;
+        www.eh20.easterhegg.eu 172.31.17.151:8443;
         eh22.easterhegg.eu 172.31.17.159:8443;
         easterheggxxxx.hamburg.ccc.de 172.31.17.151:8443;
         eh2003.hamburg.ccc.de 172.31.17.151:8443;
@@ -75,6 +77,7 @@ stream {
         www.eh11.hamburg.ccc.de 172.31.17.151:8443;
         easterhegg2011.hamburg.ccc.de 172.31.17.151:8443;
         www.easterhegg2011.hamburg.ccc.de 172.31.17.151:8443;
+        eh20.hamburg.ccc.de 172.31.17.151:8443;
         hacker.tours 172.31.17.151:8443;
         staging.hacker.tours 172.31.17.151:8443;
         woodpecker.hamburg.ccc.de 172.31.17.160:8443;

From 29d2d2926f187f7b90b932b576c299e0c24aba51 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Tue, 1 Oct 2024 01:59:33 +0200
Subject: [PATCH 02/20] prometheus: don't duplicate scrape interval and timeout

---
 .../chaosknoten/configs/grafana/docker_compose/prometheus.yml   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
index c7905b9..69d734c 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@@ -12,8 +12,6 @@ alerting:
 scrape_configs:
 - job_name: prometheus
   honor_timestamps: true
-  scrape_interval: 15s
-  scrape_timeout: 10s
   metrics_path: /metrics
   scheme: http
   static_configs:

From 803b19de0a24bd4cd8c209b51cbb25fb772bb0ce Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Tue, 1 Oct 2024 20:09:42 +0200
Subject: [PATCH 03/20] prometheus: add job for node exporter (for the NixOS
 VMs for now)

---
 .../grafana/docker_compose/prometheus.yml       | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
index 69d734c..8a5faa7 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@@ -79,3 +79,20 @@ scrape_configs:
       target_label: instance
     - target_label: __address__
       replacement: pve-exporter:9221
+- job_name: hosts
+  static_configs:
+    - labels:
+        site: wieske
+        type: virtual_machine
+        hypervisor: chaosknoten
+      targets:
+        - netbox-intern.hamburg.ccc.de:9100
+        - matrix-intern.hamburg.ccc.de:9100
+        - public-web-static-intern.hamburg.ccc.de:9100
+        - git-intern.hamburg.ccc.de:9100
+        - forgejo-actions-runner-intern.hamburg.ccc.de:9100
+        - eh22-wiki-intern.hamburg.ccc.de:9100
+        - nix-box-june-intern.hamburg.ccc.de:9100
+        - mjolnir-intern.hamburg.ccc.de:9100
+        - woodpecker-intern.hamburg.ccc.de:9100
+        - penpot-intern.hamburg.ccc.de:9100

From 30876f821c4b090a678ff35ff6b9d5dac5b5f183 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Wed, 2 Oct 2024 03:36:30 +0200
Subject: [PATCH 04/20] prometheus, alertmanager: use Prometheus alerts with
 Alertmanager

For now introduce node-exporter/hosts alert rules, which got taken from
https://samber.github.io/awesome-prometheus-alerts/rules
However with the labels removed from the description, since they don't
render correctly (at least in Telegram) and don't seem to provide much
value, as we render the labels in the notification anyway.

Also only have Telegram as the notification channel for now, as it was
the easiest to set up.
---
 .../chaosknoten/host_vars/grafana.yaml        |   6 +
 .../alertmanager_alert_templates.tmpl         |  35 ++
 .../grafana/docker_compose/prometheus.yml     |  13 +-
 .../prometheus_alerts.rules.yaml              | 313 ++++++++++++++++++
 .../configs/grafana/compose.yaml.j2           |  13 +
 .../docker_compose/alertmanager.yaml.j2       |  40 +++
 6 files changed, 416 insertions(+), 4 deletions(-)
 create mode 100644 playbooks/files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl
 create mode 100644 playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
 create mode 100644 playbooks/templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2

diff --git a/inventories/chaosknoten/host_vars/grafana.yaml b/inventories/chaosknoten/host_vars/grafana.yaml
index efa34aa..300bda6 100644
--- a/inventories/chaosknoten/host_vars/grafana.yaml
+++ b/inventories/chaosknoten/host_vars/grafana.yaml
@@ -6,6 +6,12 @@ docker_compose__configuration_files:
     content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml') }}"
   - name: prometheus.yml
     content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/prometheus.yml') }}"
+  - name: alertmanager.yaml
+    content: "{{ lookup('ansible.builtin.template', 'templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2') }}"
+  - name: prometheus_alerts.rules.yaml
+    content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml') }}"
+  - name: alertmanager_alert_templates.tmpl
+    content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}"
 
 certbot__version_spec: ""
 certbot__acme_account_email_address: le-admin@hamburg.ccc.de
diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl b/playbooks/files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl
new file mode 100644
index 0000000..5318fb0
--- /dev/null
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl
@@ -0,0 +1,35 @@
+{{/*
+Links & Resources
+- https://prometheus.io/blog/2016/03/03/custom-alertmanager-templates/
+- https://prometheus.io/docs/alerting/latest/notifications/
+- https://gist.github.com/jidckii/5ac5f8f20368b56de72af70222509b7b
+*/}}
+{{ define "alert-item.telegram.ccchh.internal" }}
+<b>[{{ .Labels.alertname }}] {{ .Labels.nodename }}</b>
+{{- if .Annotations.summary }}
+<i>Summary</i>: {{ .Annotations.summary }}
+{{- end }}
+{{- if .Annotations.description }}
+<i>Description</i>: {{ .Annotations.description }}
+{{- end }}
+<i>Labels</i>:
+{{ range .Labels.SortedPairs -}}
+• <i>{{ .Name }}</i>: <code>{{ .Value }}</code>
+{{ end }}
+{{- end }}
+
+
+{{ define "alert-message.telegram.ccchh" }}
+{{- if .Alerts.Firing }}
+<u>🔥{{ len .Alerts.Firing }} Alert(/s) Firing 🔥</u>
+{{ range .Alerts.Firing -}}
+{{ template "alert-item.telegram.ccchh.internal" . }}
+{{- end }}
+{{- end }}
+{{- if .Alerts.Resolved }}
+<u>✅{{ len .Alerts.Resolved }} Alert(/s) Resolved ✅</u>
+{{ range .Alerts.Resolved -}}
+{{ template "alert-item.telegram.ccchh.internal" . }}
+{{- end }}
+{{- end }}
+{{- end }}
diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
index 8a5faa7..bdb9a20 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@@ -2,13 +2,18 @@ global:
   scrape_interval: 15s
   scrape_timeout: 10s
   evaluation_interval: 15s
+
 alerting:
   alertmanagers:
-  - static_configs:
-    - targets: []
-    scheme: http
+  - scheme: http
     timeout: 10s
-    api_version: v1
+    static_configs:
+    - targets:
+      - "alertmanager:9093"
+
+rule_files:
+  - "/etc/prometheus/rules/*.rules.yaml"
+
 scrape_configs:
 - job_name: prometheus
   honor_timestamps: true
diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
new file mode 100644
index 0000000..ebc1748
--- /dev/null
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
@@ -0,0 +1,313 @@
+# Links & Resources:
+# - https://samber.github.io/awesome-prometheus-alerts/rules
+groups:
+- name: node-exporter
+  rules:
+  - alert: HostOutOfMemory
+    expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of memory (instance {{ $labels.instance }})
+      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}"
+  - alert: HostMemoryUnderMemoryPressure
+    expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host memory under memory pressure (instance {{ $labels.instance }})
+      description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}"
+  # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
+  - alert: HostMemoryIsUnderutilized
+    expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 1w
+    labels:
+      severity: info
+    annotations:
+      summary: Host Memory is underutilized (instance {{ $labels.instance }})
+      description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}"
+  - alert: HostUnusualNetworkThroughputIn
+    expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual network throughput in (instance {{ $labels.instance }})
+      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+  - alert: HostUnusualNetworkThroughputOut
+    expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual network throughput out (instance {{ $labels.instance }})
+      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+  - alert: HostUnusualDiskReadRate
+    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk read rate (instance {{ $labels.instance }})
+      description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+  - alert: HostUnusualDiskWriteRate
+    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk write rate (instance {{ $labels.instance }})
+      description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+  # Please add ignored mountpoints in node_exporter parameters like
+  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+  - alert: HostOutOfDiskSpace
+    expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of disk space (instance {{ $labels.instance }})
+      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}"
+  # Please add ignored mountpoints in node_exporter parameters like
+  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+  - alert: HostDiskWillFillIn24Hours
+    expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+      description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}"
+  - alert: HostOutOfInodes
+    expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of inodes (instance {{ $labels.instance }})
+      description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}"
+  - alert: HostInodesWillFillIn24Hours
+    expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+      description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}"
+  - alert: HostFilesystemDeviceError
+    expr: node_filesystem_device_error == 1
+    for: 2m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host filesystem device error (instance {{ $labels.instance }})
+      description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}"
+  - alert: HostUnusualDiskReadLatency
+    expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk read latency (instance {{ $labels.instance }})
+      description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}"
+  - alert: HostUnusualDiskWriteLatency
+    expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk write latency (instance {{ $labels.instance }})
+      description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}"
+  - alert: HostHighCpuLoad
+    expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host high CPU load (instance {{ $labels.instance }})
+      description: "CPU load is > 80%\n  VALUE = {{ $value }}"
+  # We might want to introduce that later, tho maybe excluding hosts with one core, if possible and only for VMs?
+  # # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
+  # - alert: HostCpuIsUnderutilized
+  #   expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+  #   for: 1w
+  #   labels:
+  #     severity: info
+  #   annotations:
+  #     summary: Host CPU is underutilized (instance {{ $labels.instance }})
+  #     description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}"
+  - alert: HostCpuStealNoisyNeighbor
+    expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+      description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}"
+  - alert: HostCpuHighIowait
+    expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host CPU high iowait (instance {{ $labels.instance }})
+      description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}"
+  - alert: HostUnusualDiskIo
+    expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk IO (instance {{ $labels.instance }})
+      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+  # # x2 context switches is an arbitrary number.
+  # # The alert threshold depends on the nature of the application.
+  # # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
+  # - alert: HostContextSwitchingHigh
+  #   expr: (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
+  #   for: 0m
+  #   labels:
+  #     severity: warning
+  #   annotations:
+  #     summary: Host context switching high (instance {{ $labels.instance }})
+  #     description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}"
+  - alert: HostSwapIsFillingUp
+    expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host swap is filling up (instance {{ $labels.instance }})
+      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}"
+  - alert: HostSystemdServiceCrashed
+    expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host systemd service crashed (instance {{ $labels.instance }})
+      description: "systemd service crashed\n  VALUE = {{ $value }}"
+  - alert: HostPhysicalComponentTooHot
+    expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host physical component too hot (instance {{ $labels.instance }})
+      description: "Physical hardware component too hot\n  VALUE = {{ $value }}"
+  - alert: HostNodeOvertemperatureAlarm
+    expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+      description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}"
+  - alert: HostRaidArrayGotInactive
+    expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host RAID array got inactive (instance {{ $labels.instance }})
+      description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}"
+  - alert: HostRaidDiskFailure
+    expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host RAID disk failure (instance {{ $labels.instance }})
+      description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}"
+  - alert: HostKernelVersionDeviations
+    expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 6h
+    labels:
+      severity: warning
+    annotations:
+      summary: Host kernel version deviations (instance {{ $labels.instance }})
+      description: "Different kernel versions are running\n  VALUE = {{ $value }}"
+  - alert: HostOomKillDetected
+    expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host OOM kill detected (instance {{ $labels.instance }})
+      description: "OOM kill detected\n  VALUE = {{ $value }}"
+  - alert: HostEdacCorrectableErrorsDetected
+    expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: info
+    annotations:
+      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+      description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}"
+  - alert: HostEdacUncorrectableErrorsDetected
+    expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+      description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}"
+  - alert: HostNetworkReceiveErrors
+    expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Receive Errors (instance {{ $labels.instance }})
+      description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}"
+  - alert: HostNetworkTransmitErrors
+    expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+      description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}"
+  - alert: HostNetworkBondDegraded
+    expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+      description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}"
+  - alert: HostConntrackLimit
+    expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host conntrack limit (instance {{ $labels.instance }})
+      description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}"
+  - alert: HostClockSkew
+    expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host clock skew (instance {{ $labels.instance }})
+      description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}"
+  - alert: HostClockNotSynchronising
+    expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host clock not synchronising (instance {{ $labels.instance }})
+      description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}"
+  - alert: HostRequiresReboot
+    expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 4h
+    labels:
+      severity: info
+    annotations:
+      summary: Host requires reboot (instance {{ $labels.instance }})
+      description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}"
diff --git a/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2 b/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2
index e235380..212a9f9 100644
--- a/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2
@@ -11,7 +11,20 @@ services:
     restart: unless-stopped
     volumes:
       - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml
       - prom_data:/prometheus
+  
+  alertmanager:
+    image: prom/alertmanager
+    container_name: alertmanager
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yaml'
+    ports:
+      - 9093:9093
+    restart: unless-stopped
+    volumes:
+      - ./configs/alertmanager.yaml:/etc/alertmanager/alertmanager.yaml
+      - ./configs/alertmanager_alert_templates.tmpl:/etc/alertmanager/templates/alert_templates.tmpl
 
   grafana:
     image: grafana/grafana
diff --git a/playbooks/templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2 b/playbooks/templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2
new file mode 100644
index 0000000..83aeaad
--- /dev/null
+++ b/playbooks/templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2
@@ -0,0 +1,40 @@
+# Links & References:
+# - https://prometheus.io/docs/alerting/latest/configuration/
+# - https://github.com/prometheus/alertmanager/blob/48a99764a1fc9279fc828de83e7a03ae2219abc7/doc/examples/simple.yml
+
+route:
+  group_by: ["alertname", "site", "type", "hypervisor"]
+
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 3h
+
+  receiver: ccchh-infrastructure-alerts
+
+
+{# Disable these for now, but might be interesting in the future.
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is
+# already critical.
+inhibit_rules:
+  - source_matchers: [severity="critical"]
+    target_matchers: [severity="warning"]
+    # Apply inhibition if the alertname is the same.
+    # CAUTION:
+    #   If all label names listed in `equal` are missing
+    #   from both the source and target alerts,
+    #   the inhibition rule will apply!
+    equal: [alertname, cluster, service] #}
+
+templates:
+  - "/etc/alertmanager/templates/*.tmpl"
+
+receivers:
+  - name: "ccchh-infrastructure-alerts"
+    telegram_configs:
+      - send_resolved: true
+        bot_token: {{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/grafana/alertmanager_telegram_bot_token", create=false, missing="error") }}
+        chat_id: -1002434372415
+        parse_mode: HTML
+        message: {{ "'{{ template \"alert-message.telegram.ccchh\" . }}'" }}

From 61edc3587f65995b8236d2f71e4e7a098e2a9a18 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Wed, 2 Oct 2024 03:43:22 +0200
Subject: [PATCH 05/20] alertmanager: give Alertmanager a persistent storage
 directory

---
 playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2 b/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2
index 212a9f9..3e994dc 100644
--- a/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2
@@ -25,6 +25,7 @@ services:
     volumes:
       - ./configs/alertmanager.yaml:/etc/alertmanager/alertmanager.yaml
       - ./configs/alertmanager_alert_templates.tmpl:/etc/alertmanager/templates/alert_templates.tmpl
+      - alertmanager_data:/alertmanager
 
   grafana:
     image: grafana/grafana
@@ -57,3 +58,4 @@ services:
 volumes:
   graf_data: {}
   prom_data: {}
+  alertmanager_data: {}

From 2e29b78f6ac3a782becbff80c8621f7b2bea50e0 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Wed, 2 Oct 2024 03:45:56 +0200
Subject: [PATCH 06/20] prometheus: move Jitsis node exporter target to hosts
 job

---
 .../chaosknoten/configs/grafana/docker_compose/prometheus.yml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
index bdb9a20..b50b0df 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@@ -57,7 +57,6 @@ scrape_configs:
   scheme: http
   static_configs:
   - targets:
-    - jitsi.hamburg.ccc.de:9100 # Node Exporter
     - jitsi.hamburg.ccc.de:9888 # Jitsi Video Bridge
 - job_name: chaosknoten
   honor_timestamps: true
@@ -101,3 +100,4 @@ scrape_configs:
         - mjolnir-intern.hamburg.ccc.de:9100
         - woodpecker-intern.hamburg.ccc.de:9100
         - penpot-intern.hamburg.ccc.de:9100
+        - jitsi.hamburg.ccc.de:9100

From 0a05cad0a1adef0ec1bb0496e6076d9365b3f37f Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Wed, 2 Oct 2024 04:13:37 +0200
Subject: [PATCH 07/20] prometheus & alertmanager: add self-alerting

Add self-alerting for Prometheus and Alertmanager using rules from
https://samber.github.io/awesome-prometheus-alerts/rules
---
 .../grafana/docker_compose/prometheus.yml     |  23 +-
 .../prometheus_alerts.rules.yaml              | 219 ++++++++++++++++++
 2 files changed, 233 insertions(+), 9 deletions(-)

diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
index b50b0df..dce71c2 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@@ -22,6 +22,13 @@ scrape_configs:
   static_configs:
   - targets:
     - localhost:9090
+- job_name: alertmanager
+  honor_timestamps: true
+  metrics_path: /metrics
+  scheme: http
+  static_configs:
+  - targets:
+    - alertmanager:9093
 - job_name: c3lingo
   honor_timestamps: true
   scrape_interval: 5s
@@ -58,15 +65,6 @@ scrape_configs:
   static_configs:
   - targets:
     - jitsi.hamburg.ccc.de:9888 # Jitsi Video Bridge
-- job_name: chaosknoten
-  honor_timestamps: true
-  scrape_interval: 5s
-  scrape_timeout: 1s
-  metrics_path: /metrics
-  scheme: http
-  static_configs:
-  - targets:
-    - chaosknoten.hamburg.ccc.de:9100 # Node Exporter
 - job_name: 'pve'
   static_configs:
     - targets:
@@ -85,6 +83,7 @@ scrape_configs:
       replacement: pve-exporter:9221
 - job_name: hosts
   static_configs:
+    # Wieske Chaosknoten VMs
     - labels:
         site: wieske
         type: virtual_machine
@@ -101,3 +100,9 @@ scrape_configs:
         - woodpecker-intern.hamburg.ccc.de:9100
         - penpot-intern.hamburg.ccc.de:9100
         - jitsi.hamburg.ccc.de:9100
+    # Wieske Physical Machines
+    - labels:
+        site: wieske
+        type: physical_machine
+      targets:
+        - chaosknoten.hamburg.ccc.de:9100
diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
index ebc1748..1c06485 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
@@ -311,3 +311,222 @@ groups:
     annotations:
       summary: Host requires reboot (instance {{ $labels.instance }})
       description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}"
+- name: prometheus
+  rules:
+  - alert: PrometheusJobMissing
+    expr: absent(up{job="prometheus"})
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus job missing (instance {{ $labels.instance }})
+      description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetMissing
+    expr: up == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus target missing (instance {{ $labels.instance }})
+      description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}"
+  - alert: PrometheusAllTargetsMissing
+    expr: sum by (job) (up) == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus all targets missing (instance {{ $labels.instance }})
+      description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
+  - alert: PrometheusConfigurationReloadFailure
+    expr: prometheus_config_last_reload_successful != 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+      description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
+  - alert: PrometheusTooManyRestarts
+    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus too many restarts (instance {{ $labels.instance }})
+      description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerJobMissing
+    expr: absent(up{job="alertmanager"})
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
+      description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerConfigurationReloadFailure
+    expr: alertmanager_config_last_reload_successful != 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+      description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerConfigNotSynced
+    expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
+      description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}"
+  # For testing.
+  # - alert: PrometheusAlertmanagerE2eDeadManSwitch
+  #   expr: vector(1)
+  #   for: 0m
+  #   labels:
+  #     severity: critical
+  #   annotations:
+  #     summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
+  #     description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}"
+  - alert: PrometheusNotConnectedToAlertmanager
+    expr: prometheus_notifications_alertmanagers_discovered < 1
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
+      description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
+  - alert: PrometheusRuleEvaluationFailures
+    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
+  - alert: PrometheusTemplateTextExpansionFailures
+    expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusRuleEvaluationSlow
+    expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+      description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
+  - alert: PrometheusNotificationsBacklog
+    expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+      description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerNotificationFailing
+    expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+      description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetEmpty
+    expr: prometheus_sd_discovered_targets == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus target empty (instance {{ $labels.instance }})
+      description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetScrapingSlow
+    expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+      description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}"
+  - alert: PrometheusLargeScrape
+    expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus large scrape (instance {{ $labels.instance }})
+      description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetScrapeDuplicate
+    expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+      description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbCheckpointCreationFailures
+    expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbCheckpointDeletionFailures
+    expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbCompactionsFailed
+    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbHeadTruncationsFailed
+    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbReloadFailures
+    expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbWalCorruptions
+    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbWalTruncationsFailed
+    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTimeseriesCardinality
+    expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
+      description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}"

From 229daa72fc7f104166b207f8086ed3dc81aeeb19 Mon Sep 17 00:00:00 2001
From: Stefan Bethke <stb@lassitu.de>
Date: Thu, 3 Oct 2024 19:44:15 +0200
Subject: [PATCH 08/20] Redirect plain URL to hash for ticket deep links

---
 .../configs/zammad/nginx/zammad.hamburg.ccc.de.conf           | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/playbooks/files/chaosknoten/configs/zammad/nginx/zammad.hamburg.ccc.de.conf b/playbooks/files/chaosknoten/configs/zammad/nginx/zammad.hamburg.ccc.de.conf
index 39a7856..f008634 100644
--- a/playbooks/files/chaosknoten/configs/zammad/nginx/zammad.hamburg.ccc.de.conf
+++ b/playbooks/files/chaosknoten/configs/zammad/nginx/zammad.hamburg.ccc.de.conf
@@ -36,6 +36,10 @@ server {
     # Also provide "_hidden" for by, since it's not relevant.
     proxy_set_header Forwarded "for=$remote_addr;proto=https;host=$host;by=_hidden";
 
+    location ~/(ticket/zoom/.*) {
+        return 302 https://zammad.hamburg.ccc.de/#$1;
+    }
+
     location / {
         proxy_pass http://127.0.0.1:8080/;
     }

From 43ca24b5e2f70e4aa68f50f021119d42cc8665c3 Mon Sep 17 00:00:00 2001
From: Stefan Bethke <stb@lassitu.de>
Date: Thu, 3 Oct 2024 19:44:43 +0200
Subject: [PATCH 09/20] Take website image from Forgejo

---
 .../templates/chaosknoten/configs/hackertours/compose.yaml.j2   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playbooks/templates/chaosknoten/configs/hackertours/compose.yaml.j2 b/playbooks/templates/chaosknoten/configs/hackertours/compose.yaml.j2
index 88b215b..7566bab 100644
--- a/playbooks/templates/chaosknoten/configs/hackertours/compose.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/hackertours/compose.yaml.j2
@@ -38,7 +38,7 @@ services:
       frontend:
 
   web:
-    image: gitlab-cr.hamburg.ccc.de/ccchh/hackertours/hackertours:latest
+    image: git.hamburg.ccc.de/ccchh/hackertours/hackertours:latest
     ports: 
       - "8888:80"
     restart: unless-stopped

From d8188d192b90fae79954c685cf0c388af18b88a3 Mon Sep 17 00:00:00 2001
From: c6ristian <c6ristian@christian.moe>
Date: Fri, 4 Oct 2024 17:07:49 +0200
Subject: [PATCH 10/20] Use keycloak version 26

---
 .../templates/chaosknoten/configs/keycloak/compose.yaml.j2      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playbooks/templates/chaosknoten/configs/keycloak/compose.yaml.j2 b/playbooks/templates/chaosknoten/configs/keycloak/compose.yaml.j2
index f6b293d..9509654 100644
--- a/playbooks/templates/chaosknoten/configs/keycloak/compose.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/keycloak/compose.yaml.j2
@@ -22,7 +22,7 @@
 
 services:
   keycloak:
-    image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:25.0
+    image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:26.0
     pull_policy: always
     restart: unless-stopped
     command: start --optimized

From f721dd9feaba061ae8c32641b0d7078ca760c7b9 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Wed, 2 Oct 2024 04:19:37 +0200
Subject: [PATCH 11/20] prometheus: make opnsense-ccchh job not fail half the
 time

The scrape seems to take around a second to complete and with the
configured timeout of 1s that failed half the time. Therefore use the
default, more relaxed scrape interval and timeout and have it be
reliable.
---
 .../chaosknoten/configs/grafana/docker_compose/prometheus.yml   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
index dce71c2..7936fd5 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@@ -49,8 +49,6 @@ scrape_configs:
     - mumble.hamburg.ccc.de:443
 - job_name: opnsense-ccchh
   honor_timestamps: true
-  scrape_interval: 5s
-  scrape_timeout: 1s
   metrics_path: /metrics
   scheme: http
   static_configs:

From 4cac84e7ec6f7d9f8cde3a5a0677d744545c97b2 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Sat, 5 Oct 2024 17:17:35 +0200
Subject: [PATCH 12/20] prometheus: have different disk alerts for physical and
 virtual hosts

Have more relaxed read/write alerts for physical hosts as they are
probably hypervisors and regular high read/writes are more common.
Also differentiate between physical and virtual hosts for IO alerts and
allow for hard disks to spend more time in IO.
---
 .../prometheus_alerts.rules.yaml              | 52 +++++++++++++++----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
index 1c06485..284c7ec 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
@@ -44,22 +44,39 @@ groups:
     annotations:
       summary: Host unusual network throughput out (instance {{ $labels.instance }})
       description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
-  - alert: HostUnusualDiskReadRate
-    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+  # Have different disk read and write rate alerts for VMs and physical machines.
+  - alert: VirtualHostUnusualDiskReadRate
+    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"}
     for: 5m
     labels:
       severity: warning
     annotations:
-      summary: Host unusual disk read rate (instance {{ $labels.instance }})
+      summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
       description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
-  - alert: HostUnusualDiskWriteRate
-    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+  - alert: VirtualHostUnusualDiskWriteRate
+    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
     for: 2m
     labels:
       severity: warning
     annotations:
-      summary: Host unusual disk write rate (instance {{ $labels.instance }})
+      summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
       description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+  - alert: PhysicalHostUnusualDiskReadRate
+    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+    for: 20m
+    labels:
+      severity: warning
+    annotations:
+      summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
+      description: "Disk is probably reading too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+  - alert: PhysicalHostUnusualDiskWriteRate
+    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+    for: 15m
+    labels:
+      severity: warning
+    annotations:
+      summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
+      description: "Disk is probably writing too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
   # Please add ignored mountpoints in node_exporter parameters like
   # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
   # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
@@ -156,13 +173,30 @@ groups:
     annotations:
       summary: Host CPU high iowait (instance {{ $labels.instance }})
       description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}"
-  - alert: HostUnusualDiskIo
-    expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+  # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
+  - alert: PhysicalHostUnusualHardDiskIo
+    expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
     for: 5m
     labels:
       severity: warning
     annotations:
-      summary: Host unusual disk IO (instance {{ $labels.instance }})
+      summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
+      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+  - alert: PhysicalHostUnusualOtherDiskIo
+    expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
+      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+  - alert: VirtualHostUnusualDiskIo
+    expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
       description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
   # # x2 context switches is an arbitrary number.
   # # The alert threshold depends on the nature of the application.

From 2fc54f5a83abd0fbcd07eb402fe0ae29f72c353b Mon Sep 17 00:00:00 2001
From: Stefan Bethke <stb@lassitu.de>
Date: Tue, 8 Oct 2024 20:28:56 +0200
Subject: [PATCH 13/20] Add missing headers to avoid CSRF errors

---
 .../configs/zammad/nginx/zammad.hamburg.ccc.de.conf           | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/playbooks/files/chaosknoten/configs/zammad/nginx/zammad.hamburg.ccc.de.conf b/playbooks/files/chaosknoten/configs/zammad/nginx/zammad.hamburg.ccc.de.conf
index f008634..c1f9182 100644
--- a/playbooks/files/chaosknoten/configs/zammad/nginx/zammad.hamburg.ccc.de.conf
+++ b/playbooks/files/chaosknoten/configs/zammad/nginx/zammad.hamburg.ccc.de.conf
@@ -35,6 +35,10 @@ server {
     # is transparent).
     # Also provide "_hidden" for by, since it's not relevant.
     proxy_set_header Forwarded "for=$remote_addr;proto=https;host=$host;by=_hidden";
+    proxy_read_timeout 86400;
+    proxy_set_header Upgrade $http_upgrade;
+    proxy_set_header Connection "Upgrade";
+    proxy_set_header CLIENT_IP $remote_addr;
 
     location ~/(ticket/zoom/.*) {
         return 302 https://zammad.hamburg.ccc.de/#$1;

From d7a9534eeb40c52101a16d1bec015a300aa0435e Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Sat, 12 Oct 2024 22:00:14 +0200
Subject: [PATCH 14/20] public-reverse-proxy: use public-web-static as host for
 hackert. ccchh

---
 .../configs/public-reverse-proxy/nginx/acme_challenge.conf      | 2 +-
 .../chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
index edc1d6e..32d7337 100644
--- a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
+++ b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
@@ -8,7 +8,7 @@ map $host $upstream_acme_challenge_host {
     element.hamburg.ccc.de 172.31.17.151:31820;
     git.hamburg.ccc.de 172.31.17.154:31820;
     grafana.hamburg.ccc.de 172.31.17.145:31820;
-    hackertours.hamburg.ccc.de 172.31.17.148:31820;
+    hackertours.hamburg.ccc.de 172.31.17.151:31820;
     hamburg.ccc.de 172.31.17.151:31820;
     id.hamburg.ccc.de 172.31.17.144:31820;
     invite.hamburg.ccc.de 172.31.17.144:31820;
diff --git a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
index 72f475c..2a83e4a 100644
--- a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
+++ b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
@@ -30,7 +30,7 @@ stream {
         wiki.ccchh.net 172.31.17.146:8443;
         wiki.hamburg.ccc.de 172.31.17.146:8443;
         onlyoffice.hamburg.ccc.de 172.31.17.147:8443;
-        hackertours.hamburg.ccc.de 172.31.17.148:8443;
+        hackertours.hamburg.ccc.de 172.31.17.151:8443;
         netbox.hamburg.ccc.de 172.31.17.149:8443;
         matrix.hamburg.ccc.de 172.31.17.150:8443;
         element.hamburg.ccc.de 172.31.17.151:8443;

From 7cd4a9a7230dac9d8124d319f58b3d04a9b81669 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Sat, 12 Oct 2024 22:08:28 +0200
Subject: [PATCH 15/20] public-reverse-proxy: add config for
 staging.hackertours.hamburg.ccc.de

---
 .../configs/public-reverse-proxy/nginx/acme_challenge.conf       | 1 +
 .../chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
index 32d7337..925e4f3 100644
--- a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
+++ b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
@@ -9,6 +9,7 @@ map $host $upstream_acme_challenge_host {
     git.hamburg.ccc.de 172.31.17.154:31820;
     grafana.hamburg.ccc.de 172.31.17.145:31820;
     hackertours.hamburg.ccc.de 172.31.17.151:31820;
+    staging.hackertours.hamburg.ccc.de 172.31.17.151:31820;
     hamburg.ccc.de 172.31.17.151:31820;
     id.hamburg.ccc.de 172.31.17.144:31820;
     invite.hamburg.ccc.de 172.31.17.144:31820;
diff --git a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
index 2a83e4a..26fab2c 100644
--- a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
+++ b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
@@ -31,6 +31,7 @@ stream {
         wiki.hamburg.ccc.de 172.31.17.146:8443;
         onlyoffice.hamburg.ccc.de 172.31.17.147:8443;
         hackertours.hamburg.ccc.de 172.31.17.151:8443;
+        staging.hackertours.hamburg.ccc.de 172.31.17.151:8443;
         netbox.hamburg.ccc.de 172.31.17.149:8443;
         matrix.hamburg.ccc.de 172.31.17.150:8443;
         element.hamburg.ccc.de 172.31.17.151:8443;

From 235e6e514f817ec8e97d54c6e6067e582af4ac33 Mon Sep 17 00:00:00 2001
From: Stefan Bethke <stb@lassitu.de>
Date: Sun, 13 Oct 2024 09:10:10 +0200
Subject: [PATCH 16/20] Move Pretix from hackertours to tickets

---
 .../host_vars/{hackertours.yaml => tickets.yaml}  | 10 +++++-----
 inventories/chaosknoten/hosts.yaml                | 12 ++++++------
 .../nginx/acme_challenge.conf                     |  1 +
 .../configs/public-reverse-proxy/nginx/nginx.conf |  1 +
 .../nginx/tickets.hamburg.ccc.de.conf}            | 15 ++++-----------
 .../{hackertours => tickets}/compose.yaml.j2      | 10 +---------
 .../{hackertours => tickets}/pretix.cfg.j2        |  8 ++++----
 7 files changed, 22 insertions(+), 35 deletions(-)
 rename inventories/chaosknoten/host_vars/{hackertours.yaml => tickets.yaml} (65%)
 rename playbooks/files/chaosknoten/configs/{hackertours/nginx/hackertours.hamburg.ccc.de.conf => tickets/nginx/tickets.hamburg.ccc.de.conf} (76%)
 rename playbooks/templates/chaosknoten/configs/{hackertours => tickets}/compose.yaml.j2 (79%)
 rename playbooks/templates/chaosknoten/configs/{hackertours => tickets}/pretix.cfg.j2 (66%)

diff --git a/inventories/chaosknoten/host_vars/hackertours.yaml b/inventories/chaosknoten/host_vars/tickets.yaml
similarity index 65%
rename from inventories/chaosknoten/host_vars/hackertours.yaml
rename to inventories/chaosknoten/host_vars/tickets.yaml
index 6b6a0e5..e160a55 100644
--- a/inventories/chaosknoten/host_vars/hackertours.yaml
+++ b/inventories/chaosknoten/host_vars/tickets.yaml
@@ -1,16 +1,16 @@
-docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/hackertours/compose.yaml.j2') }}"
+docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/tickets/compose.yaml.j2') }}"
 docker_compose__configuration_files:
   - name: pretix.cfg
-    content: "{{ lookup('ansible.builtin.template', 'templates/chaosknoten/configs/hackertours/pretix.cfg.j2') }}"
+    content: "{{ lookup('ansible.builtin.template', 'templates/chaosknoten/configs/tickets/pretix.cfg.j2') }}"
 
 certbot__version_spec: ""
 certbot__acme_account_email_address: le-admin@hamburg.ccc.de
 certbot__certificate_domains:
-  - "hackertours.hamburg.ccc.de"
+  - "tickets.hamburg.ccc.de"
 certbot__new_cert_commands:
   - "systemctl reload nginx.service"
 
 nginx__version_spec: ""
 nginx__configurations:
-  - name: hackertours.hamburg.ccc.de
-    content: "{{ lookup('ansible.builtin.file', 'chaosknoten/configs/hackertours/nginx/hackertours.hamburg.ccc.de.conf') }}"
+  - name: tickets.hamburg.ccc.de
+    content: "{{ lookup('ansible.builtin.file', 'chaosknoten/configs/tickets/nginx/tickets.hamburg.ccc.de.conf') }}"
diff --git a/inventories/chaosknoten/hosts.yaml b/inventories/chaosknoten/hosts.yaml
index d6f2dcc..06fb108 100644
--- a/inventories/chaosknoten/hosts.yaml
+++ b/inventories/chaosknoten/hosts.yaml
@@ -16,8 +16,8 @@ all:
           ansible_port: 42666
           ansible_user: chaos
           ansible_ssh_common_args: -J ssh://chaos@public-reverse-proxy.hamburg.ccc.de:42666
-        hackertours:
-          ansible_host: hackertours-intern.hamburg.ccc.de
+        tickets:
+          ansible_host: tickets-intern.hamburg.ccc.de
           ansible_port: 42666
           ansible_user: chaos
           ansible_ssh_common_args: -J ssh://chaos@public-reverse-proxy.hamburg.ccc.de:42666
@@ -64,7 +64,7 @@ all:
       hosts:
         ccchoir:
         grafana:
-        hackertours:
+        tickets:
         keycloak:
         lists:
         onlyoffice:
@@ -78,7 +78,7 @@ all:
       hosts:
         ccchoir:
         grafana:
-        hackertours:
+        tickets:
         keycloak:
         lists:
         mumble:
@@ -99,7 +99,7 @@ all:
       hosts:
         ccchoir:
         grafana:
-        hackertours:
+        tickets:
         keycloak:
         lists:
         mumble:
@@ -112,7 +112,7 @@ all:
       hosts:
         ccchoir:
         grafana:
-        hackertours:
+        tickets:
         cloud:
         keycloak:
         onlyoffice:
diff --git a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
index 925e4f3..f27b0bb 100644
--- a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
+++ b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
@@ -24,6 +24,7 @@ map $host $upstream_acme_challenge_host {
     wiki.ccchh.net 172.31.17.146:31820;
     wiki.hamburg.ccc.de 172.31.17.146:31820;
     www.hamburg.ccc.de 172.31.17.151:31820;
+    tickets.hamburg.ccc.de 172.31.17.148:31820;
     zammad.hamburg.ccc.de 172.31.17.152:31820;
     eh03.easterhegg.eu 172.31.17.151:31820;
     eh05.easterhegg.eu 172.31.17.151:31820;
diff --git a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
index 26fab2c..7980d05 100644
--- a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
+++ b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
@@ -40,6 +40,7 @@ stream {
         hamburg.ccc.de 172.31.17.151:8443;
         staging.hamburg.ccc.de 172.31.17.151:8443;
         spaceapi.hamburg.ccc.de 172.31.17.151:8443;
+        tickets.hamburg.ccc.de 172.31.17.148:8443;
         zammad.hamburg.ccc.de 172.31.17.152:8443;
         c3cat.de 172.31.17.151:8443;
         git.hamburg.ccc.de 172.31.17.154:8443;
diff --git a/playbooks/files/chaosknoten/configs/hackertours/nginx/hackertours.hamburg.ccc.de.conf b/playbooks/files/chaosknoten/configs/tickets/nginx/tickets.hamburg.ccc.de.conf
similarity index 76%
rename from playbooks/files/chaosknoten/configs/hackertours/nginx/hackertours.hamburg.ccc.de.conf
rename to playbooks/files/chaosknoten/configs/tickets/nginx/tickets.hamburg.ccc.de.conf
index 7f64e64..eb3b3a7 100644
--- a/playbooks/files/chaosknoten/configs/hackertours/nginx/hackertours.hamburg.ccc.de.conf
+++ b/playbooks/files/chaosknoten/configs/tickets/nginx/tickets.hamburg.ccc.de.conf
@@ -12,12 +12,12 @@ server {
     # header.
     real_ip_header proxy_protocol;
 
-    server_name hackertours.hamburg.ccc.de;
+    server_name tickets.hamburg.ccc.de;
 
-    ssl_certificate /etc/letsencrypt/live/hackertours.hamburg.ccc.de/fullchain.pem;
-    ssl_certificate_key /etc/letsencrypt/live/hackertours.hamburg.ccc.de/privkey.pem;
+    ssl_certificate /etc/letsencrypt/live/tickets.hamburg.ccc.de/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/tickets.hamburg.ccc.de/privkey.pem;
     # verify chain of trust of OCSP response using Root CA and Intermediate certs
-    ssl_trusted_certificate /etc/letsencrypt/live/hackertours.hamburg.ccc.de/chain.pem;
+    ssl_trusted_certificate /etc/letsencrypt/live/tickets.hamburg.ccc.de/chain.pem;
 
     # HSTS (ngx_http_headers_module is required) (63072000 seconds)
     add_header Strict-Transport-Security "max-age=63072000" always;
@@ -36,13 +36,6 @@ server {
     # Also provide "_hidden" for by, since it's not relevant.
     proxy_set_header Forwarded "for=$remote_addr;proto=https;host=$host;by=_hidden";
 
-    location = / {
-        proxy_pass http://127.0.0.1:8888/;
-    }
-    location ~ ^/(apple-touch-icon.png|assets|css|de|en|js|posts|tours)(.*)$ {
-        proxy_pass http://127.0.0.1:8888/$1$2;
-    }
-
     location / {
         proxy_pass http://127.0.0.1:8345/;
     }
diff --git a/playbooks/templates/chaosknoten/configs/hackertours/compose.yaml.j2 b/playbooks/templates/chaosknoten/configs/tickets/compose.yaml.j2
similarity index 79%
rename from playbooks/templates/chaosknoten/configs/hackertours/compose.yaml.j2
rename to playbooks/templates/chaosknoten/configs/tickets/compose.yaml.j2
index 7566bab..1f9d99d 100644
--- a/playbooks/templates/chaosknoten/configs/hackertours/compose.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/tickets/compose.yaml.j2
@@ -4,7 +4,7 @@ services:
     image: docker.io/library/postgres:15-alpine
     environment:
       - "POSTGRES_USER=pretix"
-      - "POSTGRES_PASSWORD={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/hackertours/DB_PASSWORD", create=false, missing="error") }}"
+      - "POSTGRES_PASSWORD={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/tickets/DB_PASSWORD", create=false, missing="error") }}"
       - "POSTGRES_DB=pretix"
     volumes:
       - database:/var/lib/postgresql/data
@@ -37,14 +37,6 @@ services:
       backend:
       frontend:
 
-  web:
-    image: git.hamburg.ccc.de/ccchh/hackertours/hackertours:latest
-    ports: 
-      - "8888:80"
-    restart: unless-stopped
-    networks:
-      frontend:
-
 volumes:
   database: {}
   pretix: {}
diff --git a/playbooks/templates/chaosknoten/configs/hackertours/pretix.cfg.j2 b/playbooks/templates/chaosknoten/configs/tickets/pretix.cfg.j2
similarity index 66%
rename from playbooks/templates/chaosknoten/configs/hackertours/pretix.cfg.j2
rename to playbooks/templates/chaosknoten/configs/tickets/pretix.cfg.j2
index a585cb9..3f4af83 100644
--- a/playbooks/templates/chaosknoten/configs/hackertours/pretix.cfg.j2
+++ b/playbooks/templates/chaosknoten/configs/tickets/pretix.cfg.j2
@@ -1,6 +1,6 @@
 [pretix]
-instance_name=CCCHH Hackertours
-url=https://hackertours.hamburg.ccc.de
+instance_name=CCCHH Tickets
+url=https://tickets.hamburg.ccc.de
 currency=EUR
 datadir=/data
 trust_x_forwarded_for=on
@@ -10,11 +10,11 @@ trust_x_forwarded_proto=on
 backend=postgresql
 name=pretix
 user=pretix
-password={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/hackertours/DB_PASSWORD", create=false, missing="error") }}
+password={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/tickets/DB_PASSWORD", create=false, missing="error") }}
 host=database
 
 [mail]
-from=ticket@hackertours.hamburg.ccc.de
+from=tickets@hamburg.ccc.de
 host=cow-intern.hamburg.ccc.de
 
 [redis]

From 2f00d2182163755f88948b22500e3297b192f084 Mon Sep 17 00:00:00 2001
From: Stefan Bethke <stb@lassitu.de>
Date: Sun, 13 Oct 2024 13:50:50 +0200
Subject: [PATCH 17/20] Redirect home page to wiki

---
 .../configs/tickets/nginx/tickets.hamburg.ccc.de.conf         | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/playbooks/files/chaosknoten/configs/tickets/nginx/tickets.hamburg.ccc.de.conf b/playbooks/files/chaosknoten/configs/tickets/nginx/tickets.hamburg.ccc.de.conf
index eb3b3a7..0eb5485 100644
--- a/playbooks/files/chaosknoten/configs/tickets/nginx/tickets.hamburg.ccc.de.conf
+++ b/playbooks/files/chaosknoten/configs/tickets/nginx/tickets.hamburg.ccc.de.conf
@@ -36,6 +36,10 @@ server {
     # Also provide "_hidden" for by, since it's not relevant.
     proxy_set_header Forwarded "for=$remote_addr;proto=https;host=$host;by=_hidden";
 
+    location = / {
+        return 302 https://wiki.hamburg.ccc.de/infrastructure:service-overview#tickets_pretix;
+    }
+
     location / {
         proxy_pass http://127.0.0.1:8345/;
     }

From b660d937dc01d85aca4e1b7417ef2e7aca1fe643 Mon Sep 17 00:00:00 2001
From: Stefan Bethke <stb@lassitu.de>
Date: Fri, 18 Oct 2024 12:40:24 +0200
Subject: [PATCH 18/20] Allow GPG keys as uploads

---
 playbooks/roles/dokuwiki/files/mime.local.conf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/playbooks/roles/dokuwiki/files/mime.local.conf b/playbooks/roles/dokuwiki/files/mime.local.conf
index 66d6072..592c299 100644
--- a/playbooks/roles/dokuwiki/files/mime.local.conf
+++ b/playbooks/roles/dokuwiki/files/mime.local.conf
@@ -2,3 +2,4 @@
 
 # Allow stl files.
 stl     !model/stl
+asc     application/pgp-keys

From 34dc6d9a84b0b89a7c687493a9b21f52d8d745c8 Mon Sep 17 00:00:00 2001
From: c6ristian <c6ristian@christian.moe>
Date: Fri, 18 Oct 2024 21:15:20 +0200
Subject: [PATCH 19/20] Reduce Host Memory is underutilized to 10%

---
 .../grafana/docker_compose/prometheus_alerts.rules.yaml       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
index 284c7ec..e638248 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
@@ -21,13 +21,13 @@ groups:
       description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}"
   # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
   - alert: HostMemoryIsUnderutilized
-    expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
     for: 1w
     labels:
       severity: info
     annotations:
       summary: Host Memory is underutilized (instance {{ $labels.instance }})
-      description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}"
+      description: "Node memory is < 10% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}"
   - alert: HostUnusualNetworkThroughputIn
     expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
     for: 5m

From 735fe0ca9b6dae82804b0747b4df08f96c019c99 Mon Sep 17 00:00:00 2001
From: Herr-Dante <dante@briefhaus.de>
Date: Tue, 24 Sep 2024 23:23:22 +0200
Subject: [PATCH 20/20] Add local port forwarding for debug sessions

---
 playbooks/roles/nextcloud/templates/compose.yaml.j2 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/playbooks/roles/nextcloud/templates/compose.yaml.j2 b/playbooks/roles/nextcloud/templates/compose.yaml.j2
index 21bf15c..520be37 100644
--- a/playbooks/roles/nextcloud/templates/compose.yaml.j2
+++ b/playbooks/roles/nextcloud/templates/compose.yaml.j2
@@ -36,6 +36,8 @@ services:
   db:
     image: postgres:{{ nextcloud__postgres_version }}
     restart: unless-stopped
+    #ports:
+    #  - 127.0.0.1:5432:5432
     networks:
       - nextcloud
     volumes: