Add local port forwarding for debug sessions

Reduce Host Memory is underutilized to 10%
Allow GPG keys as uploads
2024-10-27 22:27:07 +01:00 · 2024-10-18 21:15:20 +02:00 · 2024-10-18 12:40:24 +02:00 · 2024-10-13 13:50:50 +02:00 · 2024-10-13 09:10:10 +02:00 · 2024-10-12 22:08:28 +02:00
16 changed files with 746 additions and 53 deletions
--- a/inventories/chaosknoten/host_vars/grafana.yaml
+++ b/inventories/chaosknoten/host_vars/grafana.yaml
@ -6,6 +6,12 @@ docker_compose__configuration_files:
    content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml') }}"
  - name: prometheus.yml
    content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/prometheus.yml') }}"
+  - name: alertmanager.yaml
+    content: "{{ lookup('ansible.builtin.template', 'templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2') }}"
+  - name: prometheus_alerts.rules.yaml
+    content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml') }}"
+  - name: alertmanager_alert_templates.tmpl
+    content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}"

 certbot__version_spec: ""
 certbot__acme_account_email_address: le-admin@hamburg.ccc.de
--- a/inventories/chaosknoten/host_vars/hackertours.yaml
+++ b/inventories/chaosknoten/host_vars/hackertours.yaml
@ -1,16 +1,16 @@
-docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/hackertours/compose.yaml.j2') }}"
+docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/tickets/compose.yaml.j2') }}"
 docker_compose__configuration_files:
  - name: pretix.cfg
-    content: "{{ lookup('ansible.builtin.template', 'templates/chaosknoten/configs/hackertours/pretix.cfg.j2') }}"
+    content: "{{ lookup('ansible.builtin.template', 'templates/chaosknoten/configs/tickets/pretix.cfg.j2') }}"

 certbot__version_spec: ""
 certbot__acme_account_email_address: le-admin@hamburg.ccc.de
 certbot__certificate_domains:
-  - "hackertours.hamburg.ccc.de"
+  - "tickets.hamburg.ccc.de"
 certbot__new_cert_commands:
  - "systemctl reload nginx.service"

 nginx__version_spec: ""
 nginx__configurations:
-  - name: hackertours.hamburg.ccc.de
-    content: "{{ lookup('ansible.builtin.file', 'chaosknoten/configs/hackertours/nginx/hackertours.hamburg.ccc.de.conf') }}"
+  - name: tickets.hamburg.ccc.de
+    content: "{{ lookup('ansible.builtin.file', 'chaosknoten/configs/tickets/nginx/tickets.hamburg.ccc.de.conf') }}"
--- a/inventories/chaosknoten/hosts.yaml
+++ b/inventories/chaosknoten/hosts.yaml
@ -16,8 +16,8 @@ all:
          ansible_port: 42666
          ansible_user: chaos
          ansible_ssh_common_args: -J ssh://chaos@public-reverse-proxy.hamburg.ccc.de:42666
-        hackertours:
-          ansible_host: hackertours-intern.hamburg.ccc.de
+        tickets:
+          ansible_host: tickets-intern.hamburg.ccc.de
          ansible_port: 42666
          ansible_user: chaos
          ansible_ssh_common_args: -J ssh://chaos@public-reverse-proxy.hamburg.ccc.de:42666
@ -64,7 +64,7 @@ all:
      hosts:
        ccchoir:
        grafana:
-        hackertours:
+        tickets:
        keycloak:
        lists:
        onlyoffice:
@ -78,7 +78,7 @@ all:
      hosts:
        ccchoir:
        grafana:
-        hackertours:
+        tickets:
        keycloak:
        lists:
        mumble:
@ -99,7 +99,7 @@ all:
      hosts:
        ccchoir:
        grafana:
-        hackertours:
+        tickets:
        keycloak:
        lists:
        mumble:
@ -112,7 +112,7 @@ all:
      hosts:
        ccchoir:
        grafana:
-        hackertours:
+        tickets:
        cloud:
        keycloak:
        onlyoffice:
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl
@ -0,0 +1,35 @@
+{{/*
+Links & Resources
+- https://prometheus.io/blog/2016/03/03/custom-alertmanager-templates/
+- https://prometheus.io/docs/alerting/latest/notifications/
+- https://gist.github.com/jidckii/5ac5f8f20368b56de72af70222509b7b
+*/}}
+{{ define "alert-item.telegram.ccchh.internal" }}
+<b>[{{ .Labels.alertname }}] {{ .Labels.nodename }}</b>
+{{- if .Annotations.summary }}
+<i>Summary</i>: {{ .Annotations.summary }}
+{{- end }}
+{{- if .Annotations.description }}
+<i>Description</i>: {{ .Annotations.description }}
+{{- end }}
+<i>Labels</i>:
+{{ range .Labels.SortedPairs -}}
+• <i>{{ .Name }}</i>: <code>{{ .Value }}</code>
+{{ end }}
+{{- end }}
+
+
+{{ define "alert-message.telegram.ccchh" }}
+{{- if .Alerts.Firing }}
+<u>🔥{{ len .Alerts.Firing }} Alert(/s) Firing 🔥</u>
+{{ range .Alerts.Firing -}}
+{{ template "alert-item.telegram.ccchh.internal" . }}
+{{- end }}
+{{- end }}
+{{- if .Alerts.Resolved }}
+<u>✅{{ len .Alerts.Resolved }} Alert(/s) Resolved ✅</u>
+{{ range .Alerts.Resolved -}}
+{{ template "alert-item.telegram.ccchh.internal" . }}
+{{- end }}
+{{- end }}
+{{- end }}
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@ -2,23 +2,33 @@ global:
  scrape_interval: 15s
  scrape_timeout: 10s
  evaluation_interval: 15s
+
 alerting:
  alertmanagers:
-  - static_configs:
-    - targets: []
-    scheme: http
+  - scheme: http
    timeout: 10s
-    api_version: v1
+    static_configs:
+    - targets:
+      - "alertmanager:9093"
+
+rule_files:
+  - "/etc/prometheus/rules/*.rules.yaml"
+
 scrape_configs:
 - job_name: prometheus
  honor_timestamps: true
-  scrape_interval: 15s
-  scrape_timeout: 10s
  metrics_path: /metrics
  scheme: http
  static_configs:
  - targets:
    - localhost:9090
+- job_name: alertmanager
+  honor_timestamps: true
+  metrics_path: /metrics
+  scheme: http
+  static_configs:
+  - targets:
+    - alertmanager:9093
 - job_name: c3lingo
  honor_timestamps: true
  scrape_interval: 5s
@ -39,8 +49,6 @@ scrape_configs:
    - mumble.hamburg.ccc.de:443
 - job_name: opnsense-ccchh
  honor_timestamps: true
-  scrape_interval: 5s
-  scrape_timeout: 1s
  metrics_path: /metrics
  scheme: http
  static_configs:
@ -54,17 +62,7 @@ scrape_configs:
  scheme: http
  static_configs:
  - targets:
-    - jitsi.hamburg.ccc.de:9100 # Node Exporter
    - jitsi.hamburg.ccc.de:9888 # Jitsi Video Bridge
- job_name: chaosknoten
-  honor_timestamps: true
-  scrape_interval: 5s
-  scrape_timeout: 1s
-  metrics_path: /metrics
-  scheme: http
-  static_configs:
-  - targets:
-    - chaosknoten.hamburg.ccc.de:9100 # Node Exporter
 - job_name: 'pve'
  static_configs:
    - targets:
@ -81,3 +79,28 @@ scrape_configs:
      target_label: instance
    - target_label: __address__
      replacement: pve-exporter:9221
+- job_name: hosts
+  static_configs:
+    # Wieske Chaosknoten VMs
+    - labels:
+        site: wieske
+        type: virtual_machine
+        hypervisor: chaosknoten
+      targets:
+        - netbox-intern.hamburg.ccc.de:9100
+        - matrix-intern.hamburg.ccc.de:9100
+        - public-web-static-intern.hamburg.ccc.de:9100
+        - git-intern.hamburg.ccc.de:9100
+        - forgejo-actions-runner-intern.hamburg.ccc.de:9100
+        - eh22-wiki-intern.hamburg.ccc.de:9100
+        - nix-box-june-intern.hamburg.ccc.de:9100
+        - mjolnir-intern.hamburg.ccc.de:9100
+        - woodpecker-intern.hamburg.ccc.de:9100
+        - penpot-intern.hamburg.ccc.de:9100
+        - jitsi.hamburg.ccc.de:9100
+    # Wieske Physical Machines
+    - labels:
+        site: wieske
+        type: physical_machine
+      targets:
+        - chaosknoten.hamburg.ccc.de:9100
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
@ -0,0 +1,566 @@
+# Links & Resources:
+# - https://samber.github.io/awesome-prometheus-alerts/rules
+groups:
+- name: node-exporter
+  rules:
+  - alert: HostOutOfMemory
+    expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of memory (instance {{ $labels.instance }})
+      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}"
+  - alert: HostMemoryUnderMemoryPressure
+    expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host memory under memory pressure (instance {{ $labels.instance }})
+      description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}"
+  # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
+  - alert: HostMemoryIsUnderutilized
+    expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 1w
+    labels:
+      severity: info
+    annotations:
+      summary: Host Memory is underutilized (instance {{ $labels.instance }})
+      description: "Node memory is < 10% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}"
+  - alert: HostUnusualNetworkThroughputIn
+    expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual network throughput in (instance {{ $labels.instance }})
+      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+  - alert: HostUnusualNetworkThroughputOut
+    expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual network throughput out (instance {{ $labels.instance }})
+      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+  # Have different disk read and write rate alerts for VMs and physical machines.
+  - alert: VirtualHostUnusualDiskReadRate
+    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
+      description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+  - alert: VirtualHostUnusualDiskWriteRate
+    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
+      description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+  - alert: PhysicalHostUnusualDiskReadRate
+    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+    for: 20m
+    labels:
+      severity: warning
+    annotations:
+      summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
+      description: "Disk is probably reading too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+  - alert: PhysicalHostUnusualDiskWriteRate
+    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+    for: 15m
+    labels:
+      severity: warning
+    annotations:
+      summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
+      description: "Disk is probably writing too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+  # Please add ignored mountpoints in node_exporter parameters like
+  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+  - alert: HostOutOfDiskSpace
+    expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of disk space (instance {{ $labels.instance }})
+      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}"
+  # Please add ignored mountpoints in node_exporter parameters like
+  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+  - alert: HostDiskWillFillIn24Hours
+    expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+      description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}"
+  - alert: HostOutOfInodes
+    expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host out of inodes (instance {{ $labels.instance }})
+      description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}"
+  - alert: HostInodesWillFillIn24Hours
+    expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+      description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}"
+  - alert: HostFilesystemDeviceError
+    expr: node_filesystem_device_error == 1
+    for: 2m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host filesystem device error (instance {{ $labels.instance }})
+      description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}"
+  - alert: HostUnusualDiskReadLatency
+    expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk read latency (instance {{ $labels.instance }})
+      description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}"
+  - alert: HostUnusualDiskWriteLatency
+    expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host unusual disk write latency (instance {{ $labels.instance }})
+      description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}"
+  - alert: HostHighCpuLoad
+    expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host high CPU load (instance {{ $labels.instance }})
+      description: "CPU load is > 80%\n  VALUE = {{ $value }}"
+  # We might want to introduce that later, tho maybe excluding hosts with one core, if possible and only for VMs?
+  # # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
+  # - alert: HostCpuIsUnderutilized
+  #   expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+  #   for: 1w
+  #   labels:
+  #     severity: info
+  #   annotations:
+  #     summary: Host CPU is underutilized (instance {{ $labels.instance }})
+  #     description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}"
+  - alert: HostCpuStealNoisyNeighbor
+    expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+      description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}"
+  - alert: HostCpuHighIowait
+    expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host CPU high iowait (instance {{ $labels.instance }})
+      description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}"
+  # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
+  - alert: PhysicalHostUnusualHardDiskIo
+    expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
+      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+  - alert: PhysicalHostUnusualOtherDiskIo
+    expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
+      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+  - alert: VirtualHostUnusualDiskIo
+    expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
+      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+  # # x2 context switches is an arbitrary number.
+  # # The alert threshold depends on the nature of the application.
+  # # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
+  # - alert: HostContextSwitchingHigh
+  #   expr: (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
+  #   for: 0m
+  #   labels:
+  #     severity: warning
+  #   annotations:
+  #     summary: Host context switching high (instance {{ $labels.instance }})
+  #     description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}"
+  - alert: HostSwapIsFillingUp
+    expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host swap is filling up (instance {{ $labels.instance }})
+      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}"
+  - alert: HostSystemdServiceCrashed
+    expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host systemd service crashed (instance {{ $labels.instance }})
+      description: "systemd service crashed\n  VALUE = {{ $value }}"
+  - alert: HostPhysicalComponentTooHot
+    expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host physical component too hot (instance {{ $labels.instance }})
+      description: "Physical hardware component too hot\n  VALUE = {{ $value }}"
+  - alert: HostNodeOvertemperatureAlarm
+    expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+      description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}"
+  - alert: HostRaidArrayGotInactive
+    expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host RAID array got inactive (instance {{ $labels.instance }})
+      description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}"
+  - alert: HostRaidDiskFailure
+    expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host RAID disk failure (instance {{ $labels.instance }})
+      description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}"
+  - alert: HostKernelVersionDeviations
+    expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 6h
+    labels:
+      severity: warning
+    annotations:
+      summary: Host kernel version deviations (instance {{ $labels.instance }})
+      description: "Different kernel versions are running\n  VALUE = {{ $value }}"
+  - alert: HostOomKillDetected
+    expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host OOM kill detected (instance {{ $labels.instance }})
+      description: "OOM kill detected\n  VALUE = {{ $value }}"
+  - alert: HostEdacCorrectableErrorsDetected
+    expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: info
+    annotations:
+      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+      description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}"
+  - alert: HostEdacUncorrectableErrorsDetected
+    expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+      description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}"
+  - alert: HostNetworkReceiveErrors
+    expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Receive Errors (instance {{ $labels.instance }})
+      description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}"
+  - alert: HostNetworkTransmitErrors
+    expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+      description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}"
+  - alert: HostNetworkBondDegraded
+    expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+      description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}"
+  - alert: HostConntrackLimit
+    expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host conntrack limit (instance {{ $labels.instance }})
+      description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}"
+  - alert: HostClockSkew
+    expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host clock skew (instance {{ $labels.instance }})
+      description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}"
+  - alert: HostClockNotSynchronising
+    expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host clock not synchronising (instance {{ $labels.instance }})
+      description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}"
+  - alert: HostRequiresReboot
+    expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+    for: 4h
+    labels:
+      severity: info
+    annotations:
+      summary: Host requires reboot (instance {{ $labels.instance }})
+      description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}"
+- name: prometheus
+  rules:
+  - alert: PrometheusJobMissing
+    expr: absent(up{job="prometheus"})
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus job missing (instance {{ $labels.instance }})
+      description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetMissing
+    expr: up == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus target missing (instance {{ $labels.instance }})
+      description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}"
+  - alert: PrometheusAllTargetsMissing
+    expr: sum by (job) (up) == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus all targets missing (instance {{ $labels.instance }})
+      description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
+  - alert: PrometheusConfigurationReloadFailure
+    expr: prometheus_config_last_reload_successful != 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+      description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
+  - alert: PrometheusTooManyRestarts
+    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus too many restarts (instance {{ $labels.instance }})
+      description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerJobMissing
+    expr: absent(up{job="alertmanager"})
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
+      description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerConfigurationReloadFailure
+    expr: alertmanager_config_last_reload_successful != 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+      description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerConfigNotSynced
+    expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
+      description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}"
+  # For testing.
+  # - alert: PrometheusAlertmanagerE2eDeadManSwitch
+  #   expr: vector(1)
+  #   for: 0m
+  #   labels:
+  #     severity: critical
+  #   annotations:
+  #     summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
+  #     description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}"
+  - alert: PrometheusNotConnectedToAlertmanager
+    expr: prometheus_notifications_alertmanagers_discovered < 1
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
+      description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
+  - alert: PrometheusRuleEvaluationFailures
+    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
+  - alert: PrometheusTemplateTextExpansionFailures
+    expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusRuleEvaluationSlow
+    expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+      description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
+  - alert: PrometheusNotificationsBacklog
+    expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+      description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
+  - alert: PrometheusAlertmanagerNotificationFailing
+    expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+      description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetEmpty
+    expr: prometheus_sd_discovered_targets == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus target empty (instance {{ $labels.instance }})
+      description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetScrapingSlow
+    expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+      description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}"
+  - alert: PrometheusLargeScrape
+    expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus large scrape (instance {{ $labels.instance }})
+      description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
+  - alert: PrometheusTargetScrapeDuplicate
+    expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+      description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbCheckpointCreationFailures
+    expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbCheckpointDeletionFailures
+    expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbCompactionsFailed
+    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbHeadTruncationsFailed
+    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbReloadFailures
+    expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbWalCorruptions
+    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
+  - alert: PrometheusTsdbWalTruncationsFailed
+    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"
+  - alert: PrometheusTimeseriesCardinality
+    expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
+      description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}"
--- a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
+++ b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/acme_challenge.conf
@ -8,7 +8,8 @@ map $host $upstream_acme_challenge_host {
    element.hamburg.ccc.de 172.31.17.151:31820;
    git.hamburg.ccc.de 172.31.17.154:31820;
    grafana.hamburg.ccc.de 172.31.17.145:31820;
-    hackertours.hamburg.ccc.de 172.31.17.148:31820;
+    hackertours.hamburg.ccc.de 172.31.17.151:31820;
+    staging.hackertours.hamburg.ccc.de 172.31.17.151:31820;
    hamburg.ccc.de 172.31.17.151:31820;
    id.hamburg.ccc.de 172.31.17.144:31820;
    invite.hamburg.ccc.de 172.31.17.144:31820;
@ -23,12 +24,15 @@ map $host $upstream_acme_challenge_host {
    wiki.ccchh.net 172.31.17.146:31820;
    wiki.hamburg.ccc.de 172.31.17.146:31820;
    www.hamburg.ccc.de 172.31.17.151:31820;
+    tickets.hamburg.ccc.de 172.31.17.148:31820;
    zammad.hamburg.ccc.de 172.31.17.152:31820;
    eh03.easterhegg.eu 172.31.17.151:31820;
    eh05.easterhegg.eu 172.31.17.151:31820;
    eh07.easterhegg.eu 172.31.17.151:31820;
    eh09.easterhegg.eu 172.31.17.151:31820;
    eh11.easterhegg.eu 172.31.17.151:31820;
+    eh20.easterhegg.eu 172.31.17.151:31820;
+    www.eh20.easterhegg.eu 172.31.17.151:31820;
    eh22.easterhegg.eu 172.31.17.159:31820;
    easterheggxxxx.hamburg.ccc.de 172.31.17.151:31820;
    eh2003.hamburg.ccc.de 172.31.17.151:31820;
@ -57,6 +61,7 @@ map $host $upstream_acme_challenge_host {
    www.eh11.hamburg.ccc.de 172.31.17.151:31820;
    easterhegg2011.hamburg.ccc.de 172.31.17.151:31820;
    www.easterhegg2011.hamburg.ccc.de 172.31.17.151:31820;
+    eh20.hamburg.ccc.de 172.31.17.151:31820;
    hacker.tours 172.31.17.151:31820;
    staging.hacker.tours 172.31.17.151:31820;
    woodpecker.hamburg.ccc.de 172.31.17.160:31820;
--- a/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
+++ b/playbooks/files/chaosknoten/configs/public-reverse-proxy/nginx/nginx.conf
@ -30,7 +30,8 @@ stream {
        wiki.ccchh.net 172.31.17.146:8443;
        wiki.hamburg.ccc.de 172.31.17.146:8443;
        onlyoffice.hamburg.ccc.de 172.31.17.147:8443;
-        hackertours.hamburg.ccc.de 172.31.17.148:8443;
+        hackertours.hamburg.ccc.de 172.31.17.151:8443;
+        staging.hackertours.hamburg.ccc.de 172.31.17.151:8443;
        netbox.hamburg.ccc.de 172.31.17.149:8443;
        matrix.hamburg.ccc.de 172.31.17.150:8443;
        element.hamburg.ccc.de 172.31.17.151:8443;
@ -39,6 +40,7 @@ stream {
        hamburg.ccc.de 172.31.17.151:8443;
        staging.hamburg.ccc.de 172.31.17.151:8443;
        spaceapi.hamburg.ccc.de 172.31.17.151:8443;
+        tickets.hamburg.ccc.de 172.31.17.148:8443;
        zammad.hamburg.ccc.de 172.31.17.152:8443;
        c3cat.de 172.31.17.151:8443;
        git.hamburg.ccc.de 172.31.17.154:8443;
@ -47,6 +49,8 @@ stream {
        eh07.easterhegg.eu 172.31.17.151:8443;
        eh09.easterhegg.eu 172.31.17.151:8443;
        eh11.easterhegg.eu 172.31.17.151:8443;
+        eh20.easterhegg.eu 172.31.17.151:8443;
+        www.eh20.easterhegg.eu 172.31.17.151:8443;
        eh22.easterhegg.eu 172.31.17.159:8443;
        easterheggxxxx.hamburg.ccc.de 172.31.17.151:8443;
        eh2003.hamburg.ccc.de 172.31.17.151:8443;
@ -75,6 +79,7 @@ stream {
        www.eh11.hamburg.ccc.de 172.31.17.151:8443;
        easterhegg2011.hamburg.ccc.de 172.31.17.151:8443;
        www.easterhegg2011.hamburg.ccc.de 172.31.17.151:8443;
+        eh20.hamburg.ccc.de 172.31.17.151:8443;
        hacker.tours 172.31.17.151:8443;
        staging.hacker.tours 172.31.17.151:8443;
        woodpecker.hamburg.ccc.de 172.31.17.160:8443;
--- a/playbooks/files/chaosknoten/configs/hackertours/nginx/hackertours.hamburg.ccc.de.conf
+++ b/playbooks/files/chaosknoten/configs/hackertours/nginx/hackertours.hamburg.ccc.de.conf
@ -12,12 +12,12 @@ server {
    # header.
    real_ip_header proxy_protocol;

-    server_name hackertours.hamburg.ccc.de;
+    server_name tickets.hamburg.ccc.de;

-    ssl_certificate /etc/letsencrypt/live/hackertours.hamburg.ccc.de/fullchain.pem;
-    ssl_certificate_key /etc/letsencrypt/live/hackertours.hamburg.ccc.de/privkey.pem;
+    ssl_certificate /etc/letsencrypt/live/tickets.hamburg.ccc.de/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/tickets.hamburg.ccc.de/privkey.pem;
    # verify chain of trust of OCSP response using Root CA and Intermediate certs
-    ssl_trusted_certificate /etc/letsencrypt/live/hackertours.hamburg.ccc.de/chain.pem;
+    ssl_trusted_certificate /etc/letsencrypt/live/tickets.hamburg.ccc.de/chain.pem;

    # HSTS (ngx_http_headers_module is required) (63072000 seconds)
    add_header Strict-Transport-Security "max-age=63072000" always;
@ -37,10 +37,7 @@ server {
    proxy_set_header Forwarded "for=$remote_addr;proto=https;host=$host;by=_hidden";

    location = / {
-        proxy_pass http://127.0.0.1:8888/;
-    }
-    location ~ ^/(apple-touch-icon.png|assets|css|de|en|js|posts|tours)(.*)$ {
-        proxy_pass http://127.0.0.1:8888/$1$2;
+        return 302 https://wiki.hamburg.ccc.de/infrastructure:service-overview#tickets_pretix;
    }

    location / {
--- a/playbooks/files/chaosknoten/configs/zammad/nginx/zammad.hamburg.ccc.de.conf
+++ b/playbooks/files/chaosknoten/configs/zammad/nginx/zammad.hamburg.ccc.de.conf
@ -35,6 +35,14 @@ server {
    # is transparent).
    # Also provide "_hidden" for by, since it's not relevant.
    proxy_set_header Forwarded "for=$remote_addr;proto=https;host=$host;by=_hidden";
+    proxy_read_timeout 86400;
+    proxy_set_header Upgrade $http_upgrade;
+    proxy_set_header Connection "Upgrade";
+    proxy_set_header CLIENT_IP $remote_addr;
+
+    location ~/(ticket/zoom/.*) {
+        return 302 https://zammad.hamburg.ccc.de/#$1;
+    }

    location / {
        proxy_pass http://127.0.0.1:8080/;
--- a/playbooks/roles/dokuwiki/files/mime.local.conf
+++ b/playbooks/roles/dokuwiki/files/mime.local.conf
@ -2,3 +2,4 @@

 # Allow stl files.
 stl     !model/stl
+asc     application/pgp-keys
--- a/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/grafana/compose.yaml.j2
@ -11,7 +11,21 @@ services:
    restart: unless-stopped
    volumes:
      - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml
      - prom_data:/prometheus
+  
+  alertmanager:
+    image: prom/alertmanager
+    container_name: alertmanager
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yaml'
+    ports:
+      - 9093:9093
+    restart: unless-stopped
+    volumes:
+      - ./configs/alertmanager.yaml:/etc/alertmanager/alertmanager.yaml
+      - ./configs/alertmanager_alert_templates.tmpl:/etc/alertmanager/templates/alert_templates.tmpl
+      - alertmanager_data:/alertmanager

  grafana:
    image: grafana/grafana
@ -44,3 +58,4 @@ services:
 volumes:
  graf_data: {}
  prom_data: {}
+  alertmanager_data: {}
--- a/playbooks/templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2
@ -0,0 +1,40 @@
+# Links & References:
+# - https://prometheus.io/docs/alerting/latest/configuration/
+# - https://github.com/prometheus/alertmanager/blob/48a99764a1fc9279fc828de83e7a03ae2219abc7/doc/examples/simple.yml
+
+route:
+  group_by: ["alertname", "site", "type", "hypervisor"]
+
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 3h
+
+  receiver: ccchh-infrastructure-alerts
+
+
+{# Disable these for now, but might be interesting in the future.
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is
+# already critical.
+inhibit_rules:
+  - source_matchers: [severity="critical"]
+    target_matchers: [severity="warning"]
+    # Apply inhibition if the alertname is the same.
+    # CAUTION:
+    #   If all label names listed in `equal` are missing
+    #   from both the source and target alerts,
+    #   the inhibition rule will apply!
+    equal: [alertname, cluster, service] #}
+
+templates:
+  - "/etc/alertmanager/templates/*.tmpl"
+
+receivers:
+  - name: "ccchh-infrastructure-alerts"
+    telegram_configs:
+      - send_resolved: true
+        bot_token: {{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/grafana/alertmanager_telegram_bot_token", create=false, missing="error") }}
+        chat_id: -1002434372415
+        parse_mode: HTML
+        message: {{ "'{{ template \"alert-message.telegram.ccchh\" . }}'" }}
--- a/playbooks/templates/chaosknoten/configs/keycloak/compose.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/keycloak/compose.yaml.j2
@ -22,7 +22,7 @@

 services:
  keycloak:
-    image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:25.0
+    image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:26.0
    pull_policy: always
    restart: unless-stopped
    command: start --optimized
--- a/playbooks/templates/chaosknoten/configs/hackertours/compose.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/hackertours/compose.yaml.j2
@ -4,7 +4,7 @@ services:
    image: docker.io/library/postgres:15-alpine
    environment:
      - "POSTGRES_USER=pretix"
-      - "POSTGRES_PASSWORD={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/hackertours/DB_PASSWORD", create=false, missing="error") }}"
+      - "POSTGRES_PASSWORD={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/tickets/DB_PASSWORD", create=false, missing="error") }}"
      - "POSTGRES_DB=pretix"
    volumes:
      - database:/var/lib/postgresql/data
@ -37,14 +37,6 @@ services:
      backend:
      frontend:

-  web:
-    image: gitlab-cr.hamburg.ccc.de/ccchh/hackertours/hackertours:latest
-    ports: 
-      - "8888:80"
-    restart: unless-stopped
-    networks:
-      frontend:
-
 volumes:
  database: {}
  pretix: {}
--- a/playbooks/templates/chaosknoten/configs/hackertours/pretix.cfg.j2
+++ b/playbooks/templates/chaosknoten/configs/hackertours/pretix.cfg.j2
@ -1,6 +1,6 @@
 [pretix]
-instance_name=CCCHH Hackertours
-url=https://hackertours.hamburg.ccc.de
+instance_name=CCCHH Tickets
+url=https://tickets.hamburg.ccc.de
 currency=EUR
 datadir=/data
 trust_x_forwarded_for=on
@ -10,11 +10,11 @@ trust_x_forwarded_proto=on
 backend=postgresql
 name=pretix
 user=pretix
-password={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/hackertours/DB_PASSWORD", create=false, missing="error") }}
+password={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/tickets/DB_PASSWORD", create=false, missing="error") }}
 host=database

 [mail]
-from=ticket@hackertours.hamburg.ccc.de
+from=tickets@hamburg.ccc.de
 host=cow-intern.hamburg.ccc.de

 [redis]
Author	SHA1	Message	Date
Herr-Dante	735fe0ca9b	Add local port forwarding for debug sessions	2024-10-27 22:27:07 +01:00
c6ristian	34dc6d9a84	Reduce Host Memory is underutilized to 10%	2024-10-18 21:15:20 +02:00
Stefan Bethke	b660d937dc	Allow GPG keys as uploads	2024-10-18 12:40:24 +02:00
Stefan Bethke	2f00d21821	Redirect home page to wiki	2024-10-13 13:50:50 +02:00
Stefan Bethke	235e6e514f	Move Pretix from hackertours to tickets	2024-10-13 09:10:10 +02:00
June	7cd4a9a723	public-reverse-proxy: add config for staging.hackertours.hamburg.ccc.de	2024-10-12 22:08:28 +02:00
June	d7a9534eeb	public-reverse-proxy: use public-web-static as host for hackert. ccchh	2024-10-12 22:00:14 +02:00
Stefan Bethke	a35fcc13cf	Merge branch 'main' of git.hamburg.ccc.de:CCCHH/ansible-infra	2024-10-08 20:28:57 +02:00
Stefan Bethke	2fc54f5a83	Add missing headers to avoid CSRF errors	2024-10-08 20:28:56 +02:00
June	4cac84e7ec	prometheus: have different disk alerts for physical and virtual hosts Have more relaxed read/write alerts for physical hosts as they are probably hypervisors and regular high read/writes are more common. Also differentiate between physical and virtual hosts for IO alerts and allow for hard disks to spend more time in IO.	2024-10-05 17:22:45 +02:00
June	f721dd9fea	prometheus: make opnsense-ccchh job not fail half the time The scrape seems to take around a second to complete and with the configured timeout of 1s that failed half the time. Therefore use the default, more relaxed scrape interval and timeout and have it be reliable.	2024-10-05 17:22:45 +02:00
c6ristian	d8188d192b	Use keycloak version 26	2024-10-04 17:07:49 +02:00
Stefan Bethke	43ca24b5e2	Take website image from Forgejo	2024-10-03 19:44:43 +02:00
Stefan Bethke	229daa72fc	Redirect plain URL to hash for ticket deep links	2024-10-03 19:44:15 +02:00
June	0a05cad0a1	prometheus & alertmanager: add self-alerting Add self-alerting for Prometheus and Alertmanager using rules from https://samber.github.io/awesome-prometheus-alerts/rules	2024-10-02 04:13:37 +02:00
June	2e29b78f6a	prometheus: move Jitsis node exporter target to hosts job	2024-10-02 03:45:56 +02:00
June	61edc3587f	alertmanager: give Alertmanager a persistent storage directory	2024-10-02 03:43:22 +02:00
June	30876f821c	prometheus, alertmanager: use Prometheus alerts with Alertmanager For now introduce node-exporter/hosts alert rules, which got taken from https://samber.github.io/awesome-prometheus-alerts/rules However with the labels removed from the description, since they don't render correctly (at least in Telegram) and don't seem to provide much value, as we render the labels in the notification anyway. Also only have Telegram as the notification channel for now, as it was the easiest to set up.	2024-10-02 03:36:30 +02:00
June	803b19de0a	prometheus: add job for node exporter (for the NixOS VMs for now)	2024-10-01 20:09:42 +02:00
June	29d2d2926f	prometheus: don't duplicate scrape interval and timeout	2024-10-01 01:59:33 +02:00
June	e81ae5165f	public-reverse-proxy: config for eh20 static website deploy	2024-09-28 05:04:01 +02:00