From d8188d192b90fae79954c685cf0c388af18b88a3 Mon Sep 17 00:00:00 2001
From: c6ristian <c6ristian@christian.moe>
Date: Fri, 4 Oct 2024 17:07:49 +0200
Subject: [PATCH 1/3] Use keycloak version 26

---
 .../templates/chaosknoten/configs/keycloak/compose.yaml.j2      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playbooks/templates/chaosknoten/configs/keycloak/compose.yaml.j2 b/playbooks/templates/chaosknoten/configs/keycloak/compose.yaml.j2
index f6b293d..9509654 100644
--- a/playbooks/templates/chaosknoten/configs/keycloak/compose.yaml.j2
+++ b/playbooks/templates/chaosknoten/configs/keycloak/compose.yaml.j2
@@ -22,7 +22,7 @@
 
 services:
   keycloak:
-    image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:25.0
+    image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:26.0
     pull_policy: always
     restart: unless-stopped
     command: start --optimized

From f721dd9feaba061ae8c32641b0d7078ca760c7b9 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Wed, 2 Oct 2024 04:19:37 +0200
Subject: [PATCH 2/3] prometheus: make opnsense-ccchh job not fail half the
 time

The scrape seems to take around a second to complete and with the
configured timeout of 1s that failed half the time. Therefore use the
default, more relaxed scrape interval and timeout and have it be
reliable.
---
 .../chaosknoten/configs/grafana/docker_compose/prometheus.yml   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
index dce71c2..7936fd5 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@@ -49,8 +49,6 @@ scrape_configs:
     - mumble.hamburg.ccc.de:443
 - job_name: opnsense-ccchh
   honor_timestamps: true
-  scrape_interval: 5s
-  scrape_timeout: 1s
   metrics_path: /metrics
   scheme: http
   static_configs:

From 4cac84e7ec6f7d9f8cde3a5a0677d744545c97b2 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Sat, 5 Oct 2024 17:17:35 +0200
Subject: [PATCH 3/3] prometheus: have different disk alerts for physical and
 virtual hosts

Have more relaxed read/write alerts for physical hosts as they are
probably hypervisors and regular high read/writes are more common.
Also differentiate between physical and virtual hosts for IO alerts and
allow for hard disks to spend more time in IO.
---
 .../prometheus_alerts.rules.yaml              | 52 +++++++++++++++----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
index 1c06485..284c7ec 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
@@ -44,22 +44,39 @@ groups:
     annotations:
       summary: Host unusual network throughput out (instance {{ $labels.instance }})
       description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
-  - alert: HostUnusualDiskReadRate
-    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+  # Have different disk read and write rate alerts for VMs and physical machines.
+  - alert: VirtualHostUnusualDiskReadRate
+    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"}
     for: 5m
     labels:
       severity: warning
     annotations:
-      summary: Host unusual disk read rate (instance {{ $labels.instance }})
+      summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
       description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
-  - alert: HostUnusualDiskWriteRate
-    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+  - alert: VirtualHostUnusualDiskWriteRate
+    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
     for: 2m
     labels:
       severity: warning
     annotations:
-      summary: Host unusual disk write rate (instance {{ $labels.instance }})
+      summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
       description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+  - alert: PhysicalHostUnusualDiskReadRate
+    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+    for: 20m
+    labels:
+      severity: warning
+    annotations:
+      summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
+      description: "Disk is probably reading too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+  - alert: PhysicalHostUnusualDiskWriteRate
+    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+    for: 15m
+    labels:
+      severity: warning
+    annotations:
+      summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
+      description: "Disk is probably writing too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
   # Please add ignored mountpoints in node_exporter parameters like
   # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
   # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
@@ -156,13 +173,30 @@ groups:
     annotations:
       summary: Host CPU high iowait (instance {{ $labels.instance }})
       description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}"
-  - alert: HostUnusualDiskIo
-    expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+  # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
+  - alert: PhysicalHostUnusualHardDiskIo
+    expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
     for: 5m
     labels:
       severity: warning
     annotations:
-      summary: Host unusual disk IO (instance {{ $labels.instance }})
+      summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
+      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+  - alert: PhysicalHostUnusualOtherDiskIo
+    expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
+      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+  - alert: VirtualHostUnusualDiskIo
+    expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
       description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
   # # x2 context switches is an arbitrary number.
   # # The alert threshold depends on the nature of the application.