From 4060dbbe21386abf6a02d8fafd00cd501297ba03 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Sat, 23 Nov 2024 02:49:23 +0100
Subject: [PATCH] fix all ansible-lint yaml errors (except for line-length)

---
 .../chaosknoten/host_vars/ccchoir.yaml        |    2 +-
 inventories/chaosknoten/host_vars/pad.yaml    |    2 +-
 .../chaosknoten/host_vars/pretalx.yaml        |    2 +-
 inventories/chaosknoten/host_vars/zammad.yaml |    2 +-
 .../docker_compose/grafana-datasource.yml     |   13 +-
 .../grafana/docker_compose/prometheus.yml     |  204 +--
 .../prometheus_alerts.rules.yaml              | 1160 ++++++++---------
 .../configs/lists/compose/compose.yaml        |   60 +-
 .../apt_update_and_upgrade/tasks/main.yaml    |   20 +-
 .../deploy_ssh_server_config/tasks/main.yaml  |   32 +-
 .../tasks/main.yaml                           |    1 -
 playbooks/roles/nextcloud/meta/main.yaml      |    4 +-
 playbooks/roles/nginx/defaults/main.yaml      |    2 +-
 .../roles/nginx/tasks/main/config_deploy.yaml |   14 +-
 requirements.yml                              |    2 +-
 15 files changed, 759 insertions(+), 761 deletions(-)

diff --git a/inventories/chaosknoten/host_vars/ccchoir.yaml b/inventories/chaosknoten/host_vars/ccchoir.yaml
index 87e6696..cd59ea1 100644
--- a/inventories/chaosknoten/host_vars/ccchoir.yaml
+++ b/inventories/chaosknoten/host_vars/ccchoir.yaml
@@ -1,5 +1,5 @@
 docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/ccchoir/compose.yaml.j2') }}"
-docker_compose__configuration_files: []
+docker_compose__configuration_files: [ ]
 
 certbot__version_spec: ""
 certbot__acme_account_email_address: le-admin@hamburg.ccc.de
diff --git a/inventories/chaosknoten/host_vars/pad.yaml b/inventories/chaosknoten/host_vars/pad.yaml
index ea420a9..01a0d75 100644
--- a/inventories/chaosknoten/host_vars/pad.yaml
+++ b/inventories/chaosknoten/host_vars/pad.yaml
@@ -1,5 +1,5 @@
 docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/pad/compose.yaml.j2') }}"
-docker_compose__configuration_files: []
+docker_compose__configuration_files: [ ]
 
 certbot__version_spec: ""
 certbot__acme_account_email_address: le-admin@hamburg.ccc.de
diff --git a/inventories/chaosknoten/host_vars/pretalx.yaml b/inventories/chaosknoten/host_vars/pretalx.yaml
index fbc7c57..cd98387 100644
--- a/inventories/chaosknoten/host_vars/pretalx.yaml
+++ b/inventories/chaosknoten/host_vars/pretalx.yaml
@@ -1,5 +1,5 @@
 docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/pretalx/compose.yaml.j2') }}"
-docker_compose__configuration_files: []
+docker_compose__configuration_files: [ ]
 
 certbot__version_spec: ""
 certbot__acme_account_email_address: le-admin@hamburg.ccc.de
diff --git a/inventories/chaosknoten/host_vars/zammad.yaml b/inventories/chaosknoten/host_vars/zammad.yaml
index d0e1ea8..962df32 100644
--- a/inventories/chaosknoten/host_vars/zammad.yaml
+++ b/inventories/chaosknoten/host_vars/zammad.yaml
@@ -1,5 +1,5 @@
 docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/zammad/compose.yaml.j2') }}"
-docker_compose__configuration_files: []
+docker_compose__configuration_files: [ ]
 
 certbot__version_spec: ""
 certbot__acme_account_email_address: le-admin@hamburg.ccc.de
diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml
index ddb52fc..44999d4 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml
@@ -1,10 +1,9 @@
 apiVersion: 1
 
 datasources:
-- name: Prometheus
-  type: prometheus
-  url: http://prometheus:9090 
-  isDefault: true
-  access: proxy
-  editable: true
-
+  - name: Prometheus
+    type: prometheus
+    url: http://prometheus:9090
+    isDefault: true
+    access: proxy
+    editable: true
diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
index 9ce796f..0dad0a0 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml
@@ -5,110 +5,110 @@ global:
 
 alerting:
   alertmanagers:
-  - scheme: http
-    timeout: 10s
-    static_configs:
-    - targets:
-      - "alertmanager:9093"
+    - scheme: http
+      timeout: 10s
+      static_configs:
+        - targets:
+            - "alertmanager:9093"
 
 rule_files:
   - "/etc/prometheus/rules/*.rules.yaml"
 
 scrape_configs:
-- job_name: prometheus
-  honor_timestamps: true
-  metrics_path: /metrics
-  scheme: http
-  static_configs:
-  - targets:
-    - localhost:9090
-- job_name: alertmanager
-  honor_timestamps: true
-  metrics_path: /metrics
-  scheme: http
-  static_configs:
-  - targets:
-    - alertmanager:9093
-- job_name: c3lingo
-  honor_timestamps: true
-  scrape_interval: 5s
-  scrape_timeout: 1s
-  metrics_path: /mumblestats/metrics
-  scheme: https
-  static_configs:
-  - targets:
-    - mumble.c3lingo.org:443
-- job_name: mumble
-  honor_timestamps: true
-  scrape_interval: 5s
-  scrape_timeout: 1s
-  metrics_path: /metrics
-  scheme: https
-  static_configs:
-  - targets:
-    - mumble.hamburg.ccc.de:443
-- job_name: opnsense-ccchh
-  honor_timestamps: true
-  metrics_path: /metrics
-  scheme: http
-  static_configs:
-  - targets:
-    - 185.161.129.132:9100
-- job_name: jitsi
-  honor_timestamps: true
-  scrape_interval: 5s
-  scrape_timeout: 1s
-  metrics_path: /metrics
-  scheme: http
-  static_configs:
-  - targets:
-    - jitsi.hamburg.ccc.de:9888 # Jitsi Video Bridge
-- job_name: 'pve'
-  static_configs:
-    - targets:
-      - 212.12.48.126  # chaosknoten
-  metrics_path: /pve
-  params:
-    module: [default]
-    cluster: ['1']
-    node: ['1']
-  relabel_configs:
-    - source_labels: [__address__]
-      target_label: __param_target
-    - source_labels: [__param_target]
-      target_label: instance
-    - target_label: __address__
-      replacement: pve-exporter:9221
-- job_name: hosts
-  static_configs:
-    # Wieske Chaosknoten VMs
-    - labels:
-        site: wieske
-        type: virtual_machine
-        hypervisor: chaosknoten
-      targets:
-        - netbox-intern.hamburg.ccc.de:9100
-        - matrix-intern.hamburg.ccc.de:9100
-        - public-web-static-intern.hamburg.ccc.de:9100
-        - git-intern.hamburg.ccc.de:9100
-        - forgejo-actions-runner-intern.hamburg.ccc.de:9100
-        - eh22-wiki-intern.hamburg.ccc.de:9100
-        - nix-box-june-intern.hamburg.ccc.de:9100
-        - mjolnir-intern.hamburg.ccc.de:9100
-        - woodpecker-intern.hamburg.ccc.de:9100
-        - penpot-intern.hamburg.ccc.de:9100
-        - jitsi.hamburg.ccc.de:9100
-        - onlyoffice-intern.hamburg.ccc.de:9100
-        - ccchoir-intern.hamburg.ccc.de:9100
-        - tickets-intern.hamburg.ccc.de:9100
-        - keycloak-intern.hamburg.ccc.de:9100
-        - onlyoffice-intern.hamburg.ccc.de:9100
-        - pad-intern.hamburg.ccc.de:9100
-        - wiki-intern.hamburg.ccc.de:9100
-        - zammad-intern.hamburg.ccc.de:9100
-        - pretalx-intern.hamburg.ccc.de:9100
-    - labels:
-        site: wieske
-        type: physical_machine
-      targets:
-        - chaosknoten.hamburg.ccc.de:9100
+  - job_name: prometheus
+    honor_timestamps: true
+    metrics_path: /metrics
+    scheme: http
+    static_configs:
+      - targets:
+          - localhost:9090
+  - job_name: alertmanager
+    honor_timestamps: true
+    metrics_path: /metrics
+    scheme: http
+    static_configs:
+      - targets:
+          - alertmanager:9093
+  - job_name: c3lingo
+    honor_timestamps: true
+    scrape_interval: 5s
+    scrape_timeout: 1s
+    metrics_path: /mumblestats/metrics
+    scheme: https
+    static_configs:
+      - targets:
+          - mumble.c3lingo.org:443
+  - job_name: mumble
+    honor_timestamps: true
+    scrape_interval: 5s
+    scrape_timeout: 1s
+    metrics_path: /metrics
+    scheme: https
+    static_configs:
+      - targets:
+          - mumble.hamburg.ccc.de:443
+  - job_name: opnsense-ccchh
+    honor_timestamps: true
+    metrics_path: /metrics
+    scheme: http
+    static_configs:
+      - targets:
+          - 185.161.129.132:9100
+  - job_name: jitsi
+    honor_timestamps: true
+    scrape_interval: 5s
+    scrape_timeout: 1s
+    metrics_path: /metrics
+    scheme: http
+    static_configs:
+      - targets:
+          - jitsi.hamburg.ccc.de:9888 # Jitsi Video Bridge
+  - job_name: 'pve'
+    static_configs:
+      - targets:
+          - 212.12.48.126  # chaosknoten
+    metrics_path: /pve
+    params:
+      module: [ default ]
+      cluster: [ '1' ]
+      node: [ '1' ]
+    relabel_configs:
+      - source_labels: [ __address__ ]
+        target_label: __param_target
+      - source_labels: [ __param_target ]
+        target_label: instance
+      - target_label: __address__
+        replacement: pve-exporter:9221
+  - job_name: hosts
+    static_configs:
+      # Wieske Chaosknoten VMs
+      - labels:
+          site: wieske
+          type: virtual_machine
+          hypervisor: chaosknoten
+        targets:
+          - netbox-intern.hamburg.ccc.de:9100
+          - matrix-intern.hamburg.ccc.de:9100
+          - public-web-static-intern.hamburg.ccc.de:9100
+          - git-intern.hamburg.ccc.de:9100
+          - forgejo-actions-runner-intern.hamburg.ccc.de:9100
+          - eh22-wiki-intern.hamburg.ccc.de:9100
+          - nix-box-june-intern.hamburg.ccc.de:9100
+          - mjolnir-intern.hamburg.ccc.de:9100
+          - woodpecker-intern.hamburg.ccc.de:9100
+          - penpot-intern.hamburg.ccc.de:9100
+          - jitsi.hamburg.ccc.de:9100
+          - onlyoffice-intern.hamburg.ccc.de:9100
+          - ccchoir-intern.hamburg.ccc.de:9100
+          - tickets-intern.hamburg.ccc.de:9100
+          - keycloak-intern.hamburg.ccc.de:9100
+          - onlyoffice-intern.hamburg.ccc.de:9100
+          - pad-intern.hamburg.ccc.de:9100
+          - wiki-intern.hamburg.ccc.de:9100
+          - zammad-intern.hamburg.ccc.de:9100
+          - pretalx-intern.hamburg.ccc.de:9100
+      - labels:
+          site: wieske
+          type: physical_machine
+        targets:
+          - chaosknoten.hamburg.ccc.de:9100
diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
index fe12bfd..65b3590 100644
--- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml
@@ -1,583 +1,583 @@
 # Links & Resources:
 # - https://samber.github.io/awesome-prometheus-alerts/rules
 groups:
-- name: node-exporter
-  rules:
-  - alert: HostOutOfMemory
-    expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host out of memory (instance {{ $labels.instance }})
-      description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}"
-  - alert: HostMemoryUnderMemoryPressure
-    expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host memory under memory pressure (instance {{ $labels.instance }})
-      description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}"
-  # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
-  - alert: HostMemoryIsUnderutilized
-    expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 1w
-    labels:
-      severity: info
-    annotations:
-      summary: Host Memory is underutilized (instance {{ $labels.instance }})
-      description: "Node memory is < 10% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}"
-  - alert: HostUnusualNetworkThroughputIn
-    expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual network throughput in (instance {{ $labels.instance }})
-      description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
-  - alert: HostUnusualNetworkThroughputOut
-    expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual network throughput out (instance {{ $labels.instance }})
-      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
-  # Have different disk read and write rate alerts for VMs and physical machines.
-  - alert: VirtualHostUnusualDiskReadRate
-    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
-      description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
-  - alert: VirtualHostUnusualDiskWriteRate
-    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
-      description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
-  # Some VMs are expected to have high Read / Write rates z.B. CI servers 
-  - alert: VirtualHostUnusualDiskReadRate
-    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }})
-      description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
-  - alert: VirtualHostUnusualDiskWriteRate
-    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
-    for: 4m
-    labels:
-      severity: warning
-    annotations:
-      summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }})
-      description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
-  - alert: PhysicalHostUnusualDiskReadRate
-    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
-    for: 20m
-    labels:
-      severity: warning
-    annotations:
-      summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
-      description: "Disk is probably reading too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
-  - alert: PhysicalHostUnusualDiskWriteRate
-    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
-    for: 15m
-    labels:
-      severity: warning
-    annotations:
-      summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
-      description: "Disk is probably writing too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
-  # Please add ignored mountpoints in node_exporter parameters like
-  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
-  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
-  - alert: HostOutOfDiskSpace
-    expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host out of disk space (instance {{ $labels.instance }})
-      description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}"
-  # Please add ignored mountpoints in node_exporter parameters like
-  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
-  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
-  - alert: HostDiskWillFillIn24Hours
-    expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
-      description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}"
-  - alert: HostOutOfInodes
-    expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host out of inodes (instance {{ $labels.instance }})
-      description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}"
-  - alert: HostInodesWillFillIn24Hours
-    expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
-      description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}"
-  - alert: HostFilesystemDeviceError
-    expr: node_filesystem_device_error == 1
-    for: 2m
-    labels:
-      severity: critical
-    annotations:
-      summary: Host filesystem device error (instance {{ $labels.instance }})
-      description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}"
-  - alert: HostUnusualDiskReadLatency
-    expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual disk read latency (instance {{ $labels.instance }})
-      description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}"
-  - alert: HostUnusualDiskWriteLatency
-    expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host unusual disk write latency (instance {{ $labels.instance }})
-      description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}"
-  - alert: HostHighCpuLoad
-    expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host high CPU load (instance {{ $labels.instance }})
-      description: "CPU load is > 80%\n  VALUE = {{ $value }}"
-  # We might want to introduce that later, tho maybe excluding hosts with one core, if possible and only for VMs?
-  # # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
-  # - alert: HostCpuIsUnderutilized
-  #   expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-  #   for: 1w
-  #   labels:
-  #     severity: info
-  #   annotations:
-  #     summary: Host CPU is underutilized (instance {{ $labels.instance }})
-  #     description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}"
-  - alert: HostCpuStealNoisyNeighbor
-    expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
-      description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}"
-  - alert: HostCpuHighIowait
-    expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host CPU high iowait (instance {{ $labels.instance }})
-      description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}"
-  # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
-  - alert: PhysicalHostUnusualHardDiskIo
-    expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
-      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
-  - alert: PhysicalHostUnusualOtherDiskIo
-    expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
-      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
-  - alert: VirtualHostUnusualDiskIo
-    expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
-      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
-  # # x2 context switches is an arbitrary number.
-  # # The alert threshold depends on the nature of the application.
-  # # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
-  # - alert: HostContextSwitchingHigh
-  #   expr: (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
-  #   for: 0m
-  #   labels:
-  #     severity: warning
-  #   annotations:
-  #     summary: Host context switching high (instance {{ $labels.instance }})
-  #     description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}"
-  - alert: HostSwapIsFillingUp
-    expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host swap is filling up (instance {{ $labels.instance }})
-      description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}"
-  - alert: HostSystemdServiceCrashed
-    expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host systemd service crashed (instance {{ $labels.instance }})
-      description: "systemd service crashed\n  VALUE = {{ $value }}"
-  - alert: HostPhysicalComponentTooHot
-    expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host physical component too hot (instance {{ $labels.instance }})
-      description: "Physical hardware component too hot\n  VALUE = {{ $value }}"
-  - alert: HostNodeOvertemperatureAlarm
-    expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Host node overtemperature alarm (instance {{ $labels.instance }})
-      description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}"
-  - alert: HostRaidArrayGotInactive
-    expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Host RAID array got inactive (instance {{ $labels.instance }})
-      description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}"
-  - alert: HostRaidDiskFailure
-    expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host RAID disk failure (instance {{ $labels.instance }})
-      description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}"
-  - alert: HostKernelVersionDeviations
-    expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 6h
-    labels:
-      severity: warning
-    annotations:
-      summary: Host kernel version deviations (instance {{ $labels.instance }})
-      description: "Different kernel versions are running\n  VALUE = {{ $value }}"
-  - alert: HostOomKillDetected
-    expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host OOM kill detected (instance {{ $labels.instance }})
-      description: "OOM kill detected\n  VALUE = {{ $value }}"
-  - alert: HostEdacCorrectableErrorsDetected
-    expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 0m
-    labels:
-      severity: info
-    annotations:
-      summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
-      description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}"
-  - alert: HostEdacUncorrectableErrorsDetected
-    expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
-      description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}"
-  - alert: HostNetworkReceiveErrors
-    expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host Network Receive Errors (instance {{ $labels.instance }})
-      description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}"
-  - alert: HostNetworkTransmitErrors
-    expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host Network Transmit Errors (instance {{ $labels.instance }})
-      description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}"
-  - alert: HostNetworkBondDegraded
-    expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host Network Bond Degraded (instance {{ $labels.instance }})
-      description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}"
-  - alert: HostConntrackLimit
-    expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host conntrack limit (instance {{ $labels.instance }})
-      description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}"
-  - alert: HostClockSkew
-    expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 10m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host clock skew (instance {{ $labels.instance }})
-      description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}"
-  - alert: HostClockNotSynchronising
-    expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 2m
-    labels:
-      severity: warning
-    annotations:
-      summary: Host clock not synchronising (instance {{ $labels.instance }})
-      description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}"
-  - alert: HostRequiresReboot
-    expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
-    for: 4h
-    labels:
-      severity: info
-    annotations:
-      summary: Host requires reboot (instance {{ $labels.instance }})
-      description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}"
-- name: prometheus
-  rules:
-  - alert: PrometheusJobMissing
-    expr: absent(up{job="prometheus"})
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus job missing (instance {{ $labels.instance }})
-      description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
-  - alert: PrometheusTargetMissing
-    expr: up == 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus target missing (instance {{ $labels.instance }})
-      description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}"
-  - alert: PrometheusAllTargetsMissing
-    expr: sum by (job) (up) == 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus all targets missing (instance {{ $labels.instance }})
-      description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
-  - alert: PrometheusConfigurationReloadFailure
-    expr: prometheus_config_last_reload_successful != 1
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
-      description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
-  - alert: PrometheusTooManyRestarts
-    expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus too many restarts (instance {{ $labels.instance }})
-      description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
-  - alert: PrometheusAlertmanagerJobMissing
-    expr: absent(up{job="alertmanager"})
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
-      description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
-  - alert: PrometheusAlertmanagerConfigurationReloadFailure
-    expr: alertmanager_config_last_reload_successful != 1
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
-      description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
-  - alert: PrometheusAlertmanagerConfigNotSynced
-    expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
-      description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}"
-  # For testing.
-  # - alert: PrometheusAlertmanagerE2eDeadManSwitch
-  #   expr: vector(1)
-  #   for: 0m
-  #   labels:
-  #     severity: critical
-  #   annotations:
-  #     summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
-  #     description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}"
-  - alert: PrometheusNotConnectedToAlertmanager
-    expr: prometheus_notifications_alertmanagers_discovered < 1
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
-      description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
-  - alert: PrometheusRuleEvaluationFailures
-    expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
-      description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
-  - alert: PrometheusTemplateTextExpansionFailures
-    expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
-      description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
-  - alert: PrometheusRuleEvaluationSlow
-    expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
-      description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
-  - alert: PrometheusNotificationsBacklog
-    expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus notifications backlog (instance {{ $labels.instance }})
-      description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
-  - alert: PrometheusAlertmanagerNotificationFailing
-    expr: rate(alertmanager_notifications_failed_total[1m]) > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
-      description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
-  - alert: PrometheusTargetEmpty
-    expr: prometheus_sd_discovered_targets == 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus target empty (instance {{ $labels.instance }})
-      description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
-  - alert: PrometheusTargetScrapingSlow
-    expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus target scraping slow (instance {{ $labels.instance }})
-      description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}"
-  - alert: PrometheusLargeScrape
-    expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
-    for: 5m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus large scrape (instance {{ $labels.instance }})
-      description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
-  - alert: PrometheusTargetScrapeDuplicate
-    expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
-      description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
-  - alert: PrometheusTsdbCheckpointCreationFailures
-    expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
-      description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
-  - alert: PrometheusTsdbCheckpointDeletionFailures
-    expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
-      description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
-  - alert: PrometheusTsdbCompactionsFailed
-    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
-      description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
-  - alert: PrometheusTsdbHeadTruncationsFailed
-    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
-      description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
-  - alert: PrometheusTsdbReloadFailures
-    expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
-      description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
-  - alert: PrometheusTsdbWalCorruptions
-    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
-      description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
-  - alert: PrometheusTsdbWalTruncationsFailed
-    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
-    for: 0m
-    labels:
-      severity: critical
-    annotations:
-      summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
-      description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"
-  - alert: PrometheusTimeseriesCardinality
-    expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
-      description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}"
+  - name: node-exporter
+    rules:
+      - alert: HostOutOfMemory
+        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of memory (instance {{ $labels.instance }})
+          description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}"
+      - alert: HostMemoryUnderMemoryPressure
+        expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host memory under memory pressure (instance {{ $labels.instance }})
+          description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}"
+      # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
+      - alert: HostMemoryIsUnderutilized
+        expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 1w
+        labels:
+          severity: info
+        annotations:
+          summary: Host Memory is underutilized (instance {{ $labels.instance }})
+          description: "Node memory is < 10% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}"
+      - alert: HostUnusualNetworkThroughputIn
+        expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual network throughput in (instance {{ $labels.instance }})
+          description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+      - alert: HostUnusualNetworkThroughputOut
+        expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual network throughput out (instance {{ $labels.instance }})
+          description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+      # Have different disk read and write rate alerts for VMs and physical machines.
+      - alert: VirtualHostUnusualDiskReadRate
+        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
+          description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+      - alert: VirtualHostUnusualDiskWriteRate
+        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
+          description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+      # Some VMs are expected to have high Read / Write rates z.B. CI servers
+      - alert: VirtualHostUnusualDiskReadRate
+        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }})
+          description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+      - alert: VirtualHostUnusualDiskWriteRate
+        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
+        for: 4m
+        labels:
+          severity: warning
+        annotations:
+          summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }})
+          description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
+      - alert: PhysicalHostUnusualDiskReadRate
+        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+        for: 20m
+        labels:
+          severity: warning
+        annotations:
+          summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
+          description: "Disk is probably reading too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+      - alert: PhysicalHostUnusualDiskWriteRate
+        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
+          description: "Disk is probably writing too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+      # Please add ignored mountpoints in node_exporter parameters like
+      # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+      # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+      - alert: HostOutOfDiskSpace
+        expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of disk space (instance {{ $labels.instance }})
+          description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}"
+      # Please add ignored mountpoints in node_exporter parameters like
+      # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
+      # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
+      - alert: HostDiskWillFillIn24Hours
+        expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+          description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}"
+      - alert: HostOutOfInodes
+        expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of inodes (instance {{ $labels.instance }})
+          description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}"
+      - alert: HostInodesWillFillIn24Hours
+        expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+          description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}"
+      - alert: HostFilesystemDeviceError
+        expr: node_filesystem_device_error == 1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host filesystem device error (instance {{ $labels.instance }})
+          description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}"
+      - alert: HostUnusualDiskReadLatency
+        expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk read latency (instance {{ $labels.instance }})
+          description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}"
+      - alert: HostUnusualDiskWriteLatency
+        expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk write latency (instance {{ $labels.instance }})
+          description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}"
+      - alert: HostHighCpuLoad
+        expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host high CPU load (instance {{ $labels.instance }})
+          description: "CPU load is > 80%\n  VALUE = {{ $value }}"
+      # We might want to introduce that later, tho maybe excluding hosts with one core, if possible and only for VMs?
+      # # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
+      # - alert: HostCpuIsUnderutilized
+      #   expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+      #   for: 1w
+      #   labels:
+      #     severity: info
+      #   annotations:
+      #     summary: Host CPU is underutilized (instance {{ $labels.instance }})
+      #     description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}"
+      - alert: HostCpuStealNoisyNeighbor
+        expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+          description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}"
+      - alert: HostCpuHighIowait
+        expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host CPU high iowait (instance {{ $labels.instance }})
+          description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}"
+      # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
+      - alert: PhysicalHostUnusualHardDiskIo
+        expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
+          description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+      - alert: PhysicalHostUnusualOtherDiskIo
+        expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
+          description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+      - alert: VirtualHostUnusualDiskIo
+        expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
+          description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
+      # # x2 context switches is an arbitrary number.
+      # # The alert threshold depends on the nature of the application.
+      # # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
+      # - alert: HostContextSwitchingHigh
+      #   expr: (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
+      #   for: 0m
+      #   labels:
+      #     severity: warning
+      #   annotations:
+      #     summary: Host context switching high (instance {{ $labels.instance }})
+      #     description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}"
+      - alert: HostSwapIsFillingUp
+        expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host swap is filling up (instance {{ $labels.instance }})
+          description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}"
+      - alert: HostSystemdServiceCrashed
+        expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host systemd service crashed (instance {{ $labels.instance }})
+          description: "systemd service crashed\n  VALUE = {{ $value }}"
+      - alert: HostPhysicalComponentTooHot
+        expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host physical component too hot (instance {{ $labels.instance }})
+          description: "Physical hardware component too hot\n  VALUE = {{ $value }}"
+      - alert: HostNodeOvertemperatureAlarm
+        expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host node overtemperature alarm (instance {{ $labels.instance }})
+          description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}"
+      - alert: HostRaidArrayGotInactive
+        expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Host RAID array got inactive (instance {{ $labels.instance }})
+          description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}"
+      - alert: HostRaidDiskFailure
+        expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host RAID disk failure (instance {{ $labels.instance }})
+          description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}"
+      - alert: HostKernelVersionDeviations
+        expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 6h
+        labels:
+          severity: warning
+        annotations:
+          summary: Host kernel version deviations (instance {{ $labels.instance }})
+          description: "Different kernel versions are running\n  VALUE = {{ $value }}"
+      - alert: HostOomKillDetected
+        expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host OOM kill detected (instance {{ $labels.instance }})
+          description: "OOM kill detected\n  VALUE = {{ $value }}"
+      - alert: HostEdacCorrectableErrorsDetected
+        expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 0m
+        labels:
+          severity: info
+        annotations:
+          summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
+          description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}"
+      - alert: HostEdacUncorrectableErrorsDetected
+        expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
+          description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}"
+      - alert: HostNetworkReceiveErrors
+        expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Receive Errors (instance {{ $labels.instance }})
+          description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}"
+      - alert: HostNetworkTransmitErrors
+        expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Transmit Errors (instance {{ $labels.instance }})
+          description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}"
+      - alert: HostNetworkBondDegraded
+        expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host Network Bond Degraded (instance {{ $labels.instance }})
+          description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}"
+      - alert: HostConntrackLimit
+        expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host conntrack limit (instance {{ $labels.instance }})
+          description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}"
+      - alert: HostClockSkew
+        expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host clock skew (instance {{ $labels.instance }})
+          description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}"
+      - alert: HostClockNotSynchronising
+        expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host clock not synchronising (instance {{ $labels.instance }})
+          description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}"
+      - alert: HostRequiresReboot
+        expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        for: 4h
+        labels:
+          severity: info
+        annotations:
+          summary: Host requires reboot (instance {{ $labels.instance }})
+          description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}"
+  - name: prometheus
+    rules:
+      - alert: PrometheusJobMissing
+        expr: absent(up{job="prometheus"})
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus job missing (instance {{ $labels.instance }})
+          description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
+      - alert: PrometheusTargetMissing
+        expr: up == 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target missing (instance {{ $labels.instance }})
+          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}"
+      - alert: PrometheusAllTargetsMissing
+        expr: sum by (job) (up) == 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus all targets missing (instance {{ $labels.instance }})
+          description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}"
+      - alert: PrometheusConfigurationReloadFailure
+        expr: prometheus_config_last_reload_successful != 1
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+          description: "Prometheus configuration reload error\n  VALUE = {{ $value }}"
+      - alert: PrometheusTooManyRestarts
+        expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus too many restarts (instance {{ $labels.instance }})
+          description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}"
+      - alert: PrometheusAlertmanagerJobMissing
+        expr: absent(up{job="alertmanager"})
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
+          description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}"
+      - alert: PrometheusAlertmanagerConfigurationReloadFailure
+        expr: alertmanager_config_last_reload_successful != 1
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+          description: "AlertManager configuration reload error\n  VALUE = {{ $value }}"
+      - alert: PrometheusAlertmanagerConfigNotSynced
+        expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
+          description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}"
+      # For testing.
+      # - alert: PrometheusAlertmanagerE2eDeadManSwitch
+      #   expr: vector(1)
+      #   for: 0m
+      #   labels:
+      #     severity: critical
+      #   annotations:
+      #     summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
+      #     description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}"
+      - alert: PrometheusNotConnectedToAlertmanager
+        expr: prometheus_notifications_alertmanagers_discovered < 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
+          description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}"
+      - alert: PrometheusRuleEvaluationFailures
+        expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}"
+      - alert: PrometheusTemplateTextExpansionFailures
+        expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}"
+      - alert: PrometheusRuleEvaluationSlow
+        expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
+          description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}"
+      - alert: PrometheusNotificationsBacklog
+        expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus notifications backlog (instance {{ $labels.instance }})
+          description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}"
+      - alert: PrometheusAlertmanagerNotificationFailing
+        expr: rate(alertmanager_notifications_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
+          description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}"
+      - alert: PrometheusTargetEmpty
+        expr: prometheus_sd_discovered_targets == 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target empty (instance {{ $labels.instance }})
+          description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}"
+      - alert: PrometheusTargetScrapingSlow
+        expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus target scraping slow (instance {{ $labels.instance }})
+          description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}"
+      - alert: PrometheusLargeScrape
+        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus large scrape (instance {{ $labels.instance }})
+          description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}"
+      - alert: PrometheusTargetScrapeDuplicate
+        expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
+          description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}"
+      - alert: PrometheusTsdbCheckpointCreationFailures
+        expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}"
+      - alert: PrometheusTsdbCheckpointDeletionFailures
+        expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}"
+      - alert: PrometheusTsdbCompactionsFailed
+        expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}"
+      - alert: PrometheusTsdbHeadTruncationsFailed
+        expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}"
+      - alert: PrometheusTsdbReloadFailures
+        expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}"
+      - alert: PrometheusTsdbWalCorruptions
+        expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}"
+      - alert: PrometheusTsdbWalTruncationsFailed
+        expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
+          description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}"
+      - alert: PrometheusTimeseriesCardinality
+        expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
+          description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}"
diff --git a/playbooks/files/chaosknoten/configs/lists/compose/compose.yaml b/playbooks/files/chaosknoten/configs/lists/compose/compose.yaml
index 2ef87aa..232627a 100644
--- a/playbooks/files/chaosknoten/configs/lists/compose/compose.yaml
+++ b/playbooks/files/chaosknoten/configs/lists/compose/compose.yaml
@@ -5,21 +5,21 @@ services:
     container_name: mailman-core
     hostname: mailman-core
     volumes:
-    - /opt/mailman/core:/opt/mailman/
+      - /opt/mailman/core:/opt/mailman/
     stop_grace_period: 30s
     links:
-    - database:database
+      - database:database
     depends_on:
-    - database
+      - database
     environment:
-    - DATABASE_URL=postgresql://mailman:wvQjbMRnwFuxGEPz@database/mailmandb
-    - DATABASE_TYPE=postgres
-    - DATABASE_CLASS=mailman.database.postgresql.PostgreSQLDatabase
-    - HYPERKITTY_API_KEY=ITfRjushI6FP0TLMnRpZxlfB2e17DN86
-    - MTA=postfix
+      - DATABASE_URL=postgresql://mailman:wvQjbMRnwFuxGEPz@database/mailmandb
+      - DATABASE_TYPE=postgres
+      - DATABASE_CLASS=mailman.database.postgresql.PostgreSQLDatabase
+      - HYPERKITTY_API_KEY=ITfRjushI6FP0TLMnRpZxlfB2e17DN86
+      - MTA=postfix
     ports:
-    - "127.0.0.1:8001:8001" # API
-    - "127.0.0.1:8024:8024" # LMTP - incoming emails
+      - "127.0.0.1:8001:8001" # API
+      - "127.0.0.1:8024:8024" # LMTP - incoming emails
     networks:
       mailman:
 
@@ -29,36 +29,36 @@ services:
     container_name: mailman-web
     hostname: mailman-web
     depends_on:
-    - database
+      - database
     links:
-    - mailman-core:mailman-core
-    - database:database
+      - mailman-core:mailman-core
+      - database:database
     volumes:
-    - /opt/mailman/web:/opt/mailman-web-data
+      - /opt/mailman/web:/opt/mailman-web-data
     environment:
-    - DATABASE_TYPE=postgres
-    - DATABASE_URL=postgresql://mailman:wvQjbMRnwFuxGEPz@database/mailmandb
-    - "DJANGO_ALLOWED_HOSTS=lists.hamburg.ccc.de,lists.c3lingo.org"
-    - HYPERKITTY_API_KEY=ITfRjushI6FP0TLMnRpZxlfB2e17DN86
-    - SERVE_FROM_DOMAIN=lists.hamburg.ccc.de
-    - SECRET_KEY=ugfknEYBaFVc62R1jlIjnkizQaqr7tSt
-    - MAILMAN_ADMIN_USER=ccchh-admin
-    - MAILMAN_ADMIN_EMAIL=tony@cowtest.hamburg.ccc.de
+      - DATABASE_TYPE=postgres
+      - DATABASE_URL=postgresql://mailman:wvQjbMRnwFuxGEPz@database/mailmandb
+      - "DJANGO_ALLOWED_HOSTS=lists.hamburg.ccc.de,lists.c3lingo.org"
+      - HYPERKITTY_API_KEY=ITfRjushI6FP0TLMnRpZxlfB2e17DN86
+      - SERVE_FROM_DOMAIN=lists.hamburg.ccc.de
+      - SECRET_KEY=ugfknEYBaFVc62R1jlIjnkizQaqr7tSt
+      - MAILMAN_ADMIN_USER=ccchh-admin
+      - MAILMAN_ADMIN_EMAIL=tony@cowtest.hamburg.ccc.de
     ports:
-    - "127.0.0.1:8000:8000" # HTTP
-    - "127.0.0.1:8080:8080" # uwsgi
+      - "127.0.0.1:8000:8000" # HTTP
+      - "127.0.0.1:8080:8080" # uwsgi
     networks:
       mailman:
 
   database:
     restart: unless-stopped
     environment:
-    - POSTGRES_DB=mailmandb
-    - POSTGRES_USER=mailman
-    - POSTGRES_PASSWORD=wvQjbMRnwFuxGEPz
+      - POSTGRES_DB=mailmandb
+      - POSTGRES_USER=mailman
+      - POSTGRES_PASSWORD=wvQjbMRnwFuxGEPz
     image: postgres:12-alpine
     volumes:
-    - /opt/mailman/database:/var/lib/postgresql/data
+      - /opt/mailman/database:/var/lib/postgresql/data
     networks:
       mailman:
 
@@ -68,5 +68,5 @@ networks:
     ipam:
       driver: default
       config:
-      -
-        subnet: 172.19.199.0/24
+        -
+          subnet: 172.19.199.0/24
diff --git a/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml b/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml
index cdc9922..f63436b 100644
--- a/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml
+++ b/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml
@@ -1,15 +1,15 @@
 - name: update, upgrade and potentially reboot
   become: true
   block:
-  - name: apt-get update
-    ansible.builtin.apt:
-      update-cache: true
+    - name: apt-get update
+      ansible.builtin.apt:
+        update-cache: true
 
-  - name: apt-get dist-upgrade
-    ansible.builtin.apt:
-      upgrade: dist
-    register: apt_update_and_upgrade__upgrade_result
+    - name: apt-get dist-upgrade
+      ansible.builtin.apt:
+        upgrade: dist
+      register: apt_update_and_upgrade__upgrade_result
 
-  - name: reboot, after package upgrade
-    ansible.builtin.reboot:
-    when: apt_update_and_upgrade__upgrade_result.changed
+    - name: reboot, after package upgrade
+      ansible.builtin.reboot:
+      when: apt_update_and_upgrade__upgrade_result.changed
diff --git a/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml b/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml
index 12676dc..0492a35 100644
--- a/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml
+++ b/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml
@@ -3,21 +3,21 @@
   become: true
 
   block:
-  - name: deploy `sshd_config`
-    ansible.builtin.template:
-      force: true
-      dest: /etc/ssh/sshd_config
-      mode: 0644
-      owner: root
-      group: root
-      src: sshd_config.j2
-    register: deploy_ssh_server_config__ssh_config_copy_result
+    - name: deploy `sshd_config`
+      ansible.builtin.template:
+        force: true
+        dest: /etc/ssh/sshd_config
+        mode: "0644"
+        owner: root
+        group: root
+        src: sshd_config.j2
+      register: deploy_ssh_server_config__ssh_config_copy_result
 
-  - name: deactivate short moduli
-    ansible.builtin.shell:
-      cmd: awk '$5 >= 3071' /etc/ssh/moduli > /etc/ssh/moduli.tmp && mv /etc/ssh/moduli.tmp /etc/ssh/moduli
+    - name: deactivate short moduli
+      ansible.builtin.shell:
+        cmd: awk '$5 >= 3071' /etc/ssh/moduli > /etc/ssh/moduli.tmp && mv /etc/ssh/moduli.tmp /etc/ssh/moduli
 
-  # Rebooting here instead of restarting the ssh service, since I don't know how Ansible reacts, when it restarts the service it probably needs for the connection.
-  - name: reboot, if ssh server config got changed
-    ansible.builtin.reboot:
-    when: deploy_ssh_server_config__ssh_config_copy_result.changed
+    # Rebooting here instead of restarting the ssh service, since I don't know how Ansible reacts, when it restarts the service it probably needs for the connection.
+    - name: reboot, if ssh server config got changed
+      ansible.builtin.reboot:
+      when: deploy_ssh_server_config__ssh_config_copy_result.changed
diff --git a/playbooks/roles/infrastructure_authorized_keys/tasks/main.yaml b/playbooks/roles/infrastructure_authorized_keys/tasks/main.yaml
index c363ce7..982c7a0 100644
--- a/playbooks/roles/infrastructure_authorized_keys/tasks/main.yaml
+++ b/playbooks/roles/infrastructure_authorized_keys/tasks/main.yaml
@@ -4,4 +4,3 @@
     user: chaos
     exclusive: true
     key: https://git.hamburg.ccc.de/CCCHH/infrastructure-authorized-keys/raw/branch/trunk/authorized_keys
-    
\ No newline at end of file
diff --git a/playbooks/roles/nextcloud/meta/main.yaml b/playbooks/roles/nextcloud/meta/main.yaml
index 9138dfe..34f476a 100644
--- a/playbooks/roles/nextcloud/meta/main.yaml
+++ b/playbooks/roles/nextcloud/meta/main.yaml
@@ -11,10 +11,10 @@ dependencies:
   - role: nginx
     vars:
       nginx__version_spec: "{{ nextcloud__nginx_version_spec }}"
-      nginx__configurations: 
+      nginx__configurations:
         - name: "{{ nextcloud__fqdn }}"
           content: "{{ lookup('ansible.builtin.template', 'nginx_nextcloud.conf.j2') }}"
   - role: docker_compose
     vars:
       docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'compose.yaml.j2') }}"
-      docker_compose__configuration_files: []
+      docker_compose__configuration_files: [ ]
diff --git a/playbooks/roles/nginx/defaults/main.yaml b/playbooks/roles/nginx/defaults/main.yaml
index e20777c..6ccfac4 100644
--- a/playbooks/roles/nginx/defaults/main.yaml
+++ b/playbooks/roles/nginx/defaults/main.yaml
@@ -1,5 +1,5 @@
 nginx__deploy_redirect_conf: true
 nginx__deploy_tls_conf: true
-nginx__configurations: []
+nginx__configurations: [ ]
 nginx__use_custom_nginx_conf: false
 nginx__custom_nginx_conf: ""
diff --git a/playbooks/roles/nginx/tasks/main/config_deploy.yaml b/playbooks/roles/nginx/tasks/main/config_deploy.yaml
index 100696e..c7fac39 100644
--- a/playbooks/roles/nginx/tasks/main/config_deploy.yaml
+++ b/playbooks/roles/nginx/tasks/main/config_deploy.yaml
@@ -11,7 +11,7 @@
       ansible.builtin.copy:
         force: true
         dest: /etc/nginx/nginx.conf.ansiblesave
-        mode: 0644
+        mode: "0644"
         owner: root
         group: root
         remote_src: true
@@ -22,7 +22,7 @@
       ansible.builtin.copy:
         content: "{{ nginx__custom_nginx_conf }}"
         dest: "/etc/nginx/nginx.conf"
-        mode: 0644
+        mode: "0644"
         owner: root
         group: root
       become: true
@@ -36,7 +36,7 @@
       ansible.builtin.copy:
         force: true
         dest: /etc/nginx/nginx.conf
-        mode: 0644
+        mode: "0644"
         owner: root
         group: root
         remote_src: true
@@ -55,7 +55,7 @@
   ansible.builtin.get_url:
     force: true
     dest: /etc/nginx-mozilla-dhparam
-    mode: 0644
+    mode: "0644"
     url: https://ssl-config.mozilla.org/ffdhe2048.txt
   become: true
   notify: Restart `nginx.service`
@@ -71,7 +71,7 @@
       ansible.builtin.copy:
         force: true
         dest: /etc/nginx/conf.d/tls.conf
-        mode: 0644
+        mode: "0644"
         owner: root
         group: root
         src: tls.conf
@@ -89,7 +89,7 @@
       ansible.builtin.copy:
         force: true
         dest: /etc/nginx/conf.d/redirect.conf
-        mode: 0644
+        mode: "0644"
         owner: root
         group: root
         src: redirect.conf
@@ -104,7 +104,7 @@
   ansible.builtin.copy:
     content: "{{ item.content }}"
     dest: "/etc/nginx/conf.d/{{ item.name }}.conf"
-    mode: 0644
+    mode: "0644"
     owner: root
     group: root
   become: true
diff --git a/requirements.yml b/requirements.yml
index e66cdcb..d5ebdfc 100644
--- a/requirements.yml
+++ b/requirements.yml
@@ -2,4 +2,4 @@ collections:
   # Install a collection from Ansible Galaxy.
   - name: debops.debops
     version: ">=3.1.0"
-    source: https://galaxy.ansible.com
\ No newline at end of file
+    source: https://galaxy.ansible.com