From 6dcf254a246a150a3010f440828d7dd1fa8ece87 Mon Sep 17 00:00:00 2001 From: June Date: Sat, 23 Nov 2024 02:11:48 +0100 Subject: [PATCH 01/10] add .editorconfig to ensure some style and format consistency --- .editorconfig | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..4ca5f7c --- /dev/null +++ b/.editorconfig @@ -0,0 +1,15 @@ +root = true + +[*] +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_style = space +charset = utf-8 + +[*.md] +indent_size = 2 +trim_trailing_whitespace = false + +[*.yaml] +indent_size = 2 -- 2.47.0 From a6453711d8f591593dfa78900ffb713f13d6b310 Mon Sep 17 00:00:00 2001 From: June Date: Sat, 23 Nov 2024 02:31:31 +0100 Subject: [PATCH 02/10] add .yamllint.yaml for some nicer yaml configuration for ansible-lint --- .yamllint.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .yamllint.yaml diff --git a/.yamllint.yaml b/.yamllint.yaml new file mode 100644 index 0000000..7ad950c --- /dev/null +++ b/.yamllint.yaml @@ -0,0 +1,6 @@ +rules: + brackets: + min-spaces-inside: 1 + max-spaces-inside: 1 + min-spaces-inside-empty: 1 + max-spaces-inside-empty: 1 -- 2.47.0 From 4060dbbe21386abf6a02d8fafd00cd501297ba03 Mon Sep 17 00:00:00 2001 From: June Date: Sat, 23 Nov 2024 02:49:23 +0100 Subject: [PATCH 03/10] fix all ansible-lint yaml errors (except for line-length) --- .../chaosknoten/host_vars/ccchoir.yaml | 2 +- inventories/chaosknoten/host_vars/pad.yaml | 2 +- .../chaosknoten/host_vars/pretalx.yaml | 2 +- inventories/chaosknoten/host_vars/zammad.yaml | 2 +- .../docker_compose/grafana-datasource.yml | 13 +- .../grafana/docker_compose/prometheus.yml | 204 +-- .../prometheus_alerts.rules.yaml | 1160 ++++++++--------- .../configs/lists/compose/compose.yaml | 60 +- .../apt_update_and_upgrade/tasks/main.yaml | 20 +- .../deploy_ssh_server_config/tasks/main.yaml | 32 +- .../tasks/main.yaml | 1 - playbooks/roles/nextcloud/meta/main.yaml | 4 +- playbooks/roles/nginx/defaults/main.yaml | 2 +- .../roles/nginx/tasks/main/config_deploy.yaml | 14 +- requirements.yml | 2 +- 15 files changed, 759 insertions(+), 761 deletions(-) diff --git a/inventories/chaosknoten/host_vars/ccchoir.yaml b/inventories/chaosknoten/host_vars/ccchoir.yaml index 87e6696..cd59ea1 100644 --- a/inventories/chaosknoten/host_vars/ccchoir.yaml +++ b/inventories/chaosknoten/host_vars/ccchoir.yaml @@ -1,5 +1,5 @@ docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/ccchoir/compose.yaml.j2') }}" -docker_compose__configuration_files: [] +docker_compose__configuration_files: [ ] certbot__version_spec: "" certbot__acme_account_email_address: le-admin@hamburg.ccc.de diff --git a/inventories/chaosknoten/host_vars/pad.yaml b/inventories/chaosknoten/host_vars/pad.yaml index ea420a9..01a0d75 100644 --- a/inventories/chaosknoten/host_vars/pad.yaml +++ b/inventories/chaosknoten/host_vars/pad.yaml @@ -1,5 +1,5 @@ docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/pad/compose.yaml.j2') }}" -docker_compose__configuration_files: [] +docker_compose__configuration_files: [ ] certbot__version_spec: "" certbot__acme_account_email_address: le-admin@hamburg.ccc.de diff --git a/inventories/chaosknoten/host_vars/pretalx.yaml b/inventories/chaosknoten/host_vars/pretalx.yaml index fbc7c57..cd98387 100644 --- a/inventories/chaosknoten/host_vars/pretalx.yaml +++ b/inventories/chaosknoten/host_vars/pretalx.yaml @@ -1,5 +1,5 @@ docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/pretalx/compose.yaml.j2') }}" -docker_compose__configuration_files: [] +docker_compose__configuration_files: [ ] certbot__version_spec: "" certbot__acme_account_email_address: le-admin@hamburg.ccc.de diff --git a/inventories/chaosknoten/host_vars/zammad.yaml b/inventories/chaosknoten/host_vars/zammad.yaml index d0e1ea8..962df32 100644 --- a/inventories/chaosknoten/host_vars/zammad.yaml +++ b/inventories/chaosknoten/host_vars/zammad.yaml @@ -1,5 +1,5 @@ docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/zammad/compose.yaml.j2') }}" -docker_compose__configuration_files: [] +docker_compose__configuration_files: [ ] certbot__version_spec: "" certbot__acme_account_email_address: le-admin@hamburg.ccc.de diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml index ddb52fc..44999d4 100644 --- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml +++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml @@ -1,10 +1,9 @@ apiVersion: 1 datasources: -- name: Prometheus - type: prometheus - url: http://prometheus:9090 - isDefault: true - access: proxy - editable: true - + - name: Prometheus + type: prometheus + url: http://prometheus:9090 + isDefault: true + access: proxy + editable: true diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml index 9ce796f..0dad0a0 100644 --- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml +++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus.yml @@ -5,110 +5,110 @@ global: alerting: alertmanagers: - - scheme: http - timeout: 10s - static_configs: - - targets: - - "alertmanager:9093" + - scheme: http + timeout: 10s + static_configs: + - targets: + - "alertmanager:9093" rule_files: - "/etc/prometheus/rules/*.rules.yaml" scrape_configs: -- job_name: prometheus - honor_timestamps: true - metrics_path: /metrics - scheme: http - static_configs: - - targets: - - localhost:9090 -- job_name: alertmanager - honor_timestamps: true - metrics_path: /metrics - scheme: http - static_configs: - - targets: - - alertmanager:9093 -- job_name: c3lingo - honor_timestamps: true - scrape_interval: 5s - scrape_timeout: 1s - metrics_path: /mumblestats/metrics - scheme: https - static_configs: - - targets: - - mumble.c3lingo.org:443 -- job_name: mumble - honor_timestamps: true - scrape_interval: 5s - scrape_timeout: 1s - metrics_path: /metrics - scheme: https - static_configs: - - targets: - - mumble.hamburg.ccc.de:443 -- job_name: opnsense-ccchh - honor_timestamps: true - metrics_path: /metrics - scheme: http - static_configs: - - targets: - - 185.161.129.132:9100 -- job_name: jitsi - honor_timestamps: true - scrape_interval: 5s - scrape_timeout: 1s - metrics_path: /metrics - scheme: http - static_configs: - - targets: - - jitsi.hamburg.ccc.de:9888 # Jitsi Video Bridge -- job_name: 'pve' - static_configs: - - targets: - - 212.12.48.126 # chaosknoten - metrics_path: /pve - params: - module: [default] - cluster: ['1'] - node: ['1'] - relabel_configs: - - source_labels: [__address__] - target_label: __param_target - - source_labels: [__param_target] - target_label: instance - - target_label: __address__ - replacement: pve-exporter:9221 -- job_name: hosts - static_configs: - # Wieske Chaosknoten VMs - - labels: - site: wieske - type: virtual_machine - hypervisor: chaosknoten - targets: - - netbox-intern.hamburg.ccc.de:9100 - - matrix-intern.hamburg.ccc.de:9100 - - public-web-static-intern.hamburg.ccc.de:9100 - - git-intern.hamburg.ccc.de:9100 - - forgejo-actions-runner-intern.hamburg.ccc.de:9100 - - eh22-wiki-intern.hamburg.ccc.de:9100 - - nix-box-june-intern.hamburg.ccc.de:9100 - - mjolnir-intern.hamburg.ccc.de:9100 - - woodpecker-intern.hamburg.ccc.de:9100 - - penpot-intern.hamburg.ccc.de:9100 - - jitsi.hamburg.ccc.de:9100 - - onlyoffice-intern.hamburg.ccc.de:9100 - - ccchoir-intern.hamburg.ccc.de:9100 - - tickets-intern.hamburg.ccc.de:9100 - - keycloak-intern.hamburg.ccc.de:9100 - - onlyoffice-intern.hamburg.ccc.de:9100 - - pad-intern.hamburg.ccc.de:9100 - - wiki-intern.hamburg.ccc.de:9100 - - zammad-intern.hamburg.ccc.de:9100 - - pretalx-intern.hamburg.ccc.de:9100 - - labels: - site: wieske - type: physical_machine - targets: - - chaosknoten.hamburg.ccc.de:9100 + - job_name: prometheus + honor_timestamps: true + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - localhost:9090 + - job_name: alertmanager + honor_timestamps: true + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - alertmanager:9093 + - job_name: c3lingo + honor_timestamps: true + scrape_interval: 5s + scrape_timeout: 1s + metrics_path: /mumblestats/metrics + scheme: https + static_configs: + - targets: + - mumble.c3lingo.org:443 + - job_name: mumble + honor_timestamps: true + scrape_interval: 5s + scrape_timeout: 1s + metrics_path: /metrics + scheme: https + static_configs: + - targets: + - mumble.hamburg.ccc.de:443 + - job_name: opnsense-ccchh + honor_timestamps: true + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - 185.161.129.132:9100 + - job_name: jitsi + honor_timestamps: true + scrape_interval: 5s + scrape_timeout: 1s + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - jitsi.hamburg.ccc.de:9888 # Jitsi Video Bridge + - job_name: 'pve' + static_configs: + - targets: + - 212.12.48.126 # chaosknoten + metrics_path: /pve + params: + module: [ default ] + cluster: [ '1' ] + node: [ '1' ] + relabel_configs: + - source_labels: [ __address__ ] + target_label: __param_target + - source_labels: [ __param_target ] + target_label: instance + - target_label: __address__ + replacement: pve-exporter:9221 + - job_name: hosts + static_configs: + # Wieske Chaosknoten VMs + - labels: + site: wieske + type: virtual_machine + hypervisor: chaosknoten + targets: + - netbox-intern.hamburg.ccc.de:9100 + - matrix-intern.hamburg.ccc.de:9100 + - public-web-static-intern.hamburg.ccc.de:9100 + - git-intern.hamburg.ccc.de:9100 + - forgejo-actions-runner-intern.hamburg.ccc.de:9100 + - eh22-wiki-intern.hamburg.ccc.de:9100 + - nix-box-june-intern.hamburg.ccc.de:9100 + - mjolnir-intern.hamburg.ccc.de:9100 + - woodpecker-intern.hamburg.ccc.de:9100 + - penpot-intern.hamburg.ccc.de:9100 + - jitsi.hamburg.ccc.de:9100 + - onlyoffice-intern.hamburg.ccc.de:9100 + - ccchoir-intern.hamburg.ccc.de:9100 + - tickets-intern.hamburg.ccc.de:9100 + - keycloak-intern.hamburg.ccc.de:9100 + - onlyoffice-intern.hamburg.ccc.de:9100 + - pad-intern.hamburg.ccc.de:9100 + - wiki-intern.hamburg.ccc.de:9100 + - zammad-intern.hamburg.ccc.de:9100 + - pretalx-intern.hamburg.ccc.de:9100 + - labels: + site: wieske + type: physical_machine + targets: + - chaosknoten.hamburg.ccc.de:9100 diff --git a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml index fe12bfd..65b3590 100644 --- a/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/playbooks/files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -1,583 +1,583 @@ # Links & Resources: # - https://samber.github.io/awesome-prometheus-alerts/rules groups: -- name: node-exporter - rules: - - alert: HostOutOfMemory - expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host out of memory (instance {{ $labels.instance }}) - description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}" - - alert: HostMemoryUnderMemoryPressure - expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host memory under memory pressure (instance {{ $labels.instance }}) - description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}" - # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - - alert: HostMemoryIsUnderutilized - expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 1w - labels: - severity: info - annotations: - summary: Host Memory is underutilized (instance {{ $labels.instance }}) - description: "Node memory is < 10% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}" - - alert: HostUnusualNetworkThroughputIn - expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual network throughput in (instance {{ $labels.instance }}) - description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}" - - alert: HostUnusualNetworkThroughputOut - expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual network throughput out (instance {{ $labels.instance }}) - description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}" - # Have different disk read and write rate alerts for VMs and physical machines. - - alert: VirtualHostUnusualDiskReadRate - expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"} - for: 5m - labels: - severity: warning - annotations: - summary: Virtual host unusual disk read rate (instance {{ $labels.instance }}) - description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" - - alert: VirtualHostUnusualDiskWriteRate - expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"} - for: 2m - labels: - severity: warning - annotations: - summary: Virtual host unusual disk write rate (instance {{ $labels.instance }}) - description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" - # Some VMs are expected to have high Read / Write rates z.B. CI servers - - alert: VirtualHostUnusualDiskReadRate - expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"} - for: 10m - labels: - severity: warning - annotations: - summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }}) - description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" - - alert: VirtualHostUnusualDiskWriteRate - expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"} - for: 4m - labels: - severity: warning - annotations: - summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }}) - description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" - - alert: PhysicalHostUnusualDiskReadRate - expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} - for: 20m - labels: - severity: warning - annotations: - summary: Physical host unusual disk read rate (instance {{ $labels.instance }}) - description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}" - - alert: PhysicalHostUnusualDiskWriteRate - expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} - for: 15m - labels: - severity: warning - annotations: - summary: Physical host unusual disk write rate (instance {{ $labels.instance }}) - description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}" - # Please add ignored mountpoints in node_exporter parameters like - # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". - # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - - alert: HostOutOfDiskSpace - expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host out of disk space (instance {{ $labels.instance }}) - description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}" - # Please add ignored mountpoints in node_exporter parameters like - # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". - # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - - alert: HostDiskWillFillIn24Hours - expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) - description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}" - - alert: HostOutOfInodes - expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host out of inodes (instance {{ $labels.instance }}) - description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}" - - alert: HostInodesWillFillIn24Hours - expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) - description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}" - - alert: HostFilesystemDeviceError - expr: node_filesystem_device_error == 1 - for: 2m - labels: - severity: critical - annotations: - summary: Host filesystem device error (instance {{ $labels.instance }}) - description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}" - - alert: HostUnusualDiskReadLatency - expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk read latency (instance {{ $labels.instance }}) - description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}" - - alert: HostUnusualDiskWriteLatency - expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host unusual disk write latency (instance {{ $labels.instance }}) - description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}" - - alert: HostHighCpuLoad - expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 10m - labels: - severity: warning - annotations: - summary: Host high CPU load (instance {{ $labels.instance }}) - description: "CPU load is > 80%\n VALUE = {{ $value }}" - # We might want to introduce that later, tho maybe excluding hosts with one core, if possible and only for VMs? - # # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - # - alert: HostCpuIsUnderutilized - # expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - # for: 1w - # labels: - # severity: info - # annotations: - # summary: Host CPU is underutilized (instance {{ $labels.instance }}) - # description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}" - - alert: HostCpuStealNoisyNeighbor - expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 0m - labels: - severity: warning - annotations: - summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) - description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}" - - alert: HostCpuHighIowait - expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 0m - labels: - severity: warning - annotations: - summary: Host CPU high iowait (instance {{ $labels.instance }}) - description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}" - # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks. - - alert: PhysicalHostUnusualHardDiskIo - expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} - for: 5m - labels: - severity: warning - annotations: - summary: Physical host unusual hard disk IO (instance {{ $labels.instance }}) - description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" - - alert: PhysicalHostUnusualOtherDiskIo - expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} - for: 5m - labels: - severity: warning - annotations: - summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }}) - description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" - - alert: VirtualHostUnusualDiskIo - expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"} - for: 5m - labels: - severity: warning - annotations: - summary: Virtual host unusual disk IO (instance {{ $labels.instance }}) - description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" - # # x2 context switches is an arbitrary number. - # # The alert threshold depends on the nature of the application. - # # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - # - alert: HostContextSwitchingHigh - # expr: (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 - # for: 0m - # labels: - # severity: warning - # annotations: - # summary: Host context switching high (instance {{ $labels.instance }}) - # description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}" - - alert: HostSwapIsFillingUp - expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host swap is filling up (instance {{ $labels.instance }}) - description: "Swap is filling up (>80%)\n VALUE = {{ $value }}" - - alert: HostSystemdServiceCrashed - expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 0m - labels: - severity: warning - annotations: - summary: Host systemd service crashed (instance {{ $labels.instance }}) - description: "systemd service crashed\n VALUE = {{ $value }}" - - alert: HostPhysicalComponentTooHot - expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 5m - labels: - severity: warning - annotations: - summary: Host physical component too hot (instance {{ $labels.instance }}) - description: "Physical hardware component too hot\n VALUE = {{ $value }}" - - alert: HostNodeOvertemperatureAlarm - expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 0m - labels: - severity: critical - annotations: - summary: Host node overtemperature alarm (instance {{ $labels.instance }}) - description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}" - - alert: HostRaidArrayGotInactive - expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 0m - labels: - severity: critical - annotations: - summary: Host RAID array got inactive (instance {{ $labels.instance }}) - description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}" - - alert: HostRaidDiskFailure - expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host RAID disk failure (instance {{ $labels.instance }}) - description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}" - - alert: HostKernelVersionDeviations - expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 6h - labels: - severity: warning - annotations: - summary: Host kernel version deviations (instance {{ $labels.instance }}) - description: "Different kernel versions are running\n VALUE = {{ $value }}" - - alert: HostOomKillDetected - expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 0m - labels: - severity: warning - annotations: - summary: Host OOM kill detected (instance {{ $labels.instance }}) - description: "OOM kill detected\n VALUE = {{ $value }}" - - alert: HostEdacCorrectableErrorsDetected - expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 0m - labels: - severity: info - annotations: - summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) - description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}" - - alert: HostEdacUncorrectableErrorsDetected - expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 0m - labels: - severity: warning - annotations: - summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) - description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}" - - alert: HostNetworkReceiveErrors - expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host Network Receive Errors (instance {{ $labels.instance }}) - description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}" - - alert: HostNetworkTransmitErrors - expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host Network Transmit Errors (instance {{ $labels.instance }}) - description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}" - - alert: HostNetworkBondDegraded - expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host Network Bond Degraded (instance {{ $labels.instance }}) - description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}" - - alert: HostConntrackLimit - expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 5m - labels: - severity: warning - annotations: - summary: Host conntrack limit (instance {{ $labels.instance }}) - description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}" - - alert: HostClockSkew - expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 10m - labels: - severity: warning - annotations: - summary: Host clock skew (instance {{ $labels.instance }}) - description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}" - - alert: HostClockNotSynchronising - expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 2m - labels: - severity: warning - annotations: - summary: Host clock not synchronising (instance {{ $labels.instance }}) - description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}" - - alert: HostRequiresReboot - expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 4h - labels: - severity: info - annotations: - summary: Host requires reboot (instance {{ $labels.instance }}) - description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}" -- name: prometheus - rules: - - alert: PrometheusJobMissing - expr: absent(up{job="prometheus"}) - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus job missing (instance {{ $labels.instance }}) - description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" - - alert: PrometheusTargetMissing - expr: up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus target missing (instance {{ $labels.instance }}) - description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}" - - alert: PrometheusAllTargetsMissing - expr: sum by (job) (up) == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus all targets missing (instance {{ $labels.instance }}) - description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}" - - alert: PrometheusConfigurationReloadFailure - expr: prometheus_config_last_reload_successful != 1 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) - description: "Prometheus configuration reload error\n VALUE = {{ $value }}" - - alert: PrometheusTooManyRestarts - expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus too many restarts (instance {{ $labels.instance }}) - description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}" - - alert: PrometheusAlertmanagerJobMissing - expr: absent(up{job="alertmanager"}) - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) - description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}" - - alert: PrometheusAlertmanagerConfigurationReloadFailure - expr: alertmanager_config_last_reload_successful != 1 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) - description: "AlertManager configuration reload error\n VALUE = {{ $value }}" - - alert: PrometheusAlertmanagerConfigNotSynced - expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) - description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}" - # For testing. - # - alert: PrometheusAlertmanagerE2eDeadManSwitch - # expr: vector(1) - # for: 0m - # labels: - # severity: critical - # annotations: - # summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) - # description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}" - - alert: PrometheusNotConnectedToAlertmanager - expr: prometheus_notifications_alertmanagers_discovered < 1 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) - description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}" - - alert: PrometheusRuleEvaluationFailures - expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}" - - alert: PrometheusTemplateTextExpansionFailures - expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}" - - alert: PrometheusRuleEvaluationSlow - expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) - description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}" - - alert: PrometheusNotificationsBacklog - expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus notifications backlog (instance {{ $labels.instance }}) - description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}" - - alert: PrometheusAlertmanagerNotificationFailing - expr: rate(alertmanager_notifications_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) - description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}" - - alert: PrometheusTargetEmpty - expr: prometheus_sd_discovered_targets == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus target empty (instance {{ $labels.instance }}) - description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}" - - alert: PrometheusTargetScrapingSlow - expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus target scraping slow (instance {{ $labels.instance }}) - description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}" - - alert: PrometheusLargeScrape - expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus large scrape (instance {{ $labels.instance }}) - description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}" - - alert: PrometheusTargetScrapeDuplicate - expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) - description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}" - - alert: PrometheusTsdbCheckpointCreationFailures - expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}" - - alert: PrometheusTsdbCheckpointDeletionFailures - expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}" - - alert: PrometheusTsdbCompactionsFailed - expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}" - - alert: PrometheusTsdbHeadTruncationsFailed - expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}" - - alert: PrometheusTsdbReloadFailures - expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}" - - alert: PrometheusTsdbWalCorruptions - expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}" - - alert: PrometheusTsdbWalTruncationsFailed - expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}" - - alert: PrometheusTimeseriesCardinality - expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) - description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}" + - name: node-exporter + rules: + - alert: HostOutOfMemory + expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}" + - alert: HostMemoryUnderMemoryPressure + expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure (instance {{ $labels.instance }}) + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}" + # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly + - alert: HostMemoryIsUnderutilized + expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 1w + labels: + severity: info + annotations: + summary: Host Memory is underutilized (instance {{ $labels.instance }}) + description: "Node memory is < 10% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}" + - alert: HostUnusualNetworkThroughputIn + expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput in (instance {{ $labels.instance }}) + description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}" + - alert: HostUnusualNetworkThroughputOut + expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput out (instance {{ $labels.instance }}) + description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}" + # Have different disk read and write rate alerts for VMs and physical machines. + - alert: VirtualHostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"} + for: 5m + labels: + severity: warning + annotations: + summary: Virtual host unusual disk read rate (instance {{ $labels.instance }}) + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" + - alert: VirtualHostUnusualDiskWriteRate + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"} + for: 2m + labels: + severity: warning + annotations: + summary: Virtual host unusual disk write rate (instance {{ $labels.instance }}) + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" + # Some VMs are expected to have high Read / Write rates z.B. CI servers + - alert: VirtualHostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"} + for: 10m + labels: + severity: warning + annotations: + summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }}) + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" + - alert: VirtualHostUnusualDiskWriteRate + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"} + for: 4m + labels: + severity: warning + annotations: + summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }}) + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" + - alert: PhysicalHostUnusualDiskReadRate + expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} + for: 20m + labels: + severity: warning + annotations: + summary: Physical host unusual disk read rate (instance {{ $labels.instance }}) + description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}" + - alert: PhysicalHostUnusualDiskWriteRate + expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} + for: 15m + labels: + severity: warning + annotations: + summary: Physical host unusual disk write rate (instance {{ $labels.instance }}) + description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}" + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostOutOfDiskSpace + expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}" + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostDiskWillFillIn24Hours + expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}" + - alert: HostOutOfInodes + expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}" + - alert: HostInodesWillFillIn24Hours + expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}" + - alert: HostFilesystemDeviceError + expr: node_filesystem_device_error == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}" + - alert: HostUnusualDiskReadLatency + expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk read latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}" + - alert: HostUnusualDiskWriteLatency + expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}" + - alert: HostHighCpuLoad + expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 10m + labels: + severity: warning + annotations: + summary: Host high CPU load (instance {{ $labels.instance }}) + description: "CPU load is > 80%\n VALUE = {{ $value }}" + # We might want to introduce that later, tho maybe excluding hosts with one core, if possible and only for VMs? + # # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly + # - alert: HostCpuIsUnderutilized + # expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + # for: 1w + # labels: + # severity: info + # annotations: + # summary: Host CPU is underutilized (instance {{ $labels.instance }}) + # description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}" + - alert: HostCpuStealNoisyNeighbor + expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) + description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}" + - alert: HostCpuHighIowait + expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}" + # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks. + - alert: PhysicalHostUnusualHardDiskIo + expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Physical host unusual hard disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" + - alert: PhysicalHostUnusualOtherDiskIo + expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" + - alert: VirtualHostUnusualDiskIo + expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Virtual host unusual disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" + # # x2 context switches is an arbitrary number. + # # The alert threshold depends on the nature of the application. + # # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 + # - alert: HostContextSwitchingHigh + # expr: (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: Host context switching high (instance {{ $labels.instance }}) + # description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}" + - alert: HostSwapIsFillingUp + expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host swap is filling up (instance {{ $labels.instance }}) + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}" + - alert: HostSystemdServiceCrashed + expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd service crashed (instance {{ $labels.instance }}) + description: "systemd service crashed\n VALUE = {{ $value }}" + - alert: HostPhysicalComponentTooHot + expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot (instance {{ $labels.instance }}) + description: "Physical hardware component too hot\n VALUE = {{ $value }}" + - alert: HostNodeOvertemperatureAlarm + expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: critical + annotations: + summary: Host node overtemperature alarm (instance {{ $labels.instance }}) + description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}" + - alert: HostRaidArrayGotInactive + expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: critical + annotations: + summary: Host RAID array got inactive (instance {{ $labels.instance }}) + description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}" + - alert: HostRaidDiskFailure + expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host RAID disk failure (instance {{ $labels.instance }}) + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}" + - alert: HostKernelVersionDeviations + expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 6h + labels: + severity: warning + annotations: + summary: Host kernel version deviations (instance {{ $labels.instance }}) + description: "Different kernel versions are running\n VALUE = {{ $value }}" + - alert: HostOomKillDetected + expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}" + - alert: HostEdacCorrectableErrorsDetected + expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: info + annotations: + summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}" + - alert: HostEdacUncorrectableErrorsDetected + expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}" + - alert: HostNetworkReceiveErrors + expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}" + - alert: HostNetworkTransmitErrors + expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}" + - alert: HostNetworkBondDegraded + expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}" + - alert: HostConntrackLimit + expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}" + - alert: HostClockSkew + expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}" + - alert: HostClockNotSynchronising + expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}" + - alert: HostRequiresReboot + expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}" + - name: prometheus + rules: + - alert: PrometheusJobMissing + expr: absent(up{job="prometheus"}) + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" + - alert: PrometheusTargetMissing + expr: up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}" + - alert: PrometheusAllTargetsMissing + expr: sum by (job) (up) == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus all targets missing (instance {{ $labels.instance }}) + description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}" + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}" + - alert: PrometheusTooManyRestarts + expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus too many restarts (instance {{ $labels.instance }}) + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}" + - alert: PrometheusAlertmanagerJobMissing + expr: absent(up{job="alertmanager"}) + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}" + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}" + - alert: PrometheusAlertmanagerConfigNotSynced + expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) + description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}" + # For testing. + # - alert: PrometheusAlertmanagerE2eDeadManSwitch + # expr: vector(1) + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) + # description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}" + - alert: PrometheusNotConnectedToAlertmanager + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}" + - alert: PrometheusRuleEvaluationFailures + expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}" + - alert: PrometheusTemplateTextExpansionFailures + expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}" + - alert: PrometheusRuleEvaluationSlow + expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}" + - alert: PrometheusNotificationsBacklog + expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}" + - alert: PrometheusAlertmanagerNotificationFailing + expr: rate(alertmanager_notifications_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}" + - alert: PrometheusTargetEmpty + expr: prometheus_sd_discovered_targets == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target empty (instance {{ $labels.instance }}) + description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}" + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}" + - alert: PrometheusLargeScrape + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}" + - alert: PrometheusTargetScrapeDuplicate + expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}" + - alert: PrometheusTsdbCheckpointCreationFailures + expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}" + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}" + - alert: PrometheusTsdbCompactionsFailed + expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}" + - alert: PrometheusTsdbHeadTruncationsFailed + expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}" + - alert: PrometheusTsdbReloadFailures + expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}" + - alert: PrometheusTsdbWalCorruptions + expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}" + - alert: PrometheusTsdbWalTruncationsFailed + expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}" + - alert: PrometheusTimeseriesCardinality + expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000 + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) + description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}" diff --git a/playbooks/files/chaosknoten/configs/lists/compose/compose.yaml b/playbooks/files/chaosknoten/configs/lists/compose/compose.yaml index 2ef87aa..232627a 100644 --- a/playbooks/files/chaosknoten/configs/lists/compose/compose.yaml +++ b/playbooks/files/chaosknoten/configs/lists/compose/compose.yaml @@ -5,21 +5,21 @@ services: container_name: mailman-core hostname: mailman-core volumes: - - /opt/mailman/core:/opt/mailman/ + - /opt/mailman/core:/opt/mailman/ stop_grace_period: 30s links: - - database:database + - database:database depends_on: - - database + - database environment: - - DATABASE_URL=postgresql://mailman:wvQjbMRnwFuxGEPz@database/mailmandb - - DATABASE_TYPE=postgres - - DATABASE_CLASS=mailman.database.postgresql.PostgreSQLDatabase - - HYPERKITTY_API_KEY=ITfRjushI6FP0TLMnRpZxlfB2e17DN86 - - MTA=postfix + - DATABASE_URL=postgresql://mailman:wvQjbMRnwFuxGEPz@database/mailmandb + - DATABASE_TYPE=postgres + - DATABASE_CLASS=mailman.database.postgresql.PostgreSQLDatabase + - HYPERKITTY_API_KEY=ITfRjushI6FP0TLMnRpZxlfB2e17DN86 + - MTA=postfix ports: - - "127.0.0.1:8001:8001" # API - - "127.0.0.1:8024:8024" # LMTP - incoming emails + - "127.0.0.1:8001:8001" # API + - "127.0.0.1:8024:8024" # LMTP - incoming emails networks: mailman: @@ -29,36 +29,36 @@ services: container_name: mailman-web hostname: mailman-web depends_on: - - database + - database links: - - mailman-core:mailman-core - - database:database + - mailman-core:mailman-core + - database:database volumes: - - /opt/mailman/web:/opt/mailman-web-data + - /opt/mailman/web:/opt/mailman-web-data environment: - - DATABASE_TYPE=postgres - - DATABASE_URL=postgresql://mailman:wvQjbMRnwFuxGEPz@database/mailmandb - - "DJANGO_ALLOWED_HOSTS=lists.hamburg.ccc.de,lists.c3lingo.org" - - HYPERKITTY_API_KEY=ITfRjushI6FP0TLMnRpZxlfB2e17DN86 - - SERVE_FROM_DOMAIN=lists.hamburg.ccc.de - - SECRET_KEY=ugfknEYBaFVc62R1jlIjnkizQaqr7tSt - - MAILMAN_ADMIN_USER=ccchh-admin - - MAILMAN_ADMIN_EMAIL=tony@cowtest.hamburg.ccc.de + - DATABASE_TYPE=postgres + - DATABASE_URL=postgresql://mailman:wvQjbMRnwFuxGEPz@database/mailmandb + - "DJANGO_ALLOWED_HOSTS=lists.hamburg.ccc.de,lists.c3lingo.org" + - HYPERKITTY_API_KEY=ITfRjushI6FP0TLMnRpZxlfB2e17DN86 + - SERVE_FROM_DOMAIN=lists.hamburg.ccc.de + - SECRET_KEY=ugfknEYBaFVc62R1jlIjnkizQaqr7tSt + - MAILMAN_ADMIN_USER=ccchh-admin + - MAILMAN_ADMIN_EMAIL=tony@cowtest.hamburg.ccc.de ports: - - "127.0.0.1:8000:8000" # HTTP - - "127.0.0.1:8080:8080" # uwsgi + - "127.0.0.1:8000:8000" # HTTP + - "127.0.0.1:8080:8080" # uwsgi networks: mailman: database: restart: unless-stopped environment: - - POSTGRES_DB=mailmandb - - POSTGRES_USER=mailman - - POSTGRES_PASSWORD=wvQjbMRnwFuxGEPz + - POSTGRES_DB=mailmandb + - POSTGRES_USER=mailman + - POSTGRES_PASSWORD=wvQjbMRnwFuxGEPz image: postgres:12-alpine volumes: - - /opt/mailman/database:/var/lib/postgresql/data + - /opt/mailman/database:/var/lib/postgresql/data networks: mailman: @@ -68,5 +68,5 @@ networks: ipam: driver: default config: - - - subnet: 172.19.199.0/24 + - + subnet: 172.19.199.0/24 diff --git a/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml b/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml index cdc9922..f63436b 100644 --- a/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml +++ b/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml @@ -1,15 +1,15 @@ - name: update, upgrade and potentially reboot become: true block: - - name: apt-get update - ansible.builtin.apt: - update-cache: true + - name: apt-get update + ansible.builtin.apt: + update-cache: true - - name: apt-get dist-upgrade - ansible.builtin.apt: - upgrade: dist - register: apt_update_and_upgrade__upgrade_result + - name: apt-get dist-upgrade + ansible.builtin.apt: + upgrade: dist + register: apt_update_and_upgrade__upgrade_result - - name: reboot, after package upgrade - ansible.builtin.reboot: - when: apt_update_and_upgrade__upgrade_result.changed + - name: reboot, after package upgrade + ansible.builtin.reboot: + when: apt_update_and_upgrade__upgrade_result.changed diff --git a/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml b/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml index 12676dc..0492a35 100644 --- a/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml +++ b/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml @@ -3,21 +3,21 @@ become: true block: - - name: deploy `sshd_config` - ansible.builtin.template: - force: true - dest: /etc/ssh/sshd_config - mode: 0644 - owner: root - group: root - src: sshd_config.j2 - register: deploy_ssh_server_config__ssh_config_copy_result + - name: deploy `sshd_config` + ansible.builtin.template: + force: true + dest: /etc/ssh/sshd_config + mode: "0644" + owner: root + group: root + src: sshd_config.j2 + register: deploy_ssh_server_config__ssh_config_copy_result - - name: deactivate short moduli - ansible.builtin.shell: - cmd: awk '$5 >= 3071' /etc/ssh/moduli > /etc/ssh/moduli.tmp && mv /etc/ssh/moduli.tmp /etc/ssh/moduli + - name: deactivate short moduli + ansible.builtin.shell: + cmd: awk '$5 >= 3071' /etc/ssh/moduli > /etc/ssh/moduli.tmp && mv /etc/ssh/moduli.tmp /etc/ssh/moduli - # Rebooting here instead of restarting the ssh service, since I don't know how Ansible reacts, when it restarts the service it probably needs for the connection. - - name: reboot, if ssh server config got changed - ansible.builtin.reboot: - when: deploy_ssh_server_config__ssh_config_copy_result.changed + # Rebooting here instead of restarting the ssh service, since I don't know how Ansible reacts, when it restarts the service it probably needs for the connection. + - name: reboot, if ssh server config got changed + ansible.builtin.reboot: + when: deploy_ssh_server_config__ssh_config_copy_result.changed diff --git a/playbooks/roles/infrastructure_authorized_keys/tasks/main.yaml b/playbooks/roles/infrastructure_authorized_keys/tasks/main.yaml index c363ce7..982c7a0 100644 --- a/playbooks/roles/infrastructure_authorized_keys/tasks/main.yaml +++ b/playbooks/roles/infrastructure_authorized_keys/tasks/main.yaml @@ -4,4 +4,3 @@ user: chaos exclusive: true key: https://git.hamburg.ccc.de/CCCHH/infrastructure-authorized-keys/raw/branch/trunk/authorized_keys - \ No newline at end of file diff --git a/playbooks/roles/nextcloud/meta/main.yaml b/playbooks/roles/nextcloud/meta/main.yaml index 9138dfe..34f476a 100644 --- a/playbooks/roles/nextcloud/meta/main.yaml +++ b/playbooks/roles/nextcloud/meta/main.yaml @@ -11,10 +11,10 @@ dependencies: - role: nginx vars: nginx__version_spec: "{{ nextcloud__nginx_version_spec }}" - nginx__configurations: + nginx__configurations: - name: "{{ nextcloud__fqdn }}" content: "{{ lookup('ansible.builtin.template', 'nginx_nextcloud.conf.j2') }}" - role: docker_compose vars: docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'compose.yaml.j2') }}" - docker_compose__configuration_files: [] + docker_compose__configuration_files: [ ] diff --git a/playbooks/roles/nginx/defaults/main.yaml b/playbooks/roles/nginx/defaults/main.yaml index e20777c..6ccfac4 100644 --- a/playbooks/roles/nginx/defaults/main.yaml +++ b/playbooks/roles/nginx/defaults/main.yaml @@ -1,5 +1,5 @@ nginx__deploy_redirect_conf: true nginx__deploy_tls_conf: true -nginx__configurations: [] +nginx__configurations: [ ] nginx__use_custom_nginx_conf: false nginx__custom_nginx_conf: "" diff --git a/playbooks/roles/nginx/tasks/main/config_deploy.yaml b/playbooks/roles/nginx/tasks/main/config_deploy.yaml index 100696e..c7fac39 100644 --- a/playbooks/roles/nginx/tasks/main/config_deploy.yaml +++ b/playbooks/roles/nginx/tasks/main/config_deploy.yaml @@ -11,7 +11,7 @@ ansible.builtin.copy: force: true dest: /etc/nginx/nginx.conf.ansiblesave - mode: 0644 + mode: "0644" owner: root group: root remote_src: true @@ -22,7 +22,7 @@ ansible.builtin.copy: content: "{{ nginx__custom_nginx_conf }}" dest: "/etc/nginx/nginx.conf" - mode: 0644 + mode: "0644" owner: root group: root become: true @@ -36,7 +36,7 @@ ansible.builtin.copy: force: true dest: /etc/nginx/nginx.conf - mode: 0644 + mode: "0644" owner: root group: root remote_src: true @@ -55,7 +55,7 @@ ansible.builtin.get_url: force: true dest: /etc/nginx-mozilla-dhparam - mode: 0644 + mode: "0644" url: https://ssl-config.mozilla.org/ffdhe2048.txt become: true notify: Restart `nginx.service` @@ -71,7 +71,7 @@ ansible.builtin.copy: force: true dest: /etc/nginx/conf.d/tls.conf - mode: 0644 + mode: "0644" owner: root group: root src: tls.conf @@ -89,7 +89,7 @@ ansible.builtin.copy: force: true dest: /etc/nginx/conf.d/redirect.conf - mode: 0644 + mode: "0644" owner: root group: root src: redirect.conf @@ -104,7 +104,7 @@ ansible.builtin.copy: content: "{{ item.content }}" dest: "/etc/nginx/conf.d/{{ item.name }}.conf" - mode: 0644 + mode: "0644" owner: root group: root become: true diff --git a/requirements.yml b/requirements.yml index e66cdcb..d5ebdfc 100644 --- a/requirements.yml +++ b/requirements.yml @@ -2,4 +2,4 @@ collections: # Install a collection from Ansible Galaxy. - name: debops.debops version: ">=3.1.0" - source: https://galaxy.ansible.com \ No newline at end of file + source: https://galaxy.ansible.com -- 2.47.0 From 4ff826e5086e6600f27ddef00733684c316486b2 Mon Sep 17 00:00:00 2001 From: June Date: Sat, 23 Nov 2024 02:50:37 +0100 Subject: [PATCH 04/10] add .ansible-lint config with setting to skip yaml line-length check --- .ansible-lint | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .ansible-lint diff --git a/.ansible-lint b/.ansible-lint new file mode 100644 index 0000000..e148abb --- /dev/null +++ b/.ansible-lint @@ -0,0 +1,2 @@ +skip_list: + - "yaml[line-length]" -- 2.47.0 From bb24e6fd5a9289e6a5b18eb5c4c22c9d40856efa Mon Sep 17 00:00:00 2001 From: June Date: Sat, 23 Nov 2024 02:53:06 +0100 Subject: [PATCH 05/10] disable name[casing] check in ansible-lint config --- .ansible-lint | 1 + 1 file changed, 1 insertion(+) diff --git a/.ansible-lint b/.ansible-lint index e148abb..03eb219 100644 --- a/.ansible-lint +++ b/.ansible-lint @@ -1,2 +1,3 @@ skip_list: - "yaml[line-length]" + - "name[casing]" -- 2.47.0 From cf5e6c4e1a0235b60321b5e5c4814ed40c3c1e43 Mon Sep 17 00:00:00 2001 From: June Date: Sat, 23 Nov 2024 02:56:16 +0100 Subject: [PATCH 06/10] fix ansible-lint error by not comparing to literal false --- playbooks/roles/nginx/tasks/main/config_deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/nginx/tasks/main/config_deploy.yaml b/playbooks/roles/nginx/tasks/main/config_deploy.yaml index c7fac39..47bc050 100644 --- a/playbooks/roles/nginx/tasks/main/config_deploy.yaml +++ b/playbooks/roles/nginx/tasks/main/config_deploy.yaml @@ -7,7 +7,7 @@ when: nginx__use_custom_nginx_conf block: - name: when no `nginx.conf.ansiblesave` is present, save the current `nginx.conf` - when: nginx__nginx_conf_ansiblesave_stat_result.stat.exists == false + when: not nginx__nginx_conf_ansiblesave_stat_result.stat.exists ansible.builtin.copy: force: true dest: /etc/nginx/nginx.conf.ansiblesave -- 2.47.0 From d3d37e2e4c49d2730c8fd804e99550d122c0ce80 Mon Sep 17 00:00:00 2001 From: June Date: Sun, 24 Nov 2024 01:08:13 +0100 Subject: [PATCH 07/10] exclude .forgejo/ directory from ansible-lint --- .ansible-lint | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.ansible-lint b/.ansible-lint index 03eb219..f68da38 100644 --- a/.ansible-lint +++ b/.ansible-lint @@ -1,3 +1,6 @@ skip_list: - "yaml[line-length]" - "name[casing]" + +exclude_paths: + - .forgejo/ -- 2.47.0 From db02969168069b718d0a1ee2eb0bd12e5e30a793 Mon Sep 17 00:00:00 2001 From: June Date: Sun, 1 Dec 2024 04:16:42 +0100 Subject: [PATCH 08/10] add CI running ansible-lint --- .forgejo/workflows/lint.yaml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .forgejo/workflows/lint.yaml diff --git a/.forgejo/workflows/lint.yaml b/.forgejo/workflows/lint.yaml new file mode 100644 index 0000000..47b5a1d --- /dev/null +++ b/.forgejo/workflows/lint.yaml @@ -0,0 +1,32 @@ +# Links & Resources: +# https://github.com/ansible/ansible-lint?tab=readme-ov-file#using-ansible-lint-as-a-github-action +# https://github.com/ansible/ansible-lint/blob/main/action.yml +on: + pull_request: + push: + +jobs: + ansible-lint: + name: Ansible Lint + runs-on: docker + steps: + - uses: actions/checkout@v4 + - name: Install pip + run: | + apt update + apt install -y pip + - name: Install python jmespath + run: | + pip install jmespath + env: + PIP_BREAK_SYSTEM_PACKAGES: 1 + # Don't let it setup python as the then called setup-python action doesn't + # work in our environmnet. + # Rather manually setup python (pip) before instead. + - name: Run ansible-lint + uses: https://github.com/ansible/ansible-lint@main + with: + setup_python: "false" + requirements_file: "requirements.yml" + env: + PIP_BREAK_SYSTEM_PACKAGES: 1 -- 2.47.0 From e3a29c422a09663a1da0131998f9ee2fd8da83f4 Mon Sep 17 00:00:00 2001 From: June Date: Sun, 1 Dec 2024 04:38:07 +0100 Subject: [PATCH 09/10] convert two reboot tasks running on changed to handlers This fixes ansible-lint no-handler complaints. --- .../roles/apt_update_and_upgrade/handlers/main.yaml | 3 +++ playbooks/roles/apt_update_and_upgrade/tasks/main.yaml | 6 ++---- .../roles/deploy_ssh_server_config/handlers/main.yaml | 3 +++ playbooks/roles/deploy_ssh_server_config/tasks/main.yaml | 9 +++------ 4 files changed, 11 insertions(+), 10 deletions(-) create mode 100644 playbooks/roles/apt_update_and_upgrade/handlers/main.yaml create mode 100644 playbooks/roles/deploy_ssh_server_config/handlers/main.yaml diff --git a/playbooks/roles/apt_update_and_upgrade/handlers/main.yaml b/playbooks/roles/apt_update_and_upgrade/handlers/main.yaml new file mode 100644 index 0000000..001bbe4 --- /dev/null +++ b/playbooks/roles/apt_update_and_upgrade/handlers/main.yaml @@ -0,0 +1,3 @@ +- name: reboot the system + become: true + ansible.builtin.reboot: diff --git a/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml b/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml index f63436b..5d9181b 100644 --- a/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml +++ b/playbooks/roles/apt_update_and_upgrade/tasks/main.yaml @@ -9,7 +9,5 @@ ansible.builtin.apt: upgrade: dist register: apt_update_and_upgrade__upgrade_result - - - name: reboot, after package upgrade - ansible.builtin.reboot: - when: apt_update_and_upgrade__upgrade_result.changed + notify: + - reboot the system diff --git a/playbooks/roles/deploy_ssh_server_config/handlers/main.yaml b/playbooks/roles/deploy_ssh_server_config/handlers/main.yaml new file mode 100644 index 0000000..001bbe4 --- /dev/null +++ b/playbooks/roles/deploy_ssh_server_config/handlers/main.yaml @@ -0,0 +1,3 @@ +- name: reboot the system + become: true + ansible.builtin.reboot: diff --git a/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml b/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml index 0492a35..714b0ca 100644 --- a/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml +++ b/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml @@ -11,13 +11,10 @@ owner: root group: root src: sshd_config.j2 - register: deploy_ssh_server_config__ssh_config_copy_result + notify: + # Reboot instead of just restarting the ssh service, since I don't know how Ansible reacts, when it restarts the service it probably needs for the connection. + - reboot the system - name: deactivate short moduli ansible.builtin.shell: cmd: awk '$5 >= 3071' /etc/ssh/moduli > /etc/ssh/moduli.tmp && mv /etc/ssh/moduli.tmp /etc/ssh/moduli - - # Rebooting here instead of restarting the ssh service, since I don't know how Ansible reacts, when it restarts the service it probably needs for the connection. - - name: reboot, if ssh server config got changed - ansible.builtin.reboot: - when: deploy_ssh_server_config__ssh_config_copy_result.changed -- 2.47.0 From e6d6d9eed053a0e4c1f11904e77ac24ed2a43129 Mon Sep 17 00:00:00 2001 From: June Date: Sun, 1 Dec 2024 22:20:15 +0100 Subject: [PATCH 10/10] report changed properly for "deactivate short moduli" task This fixes the ansible-lint no-changed-when complaint and also allows to notify the reboot handler. --- .../deploy_ssh_server_config/tasks/main.yaml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml b/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml index 714b0ca..f5d00f5 100644 --- a/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml +++ b/playbooks/roles/deploy_ssh_server_config/tasks/main.yaml @@ -17,4 +17,20 @@ - name: deactivate short moduli ansible.builtin.shell: - cmd: awk '$5 >= 3071' /etc/ssh/moduli > /etc/ssh/moduli.tmp && mv /etc/ssh/moduli.tmp /etc/ssh/moduli + executable: /bin/bash + cmd: | + set -eo pipefail + + awk '$5 >= 3071' /etc/ssh/moduli > /etc/ssh/moduli.tmp + if diff /etc/ssh/moduli /etc/ssh/moduli.tmp; then + rm /etc/ssh/moduli.tmp + else + mv /etc/ssh/moduli.tmp /etc/ssh/moduli + echo "ansible-changed: changed /etc/ssh/moduli" + fi + register: result + changed_when: + - '"ansible-changed" in result.stdout' + notify: + # Reboot instead of just restarting the ssh service, since I don't know how Ansible reacts, when it restarts the service it probably needs for the connection. + - reboot the system -- 2.47.0