From 9dc246f825a809c58b00ee9d1a08250f1df25fbf Mon Sep 17 00:00:00 2001 From: c6ristian Date: Mon, 1 Sep 2025 01:25:11 +0200 Subject: [PATCH 1/2] WIP --- .../chaosknoten/host_vars/grafana.yaml | 2 + .../prometheus_alerts-fux.rules.yaml | 37 +++++++++++++++++++ .../prometheus_alerts.rules.yaml | 4 +- 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml diff --git a/inventories/chaosknoten/host_vars/grafana.yaml b/inventories/chaosknoten/host_vars/grafana.yaml index 1ca6b1b..2e3672e 100644 --- a/inventories/chaosknoten/host_vars/grafana.yaml +++ b/inventories/chaosknoten/host_vars/grafana.yaml @@ -10,6 +10,8 @@ docker_compose__configuration_files: content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}" - name: prometheus_alerts.rules.yaml content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}" + - name: prometheus_alerts-fux.rules.yaml + content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml') }}" - name: alertmanager_alert_templates.tmpl content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}" - name: loki.yaml diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml new file mode 100644 index 0000000..eb58477 --- /dev/null +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml @@ -0,0 +1,37 @@ +groups: + - name: SNMP + rules: + - alert: SnmpTargetMissing + expr: up{job=~".*snmp|SNMP.*"} == 0 + for: 15m + labels: + severity: critical + annotations: + summary: SNMP target missing (instance {{ $labels.instance }}) + description: "A SNMP target has disappeared for more the 30 min.\n VALUE = {{ $value }}" + - alert: SnmpTargetFalky + expr: changes(up{job=~"snmp"}[24h]) > 5 + for: 0m + labels: + severity: info + annotations: + summary: SNMP target flaky (instance {{ $labels.instance }}) + description: "A SNMP target is has a flaky respons over the last 24 hours.\n VALUE = {{ $value }}" + - name: DHCP + rules: + - alert: DhcpFuxSharedFailed + expr: script_success{script="check_dhcp_fux_shared"} == 0 + for: 0m + labels: + severity: critical + annotations: + summary: DHCP for Fux Shared stoped working + description: "No DHCP lease for the Fux Shared range was received" + - alert: DhcpFuxAdminFailed + expr: script_success{script_success="check_dhcp_fux_admin"} == 0 + for: 0m + labels: + severity: critical + annotations: + summary: DHCP for Fux Admin stoped working + description: "No DHCP lease for the Fux Admin range was received" diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml index aa20a42..4a2bc6f 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -410,7 +410,7 @@ groups: summary: Prometheus job missing (instance {{ $labels.instance }}) description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" - alert: PrometheusTargetMissing - expr: up == 0 + expr: up{job!~"snmp|noc_room_temp"} == 0 for: 0m labels: severity: critical @@ -418,7 +418,7 @@ groups: summary: Prometheus target missing (instance {{ $labels.instance }}) description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}" - alert: PrometheusAllTargetsMissing - expr: sum by (job) (up) == 0 + expr: sum by (job) (up{job!~"snmp|noc_room_temp"}) == 0 for: 0m labels: severity: critical From 7f9d282155a1cfea158adac3e5fd1d520ec10905 Mon Sep 17 00:00:00 2001 From: c6ristian Date: Fri, 5 Sep 2025 15:04:04 +0200 Subject: [PATCH 2/2] grafana: make alerts better for fux --- .../prometheus_alerts-fux.rules.yaml | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml index eb58477..39dd928 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml @@ -1,4 +1,14 @@ groups: + - name: Generic + rules: + - alert: HostJobFlaky + expr: group by(host, job) (changes(up{org="fux", job!="integrations/unix"}[24h]) > 5) + for: 0m + labels: + severity: info + annotations: + summary: Job {{ $labels.job }} flaky on (host {{ $labels.instance }}) + description: "The job {{ $labels.job }} on target: {{ labels.host }} has been flaky over the last 24 hours.\n VALUE = {{ $value }}" - name: SNMP rules: - alert: SnmpTargetMissing @@ -8,15 +18,7 @@ groups: severity: critical annotations: summary: SNMP target missing (instance {{ $labels.instance }}) - description: "A SNMP target has disappeared for more the 30 min.\n VALUE = {{ $value }}" - - alert: SnmpTargetFalky - expr: changes(up{job=~"snmp"}[24h]) > 5 - for: 0m - labels: - severity: info - annotations: - summary: SNMP target flaky (instance {{ $labels.instance }}) - description: "A SNMP target is has a flaky respons over the last 24 hours.\n VALUE = {{ $value }}" + description: "A SNMP target has disappeared for more the 15 min.\n VALUE = {{ $value }}" - name: DHCP rules: - alert: DhcpFuxSharedFailed