diff --git a/inventories/chaosknoten/host_vars/grafana.yaml b/inventories/chaosknoten/host_vars/grafana.yaml index 2e3672e..1ca6b1b 100644 --- a/inventories/chaosknoten/host_vars/grafana.yaml +++ b/inventories/chaosknoten/host_vars/grafana.yaml @@ -10,8 +10,6 @@ docker_compose__configuration_files: content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}" - name: prometheus_alerts.rules.yaml content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}" - - name: prometheus_alerts-fux.rules.yaml - content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml') }}" - name: alertmanager_alert_templates.tmpl content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}" - name: loki.yaml diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml deleted file mode 100644 index 39dd928..0000000 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml +++ /dev/null @@ -1,39 +0,0 @@ -groups: - - name: Generic - rules: - - alert: HostJobFlaky - expr: group by(host, job) (changes(up{org="fux", job!="integrations/unix"}[24h]) > 5) - for: 0m - labels: - severity: info - annotations: - summary: Job {{ $labels.job }} flaky on (host {{ $labels.instance }}) - description: "The job {{ $labels.job }} on target: {{ labels.host }} has been flaky over the last 24 hours.\n VALUE = {{ $value }}" - - name: SNMP - rules: - - alert: SnmpTargetMissing - expr: up{job=~".*snmp|SNMP.*"} == 0 - for: 15m - labels: - severity: critical - annotations: - summary: SNMP target missing (instance {{ $labels.instance }}) - description: "A SNMP target has disappeared for more the 15 min.\n VALUE = {{ $value }}" - - name: DHCP - rules: - - alert: DhcpFuxSharedFailed - expr: script_success{script="check_dhcp_fux_shared"} == 0 - for: 0m - labels: - severity: critical - annotations: - summary: DHCP for Fux Shared stoped working - description: "No DHCP lease for the Fux Shared range was received" - - alert: DhcpFuxAdminFailed - expr: script_success{script_success="check_dhcp_fux_admin"} == 0 - for: 0m - labels: - severity: critical - annotations: - summary: DHCP for Fux Admin stoped working - description: "No DHCP lease for the Fux Admin range was received" diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml index 4a2bc6f..aa20a42 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -410,7 +410,7 @@ groups: summary: Prometheus job missing (instance {{ $labels.instance }}) description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" - alert: PrometheusTargetMissing - expr: up{job!~"snmp|noc_room_temp"} == 0 + expr: up == 0 for: 0m labels: severity: critical @@ -418,7 +418,7 @@ groups: summary: Prometheus target missing (instance {{ $labels.instance }}) description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}" - alert: PrometheusAllTargetsMissing - expr: sum by (job) (up{job!~"snmp|noc_room_temp"}) == 0 + expr: sum by (job) (up) == 0 for: 0m labels: severity: critical