grafana: make alerts better for fux
This commit is contained in:
parent
592afdced9
commit
068b261745
6 changed files with 47 additions and 4 deletions
|
@ -10,6 +10,8 @@ docker_compose__configuration_files:
|
|||
content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}"
|
||||
- name: prometheus_alerts.rules.yaml
|
||||
content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}"
|
||||
- name: prometheus_alerts-fux.rules.yaml
|
||||
content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml') }}"
|
||||
- name: alertmanager_alert_templates.tmpl
|
||||
content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}"
|
||||
- name: loki.yaml
|
||||
|
|
|
@ -79,7 +79,7 @@ receivers:
|
|||
- name: "email-fux-critical"
|
||||
email_configs:
|
||||
- send_resolved: true
|
||||
to: "stb@lassitu.de"
|
||||
to: "stb@lassitu.de,fux@zimdahl.org"
|
||||
from: "alert-manager@hamburg.ccc.de"
|
||||
smarthost: "cow.hamburg.ccc.de:587"
|
||||
auth_username: "alert-manager@hamburg.ccc.de"
|
||||
|
|
|
@ -14,6 +14,7 @@ services:
|
|||
volumes:
|
||||
- ./configs/prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml
|
||||
- ./configs/prometheus_alerts-fux.rules.yaml:/etc/prometheus/rules/alerts-fux.rules.yaml
|
||||
- prom_data:/prometheus
|
||||
|
||||
alertmanager:
|
||||
|
|
|
@ -18,4 +18,3 @@ datasources:
|
|||
httpHeaderName1: "X-Scope-OrgID"
|
||||
secureJsonData:
|
||||
httpHeaderValue1: "chaos"
|
||||
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
groups:
|
||||
- name: Fux-Generic
|
||||
rules:
|
||||
- alert: HostJobFlaky
|
||||
expr: group by(instance, job) (changes(up{org="fux"}[24h]) > 7)
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
org: fux
|
||||
annotations:
|
||||
summary: Job {{ $labels.job }} flaky on (instance {{ $labels.instance }})
|
||||
description: "The job {{ $labels.job }} on target: {{ $labels.instance }} has been flaky over the last 24 hours."
|
||||
- name: Fux-SNMP
|
||||
rules:
|
||||
- alert: SnmpTargetMissing
|
||||
expr: up{job=~".*snmp.*", org="fux"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
org: fux
|
||||
annotations:
|
||||
summary: SNMP target missing (instance {{ $labels.instance }})
|
||||
description: "SNMP target: {{ $labels.instance }} has disappeared for more the 15 min."
|
||||
- name: Fux-DHCP
|
||||
rules:
|
||||
- alert: DhcpFuxSharedFailed
|
||||
expr: script_success{script="check_dhcp_fux_shared"} == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DHCP for Fux Shared stoped working
|
||||
description: "No DHCP lease for the Fux Shared range was received \n V"
|
||||
- alert: DhcpFuxAdminFailed
|
||||
expr: script_success{script_success="check_dhcp_fux_admin"} == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DHCP for Fux Admin stoped working
|
||||
description: "No DHCP lease for the Fux Admin range was received"
|
|
@ -410,7 +410,7 @@ groups:
|
|||
summary: Prometheus job missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}"
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: up == 0
|
||||
expr: up{job!~"snmp|noc_room_temp"} == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -418,7 +418,7 @@ groups:
|
|||
summary: Prometheus target missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}"
|
||||
- alert: PrometheusAllTargetsMissing
|
||||
expr: sum by (job) (up) == 0
|
||||
expr: sum by (job) (up{job!~"snmp|noc_room_temp"}) == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue