This commit is contained in:
		
					parent
					
						
							
								592afdced9
							
						
					
				
			
			
				commit
				
					
						1355d4d834
					
				
			
		
					 6 changed files with 47 additions and 4 deletions
				
			
		| 
						 | 
					@ -10,6 +10,8 @@ docker_compose__configuration_files:
 | 
				
			||||||
    content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}"
 | 
					    content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}"
 | 
				
			||||||
  - name: prometheus_alerts.rules.yaml
 | 
					  - name: prometheus_alerts.rules.yaml
 | 
				
			||||||
    content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}"
 | 
					    content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}"
 | 
				
			||||||
 | 
					  - name: prometheus_alerts-fux.rules.yaml
 | 
				
			||||||
 | 
					    content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml') }}"
 | 
				
			||||||
  - name: alertmanager_alert_templates.tmpl
 | 
					  - name: alertmanager_alert_templates.tmpl
 | 
				
			||||||
    content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}"
 | 
					    content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}"
 | 
				
			||||||
  - name: loki.yaml
 | 
					  - name: loki.yaml
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -79,7 +79,7 @@ receivers:
 | 
				
			||||||
  - name: "email-fux-critical"
 | 
					  - name: "email-fux-critical"
 | 
				
			||||||
    email_configs:
 | 
					    email_configs:
 | 
				
			||||||
      - send_resolved: true
 | 
					      - send_resolved: true
 | 
				
			||||||
        to: "stb@lassitu.de"
 | 
					        to: "stb@lassitu.de,fux@zimdahl.org"
 | 
				
			||||||
        from: "alert-manager@hamburg.ccc.de"
 | 
					        from: "alert-manager@hamburg.ccc.de"
 | 
				
			||||||
        smarthost: "cow.hamburg.ccc.de:587"
 | 
					        smarthost: "cow.hamburg.ccc.de:587"
 | 
				
			||||||
        auth_username: "alert-manager@hamburg.ccc.de"
 | 
					        auth_username: "alert-manager@hamburg.ccc.de"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,6 +14,7 @@ services:
 | 
				
			||||||
    volumes:
 | 
					    volumes:
 | 
				
			||||||
      - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml
 | 
					      - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml
 | 
				
			||||||
      - ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml
 | 
					      - ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml
 | 
				
			||||||
 | 
					      - ./configs/prometheus_alerts-fux.rules.yaml:/etc/prometheus/rules/alerts-fux.rules.yaml
 | 
				
			||||||
      - prom_data:/prometheus
 | 
					      - prom_data:/prometheus
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  alertmanager:
 | 
					  alertmanager:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -18,4 +18,3 @@ datasources:
 | 
				
			||||||
      httpHeaderName1: "X-Scope-OrgID"
 | 
					      httpHeaderName1: "X-Scope-OrgID"
 | 
				
			||||||
    secureJsonData:
 | 
					    secureJsonData:
 | 
				
			||||||
      httpHeaderValue1: "chaos"
 | 
					      httpHeaderValue1: "chaos"
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,41 @@
 | 
				
			||||||
 | 
					groups:
 | 
				
			||||||
 | 
					  - name: Fux-Generic
 | 
				
			||||||
 | 
					    rules:
 | 
				
			||||||
 | 
					      - alert: HostJobFlaky
 | 
				
			||||||
 | 
					        expr: group by(instance, job) (changes(up{org="fux"}[24h]) > 7)
 | 
				
			||||||
 | 
					        for: 0m
 | 
				
			||||||
 | 
					        labels:
 | 
				
			||||||
 | 
					          severity: info
 | 
				
			||||||
 | 
					          org: fux
 | 
				
			||||||
 | 
					        annotations:
 | 
				
			||||||
 | 
					          summary: Job {{ $labels.job }} flaky on (instance {{ $labels.instance }})
 | 
				
			||||||
 | 
					          description: "The job {{ $labels.job }} on target: {{ $labels.instance }} has been flaky over the last 24 hours."
 | 
				
			||||||
 | 
					  - name: Fux-SNMP
 | 
				
			||||||
 | 
					    rules:
 | 
				
			||||||
 | 
					      - alert: SnmpTargetMissing
 | 
				
			||||||
 | 
					        expr: up{job=~".*snmp.*", org="fux"} == 0
 | 
				
			||||||
 | 
					        for: 15m
 | 
				
			||||||
 | 
					        labels:
 | 
				
			||||||
 | 
					          severity: critical
 | 
				
			||||||
 | 
					          org: fux
 | 
				
			||||||
 | 
					        annotations:
 | 
				
			||||||
 | 
					          summary: SNMP target missing (instance {{ $labels.instance }})
 | 
				
			||||||
 | 
					          description: "SNMP target: {{ $labels.instance }} has disappeared for more the 15 min."
 | 
				
			||||||
 | 
					  - name: Fux-DHCP
 | 
				
			||||||
 | 
					    rules:
 | 
				
			||||||
 | 
					      - alert: DhcpFuxSharedFailed
 | 
				
			||||||
 | 
					        expr: script_success{script="check_dhcp_fux_shared"} == 0
 | 
				
			||||||
 | 
					        for: 0m
 | 
				
			||||||
 | 
					        labels:
 | 
				
			||||||
 | 
					          severity: critical
 | 
				
			||||||
 | 
					        annotations:
 | 
				
			||||||
 | 
					          summary: DHCP for Fux Shared stoped working
 | 
				
			||||||
 | 
					          description: "No DHCP lease for the Fux Shared range was received \n V"
 | 
				
			||||||
 | 
					      - alert: DhcpFuxAdminFailed
 | 
				
			||||||
 | 
					        expr: script_success{script_success="check_dhcp_fux_admin"} == 0
 | 
				
			||||||
 | 
					        for: 0m
 | 
				
			||||||
 | 
					        labels:
 | 
				
			||||||
 | 
					          severity: critical
 | 
				
			||||||
 | 
					        annotations:
 | 
				
			||||||
 | 
					          summary: DHCP for Fux Admin stoped working
 | 
				
			||||||
 | 
					          description: "No DHCP lease for the Fux Admin range was received"
 | 
				
			||||||
| 
						 | 
					@ -410,7 +410,7 @@ groups:
 | 
				
			||||||
          summary: Prometheus job missing (instance {{ $labels.instance }})
 | 
					          summary: Prometheus job missing (instance {{ $labels.instance }})
 | 
				
			||||||
          description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
 | 
					          description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}"
 | 
				
			||||||
      - alert: PrometheusTargetMissing
 | 
					      - alert: PrometheusTargetMissing
 | 
				
			||||||
        expr: up == 0
 | 
					        expr: up{job!~"snmp|noc_room_temp"} == 0
 | 
				
			||||||
        for: 0m
 | 
					        for: 0m
 | 
				
			||||||
        labels:
 | 
					        labels:
 | 
				
			||||||
          severity: critical
 | 
					          severity: critical
 | 
				
			||||||
| 
						 | 
					@ -418,7 +418,7 @@ groups:
 | 
				
			||||||
          summary: Prometheus target missing (instance {{ $labels.instance }})
 | 
					          summary: Prometheus target missing (instance {{ $labels.instance }})
 | 
				
			||||||
          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}"
 | 
					          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}"
 | 
				
			||||||
      - alert: PrometheusAllTargetsMissing
 | 
					      - alert: PrometheusAllTargetsMissing
 | 
				
			||||||
        expr: sum by (job) (up) == 0
 | 
					        expr: sum by (job) (up{job!~"snmp|noc_room_temp"}) == 0
 | 
				
			||||||
        for: 0m
 | 
					        for: 0m
 | 
				
			||||||
        labels:
 | 
					        labels:
 | 
				
			||||||
          severity: critical
 | 
					          severity: critical
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue