Compare commits

...

2 commits

Author SHA1 Message Date
73f69f34c8
grafana: make alerts better for fux
Some checks failed
/ Ansible Lint (push) Failing after 47s
/ Ansible Lint (pull_request) Failing after 46s
2025-09-09 18:46:41 +02:00
592afdced9 add waybackproxy
Some checks failed
/ Ansible Lint (push) Failing after 50s
2025-09-06 11:39:05 +02:00
9 changed files with 73 additions and 3 deletions

View file

@ -10,6 +10,8 @@ docker_compose__configuration_files:
content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}" content: "{{ lookup('ansible.builtin.template', 'resources/chaosknoten/grafana/docker_compose/alertmanager.yaml.j2') }}"
- name: prometheus_alerts.rules.yaml - name: prometheus_alerts.rules.yaml
content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}" content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml') }}"
- name: prometheus_alerts-fux.rules.yaml
content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/prometheus_alerts-fux.rules.yaml') }}"
- name: alertmanager_alert_templates.tmpl - name: alertmanager_alert_templates.tmpl
content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}" content: "{{ lookup('ansible.builtin.file', 'resources/chaosknoten/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}"
- name: loki.yaml - name: loki.yaml

View file

@ -0,0 +1,7 @@
docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'resources/z9/waybackproxy/docker_compose/compose.yaml.j2') }}"
docker_compose__configuration_files: [ ]
nginx__version_spec: ""
nginx__configurations:
- name: waybackproxy.ccchh.net
content: "{{ lookup('ansible.builtin.file', 'resources/z9/waybackproxy/nginx/waybackproxy.ccchh.net.conf') }}"

View file

@ -11,6 +11,9 @@ all:
ansible_user: chaos ansible_user: chaos
thinkcccore0: thinkcccore0:
ansible_host: thinkcccore0.z9.ccchh.net ansible_host: thinkcccore0.z9.ccchh.net
waybackproxy:
ansible_host: waybackproxy.ccchh.net
ansible_user: chaos
yate: yate:
ansible_host: yate.ccchh.net ansible_host: yate.ccchh.net
ansible_user: chaos ansible_user: chaos
@ -20,6 +23,7 @@ certbot_hosts:
docker_compose_hosts: docker_compose_hosts:
hosts: hosts:
dooris: dooris:
waybackproxy:
yate: yate:
foobazdmx_hosts: foobazdmx_hosts:
hosts: hosts:
@ -32,11 +36,13 @@ infrastructure_authorized_keys_hosts:
dooris: dooris:
light: light:
authoritative-dns: authoritative-dns:
waybackproxy:
yate: yate:
nginx_hosts: nginx_hosts:
hosts: hosts:
dooris: dooris:
light: light:
waybackproxy:
ola_hosts: ola_hosts:
hosts: hosts:
light: light:

View file

@ -79,7 +79,7 @@ receivers:
- name: "email-fux-critical" - name: "email-fux-critical"
email_configs: email_configs:
- send_resolved: true - send_resolved: true
to: "stb@lassitu.de" to: "stb@lassitu.de,fux@zimdahl.org"
from: "alert-manager@hamburg.ccc.de" from: "alert-manager@hamburg.ccc.de"
smarthost: "cow.hamburg.ccc.de:587" smarthost: "cow.hamburg.ccc.de:587"
auth_username: "alert-manager@hamburg.ccc.de" auth_username: "alert-manager@hamburg.ccc.de"

View file

@ -14,6 +14,7 @@ services:
volumes: volumes:
- ./configs/prometheus.yml:/etc/prometheus/prometheus.yml - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml
- ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml - ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml
- ./configs/prometheus_alerts-fux.rules.yaml:/etc/prometheus/rules/alerts-fux.rules.yaml
- prom_data:/prometheus - prom_data:/prometheus
alertmanager: alertmanager:

View file

@ -0,0 +1,39 @@
groups:
- name: Fux-Generic
rules:
- alert: HostJobFlaky
expr: group by(host, job) (changes(up{org="fux", job!="integrations/unix"}[24h]) > 5)
for: 0m
labels:
severity: info
annotations:
summary: Job {{ $labels.job }} flaky on (host {{ $labels.instance }})
description: "The job {{ $labels.job }} on target: {{ $labels.host }} has been flaky over the last 24 hours.\n VALUE = {{ $value }}"
- name: Fux-SNMP
rules:
- alert: SnmpTargetMissing
expr: up{job=~".*snmp|SNMP.*"} == 0
for: 15m
labels:
severity: critical
annotations:
summary: SNMP target missing (instance {{ $labels.instance }})
description: "A SNMP target has disappeared for more the 15 min.\n VALUE = {{ $value }}"
- name: Fux-DHCP
rules:
- alert: DhcpFuxSharedFailed
expr: script_success{script="check_dhcp_fux_shared"} == 0
for: 0m
labels:
severity: critical
annotations:
summary: DHCP for Fux Shared stoped working
description: "No DHCP lease for the Fux Shared range was received"
- alert: DhcpFuxAdminFailed
expr: script_success{script_success="check_dhcp_fux_admin"} == 0
for: 0m
labels:
severity: critical
annotations:
summary: DHCP for Fux Admin stoped working
description: "No DHCP lease for the Fux Admin range was received"

View file

@ -410,7 +410,7 @@ groups:
summary: Prometheus job missing (instance {{ $labels.instance }}) summary: Prometheus job missing (instance {{ $labels.instance }})
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}" description: "A Prometheus job has disappeared\n VALUE = {{ $value }}"
- alert: PrometheusTargetMissing - alert: PrometheusTargetMissing
expr: up == 0 expr: up{job!~"snmp|noc_room_temp"} == 0
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
@ -418,7 +418,7 @@ groups:
summary: Prometheus target missing (instance {{ $labels.instance }}) summary: Prometheus target missing (instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}" description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}"
- alert: PrometheusAllTargetsMissing - alert: PrometheusAllTargetsMissing
expr: sum by (job) (up) == 0 expr: sum by (job) (up{job!~"snmp|noc_room_temp"}) == 0
for: 0m for: 0m
labels: labels:
severity: critical severity: critical

View file

@ -0,0 +1,10 @@
services:
# https://github.com/richardg867/WaybackProxy
waybackproxy:
image: cttynul/waybackproxy:latest
environment:
DATE: 19990101
DATE_TOLERANCE: 730
ports:
- "1999:8888"
restart: unless-stopped

View file

@ -0,0 +1,5 @@
# TODO: set up caching proxy
# server {
# listen 1999
# }