Compare commits

..

21 commits

Author SHA1 Message Date
Herr-Dante 735fe0ca9b Add local port forwarding for debug sessions 2024-10-27 22:27:07 +01:00
christian 34dc6d9a84
Reduce Host Memory is underutilized to 10% 2024-10-18 21:15:20 +02:00
Stefan Bethke b660d937dc Allow GPG keys as uploads 2024-10-18 12:40:24 +02:00
Stefan Bethke 2f00d21821 Redirect home page to wiki 2024-10-13 13:50:50 +02:00
Stefan Bethke 235e6e514f Move Pretix from hackertours to tickets 2024-10-13 09:10:10 +02:00
June 7cd4a9a723
public-reverse-proxy: add config for staging.hackertours.hamburg.ccc.de 2024-10-12 22:08:28 +02:00
June d7a9534eeb
public-reverse-proxy: use public-web-static as host for hackert. ccchh 2024-10-12 22:00:14 +02:00
Stefan Bethke a35fcc13cf Merge branch 'main' of git.hamburg.ccc.de:CCCHH/ansible-infra 2024-10-08 20:28:57 +02:00
Stefan Bethke 2fc54f5a83 Add missing headers to avoid CSRF errors 2024-10-08 20:28:56 +02:00
June 4cac84e7ec
prometheus: have different disk alerts for physical and virtual hosts
Have more relaxed read/write alerts for physical hosts as they are
probably hypervisors and regular high read/writes are more common.
Also differentiate between physical and virtual hosts for IO alerts and
allow for hard disks to spend more time in IO.
2024-10-05 17:22:45 +02:00
June f721dd9fea
prometheus: make opnsense-ccchh job not fail half the time
The scrape seems to take around a second to complete and with the
configured timeout of 1s that failed half the time. Therefore use the
default, more relaxed scrape interval and timeout and have it be
reliable.
2024-10-05 17:22:45 +02:00
christian d8188d192b
Use keycloak version 26 2024-10-04 17:07:49 +02:00
Stefan Bethke 43ca24b5e2 Take website image from Forgejo 2024-10-03 19:44:43 +02:00
Stefan Bethke 229daa72fc Redirect plain URL to hash for ticket deep links 2024-10-03 19:44:15 +02:00
June 0a05cad0a1
prometheus & alertmanager: add self-alerting
Add self-alerting for Prometheus and Alertmanager using rules from
https://samber.github.io/awesome-prometheus-alerts/rules
2024-10-02 04:13:37 +02:00
June 2e29b78f6a
prometheus: move Jitsis node exporter target to hosts job 2024-10-02 03:45:56 +02:00
June 61edc3587f
alertmanager: give Alertmanager a persistent storage directory 2024-10-02 03:43:22 +02:00
June 30876f821c
prometheus, alertmanager: use Prometheus alerts with Alertmanager
For now introduce node-exporter/hosts alert rules, which got taken from
https://samber.github.io/awesome-prometheus-alerts/rules
However with the labels removed from the description, since they don't
render correctly (at least in Telegram) and don't seem to provide much
value, as we render the labels in the notification anyway.

Also only have Telegram as the notification channel for now, as it was
the easiest to set up.
2024-10-02 03:36:30 +02:00
June 803b19de0a
prometheus: add job for node exporter (for the NixOS VMs for now) 2024-10-01 20:09:42 +02:00
June 29d2d2926f
prometheus: don't duplicate scrape interval and timeout 2024-10-01 01:59:33 +02:00
June e81ae5165f
public-reverse-proxy: config for eh20 static website deploy 2024-09-28 05:04:01 +02:00
16 changed files with 746 additions and 53 deletions

View file

@ -6,6 +6,12 @@ docker_compose__configuration_files:
content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml') }}" content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/grafana-datasource.yml') }}"
- name: prometheus.yml - name: prometheus.yml
content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/prometheus.yml') }}" content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/prometheus.yml') }}"
- name: alertmanager.yaml
content: "{{ lookup('ansible.builtin.template', 'templates/chaosknoten/configs/grafana/docker_compose/alertmanager.yaml.j2') }}"
- name: prometheus_alerts.rules.yaml
content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/prometheus_alerts.rules.yaml') }}"
- name: alertmanager_alert_templates.tmpl
content: "{{ lookup('ansible.builtin.file', 'files/chaosknoten/configs/grafana/docker_compose/alertmanager_alert_templates.tmpl') }}"
certbot__version_spec: "" certbot__version_spec: ""
certbot__acme_account_email_address: le-admin@hamburg.ccc.de certbot__acme_account_email_address: le-admin@hamburg.ccc.de

View file

@ -1,16 +1,16 @@
docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/hackertours/compose.yaml.j2') }}" docker_compose__compose_file_content: "{{ lookup('ansible.builtin.template', 'chaosknoten/configs/tickets/compose.yaml.j2') }}"
docker_compose__configuration_files: docker_compose__configuration_files:
- name: pretix.cfg - name: pretix.cfg
content: "{{ lookup('ansible.builtin.template', 'templates/chaosknoten/configs/hackertours/pretix.cfg.j2') }}" content: "{{ lookup('ansible.builtin.template', 'templates/chaosknoten/configs/tickets/pretix.cfg.j2') }}"
certbot__version_spec: "" certbot__version_spec: ""
certbot__acme_account_email_address: le-admin@hamburg.ccc.de certbot__acme_account_email_address: le-admin@hamburg.ccc.de
certbot__certificate_domains: certbot__certificate_domains:
- "hackertours.hamburg.ccc.de" - "tickets.hamburg.ccc.de"
certbot__new_cert_commands: certbot__new_cert_commands:
- "systemctl reload nginx.service" - "systemctl reload nginx.service"
nginx__version_spec: "" nginx__version_spec: ""
nginx__configurations: nginx__configurations:
- name: hackertours.hamburg.ccc.de - name: tickets.hamburg.ccc.de
content: "{{ lookup('ansible.builtin.file', 'chaosknoten/configs/hackertours/nginx/hackertours.hamburg.ccc.de.conf') }}" content: "{{ lookup('ansible.builtin.file', 'chaosknoten/configs/tickets/nginx/tickets.hamburg.ccc.de.conf') }}"

View file

@ -16,8 +16,8 @@ all:
ansible_port: 42666 ansible_port: 42666
ansible_user: chaos ansible_user: chaos
ansible_ssh_common_args: -J ssh://chaos@public-reverse-proxy.hamburg.ccc.de:42666 ansible_ssh_common_args: -J ssh://chaos@public-reverse-proxy.hamburg.ccc.de:42666
hackertours: tickets:
ansible_host: hackertours-intern.hamburg.ccc.de ansible_host: tickets-intern.hamburg.ccc.de
ansible_port: 42666 ansible_port: 42666
ansible_user: chaos ansible_user: chaos
ansible_ssh_common_args: -J ssh://chaos@public-reverse-proxy.hamburg.ccc.de:42666 ansible_ssh_common_args: -J ssh://chaos@public-reverse-proxy.hamburg.ccc.de:42666
@ -64,7 +64,7 @@ all:
hosts: hosts:
ccchoir: ccchoir:
grafana: grafana:
hackertours: tickets:
keycloak: keycloak:
lists: lists:
onlyoffice: onlyoffice:
@ -78,7 +78,7 @@ all:
hosts: hosts:
ccchoir: ccchoir:
grafana: grafana:
hackertours: tickets:
keycloak: keycloak:
lists: lists:
mumble: mumble:
@ -99,7 +99,7 @@ all:
hosts: hosts:
ccchoir: ccchoir:
grafana: grafana:
hackertours: tickets:
keycloak: keycloak:
lists: lists:
mumble: mumble:
@ -112,7 +112,7 @@ all:
hosts: hosts:
ccchoir: ccchoir:
grafana: grafana:
hackertours: tickets:
cloud: cloud:
keycloak: keycloak:
onlyoffice: onlyoffice:

View file

@ -0,0 +1,35 @@
{{/*
Links & Resources
- https://prometheus.io/blog/2016/03/03/custom-alertmanager-templates/
- https://prometheus.io/docs/alerting/latest/notifications/
- https://gist.github.com/jidckii/5ac5f8f20368b56de72af70222509b7b
*/}}
{{ define "alert-item.telegram.ccchh.internal" }}
<b>[{{ .Labels.alertname }}] {{ .Labels.nodename }}</b>
{{- if .Annotations.summary }}
<i>Summary</i>: {{ .Annotations.summary }}
{{- end }}
{{- if .Annotations.description }}
<i>Description</i>: {{ .Annotations.description }}
{{- end }}
<i>Labels</i>:
{{ range .Labels.SortedPairs -}}
• <i>{{ .Name }}</i>: <code>{{ .Value }}</code>
{{ end }}
{{- end }}
{{ define "alert-message.telegram.ccchh" }}
{{- if .Alerts.Firing }}
<u>🔥{{ len .Alerts.Firing }} Alert(/s) Firing 🔥</u>
{{ range .Alerts.Firing -}}
{{ template "alert-item.telegram.ccchh.internal" . }}
{{- end }}
{{- end }}
{{- if .Alerts.Resolved }}
<u>✅{{ len .Alerts.Resolved }} Alert(/s) Resolved ✅</u>
{{ range .Alerts.Resolved -}}
{{ template "alert-item.telegram.ccchh.internal" . }}
{{- end }}
{{- end }}
{{- end }}

View file

@ -2,23 +2,33 @@ global:
scrape_interval: 15s scrape_interval: 15s
scrape_timeout: 10s scrape_timeout: 10s
evaluation_interval: 15s evaluation_interval: 15s
alerting: alerting:
alertmanagers: alertmanagers:
- static_configs: - scheme: http
- targets: []
scheme: http
timeout: 10s timeout: 10s
api_version: v1 static_configs:
- targets:
- "alertmanager:9093"
rule_files:
- "/etc/prometheus/rules/*.rules.yaml"
scrape_configs: scrape_configs:
- job_name: prometheus - job_name: prometheus
honor_timestamps: true honor_timestamps: true
scrape_interval: 15s
scrape_timeout: 10s
metrics_path: /metrics metrics_path: /metrics
scheme: http scheme: http
static_configs: static_configs:
- targets: - targets:
- localhost:9090 - localhost:9090
- job_name: alertmanager
honor_timestamps: true
metrics_path: /metrics
scheme: http
static_configs:
- targets:
- alertmanager:9093
- job_name: c3lingo - job_name: c3lingo
honor_timestamps: true honor_timestamps: true
scrape_interval: 5s scrape_interval: 5s
@ -39,8 +49,6 @@ scrape_configs:
- mumble.hamburg.ccc.de:443 - mumble.hamburg.ccc.de:443
- job_name: opnsense-ccchh - job_name: opnsense-ccchh
honor_timestamps: true honor_timestamps: true
scrape_interval: 5s
scrape_timeout: 1s
metrics_path: /metrics metrics_path: /metrics
scheme: http scheme: http
static_configs: static_configs:
@ -54,17 +62,7 @@ scrape_configs:
scheme: http scheme: http
static_configs: static_configs:
- targets: - targets:
- jitsi.hamburg.ccc.de:9100 # Node Exporter
- jitsi.hamburg.ccc.de:9888 # Jitsi Video Bridge - jitsi.hamburg.ccc.de:9888 # Jitsi Video Bridge
- job_name: chaosknoten
honor_timestamps: true
scrape_interval: 5s
scrape_timeout: 1s
metrics_path: /metrics
scheme: http
static_configs:
- targets:
- chaosknoten.hamburg.ccc.de:9100 # Node Exporter
- job_name: 'pve' - job_name: 'pve'
static_configs: static_configs:
- targets: - targets:
@ -81,3 +79,28 @@ scrape_configs:
target_label: instance target_label: instance
- target_label: __address__ - target_label: __address__
replacement: pve-exporter:9221 replacement: pve-exporter:9221
- job_name: hosts
static_configs:
# Wieske Chaosknoten VMs
- labels:
site: wieske
type: virtual_machine
hypervisor: chaosknoten
targets:
- netbox-intern.hamburg.ccc.de:9100
- matrix-intern.hamburg.ccc.de:9100
- public-web-static-intern.hamburg.ccc.de:9100
- git-intern.hamburg.ccc.de:9100
- forgejo-actions-runner-intern.hamburg.ccc.de:9100
- eh22-wiki-intern.hamburg.ccc.de:9100
- nix-box-june-intern.hamburg.ccc.de:9100
- mjolnir-intern.hamburg.ccc.de:9100
- woodpecker-intern.hamburg.ccc.de:9100
- penpot-intern.hamburg.ccc.de:9100
- jitsi.hamburg.ccc.de:9100
# Wieske Physical Machines
- labels:
site: wieske
type: physical_machine
targets:
- chaosknoten.hamburg.ccc.de:9100

View file

@ -0,0 +1,566 @@
# Links & Resources:
# - https://samber.github.io/awesome-prometheus-alerts/rules
groups:
- name: node-exporter
rules:
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}"
- alert: HostMemoryUnderMemoryPressure
expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostMemoryIsUnderutilized
expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 1w
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory is < 10% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}"
- alert: HostUnusualNetworkThroughputIn
expr: (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}"
- alert: HostUnusualNetworkThroughputOut
expr: (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}"
# Have different disk read and write rate alerts for VMs and physical machines.
- alert: VirtualHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: VirtualHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 20m
labels:
severity: warning
annotations:
summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 15m
labels:
severity: warning
annotations:
summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskWillFillIn24Hours
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}"
- alert: HostOutOfInodes
expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}"
- alert: HostInodesWillFillIn24Hours
expr: (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}"
- alert: HostFilesystemDeviceError
expr: node_filesystem_device_error == 1
for: 2m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}"
- alert: HostUnusualDiskReadLatency
expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}"
- alert: HostUnusualDiskWriteLatency
expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}"
- alert: HostHighCpuLoad
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}"
# We might want to introduce that later, tho maybe excluding hosts with one core, if possible and only for VMs?
# # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
# - alert: HostCpuIsUnderutilized
# expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
# for: 1w
# labels:
# severity: info
# annotations:
# summary: Host CPU is underutilized (instance {{ $labels.instance }})
# description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}"
- alert: HostCpuStealNoisyNeighbor
expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}"
- alert: HostCpuHighIowait
expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}"
# Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
- alert: PhysicalHostUnusualHardDiskIo
expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualOtherDiskIo
expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: VirtualHostUnusualDiskIo
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
# # x2 context switches is an arbitrary number.
# # The alert threshold depends on the nature of the application.
# # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
# - alert: HostContextSwitchingHigh
# expr: (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: Host context switching high (instance {{ $labels.instance }})
# description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}"
- alert: HostSwapIsFillingUp
expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up (instance {{ $labels.instance }})
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}"
- alert: HostSystemdServiceCrashed
expr: (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service crashed\n VALUE = {{ $value }}"
- alert: HostPhysicalComponentTooHot
expr: ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}"
- alert: HostNodeOvertemperatureAlarm
expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: critical
annotations:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}"
- alert: HostRaidArrayGotInactive
expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: critical
annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}"
- alert: HostRaidDiskFailure
expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}"
- alert: HostKernelVersionDeviations
expr: (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 6h
labels:
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Different kernel versions are running\n VALUE = {{ $value }}"
- alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}"
- alert: HostEdacCorrectableErrorsDetected
expr: (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}"
- alert: HostNetworkReceiveErrors
expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Receive Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}"
- alert: HostNetworkTransmitErrors
expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}"
- alert: HostNetworkBondDegraded
expr: ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}"
- alert: HostConntrackLimit
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Host conntrack limit (instance {{ $labels.instance }})
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}"
- alert: HostClockSkew
expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 10m
labels:
severity: warning
annotations:
summary: Host clock skew (instance {{ $labels.instance }})
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}"
- alert: HostClockNotSynchronising
expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}"
- alert: HostRequiresReboot
expr: (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}"
- name: prometheus
rules:
- alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus job missing (instance {{ $labels.instance }})
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}"
- alert: PrometheusTargetMissing
expr: up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target missing (instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}"
- alert: PrometheusAllTargetsMissing
expr: sum by (job) (up) == 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus all targets missing (instance {{ $labels.instance }})
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}"
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
description: "Prometheus configuration reload error\n VALUE = {{ $value }}"
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus too many restarts (instance {{ $labels.instance }})
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}"
- alert: PrometheusAlertmanagerJobMissing
expr: absent(up{job="alertmanager"})
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}"
- alert: PrometheusAlertmanagerConfigurationReloadFailure
expr: alertmanager_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
description: "AlertManager configuration reload error\n VALUE = {{ $value }}"
- alert: PrometheusAlertmanagerConfigNotSynced
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}"
# For testing.
# - alert: PrometheusAlertmanagerE2eDeadManSwitch
# expr: vector(1)
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
# description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}"
- alert: PrometheusNotConnectedToAlertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}"
- alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}"
- alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}"
- alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}"
- alert: PrometheusTargetEmpty
expr: prometheus_sd_discovered_targets == 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target empty (instance {{ $labels.instance }})
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}"
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}"
- alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus large scrape (instance {{ $labels.instance }})
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}"
- alert: PrometheusTargetScrapeDuplicate
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}"
- alert: PrometheusTsdbCheckpointDeletionFailures
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}"
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}"
- alert: PrometheusTsdbHeadTruncationsFailed
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}"
- alert: PrometheusTsdbReloadFailures
expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}"
- alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}"
- alert: PrometheusTsdbWalTruncationsFailed
expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}"
- alert: PrometheusTimeseriesCardinality
expr: label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}"

View file

@ -8,7 +8,8 @@ map $host $upstream_acme_challenge_host {
element.hamburg.ccc.de 172.31.17.151:31820; element.hamburg.ccc.de 172.31.17.151:31820;
git.hamburg.ccc.de 172.31.17.154:31820; git.hamburg.ccc.de 172.31.17.154:31820;
grafana.hamburg.ccc.de 172.31.17.145:31820; grafana.hamburg.ccc.de 172.31.17.145:31820;
hackertours.hamburg.ccc.de 172.31.17.148:31820; hackertours.hamburg.ccc.de 172.31.17.151:31820;
staging.hackertours.hamburg.ccc.de 172.31.17.151:31820;
hamburg.ccc.de 172.31.17.151:31820; hamburg.ccc.de 172.31.17.151:31820;
id.hamburg.ccc.de 172.31.17.144:31820; id.hamburg.ccc.de 172.31.17.144:31820;
invite.hamburg.ccc.de 172.31.17.144:31820; invite.hamburg.ccc.de 172.31.17.144:31820;
@ -23,12 +24,15 @@ map $host $upstream_acme_challenge_host {
wiki.ccchh.net 172.31.17.146:31820; wiki.ccchh.net 172.31.17.146:31820;
wiki.hamburg.ccc.de 172.31.17.146:31820; wiki.hamburg.ccc.de 172.31.17.146:31820;
www.hamburg.ccc.de 172.31.17.151:31820; www.hamburg.ccc.de 172.31.17.151:31820;
tickets.hamburg.ccc.de 172.31.17.148:31820;
zammad.hamburg.ccc.de 172.31.17.152:31820; zammad.hamburg.ccc.de 172.31.17.152:31820;
eh03.easterhegg.eu 172.31.17.151:31820; eh03.easterhegg.eu 172.31.17.151:31820;
eh05.easterhegg.eu 172.31.17.151:31820; eh05.easterhegg.eu 172.31.17.151:31820;
eh07.easterhegg.eu 172.31.17.151:31820; eh07.easterhegg.eu 172.31.17.151:31820;
eh09.easterhegg.eu 172.31.17.151:31820; eh09.easterhegg.eu 172.31.17.151:31820;
eh11.easterhegg.eu 172.31.17.151:31820; eh11.easterhegg.eu 172.31.17.151:31820;
eh20.easterhegg.eu 172.31.17.151:31820;
www.eh20.easterhegg.eu 172.31.17.151:31820;
eh22.easterhegg.eu 172.31.17.159:31820; eh22.easterhegg.eu 172.31.17.159:31820;
easterheggxxxx.hamburg.ccc.de 172.31.17.151:31820; easterheggxxxx.hamburg.ccc.de 172.31.17.151:31820;
eh2003.hamburg.ccc.de 172.31.17.151:31820; eh2003.hamburg.ccc.de 172.31.17.151:31820;
@ -57,6 +61,7 @@ map $host $upstream_acme_challenge_host {
www.eh11.hamburg.ccc.de 172.31.17.151:31820; www.eh11.hamburg.ccc.de 172.31.17.151:31820;
easterhegg2011.hamburg.ccc.de 172.31.17.151:31820; easterhegg2011.hamburg.ccc.de 172.31.17.151:31820;
www.easterhegg2011.hamburg.ccc.de 172.31.17.151:31820; www.easterhegg2011.hamburg.ccc.de 172.31.17.151:31820;
eh20.hamburg.ccc.de 172.31.17.151:31820;
hacker.tours 172.31.17.151:31820; hacker.tours 172.31.17.151:31820;
staging.hacker.tours 172.31.17.151:31820; staging.hacker.tours 172.31.17.151:31820;
woodpecker.hamburg.ccc.de 172.31.17.160:31820; woodpecker.hamburg.ccc.de 172.31.17.160:31820;

View file

@ -30,7 +30,8 @@ stream {
wiki.ccchh.net 172.31.17.146:8443; wiki.ccchh.net 172.31.17.146:8443;
wiki.hamburg.ccc.de 172.31.17.146:8443; wiki.hamburg.ccc.de 172.31.17.146:8443;
onlyoffice.hamburg.ccc.de 172.31.17.147:8443; onlyoffice.hamburg.ccc.de 172.31.17.147:8443;
hackertours.hamburg.ccc.de 172.31.17.148:8443; hackertours.hamburg.ccc.de 172.31.17.151:8443;
staging.hackertours.hamburg.ccc.de 172.31.17.151:8443;
netbox.hamburg.ccc.de 172.31.17.149:8443; netbox.hamburg.ccc.de 172.31.17.149:8443;
matrix.hamburg.ccc.de 172.31.17.150:8443; matrix.hamburg.ccc.de 172.31.17.150:8443;
element.hamburg.ccc.de 172.31.17.151:8443; element.hamburg.ccc.de 172.31.17.151:8443;
@ -39,6 +40,7 @@ stream {
hamburg.ccc.de 172.31.17.151:8443; hamburg.ccc.de 172.31.17.151:8443;
staging.hamburg.ccc.de 172.31.17.151:8443; staging.hamburg.ccc.de 172.31.17.151:8443;
spaceapi.hamburg.ccc.de 172.31.17.151:8443; spaceapi.hamburg.ccc.de 172.31.17.151:8443;
tickets.hamburg.ccc.de 172.31.17.148:8443;
zammad.hamburg.ccc.de 172.31.17.152:8443; zammad.hamburg.ccc.de 172.31.17.152:8443;
c3cat.de 172.31.17.151:8443; c3cat.de 172.31.17.151:8443;
git.hamburg.ccc.de 172.31.17.154:8443; git.hamburg.ccc.de 172.31.17.154:8443;
@ -47,6 +49,8 @@ stream {
eh07.easterhegg.eu 172.31.17.151:8443; eh07.easterhegg.eu 172.31.17.151:8443;
eh09.easterhegg.eu 172.31.17.151:8443; eh09.easterhegg.eu 172.31.17.151:8443;
eh11.easterhegg.eu 172.31.17.151:8443; eh11.easterhegg.eu 172.31.17.151:8443;
eh20.easterhegg.eu 172.31.17.151:8443;
www.eh20.easterhegg.eu 172.31.17.151:8443;
eh22.easterhegg.eu 172.31.17.159:8443; eh22.easterhegg.eu 172.31.17.159:8443;
easterheggxxxx.hamburg.ccc.de 172.31.17.151:8443; easterheggxxxx.hamburg.ccc.de 172.31.17.151:8443;
eh2003.hamburg.ccc.de 172.31.17.151:8443; eh2003.hamburg.ccc.de 172.31.17.151:8443;
@ -75,6 +79,7 @@ stream {
www.eh11.hamburg.ccc.de 172.31.17.151:8443; www.eh11.hamburg.ccc.de 172.31.17.151:8443;
easterhegg2011.hamburg.ccc.de 172.31.17.151:8443; easterhegg2011.hamburg.ccc.de 172.31.17.151:8443;
www.easterhegg2011.hamburg.ccc.de 172.31.17.151:8443; www.easterhegg2011.hamburg.ccc.de 172.31.17.151:8443;
eh20.hamburg.ccc.de 172.31.17.151:8443;
hacker.tours 172.31.17.151:8443; hacker.tours 172.31.17.151:8443;
staging.hacker.tours 172.31.17.151:8443; staging.hacker.tours 172.31.17.151:8443;
woodpecker.hamburg.ccc.de 172.31.17.160:8443; woodpecker.hamburg.ccc.de 172.31.17.160:8443;

View file

@ -12,12 +12,12 @@ server {
# header. # header.
real_ip_header proxy_protocol; real_ip_header proxy_protocol;
server_name hackertours.hamburg.ccc.de; server_name tickets.hamburg.ccc.de;
ssl_certificate /etc/letsencrypt/live/hackertours.hamburg.ccc.de/fullchain.pem; ssl_certificate /etc/letsencrypt/live/tickets.hamburg.ccc.de/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/hackertours.hamburg.ccc.de/privkey.pem; ssl_certificate_key /etc/letsencrypt/live/tickets.hamburg.ccc.de/privkey.pem;
# verify chain of trust of OCSP response using Root CA and Intermediate certs # verify chain of trust of OCSP response using Root CA and Intermediate certs
ssl_trusted_certificate /etc/letsencrypt/live/hackertours.hamburg.ccc.de/chain.pem; ssl_trusted_certificate /etc/letsencrypt/live/tickets.hamburg.ccc.de/chain.pem;
# HSTS (ngx_http_headers_module is required) (63072000 seconds) # HSTS (ngx_http_headers_module is required) (63072000 seconds)
add_header Strict-Transport-Security "max-age=63072000" always; add_header Strict-Transport-Security "max-age=63072000" always;
@ -37,10 +37,7 @@ server {
proxy_set_header Forwarded "for=$remote_addr;proto=https;host=$host;by=_hidden"; proxy_set_header Forwarded "for=$remote_addr;proto=https;host=$host;by=_hidden";
location = / { location = / {
proxy_pass http://127.0.0.1:8888/; return 302 https://wiki.hamburg.ccc.de/infrastructure:service-overview#tickets_pretix;
}
location ~ ^/(apple-touch-icon.png|assets|css|de|en|js|posts|tours)(.*)$ {
proxy_pass http://127.0.0.1:8888/$1$2;
} }
location / { location / {

View file

@ -35,6 +35,14 @@ server {
# is transparent). # is transparent).
# Also provide "_hidden" for by, since it's not relevant. # Also provide "_hidden" for by, since it's not relevant.
proxy_set_header Forwarded "for=$remote_addr;proto=https;host=$host;by=_hidden"; proxy_set_header Forwarded "for=$remote_addr;proto=https;host=$host;by=_hidden";
proxy_read_timeout 86400;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "Upgrade";
proxy_set_header CLIENT_IP $remote_addr;
location ~/(ticket/zoom/.*) {
return 302 https://zammad.hamburg.ccc.de/#$1;
}
location / { location / {
proxy_pass http://127.0.0.1:8080/; proxy_pass http://127.0.0.1:8080/;

View file

@ -2,3 +2,4 @@
# Allow stl files. # Allow stl files.
stl !model/stl stl !model/stl
asc application/pgp-keys

View file

@ -11,7 +11,21 @@ services:
restart: unless-stopped restart: unless-stopped
volumes: volumes:
- ./configs/prometheus.yml:/etc/prometheus/prometheus.yml - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml
- ./configs/prometheus_alerts.rules.yaml:/etc/prometheus/rules/alerts.rules.yaml
- prom_data:/prometheus - prom_data:/prometheus
alertmanager:
image: prom/alertmanager
container_name: alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yaml'
ports:
- 9093:9093
restart: unless-stopped
volumes:
- ./configs/alertmanager.yaml:/etc/alertmanager/alertmanager.yaml
- ./configs/alertmanager_alert_templates.tmpl:/etc/alertmanager/templates/alert_templates.tmpl
- alertmanager_data:/alertmanager
grafana: grafana:
image: grafana/grafana image: grafana/grafana
@ -44,3 +58,4 @@ services:
volumes: volumes:
graf_data: {} graf_data: {}
prom_data: {} prom_data: {}
alertmanager_data: {}

View file

@ -0,0 +1,40 @@
# Links & References:
# - https://prometheus.io/docs/alerting/latest/configuration/
# - https://github.com/prometheus/alertmanager/blob/48a99764a1fc9279fc828de83e7a03ae2219abc7/doc/examples/simple.yml
route:
group_by: ["alertname", "site", "type", "hypervisor"]
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receiver: ccchh-infrastructure-alerts
{# Disable these for now, but might be interesting in the future.
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_matchers: [severity="critical"]
target_matchers: [severity="warning"]
# Apply inhibition if the alertname is the same.
# CAUTION:
# If all label names listed in `equal` are missing
# from both the source and target alerts,
# the inhibition rule will apply!
equal: [alertname, cluster, service] #}
templates:
- "/etc/alertmanager/templates/*.tmpl"
receivers:
- name: "ccchh-infrastructure-alerts"
telegram_configs:
- send_resolved: true
bot_token: {{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/grafana/alertmanager_telegram_bot_token", create=false, missing="error") }}
chat_id: -1002434372415
parse_mode: HTML
message: {{ "'{{ template \"alert-message.telegram.ccchh\" . }}'" }}

View file

@ -22,7 +22,7 @@
services: services:
keycloak: keycloak:
image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:25.0 image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:26.0
pull_policy: always pull_policy: always
restart: unless-stopped restart: unless-stopped
command: start --optimized command: start --optimized

View file

@ -4,7 +4,7 @@ services:
image: docker.io/library/postgres:15-alpine image: docker.io/library/postgres:15-alpine
environment: environment:
- "POSTGRES_USER=pretix" - "POSTGRES_USER=pretix"
- "POSTGRES_PASSWORD={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/hackertours/DB_PASSWORD", create=false, missing="error") }}" - "POSTGRES_PASSWORD={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/tickets/DB_PASSWORD", create=false, missing="error") }}"
- "POSTGRES_DB=pretix" - "POSTGRES_DB=pretix"
volumes: volumes:
- database:/var/lib/postgresql/data - database:/var/lib/postgresql/data
@ -37,14 +37,6 @@ services:
backend: backend:
frontend: frontend:
web:
image: gitlab-cr.hamburg.ccc.de/ccchh/hackertours/hackertours:latest
ports:
- "8888:80"
restart: unless-stopped
networks:
frontend:
volumes: volumes:
database: {} database: {}
pretix: {} pretix: {}

View file

@ -1,6 +1,6 @@
[pretix] [pretix]
instance_name=CCCHH Hackertours instance_name=CCCHH Tickets
url=https://hackertours.hamburg.ccc.de url=https://tickets.hamburg.ccc.de
currency=EUR currency=EUR
datadir=/data datadir=/data
trust_x_forwarded_for=on trust_x_forwarded_for=on
@ -10,11 +10,11 @@ trust_x_forwarded_proto=on
backend=postgresql backend=postgresql
name=pretix name=pretix
user=pretix user=pretix
password={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/hackertours/DB_PASSWORD", create=false, missing="error") }} password={{ lookup("community.general.passwordstore", "noc/vm-secrets/chaosknoten/tickets/DB_PASSWORD", create=false, missing="error") }}
host=database host=database
[mail] [mail]
from=ticket@hackertours.hamburg.ccc.de from=tickets@hamburg.ccc.de
host=cow-intern.hamburg.ccc.de host=cow-intern.hamburg.ccc.de
[redis] [redis]