Merge branch 'main' of git.hamburg.ccc.de:CCCHH/ansible-infra

This commit is contained in:
Stefan Bethke 2024-10-08 20:28:57 +02:00
commit a35fcc13cf
3 changed files with 44 additions and 12 deletions

View file

@ -49,8 +49,6 @@ scrape_configs:
- mumble.hamburg.ccc.de:443 - mumble.hamburg.ccc.de:443
- job_name: opnsense-ccchh - job_name: opnsense-ccchh
honor_timestamps: true honor_timestamps: true
scrape_interval: 5s
scrape_timeout: 1s
metrics_path: /metrics metrics_path: /metrics
scheme: http scheme: http
static_configs: static_configs:

View file

@ -44,22 +44,39 @@ groups:
annotations: annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }}) summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}" description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}"
- alert: HostUnusualDiskReadRate # Have different disk read and write rate alerts for VMs and physical machines.
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - alert: VirtualHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"}
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }}) summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: HostUnusualDiskWriteRate - alert: VirtualHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }}) summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 20m
labels:
severity: warning
annotations:
summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 15m
labels:
severity: warning
annotations:
summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}"
# Please add ignored mountpoints in node_exporter parameters like # Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
@ -156,13 +173,30 @@ groups:
annotations: annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }}) summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}" description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}"
- alert: HostUnusualDiskIo # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - alert: PhysicalHostUnusualHardDiskIo
expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }}) summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualOtherDiskIo
expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: VirtualHostUnusualDiskIo
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
# # x2 context switches is an arbitrary number. # # x2 context switches is an arbitrary number.
# # The alert threshold depends on the nature of the application. # # The alert threshold depends on the nature of the application.

View file

@ -22,7 +22,7 @@
services: services:
keycloak: keycloak:
image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:25.0 image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:26.0
pull_policy: always pull_policy: always
restart: unless-stopped restart: unless-stopped
command: start --optimized command: start --optimized