Compare commits

...

2 commits

Author SHA1 Message Date
June 4cac84e7ec
prometheus: have different disk alerts for physical and virtual hosts
Have more relaxed read/write alerts for physical hosts as they are
probably hypervisors and regular high read/writes are more common.
Also differentiate between physical and virtual hosts for IO alerts and
allow for hard disks to spend more time in IO.
2024-10-05 17:22:45 +02:00
June f721dd9fea
prometheus: make opnsense-ccchh job not fail half the time
The scrape seems to take around a second to complete and with the
configured timeout of 1s that failed half the time. Therefore use the
default, more relaxed scrape interval and timeout and have it be
reliable.
2024-10-05 17:22:45 +02:00
2 changed files with 43 additions and 11 deletions

View file

@ -49,8 +49,6 @@ scrape_configs:
- mumble.hamburg.ccc.de:443 - mumble.hamburg.ccc.de:443
- job_name: opnsense-ccchh - job_name: opnsense-ccchh
honor_timestamps: true honor_timestamps: true
scrape_interval: 5s
scrape_timeout: 1s
metrics_path: /metrics metrics_path: /metrics
scheme: http scheme: http
static_configs: static_configs:

View file

@ -44,22 +44,39 @@ groups:
annotations: annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }}) summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}" description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}"
- alert: HostUnusualDiskReadRate # Have different disk read and write rate alerts for VMs and physical machines.
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - alert: VirtualHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"}
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }}) summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: HostUnusualDiskWriteRate - alert: VirtualHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }}) summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}" description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualDiskReadRate
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 20m
labels:
severity: warning
annotations:
summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualDiskWriteRate
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 15m
labels:
severity: warning
annotations:
summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}"
# Please add ignored mountpoints in node_exporter parameters like # Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
@ -156,13 +173,30 @@ groups:
annotations: annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }}) summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}" description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}"
- alert: HostUnusualDiskIo # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - alert: PhysicalHostUnusualHardDiskIo
expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }}) summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: PhysicalHostUnusualOtherDiskIo
expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
- alert: VirtualHostUnusualDiskIo
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
for: 5m
labels:
severity: warning
annotations:
summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}" description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
# # x2 context switches is an arbitrary number. # # x2 context switches is an arbitrary number.
# # The alert threshold depends on the nature of the application. # # The alert threshold depends on the nature of the application.