Compare commits
No commits in common. "4cac84e7ec6f7d9f8cde3a5a0677d744545c97b2" and "d8188d192b90fae79954c685cf0c388af18b88a3" have entirely different histories.
4cac84e7ec
...
d8188d192b
|
@ -49,6 +49,8 @@ scrape_configs:
|
||||||
- mumble.hamburg.ccc.de:443
|
- mumble.hamburg.ccc.de:443
|
||||||
- job_name: opnsense-ccchh
|
- job_name: opnsense-ccchh
|
||||||
honor_timestamps: true
|
honor_timestamps: true
|
||||||
|
scrape_interval: 5s
|
||||||
|
scrape_timeout: 1s
|
||||||
metrics_path: /metrics
|
metrics_path: /metrics
|
||||||
scheme: http
|
scheme: http
|
||||||
static_configs:
|
static_configs:
|
||||||
|
|
|
@ -44,39 +44,22 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
||||||
# Have different disk read and write rate alerts for VMs and physical machines.
|
- alert: HostUnusualDiskReadRate
|
||||||
- alert: VirtualHostUnusualDiskReadRate
|
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"}
|
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
|
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
- alert: VirtualHostUnusualDiskWriteRate
|
- alert: HostUnusualDiskWriteRate
|
||||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
|
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
|
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}"
|
||||||
- alert: PhysicalHostUnusualDiskReadRate
|
|
||||||
expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
|
||||||
for: 20m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
|
|
||||||
description: "Disk is probably reading too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
|
||||||
- alert: PhysicalHostUnusualDiskWriteRate
|
|
||||||
expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
|
|
||||||
description: "Disk is probably writing too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
|
||||||
# Please add ignored mountpoints in node_exporter parameters like
|
# Please add ignored mountpoints in node_exporter parameters like
|
||||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||||
|
@ -173,30 +156,13 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||||
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}"
|
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}"
|
||||||
# Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
|
- alert: HostUnusualDiskIo
|
||||||
- alert: PhysicalHostUnusualHardDiskIo
|
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
|
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
|
|
||||||
- alert: PhysicalHostUnusualOtherDiskIo
|
|
||||||
expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
|
|
||||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
|
|
||||||
- alert: VirtualHostUnusualDiskIo
|
|
||||||
expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
|
|
||||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
|
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}"
|
||||||
# # x2 context switches is an arbitrary number.
|
# # x2 context switches is an arbitrary number.
|
||||||
# # The alert threshold depends on the nature of the application.
|
# # The alert threshold depends on the nature of the application.
|
||||||
|
|
Loading…
Reference in a new issue