Make Prometheus Alert Rules Work Better and Organize Them Better #19
1 changed files with 13 additions and 1 deletions
|
@ -79,14 +79,26 @@ groups:
|
|||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
||||
# General network receive error alerts.
|
||||
# Excluding: OPNsense hosts
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="OPNsense"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}"
|
||||
# OPNsense network receive error alerts.
|
||||
# This is the same as the regular network receive error alerts, but excluding the WireGuard interfaces as they like to throw errors, but which aren't of importance.
|
||||
- alert: OPNsenseHostNetworkReceiveErrors
|
||||
expr: (rate(node_network_receive_errs_total{device!~"wg.+"}[2m]) / rate(node_network_receive_packets_total{device!~"wg.+"}[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename="OPNsense"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OPNsense host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}"
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
|
|
Loading…
Add table
Reference in a new issue