grafana: pull out prom. net. rec. err. alerts for OPNs. to ex. wg int.
Pull out prometheus network receive error alerts for OPNsense to exclude its WireGuard interfaces, which like to throw errors, but which aren't of importance.
This commit is contained in:
parent
ee66631c2d
commit
c4e35c1adf
1 changed files with 13 additions and 1 deletions
|
@ -79,14 +79,26 @@ groups:
|
|||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}"
|
||||
# General network receive error alerts.
|
||||
# Excluding: OPNsense hosts
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="OPNsense"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}"
|
||||
# OPNsense network receive error alerts.
|
||||
# This is the same as the regular network receive error alerts, but excluding the WireGuard interfaces as they like to throw errors, but which aren't of importance.
|
||||
- alert: OPNsenseHostNetworkReceiveErrors
|
||||
expr: (rate(node_network_receive_errs_total{device!~"wg.+"}[2m]) / rate(node_network_receive_packets_total{device!~"wg.+"}[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename="OPNsense"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OPNsense host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}"
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue