diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml index 8c8f374..3d9d7a1 100644 --- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml +++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml @@ -79,14 +79,26 @@ groups: annotations: summary: Host unusual network throughput out (instance {{ $labels.instance }}) description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}" + # General network receive error alerts. + # Excluding: OPNsense hosts - alert: HostNetworkReceiveErrors - expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="OPNsense"} for: 2m labels: severity: warning annotations: summary: Host Network Receive Errors (instance {{ $labels.instance }}) description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}" + # OPNsense network receive error alerts. + # This is the same as the regular network receive error alerts, but excluding the WireGuard interfaces as they like to throw errors, but which aren't of importance. + - alert: OPNsenseHostNetworkReceiveErrors + expr: (rate(node_network_receive_errs_total{device!~"wg.+"}[2m]) / rate(node_network_receive_packets_total{device!~"wg.+"}[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename="OPNsense"} + for: 2m + labels: + severity: warning + annotations: + summary: OPNsense host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}" - alert: HostNetworkTransmitErrors expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 2m