From c4e35c1adfcd700061f208458586d9f2c5f36a38 Mon Sep 17 00:00:00 2001
From: June <june@jsts.xyz>
Date: Thu, 6 Feb 2025 01:34:45 +0100
Subject: [PATCH] grafana: pull out prom. net. rec. err. alerts for OPNs. to
 ex. wg int.

Pull out prometheus network receive error alerts for OPNsense to exclude
its WireGuard interfaces, which like to throw errors, but which aren't
of importance.
---
 .../docker_compose/prometheus_alerts.rules.yaml    | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
index 8c8f374..3d9d7a1 100644
--- a/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
+++ b/resources/chaosknoten/grafana/docker_compose/prometheus_alerts.rules.yaml
@@ -79,14 +79,26 @@ groups:
         annotations:
           summary: Host unusual network throughput out (instance {{ $labels.instance }})
           description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
+      # General network receive error alerts.
+      # Excluding: OPNsense hosts
       - alert: HostNetworkReceiveErrors
-        expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
+        expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="OPNsense"}
         for: 2m
         labels:
           severity: warning
         annotations:
           summary: Host Network Receive Errors (instance {{ $labels.instance }})
           description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}"
+      # OPNsense network receive error alerts.
+      # This is the same as the regular network receive error alerts, but excluding the WireGuard interfaces as they like to throw errors, but which aren't of importance.
+      - alert: OPNsenseHostNetworkReceiveErrors
+        expr: (rate(node_network_receive_errs_total{device!~"wg.+"}[2m]) / rate(node_network_receive_packets_total{device!~"wg.+"}[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename="OPNsense"}
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: OPNsense host Network Receive Errors (instance {{ $labels.instance }})
+          description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}"
       - alert: HostNetworkTransmitErrors
         expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
         for: 2m