forked from CCCHH/ansible-infra
		
	grafana: pull out prom. net. rec. err. alerts for OPNs. to ex. wg int.
Pull out prometheus network receive error alerts for OPNsense to exclude its WireGuard interfaces, which like to throw errors, but which aren't of importance.
This commit is contained in:
		
					parent
					
						
							
								ee66631c2d
							
						
					
				
			
			
				commit
				
					
						c4e35c1adf
					
				
			
		
					 1 changed files with 13 additions and 1 deletions
				
			
		| 
						 | 
				
			
			@ -79,14 +79,26 @@ groups:
 | 
			
		|||
        annotations:
 | 
			
		||||
          summary: Host unusual network throughput out (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
      # General network receive error alerts.
 | 
			
		||||
      # Excluding: OPNsense hosts
 | 
			
		||||
      - alert: HostNetworkReceiveErrors
 | 
			
		||||
        expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
 | 
			
		||||
        expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="OPNsense"}
 | 
			
		||||
        for: 2m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Host Network Receive Errors (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}"
 | 
			
		||||
      # OPNsense network receive error alerts.
 | 
			
		||||
      # This is the same as the regular network receive error alerts, but excluding the WireGuard interfaces as they like to throw errors, but which aren't of importance.
 | 
			
		||||
      - alert: OPNsenseHostNetworkReceiveErrors
 | 
			
		||||
        expr: (rate(node_network_receive_errs_total{device!~"wg.+"}[2m]) / rate(node_network_receive_packets_total{device!~"wg.+"}[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename="OPNsense"}
 | 
			
		||||
        for: 2m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: OPNsense host Network Receive Errors (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}"
 | 
			
		||||
      - alert: HostNetworkTransmitErrors
 | 
			
		||||
        expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
 | 
			
		||||
        for: 2m
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue