forked from CCCHH/ansible-infra
		
	grafana: differentiate prometheus disk rate alerts by host task type
Not by a mix of host task type (CI server or not) and whether or not the host is virtual or physical. Also only differentiate on the duration not the rate, to not accidentally exclude slow hard disks.
This commit is contained in:
		
					parent
					
						
							
								5016407cef
							
						
					
				
			
			
				commit
				
					
						9e77a41e3c
					
				
			
		
					 1 changed files with 29 additions and 25 deletions
				
			
		| 
						 | 
				
			
			@ -114,56 +114,60 @@ groups:
 | 
			
		|||
 | 
			
		||||
  - name: node-exporter-disk
 | 
			
		||||
    rules:
 | 
			
		||||
      # Have different disk read and write rate alerts for VMs and physical machines.
 | 
			
		||||
      - alert: VirtualHostUnusualDiskReadRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
 | 
			
		||||
      # General high disk read and write rate alerts.
 | 
			
		||||
      # Excluding: hypervisor hosts, CI hosts
 | 
			
		||||
      - alert: HostUnusualDiskReadRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
 | 
			
		||||
        for: 5m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
 | 
			
		||||
          summary: Host unusual disk read rate (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
      - alert: VirtualHostUnusualDiskWriteRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker"}
 | 
			
		||||
      - alert: HostUnusualDiskWriteRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+", nodename!="forgejo-actions-runner", nodename!="woodpecker", nodename!="chaosknoten"}
 | 
			
		||||
        for: 2m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
 | 
			
		||||
          summary: Host unusual disk write rate (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
      # Some VMs are expected to have high Read / Write rates z.B. CI servers
 | 
			
		||||
      - alert: VirtualHostUnusualDiskReadRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
 | 
			
		||||
      # CI hosts high disk read and write alerts.
 | 
			
		||||
      # Longer intervals to account for disk intensive CI tasks.
 | 
			
		||||
      - alert: CIHostUnusualDiskReadRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
 | 
			
		||||
        for: 10m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Virtual host unusual disk read rate for 10 min (instance {{ $labels.instance }})
 | 
			
		||||
          summary: CI host unusual disk read rate for 10 min (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
      - alert: VirtualHostUnusualDiskWriteRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename="forgejo-actions-runner", nodename="woodpecker"}
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="forgejo-actions-runner", nodename="woodpecker"}
 | 
			
		||||
        for: 4m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Virtual host unusual disk write rate for 4 min (instance {{ $labels.instance }})
 | 
			
		||||
          summary: CI host unusual disk write rate for 4 min (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
      - alert: PhysicalHostUnusualDiskReadRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
 | 
			
		||||
      # Hypervisor host high disk read and write alerts.
 | 
			
		||||
      # Longer intervals to account for disk intensive hypervisor tasks (backups, moving VMs, etc.).
 | 
			
		||||
      - alert: HypervisorHostUnusualDiskReadRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
 | 
			
		||||
        for: 30m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Hypervisor host unusual disk read rate (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
      - alert: HypervisorHostUnusualDiskWriteRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename="chaosknoten"}
 | 
			
		||||
        for: 20m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Disk is probably reading too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
      - alert: PhysicalHostUnusualDiskWriteRate
 | 
			
		||||
        expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
 | 
			
		||||
        for: 15m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Disk is probably writing too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
          summary: Hypervisor host unusual disk write rate (instance {{ $labels.instance }})
 | 
			
		||||
          description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
      # Please add ignored mountpoints in node_exporter parameters like
 | 
			
		||||
      # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
 | 
			
		||||
      # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue