prometheus: have different disk alerts for physical and virtual hosts
Have more relaxed read/write alerts for physical hosts as they are probably hypervisors and regular high read/writes are more common. Also differentiate between physical and virtual hosts for IO alerts and allow for hard disks to spend more time in IO.
This commit is contained in:
		
					parent
					
						
							
								f721dd9fea
							
						
					
				
			
			
				commit
				
					
						4cac84e7ec
					
				
			
		
					 1 changed files with 43 additions and 9 deletions
				
			
		|  | @ -44,22 +44,39 @@ groups: | |||
|     annotations: | ||||
|       summary: Host unusual network throughput out (instance {{ $labels.instance }}) | ||||
|       description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}" | ||||
|   - alert: HostUnusualDiskReadRate | ||||
|     expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} | ||||
|   # Have different disk read and write rate alerts for VMs and physical machines. | ||||
|   - alert: VirtualHostUnusualDiskReadRate | ||||
|     expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"} | ||||
|     for: 5m | ||||
|     labels: | ||||
|       severity: warning | ||||
|     annotations: | ||||
|       summary: Host unusual disk read rate (instance {{ $labels.instance }}) | ||||
|       summary: Virtual host unusual disk read rate (instance {{ $labels.instance }}) | ||||
|       description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}" | ||||
|   - alert: HostUnusualDiskWriteRate | ||||
|     expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} | ||||
|   - alert: VirtualHostUnusualDiskWriteRate | ||||
|     expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"} | ||||
|     for: 2m | ||||
|     labels: | ||||
|       severity: warning | ||||
|     annotations: | ||||
|       summary: Host unusual disk write rate (instance {{ $labels.instance }}) | ||||
|       summary: Virtual host unusual disk write rate (instance {{ $labels.instance }}) | ||||
|       description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}" | ||||
|   - alert: PhysicalHostUnusualDiskReadRate | ||||
|     expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} | ||||
|     for: 20m | ||||
|     labels: | ||||
|       severity: warning | ||||
|     annotations: | ||||
|       summary: Physical host unusual disk read rate (instance {{ $labels.instance }}) | ||||
|       description: "Disk is probably reading too much data (> 100 MB/s)\n  VALUE = {{ $value }}" | ||||
|   - alert: PhysicalHostUnusualDiskWriteRate | ||||
|     expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} | ||||
|     for: 15m | ||||
|     labels: | ||||
|       severity: warning | ||||
|     annotations: | ||||
|       summary: Physical host unusual disk write rate (instance {{ $labels.instance }}) | ||||
|       description: "Disk is probably writing too much data (> 100 MB/s)\n  VALUE = {{ $value }}" | ||||
|   # Please add ignored mountpoints in node_exporter parameters like | ||||
|   # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". | ||||
|   # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. | ||||
|  | @ -156,13 +173,30 @@ groups: | |||
|     annotations: | ||||
|       summary: Host CPU high iowait (instance {{ $labels.instance }}) | ||||
|       description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}" | ||||
|   - alert: HostUnusualDiskIo | ||||
|     expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} | ||||
|   # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks. | ||||
|   - alert: PhysicalHostUnusualHardDiskIo | ||||
|     expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} | ||||
|     for: 5m | ||||
|     labels: | ||||
|       severity: warning | ||||
|     annotations: | ||||
|       summary: Host unusual disk IO (instance {{ $labels.instance }}) | ||||
|       summary: Physical host unusual hard disk IO (instance {{ $labels.instance }}) | ||||
|       description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}" | ||||
|   - alert: PhysicalHostUnusualOtherDiskIo | ||||
|     expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"} | ||||
|     for: 5m | ||||
|     labels: | ||||
|       severity: warning | ||||
|     annotations: | ||||
|       summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }}) | ||||
|       description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}" | ||||
|   - alert: VirtualHostUnusualDiskIo | ||||
|     expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"} | ||||
|     for: 5m | ||||
|     labels: | ||||
|       severity: warning | ||||
|     annotations: | ||||
|       summary: Virtual host unusual disk IO (instance {{ $labels.instance }}) | ||||
|       description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}" | ||||
|   # # x2 context switches is an arbitrary number. | ||||
|   # # The alert threshold depends on the nature of the application. | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue