forked from CCCHH/ansible-infra
		
	Merge branch 'main' of git.hamburg.ccc.de:CCCHH/ansible-infra
This commit is contained in:
		
				commit
				
					
						a35fcc13cf
					
				
			
		
					 3 changed files with 44 additions and 12 deletions
				
			
		| 
						 | 
				
			
			@ -49,8 +49,6 @@ scrape_configs:
 | 
			
		|||
    - mumble.hamburg.ccc.de:443
 | 
			
		||||
- job_name: opnsense-ccchh
 | 
			
		||||
  honor_timestamps: true
 | 
			
		||||
  scrape_interval: 5s
 | 
			
		||||
  scrape_timeout: 1s
 | 
			
		||||
  metrics_path: /metrics
 | 
			
		||||
  scheme: http
 | 
			
		||||
  static_configs:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -44,22 +44,39 @@ groups:
 | 
			
		|||
    annotations:
 | 
			
		||||
      summary: Host unusual network throughput out (instance {{ $labels.instance }})
 | 
			
		||||
      description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
  - alert: HostUnusualDiskReadRate
 | 
			
		||||
    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
 | 
			
		||||
  # Have different disk read and write rate alerts for VMs and physical machines.
 | 
			
		||||
  - alert: VirtualHostUnusualDiskReadRate
 | 
			
		||||
    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{ype="virtual_machine", nodename=~".+"}
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: Host unusual disk read rate (instance {{ $labels.instance }})
 | 
			
		||||
      summary: Virtual host unusual disk read rate (instance {{ $labels.instance }})
 | 
			
		||||
      description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
  - alert: HostUnusualDiskWriteRate
 | 
			
		||||
    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
 | 
			
		||||
  - alert: VirtualHostUnusualDiskWriteRate
 | 
			
		||||
    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
 | 
			
		||||
    for: 2m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: Host unusual disk write rate (instance {{ $labels.instance }})
 | 
			
		||||
      summary: Virtual host unusual disk write rate (instance {{ $labels.instance }})
 | 
			
		||||
      description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
  - alert: PhysicalHostUnusualDiskReadRate
 | 
			
		||||
    expr: (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
 | 
			
		||||
    for: 20m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: Physical host unusual disk read rate (instance {{ $labels.instance }})
 | 
			
		||||
      description: "Disk is probably reading too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
  - alert: PhysicalHostUnusualDiskWriteRate
 | 
			
		||||
    expr: (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
 | 
			
		||||
    for: 15m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: Physical host unusual disk write rate (instance {{ $labels.instance }})
 | 
			
		||||
      description: "Disk is probably writing too much data (> 100 MB/s)\n  VALUE = {{ $value }}"
 | 
			
		||||
  # Please add ignored mountpoints in node_exporter parameters like
 | 
			
		||||
  # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
 | 
			
		||||
  # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
 | 
			
		||||
| 
						 | 
				
			
			@ -156,13 +173,30 @@ groups:
 | 
			
		|||
    annotations:
 | 
			
		||||
      summary: Host CPU high iowait (instance {{ $labels.instance }})
 | 
			
		||||
      description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}"
 | 
			
		||||
  - alert: HostUnusualDiskIo
 | 
			
		||||
    expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
 | 
			
		||||
  # Have different disk IO alerts for VMs and physical machines and for physical machines different ones for hard and other disks.
 | 
			
		||||
  - alert: PhysicalHostUnusualHardDiskIo
 | 
			
		||||
    expr: (rate(node_disk_io_time_seconds_total{device=~"s.+"}[1m]) > 0.75) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: Host unusual disk IO (instance {{ $labels.instance }})
 | 
			
		||||
      summary: Physical host unusual hard disk IO (instance {{ $labels.instance }})
 | 
			
		||||
      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
 | 
			
		||||
  - alert: PhysicalHostUnusualOtherDiskIo
 | 
			
		||||
    expr: (rate(node_disk_io_time_seconds_total{device!~"s.+"}[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="physical_machine", nodename=~".+"}
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: Physical host unusual other (non-hard) disk IO (instance {{ $labels.instance }})
 | 
			
		||||
      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
 | 
			
		||||
  - alert: VirtualHostUnusualDiskIo
 | 
			
		||||
    expr: (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{type="virtual_machine", nodename=~".+"}
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: warning
 | 
			
		||||
    annotations:
 | 
			
		||||
      summary: Virtual host unusual disk IO (instance {{ $labels.instance }})
 | 
			
		||||
      description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}"
 | 
			
		||||
  # # x2 context switches is an arbitrary number.
 | 
			
		||||
  # # The alert threshold depends on the nature of the application.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -22,7 +22,7 @@
 | 
			
		|||
 | 
			
		||||
services:
 | 
			
		||||
  keycloak:
 | 
			
		||||
    image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:25.0
 | 
			
		||||
    image: git.hamburg.ccc.de/ccchh/oci-images/keycloak:26.0
 | 
			
		||||
    pull_policy: always
 | 
			
		||||
    restart: unless-stopped
 | 
			
		||||
    command: start --optimized
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue