From e7d531760d4be2e3ffcda83303e34f49a29eaba4 Mon Sep 17 00:00:00 2001
From: c6ristian <c6ristian@christian.moe>
Date: Fri, 14 Mar 2025 23:06:10 +0100
Subject: [PATCH] alertmanager

---
 modules/alloy.nix                             | 16 +++-
 .../alerts/resources.yaml                     | 89 +++++++++++++++++++
 .../monitoring.noc.eh22.intern/grafana.nix    | 13 ++-
 systems/monitoring.noc.eh22.intern/mimir.nix  | 25 ++++++
 systems/monitoring.noc.eh22.intern/system.nix |  6 ++
 5 files changed, 146 insertions(+), 3 deletions(-)
 create mode 100644 systems/monitoring.noc.eh22.intern/alerts/resources.yaml

diff --git a/modules/alloy.nix b/modules/alloy.nix
index d5c3c6d..19d2c52 100644
--- a/modules/alloy.nix
+++ b/modules/alloy.nix
@@ -54,7 +54,19 @@
 
         rule {
           source_labels = ["__journal__systemd_unit"]
-          target_label  = "unit"
+          target_label  = "systemd_unit"
+        }
+        rule {
+          source_labels = ["__journal__hostname"]
+          target_label = "systemd_hostname"
+        }
+        rule {
+          source_labels = ["__journal__transport"]
+          target_label = "systemd_transport"
+        }
+        rule {
+          source_labels = ["__journal_syslog_identifier"]
+          target_label = "syslog_identifier"
         }
       }
 
@@ -65,7 +77,7 @@
       }
 
       logging {
-        level = "info"
+        level = "warn"
         format = "logfmt"
       }
 
diff --git a/systems/monitoring.noc.eh22.intern/alerts/resources.yaml b/systems/monitoring.noc.eh22.intern/alerts/resources.yaml
new file mode 100644
index 0000000..5804cc3
--- /dev/null
+++ b/systems/monitoring.noc.eh22.intern/alerts/resources.yaml
@@ -0,0 +1,89 @@
+groups:
+    - name: Host & hardware
+      rules:
+        - alert: HostOutOfMemory
+          expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            description: |-
+                Node memory is filling up (< 10% left)
+                  VALUE = {{ $value }}
+                  LABELS = {{ $labels }}
+            summary: Host out of memory (instance {{ $labels.instance }})
+        - alert: HostMemoryUnderMemoryPressure
+          expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            description: |-
+                The node is under heavy memory pressure. High rate of major page faults
+                  VALUE = {{ $value }}
+                  LABELS = {{ $labels }}
+            summary: Host memory under memory pressure (instance {{ $labels.instance }})
+        - alert: HostOutOfDiskSpace
+          expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and on (instance, device, mountpoint) node_filesystem_readonly == 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            description: |-
+                Disk is almost full (< 10% left)
+                  VALUE = {{ $value }}
+                  LABELS = {{ $labels }}
+            summary: Host out of disk space (instance {{ $labels.instance }})
+        - alert: HostDiskWillFillIn24Hours
+          expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and on (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and on (instance, device, mountpoint) node_filesystem_readonly == 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            description: |-
+                Filesystem is predicted to run out of space within the next 24 hours at current write rate
+                  VALUE = {{ $value }}
+                  LABELS = {{ $labels }}
+            summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+        - alert: HostCpuIsUnderutilized
+          expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
+          for: 1w
+          labels:
+            severity: info
+          annotations:
+            description: |-
+                CPU load is < 20% for 1 week. Consider reducing the number of CPUs.
+                  VALUE = {{ $value }}
+                  LABELS = {{ $labels }}
+            summary: Host CPU is underutilized (instance {{ $labels.instance }})
+        - alert: HostCpuStealNoisyNeighbor
+          expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
+          labels:
+            severity: warning
+          annotations:
+            description: |-
+                CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
+                  VALUE = {{ $value }}
+                  LABELS = {{ $labels }}
+            summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
+        - alert: HostOomKillDetected
+          expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
+          labels:
+            severity: warning
+          annotations:
+            description: |-
+                OOM kill detected
+                  VALUE = {{ $value }}
+                  LABELS = {{ $labels }}
+            summary: Host OOM kill detected (instance {{ $labels.instance }})
+        - alert: HostNetworkInterfaceSaturated
+          expr: ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
+          for: 1m
+          labels:
+            severity: warning
+          annotations:
+            description: |-
+                The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.
+                  VALUE = {{ $value }}
+                  LABELS = {{ $labels }}
+            summary: Host Network Interface Saturated (instance {{ $labels.instance }})
diff --git a/systems/monitoring.noc.eh22.intern/grafana.nix b/systems/monitoring.noc.eh22.intern/grafana.nix
index 49c2e05..4128294 100644
--- a/systems/monitoring.noc.eh22.intern/grafana.nix
+++ b/systems/monitoring.noc.eh22.intern/grafana.nix
@@ -41,7 +41,18 @@
             name = "Loki";
             type = "loki";
             uid = "loki";
-            url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}";
+            url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}/";
+          }
+          {
+            name = "Mimir Alertmanager";
+            type = "alertmanager";
+            uid = "mimir-alertmanager";
+            access = "proxy";
+            url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}/";
+            jsonData = {
+              handleGrafanaManagedAlerts = true;
+              implementation = "mimir";
+            };
           }
         ];
       };
diff --git a/systems/monitoring.noc.eh22.intern/mimir.nix b/systems/monitoring.noc.eh22.intern/mimir.nix
index cb9ebc0..1939e17 100644
--- a/systems/monitoring.noc.eh22.intern/mimir.nix
+++ b/systems/monitoring.noc.eh22.intern/mimir.nix
@@ -1,8 +1,23 @@
 {
   pkgs,
+  lib,
   config,
   ...
 }:
+let
+  inherit (lib) mkEnableOption mkIf;
+  alerts =
+    pkgs.runCommand "mimir-alerts-checked"
+      {
+        src = ./alerts;
+        nativeBuildInputs = with pkgs; [ prometheus.cli ];
+      }
+      ''
+        promtool check rules $src/*
+        mkdir $out
+        cp -R $src $out/anonymous/
+      '';
+in
 {
   services.mimir = {
     enable = true;
@@ -28,6 +43,16 @@
         max_global_series_per_user = 0; # unlimited
         max_label_value_length = 10000; # we have pgscv queries that are LONG
       };
+
+      alertmanager_storage.backend = "filesystem";
+      alertmanager = {
+        sharding_ring.replication_factor = 1;
+      };
+      ruler_storage = {
+        backend = "local";
+        local.directory = alerts;
+      };
+
     };
   };
 
diff --git a/systems/monitoring.noc.eh22.intern/system.nix b/systems/monitoring.noc.eh22.intern/system.nix
index 2dd20fd..7c1c8ae 100644
--- a/systems/monitoring.noc.eh22.intern/system.nix
+++ b/systems/monitoring.noc.eh22.intern/system.nix
@@ -48,6 +48,12 @@
     ./loki.nix
   ];
 
+  services.nginx = {
+    enable = true;
+    recommendedProxySettings = true;
+    logError = "syslog:server=unix:/dev/log,nohostname";
+  };
+
   # DO NOT CHANGE
   # this defines the first version of NixOS that was installed on the machine so that programs with non-migratable data files are kept compatible
   home-manager.users.noc.home.stateVersion = "24.11";