{ config, pkgs, ... }: let prometheusConfig = { global.scrape_interval = "15s"; scrape_configs = [ { job_name = "proxmox"; metrics_path = "/pve"; params = {"target" = ["localhost"];}; static_configs = [{targets = ["proxmox:9221"];}]; } { job_name = "personal_hardware"; static_configs = [{targets = ["london:9100" "vancouver:9100" "localhost:9100" "proxmox.scorpion-ghost.ts.net:9100" ];}]; } { job_name = "speedtest-exporter"; scrape_interval = "1h"; scrape_timeout = "1m"; static_configs = [{targets = ["vancouver:9798"];}]; } { job_name = "syncthing"; static_configs = [{targets = ["vancouver:8384"];}]; } { job_name = "forgejo"; static_configs = [{targets = ["git.gmem.ca"];}]; } { job_name = "healthchecks"; scrape_interval = "60s"; metrics_path = "/projects/5f1de50f-a52d-4215-961f-aae7cc6cf6c9/metrics/TbMoU7SUdknzMe-H5Q4HzmKl3itOIrJk"; static_configs = [{targets = ["localhost:8000"];}]; } { job_name = "dnsmasq"; scrape_interval = "10s"; static_configs = [{targets = ["100.102.19.124:9153" "100.92.113.87:9153"];}]; } { job_name = "blackbox_home"; metrics_path = "/probe"; params = {"modules" = ["http_2xx"];}; static_configs = [ { labels = { location = "home"; }; targets = ["floofy.tech" "one.one.one.one" "1.1.1.1" "[2606:4700:4700::1111]" "waterwolf.club"]; } ]; relabel_configs = [ { source_labels = ["__address__"]; target_label = "__param_target"; } { source_labels = ["__param_target"]; target_label = "instance"; } { source_labels = []; target_label = "__address__"; replacement = "vancouver:9115"; } ]; } { job_name = "blackbox_hetzner"; metrics_path = "/probe"; params = {"modules" = ["http_2xx"];}; static_configs = [ { labels = { location = "hetzner"; }; targets = ["floofy.tech" "one.one.one.one" "[2606:4700:4700::1111]" "waterwolf.club"]; } ]; relabel_configs = [ { source_labels = ["__address__"]; target_label = "__param_target"; } { source_labels = ["__param_target"]; target_label = "instance"; } { source_labels = []; target_label = "__address__"; replacement = "127.0.0.1:9115"; } ]; } { job_name = "haproxy"; scrape_interval = "10s"; static_configs = [{targets = ["100.87.208.14:8404"];}]; } { job_name = "tclip"; scrape_interval = "15s"; static_configs = [{targets = ["paste"];}]; } ]; }; in { imports = [ ./hardware.nix ./networking.nix # generated at runtime by nixos-infect ]; age.secrets.healthchecks-secret = { file = ../../secrets/monitoring-healthchecks-secret.age; owner = "healthchecks"; }; age.secrets.healthchecks-smtp = { file = ../../secrets/fastmail-smtp.age; owner = "healthchecks"; }; age.secrets.healthchecks-telegram = { file = ../../secrets/healthchecks-telegram.age; owner = "healthchecks"; }; age.secrets.prometheus-webconfig-secret = { file = ../../secrets/monitoring-prometheus-webconfig.age; owner = "prometheus"; mode = "775"; }; age.secrets.prometheus-password-secret = { file = ../../secrets/monitoring-prometheus-password.age; owner = "vmalert"; mode = "777"; }; age.secrets.grafana-client-secret = { file = ../../secrets/monitoring-grafana-client-secret.age; owner = "grafana"; }; age.secrets.telegram = { file = ../../secrets/monitoring-telegram.age; owner = "prometheus"; mode = "775"; }; nix.settings.auto-optimise-store = true; boot.tmp.cleanOnBoot = true; zramSwap.enable = true; networking.hostName = "monitoring"; networking.domain = ""; services.openssh.enable = true; users.users.root.openssh.authorizedKeys.keys = [ ''ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBDjEgtIWPA5Ncs/KOcMeT6Q/HACJJetDOLjMvXXwUE+08oTX1EpHrWPpy8J+UHKIyErCNPYq8dgtrbhnMRlxHqI='' ]; networking.firewall.enable = false; services.grafana = { enable = true; settings = { feature_toggles = { publicDashboards = true; }; log = { filters = "oauth.generic_oauth:debug"; }; server = { domain = "grafana.gmem.ca"; http_port = 2342; http_addr = "127.0.0.1"; root_url = "https://grafana.gmem.ca"; }; auth = { signout_redirect_url = "https://authentik.gmem.ca/application/o/grafana/end-session/"; oauth_auto_login = true; }; "auth.generic_oauth" = { name = "authentik"; client_id = "VbOQzwuf0UK9AUGrWvaVaWWHvX2fJsZChxJNGt61"; client_secret = "$__file{${config.age.secrets.grafana-client-secret.path}}"; auth_url = "https://authentik.gmem.ca/application/o/authorize/"; api_url = "https://authentik.gmem.ca/application/o/userinfo/"; token_url = "https://authentik.gmem.ca/application/o/token/"; enabled = true; scopes = "openid email grafana-user"; role_attribute_path = "contains(info.groups[*], 'Grafana Admins') && 'Admin' || contains(info.groups[*], 'Grafana Editors') && 'Editor' || 'Viewer'"; role_attribute_strict = true; }; }; }; services.victoriametrics = { enable = true; extraOptions = [ "-promscrape.config=${(pkgs.formats.yaml { }).generate "scrape.yml" prometheusConfig}" "-selfScrapeInterval=10s" "-vmalert.proxyURL=http://localhost:8880" ]; }; services.vmalert = { enable = true; settings = { "datasource.basicAuth.username" = "homelab"; "datasource.basicAuth.passwordFile" = config.age.secrets.prometheus-password-secret.path; "datasource.url" = "http://localhost${builtins.toString config.services.victoriametrics.listenAddress}"; "notifier.url" = [ "http://localhost:${builtins.toString config.services.prometheus.alertmanager.port}" ]; "remoteWrite.url" = "http://localhost:8428"; "remoteRead.url" = "http://localhost:8428"; }; rules = { groups = [ { name = "internet_connection"; rules = [ { alert = "LowInternetDownloadLast2Hours"; expr = '' avg_over_time(speedtest_download_bits_per_second{}[2h]) <= 500000000 ''; for = "1h"; labels = { severity = "low"; }; annotations = { summary = "Low Internet Speed in the Last 2 Hours"; description = "Internet speed has been consistently below 500Mb/s over the last 2 hours."; dashboard = "o9mIe_Aik"; }; } { alert = "MetricNotScrapedLast2Hours"; expr = '' absent_over_time(speedtest_download_bits_per_second[2h]) > 7200 ''; for = "2h"; labels = { severity = "high"; }; annotations = { summary = "Metric Not Scraped in the Last 2 Hours"; description = "The metric speedtest_download_bits_per_second has not been successfully scraped for over 2 hours."; dashboard = "o9mIe_Aik"; }; } ]; } { name = "healthchecks"; rules = [ { alert = "HealthcheckFailedCheckin"; expr = ''hc_check_up < 1''; for = "5m"; labels.severity = "medium"; annotations = { summary = "{{ $labels.name }} healthcheck failed"; description = "The {{ $labels.name }} healthcheck failed to check in."; dashboard = "f594ea85-45f2-4019-b988-2d17638b5cf3"; }; } ]; } { name = "kubernetes_node_exporter_memory_alerts"; rules = [ { alert = "HighKubernetesMemoryUsage"; expr = '' (sum by(instance) (node_memory_MemTotal_bytes{job="node-exporter"}) - sum by(instance) (node_memory_MemAvailable_bytes{job="node-exporter"})) / sum by(instance) (node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 80 ''; for = "5m"; labels = { severity = "medium"; }; annotations = { summary = "High Memory Usage on Kubernetes Node {{ $labels.instance }}"; description = "Memory usage is above 80% on instance {{ $labels.instance }}. Current value: {{ $value }}%"; }; } ]; } { name = "kubernetes_node_exporter_cpu_alerts"; rules = [ { alert = "HighKubernetesCPUUsage"; expr = '' sum by(instance) (100 - avg by(instance) (irate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m])) * 100) > 80 ''; for = "5m"; labels = { severity = "medium"; }; annotations = { summary = "High CPU Usage on Kubernetes Node {{ $labels.instance }}"; description = "CPU usage is above 80% on instance {{ $labels.instance }}."; }; } ]; } { name = "kubernetes_pod_restart_loop"; rules = [ { alert = "KubernetesPodRestartLoop"; expr = ''changes(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 3''; for = "15m"; labels = { severity = "medium"; }; annotations = { summary = "Kubernetes Pod Restart Loop: {{ $labels.namespace }}/{{ $labels.pod }}"; description = "Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting frequently. Number of restarts: {{ $value }}."; dashboard = "k8s_views_pods"; }; } ]; } { name = "postgresql_database_connectivity"; rules = [ { alert = "PostgreSQLDatabaseConnectivity"; expr = ''pg_up == 0''; for = "5m"; labels = { severity = "high"; }; annotations = { summary = "PostgreSQL Database Connectivity Issue"; description = "PostgreSQL exporter cannot connect to database."; dashboard = "wGgaPlciz"; }; } ]; } { name = "proxmox_status"; rules = [ { alert = "ProxmoxOffline"; expr = ''up{job="proxmox"} == 0''; for = "1m"; labels = { severity = "high"; }; annotations = { summary = "Proxmox offline"; description = "Proxmox exporter scrapes are failing. Proxmox is likely offline."; dashboard = "Dp7Cd57Zza"; }; } ]; } ]; }; }; services.loki = { enable = true; configuration = { server.http_listen_port = 3030; auth_enabled = false; ingester = { lifecycler = { address = "127.0.0.1"; ring = { kvstore = { store = "inmemory"; }; replication_factor = 1; }; }; chunk_idle_period = "1h"; max_chunk_age = "1h"; chunk_target_size = 999999; chunk_retain_period = "30s"; }; schema_config = { configs = [ { from = "2022-06-06"; store = "boltdb-shipper"; object_store = "filesystem"; schema = "v11"; index = { prefix = "index_"; period = "24h"; }; } { from = "2024-05-01"; store = "tsdb"; object_store = "filesystem"; schema = "v13"; index = { prefix = "index_"; period = "24h"; }; } ]; }; storage_config = { boltdb_shipper = { active_index_directory = "/var/lib/loki/boltdb-shipper-active"; cache_location = "/var/lib/loki/boltdb-shipper-cache"; cache_ttl = "24h"; }; tsdb_shipper = { active_index_directory = "/var/lib/loki/tsdb-shipper-active"; cache_location = "/var/lib/loki/tsdb-shipper-cache"; cache_ttl = "24h"; }; filesystem = { directory = "/var/lib/loki/chunks"; }; }; limits_config = { reject_old_samples = true; reject_old_samples_max_age = "168h"; }; table_manager = { retention_deletes_enabled = false; retention_period = "0s"; }; compactor = { working_directory = "/var/lib/loki"; compactor_ring = { kvstore = { store = "inmemory"; }; }; }; }; }; services.promtail = { enable = true; configuration = { server = { http_listen_port = 3031; grpc_listen_port = 0; }; positions = { filename = "/tmp/positions.yaml"; }; clients = [ { url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}/loki/api/v1/push"; } ]; scrape_configs = [ { job_name = "journal"; journal = { max_age = "12h"; labels = { job = "systemd-journal"; host = "monitoring"; }; }; relabel_configs = [ { source_labels = ["__journal__systemd_unit"]; target_label = "unit"; } ]; } ]; }; }; services.alertmanager-ntfy = { enable = true; settings = { http = { addr = "127.0.0.1:8111"; }; ntfy = { baseurl = "https://ntfy.gmem.ca"; notification = { topic = "alerts"; priority = '' status == "firing" ? "high" : "default" ''; templates = { title = ''{{ if eq .Status "resolved" }}Resolved: {{ end }}{{ index .Annotations "summary" }}''; description = ''{{ index .Annotations "description" }}{{ if ne (index .Annotations "dashboard") "" }} | https://grafana.gmem.ca/d/{{ index .Annotations "dashboard" }}{{ end }}''; }; }; }; }; }; services.prometheus = { alertmanager = { enable = true; configText = '' global: {} # The directory from which notification templates are read. templates: - '/etc/alertmanager/template/*.tmpl' # The root route on which each incoming alert enters. route: group_by: ['alertname', 'cluster', 'service'] receiver: telegram receivers: - name: telegram telegram_configs: - bot_token_file: ${config.age.secrets.telegram.path} chat_id: 682019253 message: | {{ range .Alerts.Firing }} [ FIRING ] {{ range . }} Alertname: {{ .Name }} Severity: {{ .Severity }} {{ .Annotations.description }}{{ if ne .Annotations.dashboard "" }} https://grafana.gmem.ca/d/{{ .Annotations.dashboard }}{{ end }} {{ end }} {{ range .Alerts.Resolved }} [ RESOLVED ] {{ range . }} Alertname: {{ .Name }} Severity: {{ .Severity }} {{ .Annotations.description }}{{ if ne .Annotations.dashboard "" }} https://grafana.gmem.ca/d/{{ .Annotations.dashboard }}{{ end }} {{ end }} ''; }; exporters.node = { enable = true; listenAddress = "127.0.0.1"; enabledCollectors = [ "systemd" "processes" ]; }; exporters.blackbox = { enable = true; configFile = "/var/lib/blackbox/config.yml"; }; }; services.tailscale.enable = true; services.healthchecks = { enable = true; # package = healthchecks-edge; settings = { SECRET_KEY_FILE = config.age.secrets.healthchecks-secret.path; SITE_ROOT = "https://healthchecks.gmem.ca"; SITE_NAME = "Archs Healthchecks"; EMAIL_HOST = "smtp.fastmail.com"; EMAIL_HOST_PASSWORD_FILE = config.age.secrets.healthchecks-smtp.path; EMAIL_HOST_USER = "g@gmem.ca"; DEFAULT_FROM_EMAIL = "healthchecks@gmem.ca"; TELEGRAM_BOT_NAME = "arch_healthchecks_bot"; TELEGRAM_TOKEN_FILE = config.age.secrets.healthchecks-telegram.path; }; }; services.uptime-kuma = { enable = true; settings = { PORT = "4000"; }; }; # nginx reverse proxy services.nginx = { enable = true; recommendedGzipSettings = true; recommendedBrotliSettings = true; recommendedZstdSettings = true; recommendedOptimisation = true; recommendedTlsSettings = true; recommendedProxySettings = true; virtualHosts.${config.services.grafana.settings.server.domain} = { default = true; enableACME = true; forceSSL = true; locations."/" = { proxyPass = "http://127.0.0.1:${toString config.services.grafana.settings.server.http_port}"; proxyWebsockets = true; }; }; virtualHosts."metrics.gmem.ca" = { enableACME = true; forceSSL = true; locations."/" = { extraConfig = '' client_max_body_size 0; ''; basicAuthFile = "/var/lib/htpw"; proxyPass = "http://127.0.0.1${toString config.services.victoriametrics.listenAddress}"; proxyWebsockets = true; }; }; virtualHosts."healthchecks.gmem.ca" = { enableACME = true; forceSSL = true; locations."/" = { proxyPass = "http://127.0.0.1:8000"; proxyWebsockets = true; }; }; virtualHosts."uptime.gmem.ca" = { enableACME = true; forceSSL = true; locations."/" = { proxyPass = "http://127.0.0.1:${toString config.services.uptime-kuma.settings.PORT}"; proxyWebsockets = true; }; }; virtualHosts."status.floofy.tech" = { enableACME = true; forceSSL = true; locations."/" = { proxyPass = "http://127.0.0.1:${toString config.services.uptime-kuma.settings.PORT}"; proxyWebsockets = true; }; }; }; security.acme.acceptTerms = true; security.acme.defaults.email = "acme@gmem.ca"; environment.systemPackages = with pkgs; [victoriametrics prometheus htop]; system.stateVersion = "23.11"; }