infra/nix/monitoring/configuration.nix

649 lines
19 KiB
Nix
Raw Normal View History

{
2024-02-05 13:13:44 +00:00
config,
pkgs,
...
}:
let
prometheusConfig = {
global.scrape_interval = "15s";
scrape_configs = [
{
job_name = "proxmox";
metrics_path = "/pve";
params = {"target" = ["localhost"];};
static_configs = [{targets = ["proxmox:9221"];}];
}
{
job_name = "personal_hardware";
static_configs = [{targets = ["london:9100" "vancouver:9100" "localhost:9100" "proxmox.scorpion-ghost.ts.net:9100" ];}];
}
{
job_name = "speedtest-exporter";
scrape_interval = "1h";
scrape_timeout = "1m";
static_configs = [{targets = ["vancouver:9798"];}];
}
{
job_name = "syncthing";
static_configs = [{targets = ["vancouver:8384"];}];
}
{
job_name = "forgejo";
static_configs = [{targets = ["git.gmem.ca"];}];
}
{
job_name = "healthchecks";
scrape_interval = "60s";
metrics_path = "/projects/5f1de50f-a52d-4215-961f-aae7cc6cf6c9/metrics/TbMoU7SUdknzMe-H5Q4HzmKl3itOIrJk";
static_configs = [{targets = ["localhost:8000"];}];
}
{
job_name = "dnsmasq";
scrape_interval = "10s";
static_configs = [{targets = ["100.102.19.124:9153" "100.92.113.87:9153"];}];
}
{
job_name = "blackbox_home";
metrics_path = "/probe";
params = {"modules" = ["http_2xx"];};
static_configs = [
{
labels = {
location = "home";
};
targets = ["floofy.tech" "one.one.one.one" "1.1.1.1" "[2606:4700:4700::1111]" "waterwolf.club"];
}
];
relabel_configs = [
{
source_labels = ["__address__"];
target_label = "__param_target";
}
{
source_labels = ["__param_target"];
target_label = "instance";
}
{
source_labels = [];
target_label = "__address__";
replacement = "vancouver:9115";
}
];
}
{
job_name = "blackbox_hetzner";
metrics_path = "/probe";
params = {"modules" = ["http_2xx"];};
static_configs = [
{
labels = {
location = "hetzner";
};
targets = ["floofy.tech" "one.one.one.one" "[2606:4700:4700::1111]" "waterwolf.club"];
}
];
relabel_configs = [
{
source_labels = ["__address__"];
target_label = "__param_target";
}
{
source_labels = ["__param_target"];
target_label = "instance";
}
{
source_labels = [];
target_label = "__address__";
replacement = "127.0.0.1:9115";
}
];
}
{
job_name = "haproxy";
scrape_interval = "10s";
static_configs = [{targets = ["100.87.208.14:8404"];}];
}
{
job_name = "tclip";
scrape_interval = "15s";
static_configs = [{targets = ["paste"];}];
}
];
};
in
{
2023-09-05 17:08:43 +01:00
imports = [
./hardware.nix
./networking.nix # generated at runtime by nixos-infect
];
2023-09-13 23:01:51 +01:00
age.secrets.healthchecks-secret = {
file = ../../secrets/monitoring-healthchecks-secret.age;
owner = "healthchecks";
};
2023-09-20 00:25:38 +01:00
age.secrets.healthchecks-smtp = {
file = ../../secrets/fastmail-smtp.age;
owner = "healthchecks";
};
age.secrets.healthchecks-telegram = {
file = ../../secrets/healthchecks-telegram.age;
owner = "healthchecks";
};
2024-02-05 13:13:44 +00:00
2023-09-15 09:21:50 +01:00
age.secrets.prometheus-webconfig-secret = {
file = ../../secrets/monitoring-prometheus-webconfig.age;
owner = "prometheus";
mode = "775";
};
age.secrets.prometheus-password-secret = {
file = ../../secrets/monitoring-prometheus-password.age;
owner = "vmalert";
mode = "777";
2023-09-15 09:21:50 +01:00
};
2023-10-30 12:27:14 +00:00
age.secrets.grafana-client-secret = {
file = ../../secrets/monitoring-grafana-client-secret.age;
owner = "grafana";
};
age.secrets.telegram = {
file = ../../secrets/monitoring-telegram.age;
owner = "prometheus";
mode = "775";
};
nix.settings.auto-optimise-store = true;
2023-09-05 17:08:43 +01:00
boot.tmp.cleanOnBoot = true;
zramSwap.enable = true;
networking.hostName = "monitoring";
networking.domain = "";
services.openssh.enable = true;
users.users.root.openssh.authorizedKeys.keys = [
2024-02-05 13:13:44 +00:00
''ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBDjEgtIWPA5Ncs/KOcMeT6Q/HACJJetDOLjMvXXwUE+08oTX1EpHrWPpy8J+UHKIyErCNPYq8dgtrbhnMRlxHqI=''
2023-09-05 17:08:43 +01:00
];
networking.firewall.enable = false;
services.grafana = {
enable = true;
2023-10-18 15:48:42 +01:00
settings = {
feature_toggles = {
publicDashboards = true;
};
2023-10-30 12:27:14 +00:00
log = {
filters = "oauth.generic_oauth:debug";
};
2023-10-18 15:48:42 +01:00
server = {
domain = "grafana.gmem.ca";
http_port = 2342;
http_addr = "127.0.0.1";
2023-10-30 12:27:14 +00:00
root_url = "https://grafana.gmem.ca";
};
auth = {
signout_redirect_url = "https://authentik.gmem.ca/application/o/grafana/end-session/";
oauth_auto_login = true;
};
"auth.generic_oauth" = {
name = "authentik";
client_id = "VbOQzwuf0UK9AUGrWvaVaWWHvX2fJsZChxJNGt61";
client_secret = "$__file{${config.age.secrets.grafana-client-secret.path}}";
auth_url = "https://authentik.gmem.ca/application/o/authorize/";
api_url = "https://authentik.gmem.ca/application/o/userinfo/";
token_url = "https://authentik.gmem.ca/application/o/token/";
enabled = true;
scopes = "openid email grafana-user";
role_attribute_path = "contains(info.groups[*], 'Grafana Admins') && 'Admin' || contains(info.groups[*], 'Grafana Editors') && 'Editor' || 'Viewer'";
role_attribute_strict = true;
2023-10-18 15:48:42 +01:00
};
2023-09-05 17:08:43 +01:00
};
};
services.victoriametrics = {
enable = true;
extraOptions = [
"-promscrape.config=${(pkgs.formats.yaml { }).generate "scrape.yml" prometheusConfig}"
"-selfScrapeInterval=10s"
"-vmalert.proxyURL=http://localhost:8880"
];
};
services.vmalert = {
enable = true;
settings = {
"datasource.basicAuth.username" = "homelab";
"datasource.basicAuth.passwordFile" = config.age.secrets.prometheus-password-secret.path;
"datasource.url" = "http://localhost${builtins.toString config.services.victoriametrics.listenAddress}";
"notifier.url" = [ "http://localhost:${builtins.toString config.services.prometheus.alertmanager.port}" ];
"remoteWrite.url" = "http://localhost:8428";
"remoteRead.url" = "http://localhost:8428";
};
rules = {
groups = [
{
name = "internet_connection";
rules = [
{
alert = "LowInternetDownloadLast2Hours";
expr = ''
avg_over_time(speedtest_download_bits_per_second{}[2h]) <= 500000000
'';
for = "1h";
labels = {
severity = "low";
};
annotations = {
summary = "Low Internet Speed in the Last 2 Hours";
description = "Internet speed has been consistently below 500Mb/s over the last 2 hours.";
dashboard = "o9mIe_Aik";
};
}
{
alert = "MetricNotScrapedLast2Hours";
expr = ''
absent_over_time(speedtest_download_bits_per_second[2h]) > 7200
'';
for = "2h";
labels = {
severity = "high";
};
annotations = {
summary = "Metric Not Scraped in the Last 2 Hours";
description = "The metric speedtest_download_bits_per_second has not been successfully scraped for over 2 hours.";
dashboard = "o9mIe_Aik";
};
}
];
}
{
name = "healthchecks";
rules = [
{
alert = "HealthcheckFailedCheckin";
expr = ''hc_check_up < 1'';
for = "5m";
labels.severity = "medium";
annotations = {
summary = "{{ $labels.name }} healthcheck failed";
description = "The {{ $labels.name }} healthcheck failed to check in.";
dashboard = "f594ea85-45f2-4019-b988-2d17638b5cf3";
};
}
];
}
{
name = "kubernetes_node_exporter_memory_alerts";
rules = [
{
alert = "HighKubernetesMemoryUsage";
expr = ''
(sum by(instance) (node_memory_MemTotal_bytes{job="node-exporter"}) - sum by(instance) (node_memory_MemAvailable_bytes{job="node-exporter"})) / sum by(instance) (node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 80
'';
for = "5m";
labels = {
severity = "medium";
};
annotations = {
summary = "High Memory Usage on Kubernetes Node {{ $labels.instance }}";
description = "Memory usage is above 80% on instance {{ $labels.instance }}. Current value: {{ $value }}%";
};
}
];
}
{
name = "kubernetes_node_exporter_cpu_alerts";
rules = [
{
alert = "HighKubernetesCPUUsage";
expr = ''
sum by(instance) (100 - avg by(instance) (irate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m])) * 100) > 80
'';
for = "5m";
labels = {
severity = "medium";
};
annotations = {
summary = "High CPU Usage on Kubernetes Node {{ $labels.instance }}";
description = "CPU usage is above 80% on instance {{ $labels.instance }}.";
};
}
];
}
{
name = "kubernetes_pod_restart_loop";
rules = [
{
alert = "KubernetesPodRestartLoop";
expr = ''changes(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 3'';
for = "15m";
labels = {
severity = "medium";
};
annotations = {
summary = "Kubernetes Pod Restart Loop: {{ $labels.namespace }}/{{ $labels.pod }}";
description = "Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting frequently. Number of restarts: {{ $value }}.";
dashboard = "k8s_views_pods";
};
}
];
}
{
name = "postgresql_database_connectivity";
rules = [
{
alert = "PostgreSQLDatabaseConnectivity";
expr = ''pg_up == 0'';
for = "5m";
labels = {
severity = "high";
};
annotations = {
summary = "PostgreSQL Database Connectivity Issue";
description = "PostgreSQL exporter cannot connect to database.";
dashboard = "wGgaPlciz";
};
}
];
}
{
name = "proxmox_status";
rules = [
{
alert = "ProxmoxOffline";
expr = ''up{job="proxmox"} == 0'';
for = "1m";
labels = {
severity = "high";
};
annotations = {
summary = "Proxmox offline";
description = "Proxmox exporter scrapes are failing. Proxmox is likely offline.";
dashboard = "Dp7Cd57Zza";
};
}
];
}
];
};
};
2023-10-08 22:55:20 +01:00
services.loki = {
enable = true;
configuration = {
server.http_listen_port = 3030;
auth_enabled = false;
ingester = {
lifecycler = {
address = "127.0.0.1";
ring = {
kvstore = {
store = "inmemory";
};
replication_factor = 1;
};
};
chunk_idle_period = "1h";
max_chunk_age = "1h";
chunk_target_size = 999999;
chunk_retain_period = "30s";
};
schema_config = {
2024-02-05 13:13:44 +00:00
configs = [
{
from = "2022-06-06";
store = "boltdb-shipper";
object_store = "filesystem";
schema = "v11";
index = {
prefix = "index_";
period = "24h";
};
}
2024-05-04 15:11:07 +01:00
{
from = "2024-05-01";
store = "tsdb";
object_store = "filesystem";
schema = "v13";
index = {
prefix = "index_";
period = "24h";
};
}
2024-02-05 13:13:44 +00:00
];
2023-10-08 22:55:20 +01:00
};
storage_config = {
boltdb_shipper = {
active_index_directory = "/var/lib/loki/boltdb-shipper-active";
cache_location = "/var/lib/loki/boltdb-shipper-cache";
cache_ttl = "24h";
};
2024-05-04 15:11:07 +01:00
tsdb_shipper = {
active_index_directory = "/var/lib/loki/tsdb-shipper-active";
cache_location = "/var/lib/loki/tsdb-shipper-cache";
cache_ttl = "24h";
};
2023-10-08 22:55:20 +01:00
filesystem = {
directory = "/var/lib/loki/chunks";
};
};
limits_config = {
reject_old_samples = true;
reject_old_samples_max_age = "168h";
};
table_manager = {
retention_deletes_enabled = false;
retention_period = "0s";
};
compactor = {
working_directory = "/var/lib/loki";
compactor_ring = {
kvstore = {
store = "inmemory";
};
};
};
};
};
services.promtail = {
enable = true;
configuration = {
server = {
http_listen_port = 3031;
grpc_listen_port = 0;
};
positions = {
filename = "/tmp/positions.yaml";
};
2024-02-05 13:13:44 +00:00
clients = [
{
url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}/loki/api/v1/push";
}
];
scrape_configs = [
{
job_name = "journal";
journal = {
max_age = "12h";
labels = {
job = "systemd-journal";
host = "monitoring";
};
2023-10-08 22:55:20 +01:00
};
2024-02-05 13:13:44 +00:00
relabel_configs = [
{
source_labels = ["__journal__systemd_unit"];
target_label = "unit";
}
];
}
];
2023-10-08 22:55:20 +01:00
};
};
services.alertmanager-ntfy = {
2024-02-05 13:13:44 +00:00
enable = true;
settings = {
http = {
addr = "127.0.0.1:8111";
};
ntfy = {
baseurl = "https://ntfy.gmem.ca";
notification = {
topic = "alerts";
priority = ''
status == "firing" ? "high" : "default"
2024-02-05 13:13:44 +00:00
'';
templates = {
title = ''{{ if eq .Status "resolved" }}Resolved: {{ end }}{{ index .Annotations "summary" }}'';
description = ''{{ index .Annotations "description" }}{{ if ne (index .Annotations "dashboard") "" }} | https://grafana.gmem.ca/d/{{ index .Annotations "dashboard" }}{{ end }}'';
};
};
};
};
};
2023-09-05 17:08:43 +01:00
services.prometheus = {
alertmanager = {
enable = true;
configText = ''
2024-02-05 13:13:44 +00:00
global: {}
2024-02-05 13:13:44 +00:00
# The directory from which notification templates are read.
templates:
- '/etc/alertmanager/template/*.tmpl'
2024-02-05 13:13:44 +00:00
# The root route on which each incoming alert enters.
route:
group_by: ['alertname', 'cluster', 'service']
receiver: telegram
2024-02-05 13:13:44 +00:00
receivers:
- name: telegram
telegram_configs:
- bot_token_file: ${config.age.secrets.telegram.path}
chat_id: 682019253
message: |
{{ range .Alerts.Firing }}
[ FIRING ]
{{ range . }}
Alertname: {{ .Name }}
Severity: {{ .Severity }}
{{ .Annotations.description }}{{ if ne .Annotations.dashboard "" }}
https://grafana.gmem.ca/d/{{ .Annotations.dashboard }}{{ end }}
{{ end }}
{{ range .Alerts.Resolved }}
[ RESOLVED ]
{{ range . }}
Alertname: {{ .Name }}
Severity: {{ .Severity }}
{{ .Annotations.description }}{{ if ne .Annotations.dashboard "" }}
https://grafana.gmem.ca/d/{{ .Annotations.dashboard }}{{ end }}
{{ end }}
'';
};
2023-09-05 17:08:43 +01:00
exporters.node = {
enable = true;
listenAddress = "127.0.0.1";
enabledCollectors = [
2024-02-05 13:13:44 +00:00
"systemd"
"processes"
2023-09-05 17:08:43 +01:00
];
};
exporters.blackbox = {
enable = true;
configFile = "/var/lib/blackbox/config.yml";
};
2023-09-05 17:08:43 +01:00
};
services.tailscale.enable = true;
2023-09-13 23:01:51 +01:00
services.healthchecks = {
enable = true;
# package = healthchecks-edge;
2023-09-13 23:01:51 +01:00
settings = {
SECRET_KEY_FILE = config.age.secrets.healthchecks-secret.path;
SITE_ROOT = "https://healthchecks.gmem.ca";
SITE_NAME = "Archs Healthchecks";
2023-09-20 00:25:38 +01:00
EMAIL_HOST = "smtp.fastmail.com";
EMAIL_HOST_PASSWORD_FILE = config.age.secrets.healthchecks-smtp.path;
EMAIL_HOST_USER = "g@gmem.ca";
DEFAULT_FROM_EMAIL = "healthchecks@gmem.ca";
TELEGRAM_BOT_NAME = "arch_healthchecks_bot";
TELEGRAM_TOKEN_FILE = config.age.secrets.healthchecks-telegram.path;
2023-09-13 23:01:51 +01:00
};
};
2024-02-05 13:13:44 +00:00
services.uptime-kuma = {
enable = true;
settings = {
PORT = "4000";
};
};
2023-09-05 17:08:43 +01:00
# nginx reverse proxy
services.nginx = {
enable = true;
recommendedGzipSettings = true;
recommendedBrotliSettings = true;
recommendedZstdSettings = true;
recommendedOptimisation = true;
recommendedTlsSettings = true;
recommendedProxySettings = true;
2023-09-13 23:01:51 +01:00
virtualHosts.${config.services.grafana.settings.server.domain} = {
2023-09-05 17:08:43 +01:00
default = true;
enableACME = true;
forceSSL = true;
locations."/" = {
2023-09-13 23:01:51 +01:00
proxyPass = "http://127.0.0.1:${toString config.services.grafana.settings.server.http_port}";
proxyWebsockets = true;
};
};
virtualHosts."metrics.gmem.ca" = {
enableACME = true;
forceSSL = true;
locations."/" = {
extraConfig = ''
client_max_body_size 0;
'';
basicAuthFile = "/var/lib/htpw";
proxyPass = "http://127.0.0.1${toString config.services.victoriametrics.listenAddress}";
proxyWebsockets = true;
};
};
2023-09-13 23:01:51 +01:00
virtualHosts."healthchecks.gmem.ca" = {
enableACME = true;
forceSSL = true;
locations."/" = {
proxyPass = "http://127.0.0.1:8000";
2023-09-05 17:08:43 +01:00
proxyWebsockets = true;
};
};
virtualHosts."uptime.gmem.ca" = {
enableACME = true;
forceSSL = true;
locations."/" = {
proxyPass = "http://127.0.0.1:${toString config.services.uptime-kuma.settings.PORT}";
proxyWebsockets = true;
};
};
virtualHosts."status.floofy.tech" = {
enableACME = true;
forceSSL = true;
locations."/" = {
proxyPass = "http://127.0.0.1:${toString config.services.uptime-kuma.settings.PORT}";
proxyWebsockets = true;
};
};
2023-09-05 17:08:43 +01:00
};
security.acme.acceptTerms = true;
security.acme.defaults.email = "acme@gmem.ca";
environment.systemPackages = with pkgs; [victoriametrics prometheus htop];
2023-09-05 17:08:43 +01:00
system.stateVersion = "23.11";
}