649 lines
19 KiB
Nix
649 lines
19 KiB
Nix
{
|
|
config,
|
|
pkgs,
|
|
...
|
|
}:
|
|
let
|
|
prometheusConfig = {
|
|
global.scrape_interval = "15s";
|
|
scrape_configs = [
|
|
{
|
|
job_name = "proxmox";
|
|
metrics_path = "/pve";
|
|
params = {"target" = ["localhost"];};
|
|
static_configs = [{targets = ["proxmox:9221"];}];
|
|
}
|
|
{
|
|
job_name = "personal_hardware";
|
|
static_configs = [{targets = ["london:9100" "vancouver:9100" "localhost:9100" "proxmox.scorpion-ghost.ts.net:9100" ];}];
|
|
}
|
|
{
|
|
job_name = "speedtest-exporter";
|
|
scrape_interval = "1h";
|
|
scrape_timeout = "1m";
|
|
static_configs = [{targets = ["vancouver:9798"];}];
|
|
}
|
|
{
|
|
job_name = "syncthing";
|
|
static_configs = [{targets = ["vancouver:8384"];}];
|
|
}
|
|
{
|
|
job_name = "forgejo";
|
|
static_configs = [{targets = ["git.gmem.ca"];}];
|
|
}
|
|
{
|
|
job_name = "healthchecks";
|
|
scrape_interval = "60s";
|
|
metrics_path = "/projects/5f1de50f-a52d-4215-961f-aae7cc6cf6c9/metrics/TbMoU7SUdknzMe-H5Q4HzmKl3itOIrJk";
|
|
static_configs = [{targets = ["localhost:8000"];}];
|
|
}
|
|
{
|
|
job_name = "dnsmasq";
|
|
scrape_interval = "10s";
|
|
static_configs = [{targets = ["100.102.19.124:9153" "100.92.113.87:9153"];}];
|
|
}
|
|
{
|
|
job_name = "blackbox_home";
|
|
metrics_path = "/probe";
|
|
params = {"modules" = ["http_2xx"];};
|
|
static_configs = [
|
|
{
|
|
labels = {
|
|
location = "home";
|
|
};
|
|
targets = ["floofy.tech" "one.one.one.one" "1.1.1.1" "[2606:4700:4700::1111]" "waterwolf.club"];
|
|
}
|
|
];
|
|
relabel_configs = [
|
|
{
|
|
source_labels = ["__address__"];
|
|
target_label = "__param_target";
|
|
}
|
|
{
|
|
source_labels = ["__param_target"];
|
|
target_label = "instance";
|
|
}
|
|
{
|
|
source_labels = [];
|
|
target_label = "__address__";
|
|
replacement = "vancouver:9115";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "blackbox_hetzner";
|
|
metrics_path = "/probe";
|
|
params = {"modules" = ["http_2xx"];};
|
|
static_configs = [
|
|
{
|
|
labels = {
|
|
location = "hetzner";
|
|
};
|
|
targets = ["floofy.tech" "one.one.one.one" "[2606:4700:4700::1111]" "waterwolf.club"];
|
|
}
|
|
];
|
|
relabel_configs = [
|
|
{
|
|
source_labels = ["__address__"];
|
|
target_label = "__param_target";
|
|
}
|
|
{
|
|
source_labels = ["__param_target"];
|
|
target_label = "instance";
|
|
}
|
|
{
|
|
source_labels = [];
|
|
target_label = "__address__";
|
|
replacement = "127.0.0.1:9115";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "haproxy";
|
|
scrape_interval = "10s";
|
|
static_configs = [{targets = ["100.87.208.14:8404"];}];
|
|
}
|
|
{
|
|
job_name = "tclip";
|
|
scrape_interval = "15s";
|
|
static_configs = [{targets = ["paste"];}];
|
|
}
|
|
];
|
|
};
|
|
in
|
|
{
|
|
imports = [
|
|
./hardware.nix
|
|
./networking.nix # generated at runtime by nixos-infect
|
|
];
|
|
|
|
age.secrets.healthchecks-secret = {
|
|
file = ../../secrets/monitoring-healthchecks-secret.age;
|
|
owner = "healthchecks";
|
|
};
|
|
|
|
age.secrets.healthchecks-smtp = {
|
|
file = ../../secrets/fastmail-smtp.age;
|
|
owner = "healthchecks";
|
|
};
|
|
|
|
age.secrets.healthchecks-telegram = {
|
|
file = ../../secrets/healthchecks-telegram.age;
|
|
owner = "healthchecks";
|
|
};
|
|
|
|
age.secrets.prometheus-webconfig-secret = {
|
|
file = ../../secrets/monitoring-prometheus-webconfig.age;
|
|
owner = "prometheus";
|
|
mode = "775";
|
|
};
|
|
|
|
age.secrets.prometheus-password-secret = {
|
|
file = ../../secrets/monitoring-prometheus-password.age;
|
|
owner = "vmalert";
|
|
mode = "777";
|
|
};
|
|
|
|
age.secrets.grafana-client-secret = {
|
|
file = ../../secrets/monitoring-grafana-client-secret.age;
|
|
owner = "grafana";
|
|
};
|
|
|
|
age.secrets.telegram = {
|
|
file = ../../secrets/monitoring-telegram.age;
|
|
owner = "prometheus";
|
|
mode = "775";
|
|
};
|
|
|
|
nix.settings.auto-optimise-store = true;
|
|
|
|
boot.tmp.cleanOnBoot = true;
|
|
zramSwap.enable = true;
|
|
networking.hostName = "monitoring";
|
|
networking.domain = "";
|
|
services.openssh.enable = true;
|
|
users.users.root.openssh.authorizedKeys.keys = [
|
|
''ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBDjEgtIWPA5Ncs/KOcMeT6Q/HACJJetDOLjMvXXwUE+08oTX1EpHrWPpy8J+UHKIyErCNPYq8dgtrbhnMRlxHqI=''
|
|
];
|
|
networking.firewall.enable = false;
|
|
|
|
services.grafana = {
|
|
enable = true;
|
|
settings = {
|
|
feature_toggles = {
|
|
publicDashboards = true;
|
|
};
|
|
log = {
|
|
filters = "oauth.generic_oauth:debug";
|
|
};
|
|
server = {
|
|
domain = "grafana.gmem.ca";
|
|
http_port = 2342;
|
|
http_addr = "127.0.0.1";
|
|
root_url = "https://grafana.gmem.ca";
|
|
};
|
|
auth = {
|
|
signout_redirect_url = "https://authentik.gmem.ca/application/o/grafana/end-session/";
|
|
oauth_auto_login = true;
|
|
};
|
|
"auth.generic_oauth" = {
|
|
name = "authentik";
|
|
client_id = "VbOQzwuf0UK9AUGrWvaVaWWHvX2fJsZChxJNGt61";
|
|
client_secret = "$__file{${config.age.secrets.grafana-client-secret.path}}";
|
|
auth_url = "https://authentik.gmem.ca/application/o/authorize/";
|
|
api_url = "https://authentik.gmem.ca/application/o/userinfo/";
|
|
token_url = "https://authentik.gmem.ca/application/o/token/";
|
|
enabled = true;
|
|
scopes = "openid email grafana-user";
|
|
role_attribute_path = "contains(info.groups[*], 'Grafana Admins') && 'Admin' || contains(info.groups[*], 'Grafana Editors') && 'Editor' || 'Viewer'";
|
|
role_attribute_strict = true;
|
|
};
|
|
};
|
|
};
|
|
services.victoriametrics = {
|
|
enable = true;
|
|
extraOptions = [
|
|
"-promscrape.config=${(pkgs.formats.yaml { }).generate "scrape.yml" prometheusConfig}"
|
|
"-selfScrapeInterval=10s"
|
|
"-vmalert.proxyURL=http://localhost:8880"
|
|
];
|
|
};
|
|
services.vmalert = {
|
|
enable = true;
|
|
settings = {
|
|
"datasource.basicAuth.username" = "homelab";
|
|
"datasource.basicAuth.passwordFile" = config.age.secrets.prometheus-password-secret.path;
|
|
"datasource.url" = "http://localhost${builtins.toString config.services.victoriametrics.listenAddress}";
|
|
"notifier.url" = [ "http://localhost:${builtins.toString config.services.prometheus.alertmanager.port}" ];
|
|
"remoteWrite.url" = "http://localhost:8428";
|
|
"remoteRead.url" = "http://localhost:8428";
|
|
};
|
|
rules = {
|
|
groups = [
|
|
{
|
|
name = "internet_connection";
|
|
rules = [
|
|
{
|
|
alert = "LowInternetDownloadLast2Hours";
|
|
expr = ''
|
|
avg_over_time(speedtest_download_bits_per_second{}[2h]) <= 500000000
|
|
'';
|
|
for = "1h";
|
|
labels = {
|
|
severity = "low";
|
|
};
|
|
annotations = {
|
|
summary = "Low Internet Speed in the Last 2 Hours";
|
|
description = "Internet speed has been consistently below 500Mb/s over the last 2 hours.";
|
|
dashboard = "o9mIe_Aik";
|
|
};
|
|
}
|
|
{
|
|
alert = "MetricNotScrapedLast2Hours";
|
|
expr = ''
|
|
absent_over_time(speedtest_download_bits_per_second[2h]) > 7200
|
|
'';
|
|
for = "2h";
|
|
labels = {
|
|
severity = "high";
|
|
};
|
|
annotations = {
|
|
summary = "Metric Not Scraped in the Last 2 Hours";
|
|
description = "The metric speedtest_download_bits_per_second has not been successfully scraped for over 2 hours.";
|
|
dashboard = "o9mIe_Aik";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
name = "healthchecks";
|
|
rules = [
|
|
{
|
|
alert = "HealthcheckFailedCheckin";
|
|
expr = ''hc_check_up < 1'';
|
|
for = "5m";
|
|
labels.severity = "medium";
|
|
annotations = {
|
|
summary = "{{ $labels.name }} healthcheck failed";
|
|
description = "The {{ $labels.name }} healthcheck failed to check in.";
|
|
dashboard = "f594ea85-45f2-4019-b988-2d17638b5cf3";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
name = "kubernetes_node_exporter_memory_alerts";
|
|
rules = [
|
|
{
|
|
alert = "HighKubernetesMemoryUsage";
|
|
expr = ''
|
|
(sum by(instance) (node_memory_MemTotal_bytes{job="node-exporter"}) - sum by(instance) (node_memory_MemAvailable_bytes{job="node-exporter"})) / sum by(instance) (node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 80
|
|
'';
|
|
for = "5m";
|
|
labels = {
|
|
severity = "medium";
|
|
};
|
|
annotations = {
|
|
summary = "High Memory Usage on Kubernetes Node {{ $labels.instance }}";
|
|
description = "Memory usage is above 80% on instance {{ $labels.instance }}. Current value: {{ $value }}%";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
name = "kubernetes_node_exporter_cpu_alerts";
|
|
rules = [
|
|
{
|
|
alert = "HighKubernetesCPUUsage";
|
|
expr = ''
|
|
sum by(instance) (100 - avg by(instance) (irate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[5m])) * 100) > 80
|
|
'';
|
|
for = "5m";
|
|
labels = {
|
|
severity = "medium";
|
|
};
|
|
annotations = {
|
|
summary = "High CPU Usage on Kubernetes Node {{ $labels.instance }}";
|
|
description = "CPU usage is above 80% on instance {{ $labels.instance }}.";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
name = "kubernetes_pod_restart_loop";
|
|
rules = [
|
|
{
|
|
alert = "KubernetesPodRestartLoop";
|
|
expr = ''changes(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) > 3'';
|
|
for = "15m";
|
|
labels = {
|
|
severity = "medium";
|
|
};
|
|
annotations = {
|
|
summary = "Kubernetes Pod Restart Loop: {{ $labels.namespace }}/{{ $labels.pod }}";
|
|
description = "Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting frequently. Number of restarts: {{ $value }}.";
|
|
dashboard = "k8s_views_pods";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
name = "postgresql_database_connectivity";
|
|
rules = [
|
|
{
|
|
alert = "PostgreSQLDatabaseConnectivity";
|
|
expr = ''pg_up == 0'';
|
|
for = "5m";
|
|
labels = {
|
|
severity = "high";
|
|
};
|
|
annotations = {
|
|
summary = "PostgreSQL Database Connectivity Issue";
|
|
description = "PostgreSQL exporter cannot connect to database.";
|
|
dashboard = "wGgaPlciz";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
{
|
|
name = "proxmox_status";
|
|
rules = [
|
|
{
|
|
alert = "ProxmoxOffline";
|
|
expr = ''up{job="proxmox"} == 0'';
|
|
for = "1m";
|
|
labels = {
|
|
severity = "high";
|
|
};
|
|
annotations = {
|
|
summary = "Proxmox offline";
|
|
description = "Proxmox exporter scrapes are failing. Proxmox is likely offline.";
|
|
dashboard = "Dp7Cd57Zza";
|
|
};
|
|
}
|
|
];
|
|
}
|
|
];
|
|
};
|
|
};
|
|
|
|
services.loki = {
|
|
enable = true;
|
|
configuration = {
|
|
server.http_listen_port = 3030;
|
|
auth_enabled = false;
|
|
|
|
ingester = {
|
|
lifecycler = {
|
|
address = "127.0.0.1";
|
|
ring = {
|
|
kvstore = {
|
|
store = "inmemory";
|
|
};
|
|
replication_factor = 1;
|
|
};
|
|
};
|
|
chunk_idle_period = "1h";
|
|
max_chunk_age = "1h";
|
|
chunk_target_size = 999999;
|
|
chunk_retain_period = "30s";
|
|
};
|
|
|
|
schema_config = {
|
|
configs = [
|
|
{
|
|
from = "2022-06-06";
|
|
store = "boltdb-shipper";
|
|
object_store = "filesystem";
|
|
schema = "v11";
|
|
index = {
|
|
prefix = "index_";
|
|
period = "24h";
|
|
};
|
|
}
|
|
{
|
|
from = "2024-05-01";
|
|
store = "tsdb";
|
|
object_store = "filesystem";
|
|
schema = "v13";
|
|
index = {
|
|
prefix = "index_";
|
|
period = "24h";
|
|
};
|
|
}
|
|
];
|
|
};
|
|
|
|
storage_config = {
|
|
boltdb_shipper = {
|
|
active_index_directory = "/var/lib/loki/boltdb-shipper-active";
|
|
cache_location = "/var/lib/loki/boltdb-shipper-cache";
|
|
cache_ttl = "24h";
|
|
};
|
|
tsdb_shipper = {
|
|
active_index_directory = "/var/lib/loki/tsdb-shipper-active";
|
|
cache_location = "/var/lib/loki/tsdb-shipper-cache";
|
|
cache_ttl = "24h";
|
|
};
|
|
|
|
filesystem = {
|
|
directory = "/var/lib/loki/chunks";
|
|
};
|
|
};
|
|
|
|
limits_config = {
|
|
reject_old_samples = true;
|
|
reject_old_samples_max_age = "168h";
|
|
};
|
|
|
|
table_manager = {
|
|
retention_deletes_enabled = false;
|
|
retention_period = "0s";
|
|
};
|
|
|
|
compactor = {
|
|
working_directory = "/var/lib/loki";
|
|
compactor_ring = {
|
|
kvstore = {
|
|
store = "inmemory";
|
|
};
|
|
};
|
|
};
|
|
};
|
|
};
|
|
services.promtail = {
|
|
enable = true;
|
|
configuration = {
|
|
server = {
|
|
http_listen_port = 3031;
|
|
grpc_listen_port = 0;
|
|
};
|
|
positions = {
|
|
filename = "/tmp/positions.yaml";
|
|
};
|
|
clients = [
|
|
{
|
|
url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}/loki/api/v1/push";
|
|
}
|
|
];
|
|
scrape_configs = [
|
|
{
|
|
job_name = "journal";
|
|
journal = {
|
|
max_age = "12h";
|
|
labels = {
|
|
job = "systemd-journal";
|
|
host = "monitoring";
|
|
};
|
|
};
|
|
relabel_configs = [
|
|
{
|
|
source_labels = ["__journal__systemd_unit"];
|
|
target_label = "unit";
|
|
}
|
|
];
|
|
}
|
|
];
|
|
};
|
|
};
|
|
services.alertmanager-ntfy = {
|
|
enable = true;
|
|
settings = {
|
|
http = {
|
|
addr = "127.0.0.1:8111";
|
|
};
|
|
ntfy = {
|
|
baseurl = "https://ntfy.gmem.ca";
|
|
notification = {
|
|
topic = "alerts";
|
|
priority = ''
|
|
status == "firing" ? "high" : "default"
|
|
'';
|
|
templates = {
|
|
title = ''{{ if eq .Status "resolved" }}Resolved: {{ end }}{{ index .Annotations "summary" }}'';
|
|
description = ''{{ index .Annotations "description" }}{{ if ne (index .Annotations "dashboard") "" }} | https://grafana.gmem.ca/d/{{ index .Annotations "dashboard" }}{{ end }}'';
|
|
};
|
|
};
|
|
};
|
|
};
|
|
};
|
|
services.prometheus = {
|
|
alertmanager = {
|
|
enable = true;
|
|
configText = ''
|
|
global: {}
|
|
|
|
# The directory from which notification templates are read.
|
|
templates:
|
|
- '/etc/alertmanager/template/*.tmpl'
|
|
|
|
# The root route on which each incoming alert enters.
|
|
route:
|
|
group_by: ['alertname', 'cluster', 'service']
|
|
receiver: telegram
|
|
|
|
receivers:
|
|
- name: telegram
|
|
telegram_configs:
|
|
- bot_token_file: ${config.age.secrets.telegram.path}
|
|
chat_id: 682019253
|
|
message: |
|
|
{{ range .Alerts.Firing }}
|
|
[ FIRING ]
|
|
{{ range . }}
|
|
Alertname: {{ .Name }}
|
|
Severity: {{ .Severity }}
|
|
{{ .Annotations.description }}{{ if ne .Annotations.dashboard "" }}
|
|
https://grafana.gmem.ca/d/{{ .Annotations.dashboard }}{{ end }}
|
|
{{ end }}
|
|
{{ range .Alerts.Resolved }}
|
|
[ RESOLVED ]
|
|
{{ range . }}
|
|
Alertname: {{ .Name }}
|
|
Severity: {{ .Severity }}
|
|
{{ .Annotations.description }}{{ if ne .Annotations.dashboard "" }}
|
|
https://grafana.gmem.ca/d/{{ .Annotations.dashboard }}{{ end }}
|
|
{{ end }}
|
|
'';
|
|
};
|
|
exporters.node = {
|
|
enable = true;
|
|
listenAddress = "127.0.0.1";
|
|
enabledCollectors = [
|
|
"systemd"
|
|
"processes"
|
|
];
|
|
};
|
|
exporters.blackbox = {
|
|
enable = true;
|
|
configFile = "/var/lib/blackbox/config.yml";
|
|
};
|
|
};
|
|
services.tailscale.enable = true;
|
|
|
|
services.healthchecks = {
|
|
enable = true;
|
|
# package = healthchecks-edge;
|
|
settings = {
|
|
SECRET_KEY_FILE = config.age.secrets.healthchecks-secret.path;
|
|
SITE_ROOT = "https://healthchecks.gmem.ca";
|
|
SITE_NAME = "Archs Healthchecks";
|
|
EMAIL_HOST = "smtp.fastmail.com";
|
|
EMAIL_HOST_PASSWORD_FILE = config.age.secrets.healthchecks-smtp.path;
|
|
EMAIL_HOST_USER = "g@gmem.ca";
|
|
DEFAULT_FROM_EMAIL = "healthchecks@gmem.ca";
|
|
TELEGRAM_BOT_NAME = "arch_healthchecks_bot";
|
|
TELEGRAM_TOKEN_FILE = config.age.secrets.healthchecks-telegram.path;
|
|
};
|
|
};
|
|
|
|
services.uptime-kuma = {
|
|
enable = true;
|
|
settings = {
|
|
PORT = "4000";
|
|
};
|
|
};
|
|
|
|
# nginx reverse proxy
|
|
services.nginx = {
|
|
enable = true;
|
|
recommendedGzipSettings = true;
|
|
recommendedBrotliSettings = true;
|
|
recommendedZstdSettings = true;
|
|
recommendedOptimisation = true;
|
|
recommendedTlsSettings = true;
|
|
recommendedProxySettings = true;
|
|
virtualHosts.${config.services.grafana.settings.server.domain} = {
|
|
default = true;
|
|
enableACME = true;
|
|
forceSSL = true;
|
|
locations."/" = {
|
|
proxyPass = "http://127.0.0.1:${toString config.services.grafana.settings.server.http_port}";
|
|
proxyWebsockets = true;
|
|
};
|
|
};
|
|
virtualHosts."metrics.gmem.ca" = {
|
|
enableACME = true;
|
|
forceSSL = true;
|
|
locations."/" = {
|
|
extraConfig = ''
|
|
client_max_body_size 0;
|
|
'';
|
|
basicAuthFile = "/var/lib/htpw";
|
|
proxyPass = "http://127.0.0.1${toString config.services.victoriametrics.listenAddress}";
|
|
proxyWebsockets = true;
|
|
};
|
|
};
|
|
virtualHosts."healthchecks.gmem.ca" = {
|
|
enableACME = true;
|
|
forceSSL = true;
|
|
locations."/" = {
|
|
proxyPass = "http://127.0.0.1:8000";
|
|
proxyWebsockets = true;
|
|
};
|
|
};
|
|
virtualHosts."uptime.gmem.ca" = {
|
|
enableACME = true;
|
|
forceSSL = true;
|
|
locations."/" = {
|
|
proxyPass = "http://127.0.0.1:${toString config.services.uptime-kuma.settings.PORT}";
|
|
proxyWebsockets = true;
|
|
};
|
|
};
|
|
virtualHosts."status.floofy.tech" = {
|
|
enableACME = true;
|
|
forceSSL = true;
|
|
locations."/" = {
|
|
proxyPass = "http://127.0.0.1:${toString config.services.uptime-kuma.settings.PORT}";
|
|
proxyWebsockets = true;
|
|
};
|
|
};
|
|
};
|
|
security.acme.acceptTerms = true;
|
|
security.acme.defaults.email = "acme@gmem.ca";
|
|
|
|
environment.systemPackages = with pkgs; [victoriametrics prometheus htop];
|
|
|
|
system.stateVersion = "23.11";
|
|
}
|