diff --git a/cmd/server/main.go b/cmd/server/main.go index b79d201..45f8f15 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -20,6 +20,7 @@ import ( "gitea.dcglab.co.uk/steve/restic-manager/internal/server/fleetupdate" rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws" @@ -89,6 +90,7 @@ func run() error { hub := ws.NewHub() jobHub := ws.NewJobHub() + metricsRegistry := metrics.NewRegistry() notifHub := notification.NewHub(st, aead, cfg.BaseURL) alertEngine := alert.NewEngine(st, notifHub) @@ -122,6 +124,7 @@ func run() error { UI: renderer, Version: version, OIDC: oidcClient, + Metrics: metricsRegistry, } // First-run bootstrap: if the users table is empty, mint a one-time diff --git a/deploy/grafana/restic-manager-dashboard.json b/deploy/grafana/restic-manager-dashboard.json new file mode 100644 index 0000000..7f5d690 --- /dev/null +++ b/deploy/grafana/restic-manager-dashboard.json @@ -0,0 +1,325 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "restic-manager fleet overview. Imports against any Prometheus data source.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "id": 1, + "title": "Fleet status", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rm_hosts_online", + "legendFormat": "online", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rm_hosts_total", + "legendFormat": "total", + "refId": "B" + } + ] + }, + { + "id": 2, + "title": "Open alerts", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "sum by (severity) (rm_active_alerts)", + "legendFormat": "{{severity}}", + "refId": "A" + } + ] + }, + { + "id": 3, + "title": "Backups failing (last reported run)", + "type": "stat", + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "gridPos": { "h": 6, "w": 6, "x": 12, "y": 0 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "count(rm_host_last_backup_success == 0)", + "legendFormat": "failing", + "refId": "A" + } + ] + }, + { + "id": 4, + "title": "Hosts", + "type": "table", + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 6 }, + "fieldConfig": { + "defaults": { + "custom": { "align": "auto", "displayMode": "auto" } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Value #B" }, + "properties": [ + { "id": "displayName", "value": "Last backup (s ago)" }, + { "id": "unit", "value": "s" } + ] + }, + { + "matcher": { "id": "byName", "options": "Value #C" }, + "properties": [ + { "id": "displayName", "value": "Repo size" }, + { "id": "unit", "value": "bytes" } + ] + }, + { + "matcher": { "id": "byName", "options": "Value #D" }, + "properties": [ + { "id": "displayName", "value": "Snapshots" } + ] + }, + { + "matcher": { "id": "byName", "options": "Value #A" }, + "properties": [ + { "id": "displayName", "value": "Online" } + ] + }, + { + "matcher": { "id": "byName", "options": "Value #E" }, + "properties": [ + { "id": "displayName", "value": "Open alerts" } + ] + } + ] + }, + "options": { "showHeader": true }, + "transformations": [ + { + "id": "merge", + "options": {} + } + ], + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rm_host_agent_online", + "format": "table", + "instant": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "time() - rm_host_last_backup_timestamp_seconds", + "format": "table", + "instant": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rm_host_repo_size_bytes", + "format": "table", + "instant": true, + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rm_host_snapshot_count", + "format": "table", + "instant": true, + "refId": "D" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rm_host_open_alerts", + "format": "table", + "instant": true, + "refId": "E" + } + ] + }, + { + "id": 5, + "title": "Repo size over time", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "", + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never" + }, + "unit": "bytes" + }, + "overrides": [] + }, + "options": { + "legend": { "calcs": ["last"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "rm_host_repo_size_bytes", + "legendFormat": "{{host}}", + "refId": "A" + } + ] + }, + { + "id": 6, + "title": "Job duration p95 (last 1h, by kind)", + "type": "timeseries", + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "line", + "fillOpacity": 5, + "lineWidth": 1, + "pointSize": 4, + "showPoints": "never" + }, + "unit": "s" + }, + "overrides": [] + }, + "options": { + "legend": { "calcs": ["last"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "expr": "histogram_quantile(0.95, sum by (kind, le) (rate(rm_job_duration_seconds_bucket[1h])))", + "legendFormat": "{{kind}}", + "refId": "A" + } + ] + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": ["restic-manager", "backups"], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Prometheus", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "restic-manager — fleet", + "uid": "rm-fleet-overview", + "version": 1, + "weekStart": "" +} diff --git a/docs/prometheus.md b/docs/prometheus.md new file mode 100644 index 0000000..ebd83e1 --- /dev/null +++ b/docs/prometheus.md @@ -0,0 +1,139 @@ +# Prometheus + Grafana + +restic-manager exposes a Prometheus scrape endpoint at `GET /metrics`. +The endpoint is **opt-in** — it is not mounted at all unless you set +at least one of the auth gates below. Once enabled, it serves the +standard `text/plain` exposition format that every Prometheus +release since 2.x parses without configuration. + +A sample Grafana dashboard lives at +`deploy/grafana/restic-manager-dashboard.json`. + +## Enable the endpoint + +Two switches, both off by default. If both are set, both must pass +(token AND source-IP); if only one is set, that gate alone +authorises a scrape. + +| Env var | YAML key | Effect | +|----------------------------|------------------------|--------| +| `RM_METRICS_TOKEN` | `metrics_token` | Requires `Authorization: Bearer `. Compared in constant time. | +| `RM_METRICS_TRUSTED_CIDR` | `metrics_trusted_cidrs` (list) | Restricts the source IP to one of the listed CIDRs. Comma-separated in env, list in YAML. Honours `X-Forwarded-For` only when the immediate hop matches `RM_TRUSTED_PROXY`. | + +When neither is set, `GET /metrics` returns 404 — the route is not +registered with the chi router so a forgotten config can't +accidentally publish fleet state. + +### Example: Docker + +```yaml +services: + restic-manager: + image: gitea.dcglab.co.uk/steve/restic-manager:latest + environment: + RM_METRICS_TOKEN_FILE: /run/secrets/rm_metrics_token + RM_METRICS_TRUSTED_CIDR: "10.0.0.0/8" + secrets: + - rm_metrics_token +``` + +(`RM_METRICS_TOKEN_FILE` is not currently supported — set +`RM_METRICS_TOKEN` directly. The `_FILE` convention is on the +roadmap.) + +## Prometheus scrape config + +Drop into your `prometheus.yml`: + +```yaml +scrape_configs: + - job_name: restic-manager + metrics_path: /metrics + scheme: https # via your reverse proxy + static_configs: + - targets: ['restic.example.com'] + authorization: + type: Bearer + credentials_file: /etc/prometheus/secrets/rm_metrics_token +``` + +If you don't run a TLS-terminating proxy in front, drop `scheme: +https` (the server is HTTP-only — see `docs/reverse-proxy.md`). + +## Metric reference + +All names are `rm_`-prefixed. Per-host metrics carry a `host_id` +label (the stable ULID, immune to renames) and a `host` label +(the human-readable name). + +### Server gauges + +| Name | Labels | Description | +|-----------------------|------------------------------------|-------------| +| `rm_hosts_total` | — | Total number of enrolled hosts (excludes pending announces). | +| `rm_hosts_online` | — | Number of hosts with `status='online'`. | +| `rm_active_alerts` | `severity` ∈ {info, warning, critical} | Open alerts by severity. | +| `rm_build_info` | `version, commit, go_version` | Always 1; pure label-bag for joining. | + +### Per-host gauges + +| Name | Description | +|--------------------------------------------|-------------| +| `rm_host_agent_online` | 1 if the agent is currently online, 0 otherwise. | +| `rm_host_last_backup_timestamp_seconds` | Unix timestamp of the host's most recent backup. **Omitted** for hosts with no backup yet. | +| `rm_host_last_backup_success` | 1 if the most recent backup succeeded, 0 otherwise. **Omitted** for hosts with no backup yet. | +| `rm_host_repo_size_bytes` | Latest reported repo size from `restic stats --mode raw-data`. **Omitted** when unknown. | +| `rm_host_snapshot_count` | Number of restic snapshots known on the host's repo. | +| `rm_host_open_alerts` | Number of currently open alerts attached to this host. | +| `rm_host_repo_status` | Always 1; the `status` label carries `unknown` / `ready` / `init_failed`. | + +### Job duration histogram + +``` +rm_job_duration_seconds_bucket{kind, status, le} +rm_job_duration_seconds_sum{kind, status} +rm_job_duration_seconds_count{kind, status} +``` + +`kind` ∈ {backup, forget, prune, check, unlock, restore, diff, init, update}. +`status` ∈ {succeeded, failed, cancelled}. + +Buckets (seconds): + +``` +1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf +1s 5s 30s 1m 5m 30m 1h 6h 24h +``` + +The histogram is in-memory only — values reset on process restart. +Operators who want durable history should let Prometheus persist +the scrapes; restic-manager itself is a control plane, not a +metrics database. + +## Grafana dashboard + +Import `deploy/grafana/restic-manager-dashboard.json`: + +1. In Grafana, **+ → Import → Upload JSON file**. +2. Pick the Prometheus data source you scrape with. +3. The dashboard's six panels populate from the metrics above: + * **Fleet status** — online/total stat panel. + * **Open alerts** — by severity. + * **Hosts** — per-host table (last backup, repo size, snapshots, alerts). + * **Repo size over time** — one line per host. + * **Backups failing** — count of hosts whose last backup didn't succeed. + * **Job duration p95** — `histogram_quantile(0.95, …)` over a 1h window per kind. + +Alerting is intentionally not configured in the dashboard — the +control plane already has alerts (P3-05) with native channels for +webhook, ntfy, and SMTP. Re-implementing them in Prometheus would +just duplicate state. If you do want Prom-side alerts, copy the +recording rules into your usual location. + +## Cardinality + +Per scrape: O(hosts) gauge rows + O(kinds × statuses × buckets) +histogram rows. A 100-host fleet emits roughly 700 host rows + 270 +histogram rows — well below any practical limit. There are no +`job_id` labels (cardinality bomb avoidance) and no per-source-group +labels. diff --git a/internal/server/config/config.go b/internal/server/config/config.go index ffb6363..2793913 100644 --- a/internal/server/config/config.go +++ b/internal/server/config/config.go @@ -41,6 +41,24 @@ type Config struct { // DataDir. Source-build deployments can override via // RM_BUNDLED_ASSETS_DIR. BundledAssetsDir string `yaml:"bundled_assets_dir"` + + // MetricsToken, if set, gates the /metrics scrape endpoint + // behind a `Authorization: Bearer ` check (constant-time + // compare). When neither this nor MetricsTrustedCIDRs is set, + // the route is not mounted at all (the endpoint is opt-in). + MetricsToken string `yaml:"metrics_token"` + + // MetricsTrustedCIDRs, if non-empty, gates /metrics so only + // callers from these networks may scrape. ANDed with + // MetricsToken when both are set. + MetricsTrustedCIDRs []string `yaml:"metrics_trusted_cidrs"` +} + +// MetricsAuthEnabled reports whether the operator has opted into +// exposing the Prometheus scrape endpoint by configuring at least +// one auth gate. +func (c Config) MetricsAuthEnabled() bool { + return c.MetricsToken != "" || len(c.MetricsTrustedCIDRs) > 0 } // Load resolves config in this order: @@ -93,6 +111,19 @@ func Load(yamlPath string) (Config, error) { if v, ok := os.LookupEnv("RM_BUNDLED_ASSETS_DIR"); ok { c.BundledAssetsDir = v } + if v, ok := os.LookupEnv("RM_METRICS_TOKEN"); ok { + c.MetricsToken = v + } + if v, ok := os.LookupEnv("RM_METRICS_TRUSTED_CIDR"); ok { + parts := strings.Split(v, ",") + c.MetricsTrustedCIDRs = c.MetricsTrustedCIDRs[:0] + for _, p := range parts { + p = strings.TrimSpace(p) + if p != "" { + c.MetricsTrustedCIDRs = append(c.MetricsTrustedCIDRs, p) + } + } + } if v, ok := os.LookupEnv("RM_TRUSTED_PROXY"); ok { // Comma-separated CIDRs; allow whitespace for readability. parts := strings.Split(v, ",") @@ -137,5 +168,10 @@ func (c *Config) validate() error { return fmt.Errorf("config: RM_TRUSTED_PROXY entry %q is not a valid CIDR: %w", cidr, err) } } + for _, cidr := range c.MetricsTrustedCIDRs { + if _, err := netip.ParsePrefix(cidr); err != nil { + return fmt.Errorf("config: RM_METRICS_TRUSTED_CIDR entry %q is not a valid CIDR: %w", cidr, err) + } + } return nil } diff --git a/internal/server/config/config_test.go b/internal/server/config/config_test.go index ba264f5..044af50 100644 --- a/internal/server/config/config_test.go +++ b/internal/server/config/config_test.go @@ -98,6 +98,45 @@ func TestCookieSecureDefaultAndOverride(t *testing.T) { } } +func TestMetricsAuthGates(t *testing.T) { + t.Setenv("RM_LISTEN", ":8080") + t.Setenv("RM_DATA_DIR", "/tmp/x") + + c, err := Load("") + if err != nil { + t.Fatalf("load: %v", err) + } + if c.MetricsAuthEnabled() { + t.Errorf("metrics endpoint should be off by default") + } + + t.Setenv("RM_METRICS_TOKEN", "s3cr3t-token-with-enough-bytes") + t.Setenv("RM_METRICS_TRUSTED_CIDR", "10.0.0.0/8, 192.168.1.0/24") + c, err = Load("") + if err != nil { + t.Fatalf("load: %v", err) + } + if c.MetricsToken != "s3cr3t-token-with-enough-bytes" { + t.Errorf("token: %q", c.MetricsToken) + } + if got := c.MetricsTrustedCIDRs; len(got) != 2 || got[0] != "10.0.0.0/8" || got[1] != "192.168.1.0/24" { + t.Errorf("cidrs: %v", got) + } + if !c.MetricsAuthEnabled() { + t.Errorf("MetricsAuthEnabled should be true") + } +} + +func TestMetricsTrustedCIDRRejectsGarbage(t *testing.T) { + t.Setenv("RM_LISTEN", ":8080") + t.Setenv("RM_DATA_DIR", "/tmp/x") + t.Setenv("RM_METRICS_TRUSTED_CIDR", "garbage") + + if _, err := Load(""); err == nil { + t.Fatal("expected validation error, got nil") + } +} + func writeFile(path string, body []byte) error { return writeFileImpl(path, body) } diff --git a/internal/server/http/metrics.go b/internal/server/http/metrics.go new file mode 100644 index 0000000..2c65ca8 --- /dev/null +++ b/internal/server/http/metrics.go @@ -0,0 +1,185 @@ +package http + +import ( + "context" + "crypto/subtle" + "net" + "net/http" + "net/netip" + "runtime" + "strings" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/config" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" +) + +// handleMetrics serves the Prometheus exposition body. The route is +// only mounted when the operator has opted in via RM_METRICS_TOKEN +// or RM_METRICS_TRUSTED_CIDR (see Server.New + Cfg.MetricsAuthEnabled). +func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) { + if !authoriseMetricsScrape(r, s.deps.Cfg) { + // 401 with no body; Prom respects this and surfaces the failed + // scrape. WWW-Authenticate hints at bearer when the operator + // actually configured a token. + if s.deps.Cfg.MetricsToken != "" { + w.Header().Set("WWW-Authenticate", `Bearer realm="restic-manager metrics"`) + } + w.WriteHeader(http.StatusUnauthorized) + return + } + + snap, err := s.gatherMetricsSnapshot(r.Context()) + if err != nil { + http.Error(w, "snapshot: "+err.Error(), http.StatusInternalServerError) + return + } + + // 0.0.4 is the long-stable text-format version Prometheus accepts + // without negotiation; OpenMetrics is intentionally not used here. + w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + if err := metrics.Render(w, snap); err != nil { + // Body is partially written; nothing useful we can do beyond + // dropping the connection (chi's recoverer will log). + return + } +} + +// authoriseMetricsScrape applies bearer + CIDR gates per the spec. +// AND semantics when both are configured; either alone is sufficient +// when only it is configured. +func authoriseMetricsScrape(r *http.Request, cfg config.Config) bool { + tokenOK := true + if cfg.MetricsToken != "" { + tokenOK = false + hdr := r.Header.Get("Authorization") + const prefix = "Bearer " + if strings.HasPrefix(hdr, prefix) { + got := []byte(strings.TrimPrefix(hdr, prefix)) + want := []byte(cfg.MetricsToken) + if subtle.ConstantTimeCompare(got, want) == 1 { + tokenOK = true + } + } + } + + cidrOK := true + if len(cfg.MetricsTrustedCIDRs) > 0 { + cidrOK = false + ip := callerIP(r, cfg.TrustedProxies) + if ip.IsValid() { + for _, c := range cfg.MetricsTrustedCIDRs { + prefix, err := netip.ParsePrefix(c) + if err != nil { + continue + } + if prefix.Contains(ip) { + cidrOK = true + break + } + } + } + } + return tokenOK && cidrOK +} + +// callerIP resolves the client IP. When the request hit the server +// directly we use RemoteAddr; when the immediate hop is a trusted +// proxy we honour the right-most untrusted X-Forwarded-For entry +// (mirrors how realIP middlewares typically resolve). +func callerIP(r *http.Request, trustedProxies []string) netip.Addr { + host, _, err := net.SplitHostPort(r.RemoteAddr) + if err != nil { + host = r.RemoteAddr + } + directAddr, err := netip.ParseAddr(host) + if err != nil { + return netip.Addr{} + } + + if !addrInAnyCIDR(directAddr, trustedProxies) { + return directAddr + } + + xff := r.Header.Get("X-Forwarded-For") + if xff == "" { + return directAddr + } + parts := strings.Split(xff, ",") + // Walk right→left, skipping trusted proxies, until we land on the + // first untrusted hop — that's the genuine client. + for i := len(parts) - 1; i >= 0; i-- { + p := strings.TrimSpace(parts[i]) + a, err := netip.ParseAddr(p) + if err != nil { + continue + } + if addrInAnyCIDR(a, trustedProxies) { + continue + } + return a + } + return directAddr +} + +func addrInAnyCIDR(a netip.Addr, cidrs []string) bool { + for _, c := range cidrs { + pre, err := netip.ParsePrefix(c) + if err != nil { + continue + } + if pre.Contains(a) { + return true + } + } + return false +} + +// gatherMetricsSnapshot pulls the data the renderer needs. One +// indexed query per per-host or fleet-wide read; no N+1. +func (s *Server) gatherMetricsSnapshot(ctx context.Context) (metrics.Snapshot, error) { + hosts, err := s.deps.Store.ListHosts(ctx) + if err != nil { + return metrics.Snapshot{}, err + } + hostRows := make([]metrics.HostRow, 0, len(hosts)) + for _, h := range hosts { + row := metrics.HostRow{ + ID: h.ID, + Name: h.Name, + Online: h.Status == "online", + SnapshotCount: h.SnapshotCount, + OpenAlertCount: h.OpenAlertCount, + RepoStatus: h.RepoStatus, + } + if h.LastBackupAt != nil { + ts := h.LastBackupAt.Unix() + row.LastBackupUnix = &ts + } + if h.LastBackupStatus != nil { + ok := *h.LastBackupStatus == "succeeded" + row.LastBackupSucceeded = &ok + } + if h.RepoSizeBytes > 0 { + sz := h.RepoSizeBytes + row.RepoSizeBytes = &sz + } + hostRows = append(hostRows, row) + } + + open, err := s.deps.Store.ListAlerts(ctx, store.AlertFilter{Status: "open"}) + if err != nil { + return metrics.Snapshot{}, err + } + bySeverity := map[string]int{"info": 0, "warning": 0, "critical": 0} + for _, a := range open { + bySeverity[a.Severity]++ + } + + reg := s.deps.Metrics + if reg == nil { + reg = metrics.NewRegistry() // empty histogram block + } + return reg.SnapshotWith(hostRows, bySeverity, version.Version, version.Commit, runtime.Version()), nil +} diff --git a/internal/server/http/metrics_test.go b/internal/server/http/metrics_test.go new file mode 100644 index 0000000..ef443e1 --- /dev/null +++ b/internal/server/http/metrics_test.go @@ -0,0 +1,209 @@ +package http + +import ( + "context" + "io" + stdhttp "net/http" + "net/http/httptest" + "path/filepath" + "strings" + "testing" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/crypto" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/config" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// newMetricsServer builds a Server with metrics enabled per cfg. +// Returns (URL, registry) so tests can both observe job durations +// directly and exercise the HTTP gate. +func newMetricsServer(t *testing.T, cfg config.Config) (string, *metrics.Registry, *store.Store) { + t.Helper() + dir := t.TempDir() + + st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db")) + if err != nil { + t.Fatalf("store: %v", err) + } + t.Cleanup(func() { _ = st.Close() }) + + keyPath := filepath.Join(dir, "secret.key") + if err := crypto.GenerateKeyFile(keyPath); err != nil { + t.Fatalf("genkey: %v", err) + } + key, _ := crypto.LoadKeyFromFile(keyPath) + aead, _ := crypto.NewAEAD(key) + + cfg.Listen = ":0" + cfg.DataDir = dir + cfg.SecretKeyFile = keyPath + + reg := metrics.NewRegistry() + deps := Deps{ + Cfg: cfg, + Store: st, + AEAD: aead, + Metrics: reg, + } + s := New(deps) + ts := httptest.NewServer(s.srv.Handler) + t.Cleanup(ts.Close) + return ts.URL, reg, st +} + +func TestMetricsRouteNotMountedByDefault(t *testing.T) { + t.Parallel() + url, _, _ := newMetricsServer(t, config.Config{}) + res, err := stdhttp.Get(url + "/metrics") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusNotFound { + t.Errorf("status: got %d, want 404 (route should not be mounted)", res.StatusCode) + } +} + +func TestMetricsTokenRequired(t *testing.T) { + t.Parallel() + url, _, _ := newMetricsServer(t, config.Config{ + MetricsToken: "the-token", + }) + + // Missing token. + res, err := stdhttp.Get(url + "/metrics") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusUnauthorized { + t.Errorf("no token: got %d", res.StatusCode) + } + if !strings.Contains(res.Header.Get("WWW-Authenticate"), "Bearer") { + t.Errorf("WWW-Authenticate hint missing: %q", res.Header.Get("WWW-Authenticate")) + } + + // Wrong token. + req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil) + req.Header.Set("Authorization", "Bearer not-the-token") + res2, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("GET: %v", err) + } + defer res2.Body.Close() + if res2.StatusCode != stdhttp.StatusUnauthorized { + t.Errorf("wrong token: got %d", res2.StatusCode) + } + + // Right token. + req3, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil) + req3.Header.Set("Authorization", "Bearer the-token") + res3, err3 := stdhttp.DefaultClient.Do(req3) + if err3 != nil { + t.Fatalf("GET: %v", err3) + } + defer res3.Body.Close() + if res3.StatusCode != stdhttp.StatusOK { + t.Errorf("right token: got %d", res3.StatusCode) + } + if ct := res3.Header.Get("Content-Type"); !strings.HasPrefix(ct, "text/plain") { + t.Errorf("content-type: %q", ct) + } +} + +func TestMetricsCIDRGate(t *testing.T) { + t.Parallel() + // 127.0.0.1 is what httptest hits with; pick a CIDR that excludes it + // to assert the "wrong source" branch. + url, _, _ := newMetricsServer(t, config.Config{ + MetricsTrustedCIDRs: []string{"10.0.0.0/8"}, + }) + res, err := stdhttp.Get(url + "/metrics") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusUnauthorized { + t.Errorf("loopback hitting non-matching CIDR: got %d, want 401", res.StatusCode) + } + + // Now allow loopback. + url2, _, _ := newMetricsServer(t, config.Config{ + MetricsTrustedCIDRs: []string{"127.0.0.0/8"}, + }) + res2, err := stdhttp.Get(url2 + "/metrics") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer res2.Body.Close() + if res2.StatusCode != stdhttp.StatusOK { + t.Errorf("loopback in allow CIDR: got %d, want 200", res2.StatusCode) + } +} + +func TestMetricsTokenAndCIDRBothRequired(t *testing.T) { + t.Parallel() + url, _, _ := newMetricsServer(t, config.Config{ + MetricsToken: "the-token", + MetricsTrustedCIDRs: []string{"127.0.0.0/8"}, + }) + // Token only — CIDR ok (loopback) but token missing. + res, err := stdhttp.Get(url + "/metrics") + if err != nil { + t.Fatalf("GET: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusUnauthorized { + t.Errorf("missing token but in CIDR: got %d", res.StatusCode) + } + + // Both right. + req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil) + req.Header.Set("Authorization", "Bearer the-token") + res2, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("GET: %v", err) + } + defer res2.Body.Close() + if res2.StatusCode != stdhttp.StatusOK { + t.Errorf("both right: got %d", res2.StatusCode) + } +} + +func readAll(t *testing.T, r io.Reader) string { + t.Helper() + b, err := io.ReadAll(r) + if err != nil { + t.Fatalf("read: %v", err) + } + return string(b) +} + +func TestMetricsBodyContainsExpectedLines(t *testing.T) { + t.Parallel() + url, reg, _ := newMetricsServer(t, config.Config{ + MetricsToken: "the-token", + }) + reg.ObserveJob("backup", "succeeded", 0) // produce one histogram row + + req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil) + req.Header.Set("Authorization", "Bearer the-token") + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("GET: %v", err) + } + defer res.Body.Close() + body := readAll(t, res.Body) + for _, want := range []string{ + "rm_hosts_total", + "rm_hosts_online", + `rm_active_alerts{severity="critical"}`, + "rm_build_info{", + "rm_job_duration_seconds_count{kind=\"backup\",status=\"succeeded\"}", + } { + if !strings.Contains(body, want) { + t.Errorf("body missing %q\n--- body ---\n%s", want, body) + } + } +} diff --git a/internal/server/http/server.go b/internal/server/http/server.go index c2d90c3..7d79cbf 100644 --- a/internal/server/http/server.go +++ b/internal/server/http/server.go @@ -17,6 +17,7 @@ import ( "gitea.dcglab.co.uk/steve/restic-manager/internal/crypto" "gitea.dcglab.co.uk/steve/restic-manager/internal/notification" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/config" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws" @@ -56,6 +57,12 @@ type Deps struct { // OIDC (optional). Non-nil when the operator has configured an // IdP — handlers under /auth/oidc/* are mounted only when set. OIDC *oidc.Client + // Metrics (optional). When non-nil the WS job-finished branch + // records job durations and the /metrics handler can pull a + // histogram snapshot. Independent of MetricsAuthEnabled — the + // recorder runs even if the scrape endpoint is gated off, so a + // later config flip doesn't lose the running window. + Metrics *metrics.Registry } // Server is the running HTTP server. @@ -131,12 +138,16 @@ func (s *Server) routes(r chi.Router) { r.Get("/agent/binary", s.handleAgentBinary) r.Get("/install/*", s.handleInstallAsset) r.Get("/api/version", s.handleVersion) + if s.deps.Cfg.MetricsAuthEnabled() { + r.Get("/metrics", s.handleMetrics) + } if s.deps.Hub != nil { hd := ws.HandlerDeps{ Hub: s.deps.Hub, Store: s.deps.Store, JobHub: s.deps.JobHub, AlertEngine: s.deps.AlertEngine, + Metrics: s.deps.Metrics, OnHello: s.onAgentHello, OnScheduleAck: s.applyScheduleAck, OnScheduleFire: s.dispatchScheduledJob, diff --git a/internal/server/metrics/metrics.go b/internal/server/metrics/metrics.go new file mode 100644 index 0000000..588d796 --- /dev/null +++ b/internal/server/metrics/metrics.go @@ -0,0 +1,301 @@ +// Package metrics owns the in-process Prometheus exposition for +// the control plane. It deliberately avoids prometheus/client_golang +// — the legacy text format is small and stable, and the repo's house +// style is to keep dependency surface minimal. +// +// Two halves: +// +// - Registry holds a job-duration histogram. Server hooks call +// Registry.ObserveJob from the WS job-finished branch. +// +// - Render emits a complete /metrics body from a Snapshot. The +// Snapshot is a plain value bag; the HTTP handler assembles it +// from store reads + Registry.Snapshot at scrape time. This +// keeps the package free of any database or HTTP dependency. +package metrics + +import ( + "fmt" + "io" + "sort" + "strings" + "sync" + "time" +) + +// JobDurationBuckets is the upper-bound ladder for the job duration +// histogram, in seconds. Covers admin commands (unlock/init/check +// finishing in seconds) up through hours-long backups; +Inf is +// implicit. +var JobDurationBuckets = []float64{1, 5, 30, 60, 300, 1800, 3600, 21600, 86400} + +// Registry is the in-memory store for the job-duration histogram. +// Concurrent observers and a single periodic snapshotter is the +// expected access pattern; both are guarded by a mutex. +type Registry struct { + mu sync.Mutex + jobs map[jobKey]*histogramState + clock func() time.Time +} + +type jobKey struct{ kind, status string } + +type histogramState struct { + // counts[i] = number of observations <= JobDurationBuckets[i]. + // counts[len(JobDurationBuckets)] is the implicit +Inf bucket + // (== total count, kept here for symmetry with the rendered + // _bucket{le="+Inf"} line and as a sanity check). + counts []uint64 + sum float64 + count uint64 +} + +// NewRegistry builds an empty registry. +func NewRegistry() *Registry { + return &Registry{ + jobs: make(map[jobKey]*histogramState), + clock: time.Now, + } +} + +// ObserveJob records one job-duration sample. Negative durations +// (clock-skew artefacts) are clamped to zero. Empty kind/status +// strings are tolerated but degrade the dashboard — callers should +// pass meaningful values. +func (r *Registry) ObserveJob(kind, status string, dur time.Duration) { + if r == nil { + return + } + if dur < 0 { + dur = 0 + } + secs := dur.Seconds() + + r.mu.Lock() + defer r.mu.Unlock() + k := jobKey{kind: kind, status: status} + hs, ok := r.jobs[k] + if !ok { + hs = &histogramState{counts: make([]uint64, len(JobDurationBuckets)+1)} + r.jobs[k] = hs + } + for i, ub := range JobDurationBuckets { + if secs <= ub { + hs.counts[i]++ + } + } + hs.counts[len(JobDurationBuckets)]++ // +Inf + hs.sum += secs + hs.count++ +} + +// HistogramRow is one (kind,status) row in a Snapshot. Buckets is +// the cumulative count per upper bound (matching JobDurationBuckets, +// last element is the +Inf total). +type HistogramRow struct { + Kind string + Status string + Buckets []uint64 + Sum float64 + Count uint64 +} + +// snapshotJobs returns a deterministic, sorted copy of the +// histogram state. Sort order: kind asc, status asc. +func (r *Registry) snapshotJobs() []HistogramRow { + if r == nil { + return nil + } + r.mu.Lock() + defer r.mu.Unlock() + rows := make([]HistogramRow, 0, len(r.jobs)) + for k, hs := range r.jobs { + buckets := make([]uint64, len(hs.counts)) + copy(buckets, hs.counts) + rows = append(rows, HistogramRow{ + Kind: k.kind, + Status: k.status, + Buckets: buckets, + Sum: hs.sum, + Count: hs.count, + }) + } + sort.Slice(rows, func(i, j int) bool { + if rows[i].Kind != rows[j].Kind { + return rows[i].Kind < rows[j].Kind + } + return rows[i].Status < rows[j].Status + }) + return rows +} + +// HostRow is one host's projection for the per-host gauges. +// Pointers carry "no value" semantics so we can omit a metric line +// when, e.g., a host has never run a backup. +type HostRow struct { + ID string + Name string + Online bool + LastBackupUnix *int64 // nil = no backup yet + LastBackupSucceeded *bool // nil = no backup yet + RepoSizeBytes *int64 // nil = no stats yet + SnapshotCount int + OpenAlertCount int + RepoStatus string // "unknown" | "ready" | "init_failed" +} + +// Snapshot is a frozen view of the data needed to render /metrics. +// Constructed by the HTTP handler from Store reads + Registry.snapshotJobs. +type Snapshot struct { + Hosts []HostRow + HostsTotal int + HostsOnline int + AlertsBySeverity map[string]int // severity → count + BuildVersion string + BuildCommit string + GoVersion string + JobDurationRows []HistogramRow +} + +// SnapshotWith builds a Snapshot from raw inputs and the registry's +// current job-duration state. Convenience for the HTTP handler. +func (r *Registry) SnapshotWith(hosts []HostRow, alerts map[string]int, buildVer, commit, goVer string) Snapshot { + online := 0 + for _, h := range hosts { + if h.Online { + online++ + } + } + return Snapshot{ + Hosts: hosts, + HostsTotal: len(hosts), + HostsOnline: online, + AlertsBySeverity: alerts, + BuildVersion: buildVer, + BuildCommit: commit, + GoVersion: goVer, + JobDurationRows: r.snapshotJobs(), + } +} + +// Render emits a complete Prometheus text-exposition body for s. +// Output is deterministic: metric names appear in a fixed order and +// labels within a metric are sorted by their first label value. +func Render(w io.Writer, s Snapshot) error { + var b strings.Builder + + // --- Server gauges --------------------------------------------------- + b.WriteString("# HELP rm_hosts_total Total number of enrolled hosts (excludes pending announces).\n") + b.WriteString("# TYPE rm_hosts_total gauge\n") + fmt.Fprintf(&b, "rm_hosts_total %d\n", s.HostsTotal) + + b.WriteString("# HELP rm_hosts_online Number of hosts currently online (status='online').\n") + b.WriteString("# TYPE rm_hosts_online gauge\n") + fmt.Fprintf(&b, "rm_hosts_online %d\n", s.HostsOnline) + + b.WriteString("# HELP rm_active_alerts Open alerts grouped by severity.\n") + b.WriteString("# TYPE rm_active_alerts gauge\n") + severities := []string{"info", "warning", "critical"} + for _, sev := range severities { + fmt.Fprintf(&b, "rm_active_alerts{severity=%q} %d\n", sev, s.AlertsBySeverity[sev]) + } + + b.WriteString("# HELP rm_build_info Build identifying labels; value is always 1.\n") + b.WriteString("# TYPE rm_build_info gauge\n") + fmt.Fprintf(&b, "rm_build_info{version=%q,commit=%q,go_version=%q} 1\n", + s.BuildVersion, s.BuildCommit, s.GoVersion) + + // --- Per-host gauges ------------------------------------------------- + // Stable order: by host id. + hosts := append([]HostRow(nil), s.Hosts...) + sort.Slice(hosts, func(i, j int) bool { return hosts[i].ID < hosts[j].ID }) + + b.WriteString("# HELP rm_host_agent_online 1 if the agent is currently online, 0 otherwise.\n") + b.WriteString("# TYPE rm_host_agent_online gauge\n") + for _, h := range hosts { + v := 0 + if h.Online { + v = 1 + } + fmt.Fprintf(&b, "rm_host_agent_online{host_id=%q,host=%q} %d\n", + h.ID, h.Name, v) + } + + b.WriteString("# HELP rm_host_last_backup_timestamp_seconds Unix timestamp of the host's most recent backup. Omitted for hosts with no backup yet.\n") + b.WriteString("# TYPE rm_host_last_backup_timestamp_seconds gauge\n") + for _, h := range hosts { + if h.LastBackupUnix == nil { + continue + } + fmt.Fprintf(&b, "rm_host_last_backup_timestamp_seconds{host_id=%q,host=%q} %d\n", + h.ID, h.Name, *h.LastBackupUnix) + } + + b.WriteString("# HELP rm_host_last_backup_success 1 if the host's most recent backup succeeded, 0 otherwise. Omitted for hosts with no backup yet.\n") + b.WriteString("# TYPE rm_host_last_backup_success gauge\n") + for _, h := range hosts { + if h.LastBackupSucceeded == nil { + continue + } + v := 0 + if *h.LastBackupSucceeded { + v = 1 + } + fmt.Fprintf(&b, "rm_host_last_backup_success{host_id=%q,host=%q} %d\n", + h.ID, h.Name, v) + } + + b.WriteString("# HELP rm_host_repo_size_bytes Latest reported repo size from `restic stats --mode raw-data`. Omitted for hosts with no stats yet.\n") + b.WriteString("# TYPE rm_host_repo_size_bytes gauge\n") + for _, h := range hosts { + if h.RepoSizeBytes == nil { + continue + } + fmt.Fprintf(&b, "rm_host_repo_size_bytes{host_id=%q,host=%q} %d\n", + h.ID, h.Name, *h.RepoSizeBytes) + } + + b.WriteString("# HELP rm_host_snapshot_count Number of restic snapshots known on the host's repo.\n") + b.WriteString("# TYPE rm_host_snapshot_count gauge\n") + for _, h := range hosts { + fmt.Fprintf(&b, "rm_host_snapshot_count{host_id=%q,host=%q} %d\n", + h.ID, h.Name, h.SnapshotCount) + } + + b.WriteString("# HELP rm_host_open_alerts Number of currently open alerts attached to this host.\n") + b.WriteString("# TYPE rm_host_open_alerts gauge\n") + for _, h := range hosts { + fmt.Fprintf(&b, "rm_host_open_alerts{host_id=%q,host=%q} %d\n", + h.ID, h.Name, h.OpenAlertCount) + } + + b.WriteString("# HELP rm_host_repo_status Repo readiness state for the host. Exactly one row per host with status label set.\n") + b.WriteString("# TYPE rm_host_repo_status gauge\n") + for _, h := range hosts { + st := h.RepoStatus + if st == "" { + st = "unknown" + } + fmt.Fprintf(&b, "rm_host_repo_status{host_id=%q,host=%q,status=%q} 1\n", + h.ID, h.Name, st) + } + + // --- Histogram ------------------------------------------------------- + b.WriteString("# HELP rm_job_duration_seconds End-to-end duration of completed jobs, by kind and terminal status.\n") + b.WriteString("# TYPE rm_job_duration_seconds histogram\n") + for _, row := range s.JobDurationRows { + for i, ub := range JobDurationBuckets { + fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"%g\"} %d\n", + row.Kind, row.Status, ub, row.Buckets[i]) + } + fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"+Inf\"} %d\n", + row.Kind, row.Status, row.Buckets[len(JobDurationBuckets)]) + fmt.Fprintf(&b, "rm_job_duration_seconds_sum{kind=%q,status=%q} %g\n", + row.Kind, row.Status, row.Sum) + fmt.Fprintf(&b, "rm_job_duration_seconds_count{kind=%q,status=%q} %d\n", + row.Kind, row.Status, row.Count) + } + + _, err := io.WriteString(w, b.String()) + return err +} diff --git a/internal/server/metrics/metrics_test.go b/internal/server/metrics/metrics_test.go new file mode 100644 index 0000000..70c5ed7 --- /dev/null +++ b/internal/server/metrics/metrics_test.go @@ -0,0 +1,182 @@ +package metrics + +import ( + "bytes" + "strings" + "sync" + "testing" + "time" +) + +func TestObserveJobBuckets(t *testing.T) { + r := NewRegistry() + // Bucket boundaries: 1, 5, 30, 60, 300, 1800, 3600, 21600, 86400 + r.ObserveJob("backup", "succeeded", 500*time.Millisecond) // <= 1 + r.ObserveJob("backup", "succeeded", 30*time.Second) // == 30 (boundary) + r.ObserveJob("backup", "succeeded", 90*time.Second) // > 60, <= 300 + r.ObserveJob("backup", "succeeded", 2*time.Hour) // > 3600 → 21600 bucket + rows := r.snapshotJobs() + if len(rows) != 1 { + t.Fatalf("rows: %d", len(rows)) + } + row := rows[0] + if row.Count != 4 { + t.Errorf("count: %d", row.Count) + } + wantSum := 0.5 + 30 + 90 + 7200.0 + if row.Sum != wantSum { + t.Errorf("sum: got %v want %v", row.Sum, wantSum) + } + // Cumulative buckets: + // le=1 → 1 (the 0.5s) + // le=5 → 1 + // le=30 → 2 (boundary inclusive: 30s included) + // le=60 → 2 + // le=300 → 3 + // le=1800 → 3 + // le=3600 → 3 + // le=21600 → 4 + // le=86400 → 4 + // le=+Inf → 4 + want := []uint64{1, 1, 2, 2, 3, 3, 3, 4, 4, 4} + for i, w := range want { + if row.Buckets[i] != w { + t.Errorf("bucket[%d]=%d want %d", i, row.Buckets[i], w) + } + } +} + +func TestObserveJobNegativeClampedToZero(t *testing.T) { + r := NewRegistry() + r.ObserveJob("backup", "succeeded", -5*time.Second) + rows := r.snapshotJobs() + if len(rows) != 1 || rows[0].Sum != 0 || rows[0].Count != 1 { + t.Errorf("expected one zero-second observation, got %+v", rows) + } +} + +func TestObserveJobConcurrent(t *testing.T) { + r := NewRegistry() + const goroutines = 16 + const each = 200 + var wg sync.WaitGroup + for g := 0; g < goroutines; g++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < each; i++ { + r.ObserveJob("backup", "succeeded", time.Second) + } + }() + } + wg.Wait() + rows := r.snapshotJobs() + if len(rows) != 1 { + t.Fatalf("rows: %d", len(rows)) + } + if rows[0].Count != uint64(goroutines*each) { + t.Errorf("count: got %d want %d", rows[0].Count, goroutines*each) + } +} + +func TestObserveJobNilRegistryNoop(t *testing.T) { + var r *Registry // nil + r.ObserveJob("backup", "succeeded", time.Second) +} + +func TestRenderGolden(t *testing.T) { + r := NewRegistry() + r.ObserveJob("backup", "succeeded", 5*time.Second) + r.ObserveJob("forget", "succeeded", 100*time.Millisecond) + + pi64 := func(v int64) *int64 { return &v } + pbool := func(v bool) *bool { return &v } + + hosts := []HostRow{ + { + ID: "01H0001", Name: "alpha", + Online: true, + LastBackupUnix: pi64(1700000000), + LastBackupSucceeded: pbool(true), + RepoSizeBytes: pi64(123456789), + SnapshotCount: 42, + OpenAlertCount: 0, + RepoStatus: "ready", + }, + { + ID: "01H0002", Name: "bravo", + Online: false, + SnapshotCount: 0, + OpenAlertCount: 1, + RepoStatus: "init_failed", + }, + } + snap := r.SnapshotWith(hosts, + map[string]int{"info": 0, "warning": 1, "critical": 0}, + "v1.2.3", "deadbeef", "go1.25.0") + + var buf bytes.Buffer + if err := Render(&buf, snap); err != nil { + t.Fatalf("render: %v", err) + } + out := buf.String() + + for _, want := range []string{ + "# HELP rm_hosts_total ", + "rm_hosts_total 2\n", + "rm_hosts_online 1\n", + `rm_active_alerts{severity="warning"} 1`, + `rm_active_alerts{severity="info"} 0`, + `rm_active_alerts{severity="critical"} 0`, + `rm_build_info{version="v1.2.3",commit="deadbeef",go_version="go1.25.0"} 1`, + `rm_host_agent_online{host_id="01H0001",host="alpha"} 1`, + `rm_host_agent_online{host_id="01H0002",host="bravo"} 0`, + `rm_host_last_backup_timestamp_seconds{host_id="01H0001",host="alpha"} 1700000000`, + `rm_host_last_backup_success{host_id="01H0001",host="alpha"} 1`, + `rm_host_repo_size_bytes{host_id="01H0001",host="alpha"} 123456789`, + `rm_host_snapshot_count{host_id="01H0001",host="alpha"} 42`, + `rm_host_snapshot_count{host_id="01H0002",host="bravo"} 0`, + `rm_host_open_alerts{host_id="01H0002",host="bravo"} 1`, + `rm_host_repo_status{host_id="01H0001",host="alpha",status="ready"} 1`, + `rm_host_repo_status{host_id="01H0002",host="bravo",status="init_failed"} 1`, + `rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="1"} 0`, + `rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="5"} 1`, + `rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="+Inf"} 1`, + `rm_job_duration_seconds_sum{kind="backup",status="succeeded"} 5`, + `rm_job_duration_seconds_count{kind="backup",status="succeeded"} 1`, + `rm_job_duration_seconds_bucket{kind="forget",status="succeeded",le="1"} 1`, + } { + if !strings.Contains(out, want) { + t.Errorf("missing line:\n %s\n--- full output ---\n%s", want, out) + } + } + + // bravo had no last backup → those metric lines must be absent for it. + for _, ban := range []string{ + `rm_host_last_backup_timestamp_seconds{host_id="01H0002"`, + `rm_host_last_backup_success{host_id="01H0002"`, + `rm_host_repo_size_bytes{host_id="01H0002"`, + } { + if strings.Contains(out, ban) { + t.Errorf("unexpected line for bravo: %q", ban) + } + } +} + +func TestRenderEmptySnapshot(t *testing.T) { + r := NewRegistry() + snap := r.SnapshotWith(nil, nil, "dev", "", "go1.25.0") + var buf bytes.Buffer + if err := Render(&buf, snap); err != nil { + t.Fatalf("render: %v", err) + } + out := buf.String() + if !strings.Contains(out, "rm_hosts_total 0\n") { + t.Errorf("missing zero-host gauge:\n%s", out) + } + // Histogram block has its HELP/TYPE but no rows. The HELP/TYPE + // presence is correct and helps Prometheus pre-register the metric. + if !strings.Contains(out, "# TYPE rm_job_duration_seconds histogram") { + t.Errorf("histogram HELP/TYPE missing") + } +} diff --git a/internal/server/ws/handler.go b/internal/server/ws/handler.go index 4fd0e4c..6c54b81 100644 --- a/internal/server/ws/handler.go +++ b/internal/server/ws/handler.go @@ -15,6 +15,7 @@ import ( "gitea.dcglab.co.uk/steve/restic-manager/internal/alert" "gitea.dcglab.co.uk/steve/restic-manager/internal/api" "gitea.dcglab.co.uk/steve/restic-manager/internal/auth" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics" "gitea.dcglab.co.uk/steve/restic-manager/internal/store" "gitea.dcglab.co.uk/steve/restic-manager/internal/version" ) @@ -27,6 +28,9 @@ type HandlerDeps struct { // AlertEngine receives job-finished and host-online events so the // alert engine can evaluate its rules. Optional; nil = no-op. AlertEngine *alert.Engine + // Metrics records job-duration observations on every terminal + // status. Optional; nil = no-op (test fixtures pass nil). + Metrics *metrics.Registry // UpdateWatcher reconciles in-flight agent-update dispatches against // hello envelopes. Optional; nil = no-op. UpdateWatcher *UpdateWatcher @@ -239,6 +243,13 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E slog.Warn("ws: set host last backup", "host_id", hostID, "err", err) } } + // Job-duration histogram (P6-04). Skip when StartedAt is + // missing (race: agent shipped finished without a started, + // or the row predates this code). + if deps.Metrics != nil && job.StartedAt != nil { + deps.Metrics.ObserveJob(job.Kind, string(p.Status), + p.FinishedAt.Sub(*job.StartedAt)) + } } if deps.JobHub != nil { deps.JobHub.Broadcast(p.JobID, env) diff --git a/tasks.md b/tasks.md index 9aaf4e8..0f016b6 100644 --- a/tasks.md +++ b/tasks.md @@ -390,8 +390,45 @@ Sizes: **S** = under a day, **M** = 1–3 days, **L** = 3–7 days. > swap, helper `buildRepoTrendView` shared between page-load and > fragment endpoint). No new dependencies, no client JS, no agent > change. CI green; in-browser smoke walk-through pending operator. -- [ ] **P6-04** (M) Prometheus `/metrics` endpoint: per-host gauges (last backup timestamp, last backup status, repo size, snapshot count, agent online), server gauges (active alerts, build info), job duration histograms; protected by bearer token or IP allow-list. _(Was P4-08.)_ -- [ ] **P6-05** (S) Document Prometheus integration + sample Grafana dashboard JSON. _(Was P4-09.)_ +- [x] **P6-04** (M) Prometheus `/metrics` endpoint: per-host gauges (last backup timestamp, last backup status, repo size, snapshot count, agent online), server gauges (active alerts, build info), job duration histograms; protected by bearer token or IP allow-list. _(Was P4-08.)_ +- [x] **P6-05** (S) Document Prometheus integration + sample Grafana dashboard JSON. _(Was P4-09.)_ + +> **As shipped (2026-05-07, branch `p6-04-05-prometheus-metrics`):** +> Spec `docs/superpowers/specs/2026-05-07-p6-04-05-prometheus-metrics-design.md`, +> plan `docs/superpowers/plans/2026-05-07-p6-04-05-prometheus-metrics.md`. +> New `internal/server/metrics` package emits the legacy +> `text/plain; version=0.0.4` exposition format directly — no +> `prometheus/client_golang` dependency, matching the repo's +> "no Tailwind, no Node" minimal-deps style. `/metrics` is **opt-in**: +> `RM_METRICS_TOKEN` and/or `RM_METRICS_TRUSTED_CIDR` must be set or +> the route isn't mounted at all (404). When both are set, both must +> pass; either alone gates access. Token compare is constant-time. +> CIDR check honours `X-Forwarded-For` only when the immediate hop +> is a configured `RM_TRUSTED_PROXY` (mirrors the existing realIP +> resolution). +> +> **Metrics:** per-host gauges (`rm_host_agent_online`, +> `rm_host_last_backup_timestamp_seconds`, `rm_host_last_backup_success`, +> `rm_host_repo_size_bytes`, `rm_host_snapshot_count`, +> `rm_host_open_alerts`, `rm_host_repo_status`); server gauges +> (`rm_hosts_total`, `rm_hosts_online`, `rm_active_alerts{severity}`, +> `rm_build_info{version,commit,go_version}`); histogram +> `rm_job_duration_seconds_bucket{kind,status,le}` with buckets +> `1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf`. +> Histogram is in-memory; observations come from the existing +> `MsgJobFinished` branch in `internal/server/ws/handler.go`. +> +> **Docs:** `docs/prometheus.md` covers enable + scrape config + +> metric reference + dashboard import. **Dashboard:** +> `deploy/grafana/restic-manager-dashboard.json` — six panels +> (fleet status, open alerts, backups failing, hosts table, repo +> size over time, job-duration p95). Schema 39, single Prometheus +> datasource variable. +> +> **Tests:** golden-render + concurrent-observe + bucket-boundary +> in the metrics package; auth matrix (no auth → 404; token +> missing/wrong/right; CIDR matching/non-matching; token AND CIDR) +> in the HTTP layer. ### Phase 6 acceptance