P6-04+05: Prometheus /metrics endpoint + Grafana dashboard

New internal/server/metrics package emits the legacy text/plain
exposition format directly, so we don't pull in
prometheus/client_golang. Endpoint is opt-in via RM_METRICS_TOKEN
and/or RM_METRICS_TRUSTED_CIDR; route is not mounted at all if
neither gate is set. Both gates ANDed when both configured.

Per-host gauges (online, last_backup_*, repo_size_bytes,
snapshot_count, open_alerts, repo_status), server gauges
(hosts_total/online, active_alerts by severity, build_info), and
an in-memory job-duration histogram observed from the existing
MsgJobFinished branch in the WS handler.

Docs in docs/prometheus.md (enable + scrape config + metric
reference + dashboard import). Sample dashboard at
deploy/grafana/restic-manager-dashboard.json - six panels,
Grafana schema 39, single Prometheus datasource variable.

Tests: golden render, concurrent observe, bucket boundaries in
the metrics package; auth matrix (no auth -> 404, token gate,
CIDR gate, both required) in the HTTP layer.
This commit is contained in:
2026-05-07 23:17:15 +01:00
parent 07bce16c84
commit ccd14f7cee
12 changed files with 1480 additions and 2 deletions
+36
View File
@@ -41,6 +41,24 @@ type Config struct {
// DataDir. Source-build deployments can override via
// RM_BUNDLED_ASSETS_DIR.
BundledAssetsDir string `yaml:"bundled_assets_dir"`
// MetricsToken, if set, gates the /metrics scrape endpoint
// behind a `Authorization: Bearer <token>` check (constant-time
// compare). When neither this nor MetricsTrustedCIDRs is set,
// the route is not mounted at all (the endpoint is opt-in).
MetricsToken string `yaml:"metrics_token"`
// MetricsTrustedCIDRs, if non-empty, gates /metrics so only
// callers from these networks may scrape. ANDed with
// MetricsToken when both are set.
MetricsTrustedCIDRs []string `yaml:"metrics_trusted_cidrs"`
}
// MetricsAuthEnabled reports whether the operator has opted into
// exposing the Prometheus scrape endpoint by configuring at least
// one auth gate.
func (c Config) MetricsAuthEnabled() bool {
return c.MetricsToken != "" || len(c.MetricsTrustedCIDRs) > 0
}
// Load resolves config in this order:
@@ -93,6 +111,19 @@ func Load(yamlPath string) (Config, error) {
if v, ok := os.LookupEnv("RM_BUNDLED_ASSETS_DIR"); ok {
c.BundledAssetsDir = v
}
if v, ok := os.LookupEnv("RM_METRICS_TOKEN"); ok {
c.MetricsToken = v
}
if v, ok := os.LookupEnv("RM_METRICS_TRUSTED_CIDR"); ok {
parts := strings.Split(v, ",")
c.MetricsTrustedCIDRs = c.MetricsTrustedCIDRs[:0]
for _, p := range parts {
p = strings.TrimSpace(p)
if p != "" {
c.MetricsTrustedCIDRs = append(c.MetricsTrustedCIDRs, p)
}
}
}
if v, ok := os.LookupEnv("RM_TRUSTED_PROXY"); ok {
// Comma-separated CIDRs; allow whitespace for readability.
parts := strings.Split(v, ",")
@@ -137,5 +168,10 @@ func (c *Config) validate() error {
return fmt.Errorf("config: RM_TRUSTED_PROXY entry %q is not a valid CIDR: %w", cidr, err)
}
}
for _, cidr := range c.MetricsTrustedCIDRs {
if _, err := netip.ParsePrefix(cidr); err != nil {
return fmt.Errorf("config: RM_METRICS_TRUSTED_CIDR entry %q is not a valid CIDR: %w", cidr, err)
}
}
return nil
}
+39
View File
@@ -98,6 +98,45 @@ func TestCookieSecureDefaultAndOverride(t *testing.T) {
}
}
func TestMetricsAuthGates(t *testing.T) {
t.Setenv("RM_LISTEN", ":8080")
t.Setenv("RM_DATA_DIR", "/tmp/x")
c, err := Load("")
if err != nil {
t.Fatalf("load: %v", err)
}
if c.MetricsAuthEnabled() {
t.Errorf("metrics endpoint should be off by default")
}
t.Setenv("RM_METRICS_TOKEN", "s3cr3t-token-with-enough-bytes")
t.Setenv("RM_METRICS_TRUSTED_CIDR", "10.0.0.0/8, 192.168.1.0/24")
c, err = Load("")
if err != nil {
t.Fatalf("load: %v", err)
}
if c.MetricsToken != "s3cr3t-token-with-enough-bytes" {
t.Errorf("token: %q", c.MetricsToken)
}
if got := c.MetricsTrustedCIDRs; len(got) != 2 || got[0] != "10.0.0.0/8" || got[1] != "192.168.1.0/24" {
t.Errorf("cidrs: %v", got)
}
if !c.MetricsAuthEnabled() {
t.Errorf("MetricsAuthEnabled should be true")
}
}
func TestMetricsTrustedCIDRRejectsGarbage(t *testing.T) {
t.Setenv("RM_LISTEN", ":8080")
t.Setenv("RM_DATA_DIR", "/tmp/x")
t.Setenv("RM_METRICS_TRUSTED_CIDR", "garbage")
if _, err := Load(""); err == nil {
t.Fatal("expected validation error, got nil")
}
}
func writeFile(path string, body []byte) error {
return writeFileImpl(path, body)
}