Files
steve ccd14f7cee P6-04+05: Prometheus /metrics endpoint + Grafana dashboard
New internal/server/metrics package emits the legacy text/plain
exposition format directly, so we don't pull in
prometheus/client_golang. Endpoint is opt-in via RM_METRICS_TOKEN
and/or RM_METRICS_TRUSTED_CIDR; route is not mounted at all if
neither gate is set. Both gates ANDed when both configured.

Per-host gauges (online, last_backup_*, repo_size_bytes,
snapshot_count, open_alerts, repo_status), server gauges
(hosts_total/online, active_alerts by severity, build_info), and
an in-memory job-duration histogram observed from the existing
MsgJobFinished branch in the WS handler.

Docs in docs/prometheus.md (enable + scrape config + metric
reference + dashboard import). Sample dashboard at
deploy/grafana/restic-manager-dashboard.json - six panels,
Grafana schema 39, single Prometheus datasource variable.

Tests: golden render, concurrent observe, bucket boundaries in
the metrics package; auth matrix (no auth -> 404, token gate,
CIDR gate, both required) in the HTTP layer.
2026-05-07 23:17:15 +01:00

178 lines
5.3 KiB
Go

// Package config loads server configuration from env vars (the
// canonical source) with optional YAML overlay. Documented vars are
// listed in spec.md §4.1.
package config
import (
"fmt"
"net"
"net/netip"
"os"
"strings"
"gopkg.in/yaml.v3"
)
// Config holds runtime parameters resolved from env + (optionally) a
// YAML file. Env wins over YAML so operators can tweak a single var
// without rewriting the file.
//
// The server is HTTP-only by design: the expected deployment fronts it
// with a TLS-terminating reverse proxy (Caddy/Traefik/nginx). See
// spec.md §11 for the rationale.
type Config struct {
Listen string `yaml:"listen"`
DataDir string `yaml:"data_dir"`
BaseURL string `yaml:"base_url"`
SecretKeyFile string `yaml:"secret_key_file"`
TrustedProxies []string `yaml:"trusted_proxies"`
// CookieSecure controls the Secure attribute on session cookies.
// Defaults to true. Set RM_COOKIE_SECURE=false only for local HTTP
// testing — production deployments are always behind a TLS proxy
// and the cookie must be Secure.
CookieSecure bool `yaml:"cookie_secure"`
OIDCRaw *OIDCConfig `yaml:"oidc"`
OIDC *OIDCConfig `yaml:"-"`
// BundledAssetsDir is the read-only path inside the image that
// holds agent binaries (under agent-binaries/) and install
// scripts (under install/). The /agent/binary and /install/*
// handlers fall back here when the file is not present in
// DataDir. Source-build deployments can override via
// RM_BUNDLED_ASSETS_DIR.
BundledAssetsDir string `yaml:"bundled_assets_dir"`
// MetricsToken, if set, gates the /metrics scrape endpoint
// behind a `Authorization: Bearer <token>` check (constant-time
// compare). When neither this nor MetricsTrustedCIDRs is set,
// the route is not mounted at all (the endpoint is opt-in).
MetricsToken string `yaml:"metrics_token"`
// MetricsTrustedCIDRs, if non-empty, gates /metrics so only
// callers from these networks may scrape. ANDed with
// MetricsToken when both are set.
MetricsTrustedCIDRs []string `yaml:"metrics_trusted_cidrs"`
}
// MetricsAuthEnabled reports whether the operator has opted into
// exposing the Prometheus scrape endpoint by configuring at least
// one auth gate.
func (c Config) MetricsAuthEnabled() bool {
return c.MetricsToken != "" || len(c.MetricsTrustedCIDRs) > 0
}
// Load resolves config in this order:
// 1. defaults
// 2. YAML at the given path (if non-empty and exists)
// 3. environment variables (RM_LISTEN, RM_DATA_DIR, …)
//
// The result is validated; a zero-error return means the server is
// safe to start.
func Load(yamlPath string) (Config, error) {
c := Config{
Listen: ":8080",
DataDir: "/data",
CookieSecure: true,
BundledAssetsDir: "/opt/restic-manager/dist",
}
if yamlPath != "" {
body, err := os.ReadFile(yamlPath)
if err != nil && !os.IsNotExist(err) {
return c, fmt.Errorf("config: read %q: %w", yamlPath, err)
}
if err == nil {
if err := yaml.Unmarshal(body, &c); err != nil {
return c, fmt.Errorf("config: parse %q: %w", yamlPath, err)
}
}
}
if v, ok := os.LookupEnv("RM_LISTEN"); ok {
c.Listen = v
}
if v, ok := os.LookupEnv("RM_DATA_DIR"); ok {
c.DataDir = v
}
if v, ok := os.LookupEnv("RM_BASE_URL"); ok {
c.BaseURL = v
}
if v, ok := os.LookupEnv("RM_SECRET_KEY_FILE"); ok {
c.SecretKeyFile = v
}
if v, ok := os.LookupEnv("RM_COOKIE_SECURE"); ok {
// Anything other than "false"/"0" leaves the safe default.
if v == "false" || v == "0" {
c.CookieSecure = false
} else {
c.CookieSecure = true
}
}
if v, ok := os.LookupEnv("RM_BUNDLED_ASSETS_DIR"); ok {
c.BundledAssetsDir = v
}
if v, ok := os.LookupEnv("RM_METRICS_TOKEN"); ok {
c.MetricsToken = v
}
if v, ok := os.LookupEnv("RM_METRICS_TRUSTED_CIDR"); ok {
parts := strings.Split(v, ",")
c.MetricsTrustedCIDRs = c.MetricsTrustedCIDRs[:0]
for _, p := range parts {
p = strings.TrimSpace(p)
if p != "" {
c.MetricsTrustedCIDRs = append(c.MetricsTrustedCIDRs, p)
}
}
}
if v, ok := os.LookupEnv("RM_TRUSTED_PROXY"); ok {
// Comma-separated CIDRs; allow whitespace for readability.
parts := strings.Split(v, ",")
c.TrustedProxies = c.TrustedProxies[:0]
for _, p := range parts {
p = strings.TrimSpace(p)
if p != "" {
c.TrustedProxies = append(c.TrustedProxies, p)
}
}
}
var rawOIDC OIDCConfig
if c.OIDCRaw != nil {
rawOIDC = *c.OIDCRaw
}
oidc, err := loadOIDC(envSnapshot(), rawOIDC)
if err != nil {
return c, err
}
c.OIDC = oidc
return c, c.validate()
}
func (c *Config) validate() error {
if c.Listen == "" {
return fmt.Errorf("config: RM_LISTEN must be set")
}
if _, _, err := net.SplitHostPort(c.Listen); err != nil {
return fmt.Errorf("config: RM_LISTEN %q invalid: %w", c.Listen, err)
}
if c.DataDir == "" {
return fmt.Errorf("config: RM_DATA_DIR must be set")
}
if c.SecretKeyFile == "" {
// Default to data dir.
c.SecretKeyFile = c.DataDir + "/secret.key"
}
for _, cidr := range c.TrustedProxies {
if _, err := netip.ParsePrefix(cidr); err != nil {
return fmt.Errorf("config: RM_TRUSTED_PROXY entry %q is not a valid CIDR: %w", cidr, err)
}
}
for _, cidr := range c.MetricsTrustedCIDRs {
if _, err := netip.ParsePrefix(cidr); err != nil {
return fmt.Errorf("config: RM_METRICS_TRUSTED_CIDR entry %q is not a valid CIDR: %w", cidr, err)
}
}
return nil
}