P6-04+05: Prometheus /metrics endpoint + Grafana dashboard
New internal/server/metrics package emits the legacy text/plain exposition format directly, so we don't pull in prometheus/client_golang. Endpoint is opt-in via RM_METRICS_TOKEN and/or RM_METRICS_TRUSTED_CIDR; route is not mounted at all if neither gate is set. Both gates ANDed when both configured. Per-host gauges (online, last_backup_*, repo_size_bytes, snapshot_count, open_alerts, repo_status), server gauges (hosts_total/online, active_alerts by severity, build_info), and an in-memory job-duration histogram observed from the existing MsgJobFinished branch in the WS handler. Docs in docs/prometheus.md (enable + scrape config + metric reference + dashboard import). Sample dashboard at deploy/grafana/restic-manager-dashboard.json - six panels, Grafana schema 39, single Prometheus datasource variable. Tests: golden render, concurrent observe, bucket boundaries in the metrics package; auth matrix (no auth -> 404, token gate, CIDR gate, both required) in the HTTP layer.
This commit is contained in:
@@ -41,6 +41,24 @@ type Config struct {
|
||||
// DataDir. Source-build deployments can override via
|
||||
// RM_BUNDLED_ASSETS_DIR.
|
||||
BundledAssetsDir string `yaml:"bundled_assets_dir"`
|
||||
|
||||
// MetricsToken, if set, gates the /metrics scrape endpoint
|
||||
// behind a `Authorization: Bearer <token>` check (constant-time
|
||||
// compare). When neither this nor MetricsTrustedCIDRs is set,
|
||||
// the route is not mounted at all (the endpoint is opt-in).
|
||||
MetricsToken string `yaml:"metrics_token"`
|
||||
|
||||
// MetricsTrustedCIDRs, if non-empty, gates /metrics so only
|
||||
// callers from these networks may scrape. ANDed with
|
||||
// MetricsToken when both are set.
|
||||
MetricsTrustedCIDRs []string `yaml:"metrics_trusted_cidrs"`
|
||||
}
|
||||
|
||||
// MetricsAuthEnabled reports whether the operator has opted into
|
||||
// exposing the Prometheus scrape endpoint by configuring at least
|
||||
// one auth gate.
|
||||
func (c Config) MetricsAuthEnabled() bool {
|
||||
return c.MetricsToken != "" || len(c.MetricsTrustedCIDRs) > 0
|
||||
}
|
||||
|
||||
// Load resolves config in this order:
|
||||
@@ -93,6 +111,19 @@ func Load(yamlPath string) (Config, error) {
|
||||
if v, ok := os.LookupEnv("RM_BUNDLED_ASSETS_DIR"); ok {
|
||||
c.BundledAssetsDir = v
|
||||
}
|
||||
if v, ok := os.LookupEnv("RM_METRICS_TOKEN"); ok {
|
||||
c.MetricsToken = v
|
||||
}
|
||||
if v, ok := os.LookupEnv("RM_METRICS_TRUSTED_CIDR"); ok {
|
||||
parts := strings.Split(v, ",")
|
||||
c.MetricsTrustedCIDRs = c.MetricsTrustedCIDRs[:0]
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if p != "" {
|
||||
c.MetricsTrustedCIDRs = append(c.MetricsTrustedCIDRs, p)
|
||||
}
|
||||
}
|
||||
}
|
||||
if v, ok := os.LookupEnv("RM_TRUSTED_PROXY"); ok {
|
||||
// Comma-separated CIDRs; allow whitespace for readability.
|
||||
parts := strings.Split(v, ",")
|
||||
@@ -137,5 +168,10 @@ func (c *Config) validate() error {
|
||||
return fmt.Errorf("config: RM_TRUSTED_PROXY entry %q is not a valid CIDR: %w", cidr, err)
|
||||
}
|
||||
}
|
||||
for _, cidr := range c.MetricsTrustedCIDRs {
|
||||
if _, err := netip.ParsePrefix(cidr); err != nil {
|
||||
return fmt.Errorf("config: RM_METRICS_TRUSTED_CIDR entry %q is not a valid CIDR: %w", cidr, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -98,6 +98,45 @@ func TestCookieSecureDefaultAndOverride(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsAuthGates(t *testing.T) {
|
||||
t.Setenv("RM_LISTEN", ":8080")
|
||||
t.Setenv("RM_DATA_DIR", "/tmp/x")
|
||||
|
||||
c, err := Load("")
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if c.MetricsAuthEnabled() {
|
||||
t.Errorf("metrics endpoint should be off by default")
|
||||
}
|
||||
|
||||
t.Setenv("RM_METRICS_TOKEN", "s3cr3t-token-with-enough-bytes")
|
||||
t.Setenv("RM_METRICS_TRUSTED_CIDR", "10.0.0.0/8, 192.168.1.0/24")
|
||||
c, err = Load("")
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if c.MetricsToken != "s3cr3t-token-with-enough-bytes" {
|
||||
t.Errorf("token: %q", c.MetricsToken)
|
||||
}
|
||||
if got := c.MetricsTrustedCIDRs; len(got) != 2 || got[0] != "10.0.0.0/8" || got[1] != "192.168.1.0/24" {
|
||||
t.Errorf("cidrs: %v", got)
|
||||
}
|
||||
if !c.MetricsAuthEnabled() {
|
||||
t.Errorf("MetricsAuthEnabled should be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsTrustedCIDRRejectsGarbage(t *testing.T) {
|
||||
t.Setenv("RM_LISTEN", ":8080")
|
||||
t.Setenv("RM_DATA_DIR", "/tmp/x")
|
||||
t.Setenv("RM_METRICS_TRUSTED_CIDR", "garbage")
|
||||
|
||||
if _, err := Load(""); err == nil {
|
||||
t.Fatal("expected validation error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func writeFile(path string, body []byte) error {
|
||||
return writeFileImpl(path, body)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user