P6-04+05: Prometheus /metrics endpoint + Grafana dashboard

New internal/server/metrics package emits the legacy text/plain
exposition format directly, so we don't pull in
prometheus/client_golang. Endpoint is opt-in via RM_METRICS_TOKEN
and/or RM_METRICS_TRUSTED_CIDR; route is not mounted at all if
neither gate is set. Both gates ANDed when both configured.

Per-host gauges (online, last_backup_*, repo_size_bytes,
snapshot_count, open_alerts, repo_status), server gauges
(hosts_total/online, active_alerts by severity, build_info), and
an in-memory job-duration histogram observed from the existing
MsgJobFinished branch in the WS handler.

Docs in docs/prometheus.md (enable + scrape config + metric
reference + dashboard import). Sample dashboard at
deploy/grafana/restic-manager-dashboard.json - six panels,
Grafana schema 39, single Prometheus datasource variable.

Tests: golden render, concurrent observe, bucket boundaries in
the metrics package; auth matrix (no auth -> 404, token gate,
CIDR gate, both required) in the HTTP layer.
This commit is contained in:
2026-05-07 23:17:15 +01:00
parent 07bce16c84
commit ccd14f7cee
12 changed files with 1480 additions and 2 deletions
+301
View File
@@ -0,0 +1,301 @@
// Package metrics owns the in-process Prometheus exposition for
// the control plane. It deliberately avoids prometheus/client_golang
// — the legacy text format is small and stable, and the repo's house
// style is to keep dependency surface minimal.
//
// Two halves:
//
// - Registry holds a job-duration histogram. Server hooks call
// Registry.ObserveJob from the WS job-finished branch.
//
// - Render emits a complete /metrics body from a Snapshot. The
// Snapshot is a plain value bag; the HTTP handler assembles it
// from store reads + Registry.Snapshot at scrape time. This
// keeps the package free of any database or HTTP dependency.
package metrics
import (
"fmt"
"io"
"sort"
"strings"
"sync"
"time"
)
// JobDurationBuckets is the upper-bound ladder for the job duration
// histogram, in seconds. Covers admin commands (unlock/init/check
// finishing in seconds) up through hours-long backups; +Inf is
// implicit.
var JobDurationBuckets = []float64{1, 5, 30, 60, 300, 1800, 3600, 21600, 86400}
// Registry is the in-memory store for the job-duration histogram.
// Concurrent observers and a single periodic snapshotter is the
// expected access pattern; both are guarded by a mutex.
type Registry struct {
mu sync.Mutex
jobs map[jobKey]*histogramState
clock func() time.Time
}
type jobKey struct{ kind, status string }
type histogramState struct {
// counts[i] = number of observations <= JobDurationBuckets[i].
// counts[len(JobDurationBuckets)] is the implicit +Inf bucket
// (== total count, kept here for symmetry with the rendered
// _bucket{le="+Inf"} line and as a sanity check).
counts []uint64
sum float64
count uint64
}
// NewRegistry builds an empty registry.
func NewRegistry() *Registry {
return &Registry{
jobs: make(map[jobKey]*histogramState),
clock: time.Now,
}
}
// ObserveJob records one job-duration sample. Negative durations
// (clock-skew artefacts) are clamped to zero. Empty kind/status
// strings are tolerated but degrade the dashboard — callers should
// pass meaningful values.
func (r *Registry) ObserveJob(kind, status string, dur time.Duration) {
if r == nil {
return
}
if dur < 0 {
dur = 0
}
secs := dur.Seconds()
r.mu.Lock()
defer r.mu.Unlock()
k := jobKey{kind: kind, status: status}
hs, ok := r.jobs[k]
if !ok {
hs = &histogramState{counts: make([]uint64, len(JobDurationBuckets)+1)}
r.jobs[k] = hs
}
for i, ub := range JobDurationBuckets {
if secs <= ub {
hs.counts[i]++
}
}
hs.counts[len(JobDurationBuckets)]++ // +Inf
hs.sum += secs
hs.count++
}
// HistogramRow is one (kind,status) row in a Snapshot. Buckets is
// the cumulative count per upper bound (matching JobDurationBuckets,
// last element is the +Inf total).
type HistogramRow struct {
Kind string
Status string
Buckets []uint64
Sum float64
Count uint64
}
// snapshotJobs returns a deterministic, sorted copy of the
// histogram state. Sort order: kind asc, status asc.
func (r *Registry) snapshotJobs() []HistogramRow {
if r == nil {
return nil
}
r.mu.Lock()
defer r.mu.Unlock()
rows := make([]HistogramRow, 0, len(r.jobs))
for k, hs := range r.jobs {
buckets := make([]uint64, len(hs.counts))
copy(buckets, hs.counts)
rows = append(rows, HistogramRow{
Kind: k.kind,
Status: k.status,
Buckets: buckets,
Sum: hs.sum,
Count: hs.count,
})
}
sort.Slice(rows, func(i, j int) bool {
if rows[i].Kind != rows[j].Kind {
return rows[i].Kind < rows[j].Kind
}
return rows[i].Status < rows[j].Status
})
return rows
}
// HostRow is one host's projection for the per-host gauges.
// Pointers carry "no value" semantics so we can omit a metric line
// when, e.g., a host has never run a backup.
type HostRow struct {
ID string
Name string
Online bool
LastBackupUnix *int64 // nil = no backup yet
LastBackupSucceeded *bool // nil = no backup yet
RepoSizeBytes *int64 // nil = no stats yet
SnapshotCount int
OpenAlertCount int
RepoStatus string // "unknown" | "ready" | "init_failed"
}
// Snapshot is a frozen view of the data needed to render /metrics.
// Constructed by the HTTP handler from Store reads + Registry.snapshotJobs.
type Snapshot struct {
Hosts []HostRow
HostsTotal int
HostsOnline int
AlertsBySeverity map[string]int // severity → count
BuildVersion string
BuildCommit string
GoVersion string
JobDurationRows []HistogramRow
}
// SnapshotWith builds a Snapshot from raw inputs and the registry's
// current job-duration state. Convenience for the HTTP handler.
func (r *Registry) SnapshotWith(hosts []HostRow, alerts map[string]int, buildVer, commit, goVer string) Snapshot {
online := 0
for _, h := range hosts {
if h.Online {
online++
}
}
return Snapshot{
Hosts: hosts,
HostsTotal: len(hosts),
HostsOnline: online,
AlertsBySeverity: alerts,
BuildVersion: buildVer,
BuildCommit: commit,
GoVersion: goVer,
JobDurationRows: r.snapshotJobs(),
}
}
// Render emits a complete Prometheus text-exposition body for s.
// Output is deterministic: metric names appear in a fixed order and
// labels within a metric are sorted by their first label value.
func Render(w io.Writer, s Snapshot) error {
var b strings.Builder
// --- Server gauges ---------------------------------------------------
b.WriteString("# HELP rm_hosts_total Total number of enrolled hosts (excludes pending announces).\n")
b.WriteString("# TYPE rm_hosts_total gauge\n")
fmt.Fprintf(&b, "rm_hosts_total %d\n", s.HostsTotal)
b.WriteString("# HELP rm_hosts_online Number of hosts currently online (status='online').\n")
b.WriteString("# TYPE rm_hosts_online gauge\n")
fmt.Fprintf(&b, "rm_hosts_online %d\n", s.HostsOnline)
b.WriteString("# HELP rm_active_alerts Open alerts grouped by severity.\n")
b.WriteString("# TYPE rm_active_alerts gauge\n")
severities := []string{"info", "warning", "critical"}
for _, sev := range severities {
fmt.Fprintf(&b, "rm_active_alerts{severity=%q} %d\n", sev, s.AlertsBySeverity[sev])
}
b.WriteString("# HELP rm_build_info Build identifying labels; value is always 1.\n")
b.WriteString("# TYPE rm_build_info gauge\n")
fmt.Fprintf(&b, "rm_build_info{version=%q,commit=%q,go_version=%q} 1\n",
s.BuildVersion, s.BuildCommit, s.GoVersion)
// --- Per-host gauges -------------------------------------------------
// Stable order: by host id.
hosts := append([]HostRow(nil), s.Hosts...)
sort.Slice(hosts, func(i, j int) bool { return hosts[i].ID < hosts[j].ID })
b.WriteString("# HELP rm_host_agent_online 1 if the agent is currently online, 0 otherwise.\n")
b.WriteString("# TYPE rm_host_agent_online gauge\n")
for _, h := range hosts {
v := 0
if h.Online {
v = 1
}
fmt.Fprintf(&b, "rm_host_agent_online{host_id=%q,host=%q} %d\n",
h.ID, h.Name, v)
}
b.WriteString("# HELP rm_host_last_backup_timestamp_seconds Unix timestamp of the host's most recent backup. Omitted for hosts with no backup yet.\n")
b.WriteString("# TYPE rm_host_last_backup_timestamp_seconds gauge\n")
for _, h := range hosts {
if h.LastBackupUnix == nil {
continue
}
fmt.Fprintf(&b, "rm_host_last_backup_timestamp_seconds{host_id=%q,host=%q} %d\n",
h.ID, h.Name, *h.LastBackupUnix)
}
b.WriteString("# HELP rm_host_last_backup_success 1 if the host's most recent backup succeeded, 0 otherwise. Omitted for hosts with no backup yet.\n")
b.WriteString("# TYPE rm_host_last_backup_success gauge\n")
for _, h := range hosts {
if h.LastBackupSucceeded == nil {
continue
}
v := 0
if *h.LastBackupSucceeded {
v = 1
}
fmt.Fprintf(&b, "rm_host_last_backup_success{host_id=%q,host=%q} %d\n",
h.ID, h.Name, v)
}
b.WriteString("# HELP rm_host_repo_size_bytes Latest reported repo size from `restic stats --mode raw-data`. Omitted for hosts with no stats yet.\n")
b.WriteString("# TYPE rm_host_repo_size_bytes gauge\n")
for _, h := range hosts {
if h.RepoSizeBytes == nil {
continue
}
fmt.Fprintf(&b, "rm_host_repo_size_bytes{host_id=%q,host=%q} %d\n",
h.ID, h.Name, *h.RepoSizeBytes)
}
b.WriteString("# HELP rm_host_snapshot_count Number of restic snapshots known on the host's repo.\n")
b.WriteString("# TYPE rm_host_snapshot_count gauge\n")
for _, h := range hosts {
fmt.Fprintf(&b, "rm_host_snapshot_count{host_id=%q,host=%q} %d\n",
h.ID, h.Name, h.SnapshotCount)
}
b.WriteString("# HELP rm_host_open_alerts Number of currently open alerts attached to this host.\n")
b.WriteString("# TYPE rm_host_open_alerts gauge\n")
for _, h := range hosts {
fmt.Fprintf(&b, "rm_host_open_alerts{host_id=%q,host=%q} %d\n",
h.ID, h.Name, h.OpenAlertCount)
}
b.WriteString("# HELP rm_host_repo_status Repo readiness state for the host. Exactly one row per host with status label set.\n")
b.WriteString("# TYPE rm_host_repo_status gauge\n")
for _, h := range hosts {
st := h.RepoStatus
if st == "" {
st = "unknown"
}
fmt.Fprintf(&b, "rm_host_repo_status{host_id=%q,host=%q,status=%q} 1\n",
h.ID, h.Name, st)
}
// --- Histogram -------------------------------------------------------
b.WriteString("# HELP rm_job_duration_seconds End-to-end duration of completed jobs, by kind and terminal status.\n")
b.WriteString("# TYPE rm_job_duration_seconds histogram\n")
for _, row := range s.JobDurationRows {
for i, ub := range JobDurationBuckets {
fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"%g\"} %d\n",
row.Kind, row.Status, ub, row.Buckets[i])
}
fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"+Inf\"} %d\n",
row.Kind, row.Status, row.Buckets[len(JobDurationBuckets)])
fmt.Fprintf(&b, "rm_job_duration_seconds_sum{kind=%q,status=%q} %g\n",
row.Kind, row.Status, row.Sum)
fmt.Fprintf(&b, "rm_job_duration_seconds_count{kind=%q,status=%q} %d\n",
row.Kind, row.Status, row.Count)
}
_, err := io.WriteString(w, b.String())
return err
}
+182
View File
@@ -0,0 +1,182 @@
package metrics
import (
"bytes"
"strings"
"sync"
"testing"
"time"
)
func TestObserveJobBuckets(t *testing.T) {
r := NewRegistry()
// Bucket boundaries: 1, 5, 30, 60, 300, 1800, 3600, 21600, 86400
r.ObserveJob("backup", "succeeded", 500*time.Millisecond) // <= 1
r.ObserveJob("backup", "succeeded", 30*time.Second) // == 30 (boundary)
r.ObserveJob("backup", "succeeded", 90*time.Second) // > 60, <= 300
r.ObserveJob("backup", "succeeded", 2*time.Hour) // > 3600 → 21600 bucket
rows := r.snapshotJobs()
if len(rows) != 1 {
t.Fatalf("rows: %d", len(rows))
}
row := rows[0]
if row.Count != 4 {
t.Errorf("count: %d", row.Count)
}
wantSum := 0.5 + 30 + 90 + 7200.0
if row.Sum != wantSum {
t.Errorf("sum: got %v want %v", row.Sum, wantSum)
}
// Cumulative buckets:
// le=1 → 1 (the 0.5s)
// le=5 → 1
// le=30 → 2 (boundary inclusive: 30s included)
// le=60 → 2
// le=300 → 3
// le=1800 → 3
// le=3600 → 3
// le=21600 → 4
// le=86400 → 4
// le=+Inf → 4
want := []uint64{1, 1, 2, 2, 3, 3, 3, 4, 4, 4}
for i, w := range want {
if row.Buckets[i] != w {
t.Errorf("bucket[%d]=%d want %d", i, row.Buckets[i], w)
}
}
}
func TestObserveJobNegativeClampedToZero(t *testing.T) {
r := NewRegistry()
r.ObserveJob("backup", "succeeded", -5*time.Second)
rows := r.snapshotJobs()
if len(rows) != 1 || rows[0].Sum != 0 || rows[0].Count != 1 {
t.Errorf("expected one zero-second observation, got %+v", rows)
}
}
func TestObserveJobConcurrent(t *testing.T) {
r := NewRegistry()
const goroutines = 16
const each = 200
var wg sync.WaitGroup
for g := 0; g < goroutines; g++ {
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < each; i++ {
r.ObserveJob("backup", "succeeded", time.Second)
}
}()
}
wg.Wait()
rows := r.snapshotJobs()
if len(rows) != 1 {
t.Fatalf("rows: %d", len(rows))
}
if rows[0].Count != uint64(goroutines*each) {
t.Errorf("count: got %d want %d", rows[0].Count, goroutines*each)
}
}
func TestObserveJobNilRegistryNoop(t *testing.T) {
var r *Registry // nil
r.ObserveJob("backup", "succeeded", time.Second)
}
func TestRenderGolden(t *testing.T) {
r := NewRegistry()
r.ObserveJob("backup", "succeeded", 5*time.Second)
r.ObserveJob("forget", "succeeded", 100*time.Millisecond)
pi64 := func(v int64) *int64 { return &v }
pbool := func(v bool) *bool { return &v }
hosts := []HostRow{
{
ID: "01H0001", Name: "alpha",
Online: true,
LastBackupUnix: pi64(1700000000),
LastBackupSucceeded: pbool(true),
RepoSizeBytes: pi64(123456789),
SnapshotCount: 42,
OpenAlertCount: 0,
RepoStatus: "ready",
},
{
ID: "01H0002", Name: "bravo",
Online: false,
SnapshotCount: 0,
OpenAlertCount: 1,
RepoStatus: "init_failed",
},
}
snap := r.SnapshotWith(hosts,
map[string]int{"info": 0, "warning": 1, "critical": 0},
"v1.2.3", "deadbeef", "go1.25.0")
var buf bytes.Buffer
if err := Render(&buf, snap); err != nil {
t.Fatalf("render: %v", err)
}
out := buf.String()
for _, want := range []string{
"# HELP rm_hosts_total ",
"rm_hosts_total 2\n",
"rm_hosts_online 1\n",
`rm_active_alerts{severity="warning"} 1`,
`rm_active_alerts{severity="info"} 0`,
`rm_active_alerts{severity="critical"} 0`,
`rm_build_info{version="v1.2.3",commit="deadbeef",go_version="go1.25.0"} 1`,
`rm_host_agent_online{host_id="01H0001",host="alpha"} 1`,
`rm_host_agent_online{host_id="01H0002",host="bravo"} 0`,
`rm_host_last_backup_timestamp_seconds{host_id="01H0001",host="alpha"} 1700000000`,
`rm_host_last_backup_success{host_id="01H0001",host="alpha"} 1`,
`rm_host_repo_size_bytes{host_id="01H0001",host="alpha"} 123456789`,
`rm_host_snapshot_count{host_id="01H0001",host="alpha"} 42`,
`rm_host_snapshot_count{host_id="01H0002",host="bravo"} 0`,
`rm_host_open_alerts{host_id="01H0002",host="bravo"} 1`,
`rm_host_repo_status{host_id="01H0001",host="alpha",status="ready"} 1`,
`rm_host_repo_status{host_id="01H0002",host="bravo",status="init_failed"} 1`,
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="1"} 0`,
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="5"} 1`,
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="+Inf"} 1`,
`rm_job_duration_seconds_sum{kind="backup",status="succeeded"} 5`,
`rm_job_duration_seconds_count{kind="backup",status="succeeded"} 1`,
`rm_job_duration_seconds_bucket{kind="forget",status="succeeded",le="1"} 1`,
} {
if !strings.Contains(out, want) {
t.Errorf("missing line:\n %s\n--- full output ---\n%s", want, out)
}
}
// bravo had no last backup → those metric lines must be absent for it.
for _, ban := range []string{
`rm_host_last_backup_timestamp_seconds{host_id="01H0002"`,
`rm_host_last_backup_success{host_id="01H0002"`,
`rm_host_repo_size_bytes{host_id="01H0002"`,
} {
if strings.Contains(out, ban) {
t.Errorf("unexpected line for bravo: %q", ban)
}
}
}
func TestRenderEmptySnapshot(t *testing.T) {
r := NewRegistry()
snap := r.SnapshotWith(nil, nil, "dev", "", "go1.25.0")
var buf bytes.Buffer
if err := Render(&buf, snap); err != nil {
t.Fatalf("render: %v", err)
}
out := buf.String()
if !strings.Contains(out, "rm_hosts_total 0\n") {
t.Errorf("missing zero-host gauge:\n%s", out)
}
// Histogram block has its HELP/TYPE but no rows. The HELP/TYPE
// presence is correct and helps Prometheus pre-register the metric.
if !strings.Contains(out, "# TYPE rm_job_duration_seconds histogram") {
t.Errorf("histogram HELP/TYPE missing")
}
}