// Package metrics owns the in-process Prometheus exposition for // the control plane. It deliberately avoids prometheus/client_golang // — the legacy text format is small and stable, and the repo's house // style is to keep dependency surface minimal. // // Two halves: // // - Registry holds a job-duration histogram. Server hooks call // Registry.ObserveJob from the WS job-finished branch. // // - Render emits a complete /metrics body from a Snapshot. The // Snapshot is a plain value bag; the HTTP handler assembles it // from store reads + Registry.Snapshot at scrape time. This // keeps the package free of any database or HTTP dependency. package metrics import ( "fmt" "io" "sort" "strings" "sync" "time" ) // JobDurationBuckets is the upper-bound ladder for the job duration // histogram, in seconds. Covers admin commands (unlock/init/check // finishing in seconds) up through hours-long backups; +Inf is // implicit. var JobDurationBuckets = []float64{1, 5, 30, 60, 300, 1800, 3600, 21600, 86400} // Registry is the in-memory store for the job-duration histogram. // Concurrent observers and a single periodic snapshotter is the // expected access pattern; both are guarded by a mutex. type Registry struct { mu sync.Mutex jobs map[jobKey]*histogramState clock func() time.Time } type jobKey struct{ kind, status string } type histogramState struct { // counts[i] = number of observations <= JobDurationBuckets[i]. // counts[len(JobDurationBuckets)] is the implicit +Inf bucket // (== total count, kept here for symmetry with the rendered // _bucket{le="+Inf"} line and as a sanity check). counts []uint64 sum float64 count uint64 } // NewRegistry builds an empty registry. func NewRegistry() *Registry { return &Registry{ jobs: make(map[jobKey]*histogramState), clock: time.Now, } } // ObserveJob records one job-duration sample. Negative durations // (clock-skew artefacts) are clamped to zero. Empty kind/status // strings are tolerated but degrade the dashboard — callers should // pass meaningful values. func (r *Registry) ObserveJob(kind, status string, dur time.Duration) { if r == nil { return } if dur < 0 { dur = 0 } secs := dur.Seconds() r.mu.Lock() defer r.mu.Unlock() k := jobKey{kind: kind, status: status} hs, ok := r.jobs[k] if !ok { hs = &histogramState{counts: make([]uint64, len(JobDurationBuckets)+1)} r.jobs[k] = hs } for i, ub := range JobDurationBuckets { if secs <= ub { hs.counts[i]++ } } hs.counts[len(JobDurationBuckets)]++ // +Inf hs.sum += secs hs.count++ } // HistogramRow is one (kind,status) row in a Snapshot. Buckets is // the cumulative count per upper bound (matching JobDurationBuckets, // last element is the +Inf total). type HistogramRow struct { Kind string Status string Buckets []uint64 Sum float64 Count uint64 } // snapshotJobs returns a deterministic, sorted copy of the // histogram state. Sort order: kind asc, status asc. func (r *Registry) snapshotJobs() []HistogramRow { if r == nil { return nil } r.mu.Lock() defer r.mu.Unlock() rows := make([]HistogramRow, 0, len(r.jobs)) for k, hs := range r.jobs { buckets := make([]uint64, len(hs.counts)) copy(buckets, hs.counts) rows = append(rows, HistogramRow{ Kind: k.kind, Status: k.status, Buckets: buckets, Sum: hs.sum, Count: hs.count, }) } sort.Slice(rows, func(i, j int) bool { if rows[i].Kind != rows[j].Kind { return rows[i].Kind < rows[j].Kind } return rows[i].Status < rows[j].Status }) return rows } // HostRow is one host's projection for the per-host gauges. // Pointers carry "no value" semantics so we can omit a metric line // when, e.g., a host has never run a backup. type HostRow struct { ID string Name string Online bool LastBackupUnix *int64 // nil = no backup yet LastBackupSucceeded *bool // nil = no backup yet RepoSizeBytes *int64 // nil = no stats yet SnapshotCount int OpenAlertCount int RepoStatus string // "unknown" | "ready" | "init_failed" } // Snapshot is a frozen view of the data needed to render /metrics. // Constructed by the HTTP handler from Store reads + Registry.snapshotJobs. type Snapshot struct { Hosts []HostRow HostsTotal int HostsOnline int AlertsBySeverity map[string]int // severity → count BuildVersion string BuildCommit string GoVersion string JobDurationRows []HistogramRow } // SnapshotWith builds a Snapshot from raw inputs and the registry's // current job-duration state. Convenience for the HTTP handler. func (r *Registry) SnapshotWith(hosts []HostRow, alerts map[string]int, buildVer, commit, goVer string) Snapshot { online := 0 for _, h := range hosts { if h.Online { online++ } } return Snapshot{ Hosts: hosts, HostsTotal: len(hosts), HostsOnline: online, AlertsBySeverity: alerts, BuildVersion: buildVer, BuildCommit: commit, GoVersion: goVer, JobDurationRows: r.snapshotJobs(), } } // Render emits a complete Prometheus text-exposition body for s. // Output is deterministic: metric names appear in a fixed order and // labels within a metric are sorted by their first label value. func Render(w io.Writer, s Snapshot) error { var b strings.Builder // --- Server gauges --------------------------------------------------- b.WriteString("# HELP rm_hosts_total Total number of enrolled hosts (excludes pending announces).\n") b.WriteString("# TYPE rm_hosts_total gauge\n") fmt.Fprintf(&b, "rm_hosts_total %d\n", s.HostsTotal) b.WriteString("# HELP rm_hosts_online Number of hosts currently online (status='online').\n") b.WriteString("# TYPE rm_hosts_online gauge\n") fmt.Fprintf(&b, "rm_hosts_online %d\n", s.HostsOnline) b.WriteString("# HELP rm_active_alerts Open alerts grouped by severity.\n") b.WriteString("# TYPE rm_active_alerts gauge\n") severities := []string{"info", "warning", "critical"} for _, sev := range severities { fmt.Fprintf(&b, "rm_active_alerts{severity=%q} %d\n", sev, s.AlertsBySeverity[sev]) } b.WriteString("# HELP rm_build_info Build identifying labels; value is always 1.\n") b.WriteString("# TYPE rm_build_info gauge\n") fmt.Fprintf(&b, "rm_build_info{version=%q,commit=%q,go_version=%q} 1\n", s.BuildVersion, s.BuildCommit, s.GoVersion) // --- Per-host gauges ------------------------------------------------- // Stable order: by host id. hosts := append([]HostRow(nil), s.Hosts...) sort.Slice(hosts, func(i, j int) bool { return hosts[i].ID < hosts[j].ID }) b.WriteString("# HELP rm_host_agent_online 1 if the agent is currently online, 0 otherwise.\n") b.WriteString("# TYPE rm_host_agent_online gauge\n") for _, h := range hosts { v := 0 if h.Online { v = 1 } fmt.Fprintf(&b, "rm_host_agent_online{host_id=%q,host=%q} %d\n", h.ID, h.Name, v) } b.WriteString("# HELP rm_host_last_backup_timestamp_seconds Unix timestamp of the host's most recent backup. Omitted for hosts with no backup yet.\n") b.WriteString("# TYPE rm_host_last_backup_timestamp_seconds gauge\n") for _, h := range hosts { if h.LastBackupUnix == nil { continue } fmt.Fprintf(&b, "rm_host_last_backup_timestamp_seconds{host_id=%q,host=%q} %d\n", h.ID, h.Name, *h.LastBackupUnix) } b.WriteString("# HELP rm_host_last_backup_success 1 if the host's most recent backup succeeded, 0 otherwise. Omitted for hosts with no backup yet.\n") b.WriteString("# TYPE rm_host_last_backup_success gauge\n") for _, h := range hosts { if h.LastBackupSucceeded == nil { continue } v := 0 if *h.LastBackupSucceeded { v = 1 } fmt.Fprintf(&b, "rm_host_last_backup_success{host_id=%q,host=%q} %d\n", h.ID, h.Name, v) } b.WriteString("# HELP rm_host_repo_size_bytes Latest reported repo size from `restic stats --mode raw-data`. Omitted for hosts with no stats yet.\n") b.WriteString("# TYPE rm_host_repo_size_bytes gauge\n") for _, h := range hosts { if h.RepoSizeBytes == nil { continue } fmt.Fprintf(&b, "rm_host_repo_size_bytes{host_id=%q,host=%q} %d\n", h.ID, h.Name, *h.RepoSizeBytes) } b.WriteString("# HELP rm_host_snapshot_count Number of restic snapshots known on the host's repo.\n") b.WriteString("# TYPE rm_host_snapshot_count gauge\n") for _, h := range hosts { fmt.Fprintf(&b, "rm_host_snapshot_count{host_id=%q,host=%q} %d\n", h.ID, h.Name, h.SnapshotCount) } b.WriteString("# HELP rm_host_open_alerts Number of currently open alerts attached to this host.\n") b.WriteString("# TYPE rm_host_open_alerts gauge\n") for _, h := range hosts { fmt.Fprintf(&b, "rm_host_open_alerts{host_id=%q,host=%q} %d\n", h.ID, h.Name, h.OpenAlertCount) } b.WriteString("# HELP rm_host_repo_status Repo readiness state for the host. Exactly one row per host with status label set.\n") b.WriteString("# TYPE rm_host_repo_status gauge\n") for _, h := range hosts { st := h.RepoStatus if st == "" { st = "unknown" } fmt.Fprintf(&b, "rm_host_repo_status{host_id=%q,host=%q,status=%q} 1\n", h.ID, h.Name, st) } // --- Histogram ------------------------------------------------------- b.WriteString("# HELP rm_job_duration_seconds End-to-end duration of completed jobs, by kind and terminal status.\n") b.WriteString("# TYPE rm_job_duration_seconds histogram\n") for _, row := range s.JobDurationRows { for i, ub := range JobDurationBuckets { fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"%g\"} %d\n", row.Kind, row.Status, ub, row.Buckets[i]) } fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"+Inf\"} %d\n", row.Kind, row.Status, row.Buckets[len(JobDurationBuckets)]) fmt.Fprintf(&b, "rm_job_duration_seconds_sum{kind=%q,status=%q} %g\n", row.Kind, row.Status, row.Sum) fmt.Fprintf(&b, "rm_job_duration_seconds_count{kind=%q,status=%q} %d\n", row.Kind, row.Status, row.Count) } _, err := io.WriteString(w, b.String()) return err }