ccd14f7cee
New internal/server/metrics package emits the legacy text/plain exposition format directly, so we don't pull in prometheus/client_golang. Endpoint is opt-in via RM_METRICS_TOKEN and/or RM_METRICS_TRUSTED_CIDR; route is not mounted at all if neither gate is set. Both gates ANDed when both configured. Per-host gauges (online, last_backup_*, repo_size_bytes, snapshot_count, open_alerts, repo_status), server gauges (hosts_total/online, active_alerts by severity, build_info), and an in-memory job-duration histogram observed from the existing MsgJobFinished branch in the WS handler. Docs in docs/prometheus.md (enable + scrape config + metric reference + dashboard import). Sample dashboard at deploy/grafana/restic-manager-dashboard.json - six panels, Grafana schema 39, single Prometheus datasource variable. Tests: golden render, concurrent observe, bucket boundaries in the metrics package; auth matrix (no auth -> 404, token gate, CIDR gate, both required) in the HTTP layer.
302 lines
9.8 KiB
Go
302 lines
9.8 KiB
Go
// Package metrics owns the in-process Prometheus exposition for
|
|
// the control plane. It deliberately avoids prometheus/client_golang
|
|
// — the legacy text format is small and stable, and the repo's house
|
|
// style is to keep dependency surface minimal.
|
|
//
|
|
// Two halves:
|
|
//
|
|
// - Registry holds a job-duration histogram. Server hooks call
|
|
// Registry.ObserveJob from the WS job-finished branch.
|
|
//
|
|
// - Render emits a complete /metrics body from a Snapshot. The
|
|
// Snapshot is a plain value bag; the HTTP handler assembles it
|
|
// from store reads + Registry.Snapshot at scrape time. This
|
|
// keeps the package free of any database or HTTP dependency.
|
|
package metrics
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// JobDurationBuckets is the upper-bound ladder for the job duration
|
|
// histogram, in seconds. Covers admin commands (unlock/init/check
|
|
// finishing in seconds) up through hours-long backups; +Inf is
|
|
// implicit.
|
|
var JobDurationBuckets = []float64{1, 5, 30, 60, 300, 1800, 3600, 21600, 86400}
|
|
|
|
// Registry is the in-memory store for the job-duration histogram.
|
|
// Concurrent observers and a single periodic snapshotter is the
|
|
// expected access pattern; both are guarded by a mutex.
|
|
type Registry struct {
|
|
mu sync.Mutex
|
|
jobs map[jobKey]*histogramState
|
|
clock func() time.Time
|
|
}
|
|
|
|
type jobKey struct{ kind, status string }
|
|
|
|
type histogramState struct {
|
|
// counts[i] = number of observations <= JobDurationBuckets[i].
|
|
// counts[len(JobDurationBuckets)] is the implicit +Inf bucket
|
|
// (== total count, kept here for symmetry with the rendered
|
|
// _bucket{le="+Inf"} line and as a sanity check).
|
|
counts []uint64
|
|
sum float64
|
|
count uint64
|
|
}
|
|
|
|
// NewRegistry builds an empty registry.
|
|
func NewRegistry() *Registry {
|
|
return &Registry{
|
|
jobs: make(map[jobKey]*histogramState),
|
|
clock: time.Now,
|
|
}
|
|
}
|
|
|
|
// ObserveJob records one job-duration sample. Negative durations
|
|
// (clock-skew artefacts) are clamped to zero. Empty kind/status
|
|
// strings are tolerated but degrade the dashboard — callers should
|
|
// pass meaningful values.
|
|
func (r *Registry) ObserveJob(kind, status string, dur time.Duration) {
|
|
if r == nil {
|
|
return
|
|
}
|
|
if dur < 0 {
|
|
dur = 0
|
|
}
|
|
secs := dur.Seconds()
|
|
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
k := jobKey{kind: kind, status: status}
|
|
hs, ok := r.jobs[k]
|
|
if !ok {
|
|
hs = &histogramState{counts: make([]uint64, len(JobDurationBuckets)+1)}
|
|
r.jobs[k] = hs
|
|
}
|
|
for i, ub := range JobDurationBuckets {
|
|
if secs <= ub {
|
|
hs.counts[i]++
|
|
}
|
|
}
|
|
hs.counts[len(JobDurationBuckets)]++ // +Inf
|
|
hs.sum += secs
|
|
hs.count++
|
|
}
|
|
|
|
// HistogramRow is one (kind,status) row in a Snapshot. Buckets is
|
|
// the cumulative count per upper bound (matching JobDurationBuckets,
|
|
// last element is the +Inf total).
|
|
type HistogramRow struct {
|
|
Kind string
|
|
Status string
|
|
Buckets []uint64
|
|
Sum float64
|
|
Count uint64
|
|
}
|
|
|
|
// snapshotJobs returns a deterministic, sorted copy of the
|
|
// histogram state. Sort order: kind asc, status asc.
|
|
func (r *Registry) snapshotJobs() []HistogramRow {
|
|
if r == nil {
|
|
return nil
|
|
}
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
rows := make([]HistogramRow, 0, len(r.jobs))
|
|
for k, hs := range r.jobs {
|
|
buckets := make([]uint64, len(hs.counts))
|
|
copy(buckets, hs.counts)
|
|
rows = append(rows, HistogramRow{
|
|
Kind: k.kind,
|
|
Status: k.status,
|
|
Buckets: buckets,
|
|
Sum: hs.sum,
|
|
Count: hs.count,
|
|
})
|
|
}
|
|
sort.Slice(rows, func(i, j int) bool {
|
|
if rows[i].Kind != rows[j].Kind {
|
|
return rows[i].Kind < rows[j].Kind
|
|
}
|
|
return rows[i].Status < rows[j].Status
|
|
})
|
|
return rows
|
|
}
|
|
|
|
// HostRow is one host's projection for the per-host gauges.
|
|
// Pointers carry "no value" semantics so we can omit a metric line
|
|
// when, e.g., a host has never run a backup.
|
|
type HostRow struct {
|
|
ID string
|
|
Name string
|
|
Online bool
|
|
LastBackupUnix *int64 // nil = no backup yet
|
|
LastBackupSucceeded *bool // nil = no backup yet
|
|
RepoSizeBytes *int64 // nil = no stats yet
|
|
SnapshotCount int
|
|
OpenAlertCount int
|
|
RepoStatus string // "unknown" | "ready" | "init_failed"
|
|
}
|
|
|
|
// Snapshot is a frozen view of the data needed to render /metrics.
|
|
// Constructed by the HTTP handler from Store reads + Registry.snapshotJobs.
|
|
type Snapshot struct {
|
|
Hosts []HostRow
|
|
HostsTotal int
|
|
HostsOnline int
|
|
AlertsBySeverity map[string]int // severity → count
|
|
BuildVersion string
|
|
BuildCommit string
|
|
GoVersion string
|
|
JobDurationRows []HistogramRow
|
|
}
|
|
|
|
// SnapshotWith builds a Snapshot from raw inputs and the registry's
|
|
// current job-duration state. Convenience for the HTTP handler.
|
|
func (r *Registry) SnapshotWith(hosts []HostRow, alerts map[string]int, buildVer, commit, goVer string) Snapshot {
|
|
online := 0
|
|
for _, h := range hosts {
|
|
if h.Online {
|
|
online++
|
|
}
|
|
}
|
|
return Snapshot{
|
|
Hosts: hosts,
|
|
HostsTotal: len(hosts),
|
|
HostsOnline: online,
|
|
AlertsBySeverity: alerts,
|
|
BuildVersion: buildVer,
|
|
BuildCommit: commit,
|
|
GoVersion: goVer,
|
|
JobDurationRows: r.snapshotJobs(),
|
|
}
|
|
}
|
|
|
|
// Render emits a complete Prometheus text-exposition body for s.
|
|
// Output is deterministic: metric names appear in a fixed order and
|
|
// labels within a metric are sorted by their first label value.
|
|
func Render(w io.Writer, s Snapshot) error {
|
|
var b strings.Builder
|
|
|
|
// --- Server gauges ---------------------------------------------------
|
|
b.WriteString("# HELP rm_hosts_total Total number of enrolled hosts (excludes pending announces).\n")
|
|
b.WriteString("# TYPE rm_hosts_total gauge\n")
|
|
fmt.Fprintf(&b, "rm_hosts_total %d\n", s.HostsTotal)
|
|
|
|
b.WriteString("# HELP rm_hosts_online Number of hosts currently online (status='online').\n")
|
|
b.WriteString("# TYPE rm_hosts_online gauge\n")
|
|
fmt.Fprintf(&b, "rm_hosts_online %d\n", s.HostsOnline)
|
|
|
|
b.WriteString("# HELP rm_active_alerts Open alerts grouped by severity.\n")
|
|
b.WriteString("# TYPE rm_active_alerts gauge\n")
|
|
severities := []string{"info", "warning", "critical"}
|
|
for _, sev := range severities {
|
|
fmt.Fprintf(&b, "rm_active_alerts{severity=%q} %d\n", sev, s.AlertsBySeverity[sev])
|
|
}
|
|
|
|
b.WriteString("# HELP rm_build_info Build identifying labels; value is always 1.\n")
|
|
b.WriteString("# TYPE rm_build_info gauge\n")
|
|
fmt.Fprintf(&b, "rm_build_info{version=%q,commit=%q,go_version=%q} 1\n",
|
|
s.BuildVersion, s.BuildCommit, s.GoVersion)
|
|
|
|
// --- Per-host gauges -------------------------------------------------
|
|
// Stable order: by host id.
|
|
hosts := append([]HostRow(nil), s.Hosts...)
|
|
sort.Slice(hosts, func(i, j int) bool { return hosts[i].ID < hosts[j].ID })
|
|
|
|
b.WriteString("# HELP rm_host_agent_online 1 if the agent is currently online, 0 otherwise.\n")
|
|
b.WriteString("# TYPE rm_host_agent_online gauge\n")
|
|
for _, h := range hosts {
|
|
v := 0
|
|
if h.Online {
|
|
v = 1
|
|
}
|
|
fmt.Fprintf(&b, "rm_host_agent_online{host_id=%q,host=%q} %d\n",
|
|
h.ID, h.Name, v)
|
|
}
|
|
|
|
b.WriteString("# HELP rm_host_last_backup_timestamp_seconds Unix timestamp of the host's most recent backup. Omitted for hosts with no backup yet.\n")
|
|
b.WriteString("# TYPE rm_host_last_backup_timestamp_seconds gauge\n")
|
|
for _, h := range hosts {
|
|
if h.LastBackupUnix == nil {
|
|
continue
|
|
}
|
|
fmt.Fprintf(&b, "rm_host_last_backup_timestamp_seconds{host_id=%q,host=%q} %d\n",
|
|
h.ID, h.Name, *h.LastBackupUnix)
|
|
}
|
|
|
|
b.WriteString("# HELP rm_host_last_backup_success 1 if the host's most recent backup succeeded, 0 otherwise. Omitted for hosts with no backup yet.\n")
|
|
b.WriteString("# TYPE rm_host_last_backup_success gauge\n")
|
|
for _, h := range hosts {
|
|
if h.LastBackupSucceeded == nil {
|
|
continue
|
|
}
|
|
v := 0
|
|
if *h.LastBackupSucceeded {
|
|
v = 1
|
|
}
|
|
fmt.Fprintf(&b, "rm_host_last_backup_success{host_id=%q,host=%q} %d\n",
|
|
h.ID, h.Name, v)
|
|
}
|
|
|
|
b.WriteString("# HELP rm_host_repo_size_bytes Latest reported repo size from `restic stats --mode raw-data`. Omitted for hosts with no stats yet.\n")
|
|
b.WriteString("# TYPE rm_host_repo_size_bytes gauge\n")
|
|
for _, h := range hosts {
|
|
if h.RepoSizeBytes == nil {
|
|
continue
|
|
}
|
|
fmt.Fprintf(&b, "rm_host_repo_size_bytes{host_id=%q,host=%q} %d\n",
|
|
h.ID, h.Name, *h.RepoSizeBytes)
|
|
}
|
|
|
|
b.WriteString("# HELP rm_host_snapshot_count Number of restic snapshots known on the host's repo.\n")
|
|
b.WriteString("# TYPE rm_host_snapshot_count gauge\n")
|
|
for _, h := range hosts {
|
|
fmt.Fprintf(&b, "rm_host_snapshot_count{host_id=%q,host=%q} %d\n",
|
|
h.ID, h.Name, h.SnapshotCount)
|
|
}
|
|
|
|
b.WriteString("# HELP rm_host_open_alerts Number of currently open alerts attached to this host.\n")
|
|
b.WriteString("# TYPE rm_host_open_alerts gauge\n")
|
|
for _, h := range hosts {
|
|
fmt.Fprintf(&b, "rm_host_open_alerts{host_id=%q,host=%q} %d\n",
|
|
h.ID, h.Name, h.OpenAlertCount)
|
|
}
|
|
|
|
b.WriteString("# HELP rm_host_repo_status Repo readiness state for the host. Exactly one row per host with status label set.\n")
|
|
b.WriteString("# TYPE rm_host_repo_status gauge\n")
|
|
for _, h := range hosts {
|
|
st := h.RepoStatus
|
|
if st == "" {
|
|
st = "unknown"
|
|
}
|
|
fmt.Fprintf(&b, "rm_host_repo_status{host_id=%q,host=%q,status=%q} 1\n",
|
|
h.ID, h.Name, st)
|
|
}
|
|
|
|
// --- Histogram -------------------------------------------------------
|
|
b.WriteString("# HELP rm_job_duration_seconds End-to-end duration of completed jobs, by kind and terminal status.\n")
|
|
b.WriteString("# TYPE rm_job_duration_seconds histogram\n")
|
|
for _, row := range s.JobDurationRows {
|
|
for i, ub := range JobDurationBuckets {
|
|
fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"%g\"} %d\n",
|
|
row.Kind, row.Status, ub, row.Buckets[i])
|
|
}
|
|
fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"+Inf\"} %d\n",
|
|
row.Kind, row.Status, row.Buckets[len(JobDurationBuckets)])
|
|
fmt.Fprintf(&b, "rm_job_duration_seconds_sum{kind=%q,status=%q} %g\n",
|
|
row.Kind, row.Status, row.Sum)
|
|
fmt.Fprintf(&b, "rm_job_duration_seconds_count{kind=%q,status=%q} %d\n",
|
|
row.Kind, row.Status, row.Count)
|
|
}
|
|
|
|
_, err := io.WriteString(w, b.String())
|
|
return err
|
|
}
|