P6-04+05: Prometheus /metrics endpoint + Grafana dashboard

New internal/server/metrics package emits the legacy text/plain
exposition format directly, so we don't pull in
prometheus/client_golang. Endpoint is opt-in via RM_METRICS_TOKEN
and/or RM_METRICS_TRUSTED_CIDR; route is not mounted at all if
neither gate is set. Both gates ANDed when both configured.

Per-host gauges (online, last_backup_*, repo_size_bytes,
snapshot_count, open_alerts, repo_status), server gauges
(hosts_total/online, active_alerts by severity, build_info), and
an in-memory job-duration histogram observed from the existing
MsgJobFinished branch in the WS handler.

Docs in docs/prometheus.md (enable + scrape config + metric
reference + dashboard import). Sample dashboard at
deploy/grafana/restic-manager-dashboard.json - six panels,
Grafana schema 39, single Prometheus datasource variable.

Tests: golden render, concurrent observe, bucket boundaries in
the metrics package; auth matrix (no auth -> 404, token gate,
CIDR gate, both required) in the HTTP layer.
This commit is contained in:
2026-05-07 23:17:15 +01:00
parent 07bce16c84
commit ccd14f7cee
12 changed files with 1480 additions and 2 deletions
+185
View File
@@ -0,0 +1,185 @@
package http
import (
"context"
"crypto/subtle"
"net"
"net/http"
"net/netip"
"runtime"
"strings"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
)
// handleMetrics serves the Prometheus exposition body. The route is
// only mounted when the operator has opted in via RM_METRICS_TOKEN
// or RM_METRICS_TRUSTED_CIDR (see Server.New + Cfg.MetricsAuthEnabled).
func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
if !authoriseMetricsScrape(r, s.deps.Cfg) {
// 401 with no body; Prom respects this and surfaces the failed
// scrape. WWW-Authenticate hints at bearer when the operator
// actually configured a token.
if s.deps.Cfg.MetricsToken != "" {
w.Header().Set("WWW-Authenticate", `Bearer realm="restic-manager metrics"`)
}
w.WriteHeader(http.StatusUnauthorized)
return
}
snap, err := s.gatherMetricsSnapshot(r.Context())
if err != nil {
http.Error(w, "snapshot: "+err.Error(), http.StatusInternalServerError)
return
}
// 0.0.4 is the long-stable text-format version Prometheus accepts
// without negotiation; OpenMetrics is intentionally not used here.
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
if err := metrics.Render(w, snap); err != nil {
// Body is partially written; nothing useful we can do beyond
// dropping the connection (chi's recoverer will log).
return
}
}
// authoriseMetricsScrape applies bearer + CIDR gates per the spec.
// AND semantics when both are configured; either alone is sufficient
// when only it is configured.
func authoriseMetricsScrape(r *http.Request, cfg config.Config) bool {
tokenOK := true
if cfg.MetricsToken != "" {
tokenOK = false
hdr := r.Header.Get("Authorization")
const prefix = "Bearer "
if strings.HasPrefix(hdr, prefix) {
got := []byte(strings.TrimPrefix(hdr, prefix))
want := []byte(cfg.MetricsToken)
if subtle.ConstantTimeCompare(got, want) == 1 {
tokenOK = true
}
}
}
cidrOK := true
if len(cfg.MetricsTrustedCIDRs) > 0 {
cidrOK = false
ip := callerIP(r, cfg.TrustedProxies)
if ip.IsValid() {
for _, c := range cfg.MetricsTrustedCIDRs {
prefix, err := netip.ParsePrefix(c)
if err != nil {
continue
}
if prefix.Contains(ip) {
cidrOK = true
break
}
}
}
}
return tokenOK && cidrOK
}
// callerIP resolves the client IP. When the request hit the server
// directly we use RemoteAddr; when the immediate hop is a trusted
// proxy we honour the right-most untrusted X-Forwarded-For entry
// (mirrors how realIP middlewares typically resolve).
func callerIP(r *http.Request, trustedProxies []string) netip.Addr {
host, _, err := net.SplitHostPort(r.RemoteAddr)
if err != nil {
host = r.RemoteAddr
}
directAddr, err := netip.ParseAddr(host)
if err != nil {
return netip.Addr{}
}
if !addrInAnyCIDR(directAddr, trustedProxies) {
return directAddr
}
xff := r.Header.Get("X-Forwarded-For")
if xff == "" {
return directAddr
}
parts := strings.Split(xff, ",")
// Walk right→left, skipping trusted proxies, until we land on the
// first untrusted hop — that's the genuine client.
for i := len(parts) - 1; i >= 0; i-- {
p := strings.TrimSpace(parts[i])
a, err := netip.ParseAddr(p)
if err != nil {
continue
}
if addrInAnyCIDR(a, trustedProxies) {
continue
}
return a
}
return directAddr
}
func addrInAnyCIDR(a netip.Addr, cidrs []string) bool {
for _, c := range cidrs {
pre, err := netip.ParsePrefix(c)
if err != nil {
continue
}
if pre.Contains(a) {
return true
}
}
return false
}
// gatherMetricsSnapshot pulls the data the renderer needs. One
// indexed query per per-host or fleet-wide read; no N+1.
func (s *Server) gatherMetricsSnapshot(ctx context.Context) (metrics.Snapshot, error) {
hosts, err := s.deps.Store.ListHosts(ctx)
if err != nil {
return metrics.Snapshot{}, err
}
hostRows := make([]metrics.HostRow, 0, len(hosts))
for _, h := range hosts {
row := metrics.HostRow{
ID: h.ID,
Name: h.Name,
Online: h.Status == "online",
SnapshotCount: h.SnapshotCount,
OpenAlertCount: h.OpenAlertCount,
RepoStatus: h.RepoStatus,
}
if h.LastBackupAt != nil {
ts := h.LastBackupAt.Unix()
row.LastBackupUnix = &ts
}
if h.LastBackupStatus != nil {
ok := *h.LastBackupStatus == "succeeded"
row.LastBackupSucceeded = &ok
}
if h.RepoSizeBytes > 0 {
sz := h.RepoSizeBytes
row.RepoSizeBytes = &sz
}
hostRows = append(hostRows, row)
}
open, err := s.deps.Store.ListAlerts(ctx, store.AlertFilter{Status: "open"})
if err != nil {
return metrics.Snapshot{}, err
}
bySeverity := map[string]int{"info": 0, "warning": 0, "critical": 0}
for _, a := range open {
bySeverity[a.Severity]++
}
reg := s.deps.Metrics
if reg == nil {
reg = metrics.NewRegistry() // empty histogram block
}
return reg.SnapshotWith(hostRows, bySeverity, version.Version, version.Commit, runtime.Version()), nil
}
+209
View File
@@ -0,0 +1,209 @@
package http
import (
"context"
"io"
stdhttp "net/http"
"net/http/httptest"
"path/filepath"
"strings"
"testing"
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// newMetricsServer builds a Server with metrics enabled per cfg.
// Returns (URL, registry) so tests can both observe job durations
// directly and exercise the HTTP gate.
func newMetricsServer(t *testing.T, cfg config.Config) (string, *metrics.Registry, *store.Store) {
t.Helper()
dir := t.TempDir()
st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
if err != nil {
t.Fatalf("store: %v", err)
}
t.Cleanup(func() { _ = st.Close() })
keyPath := filepath.Join(dir, "secret.key")
if err := crypto.GenerateKeyFile(keyPath); err != nil {
t.Fatalf("genkey: %v", err)
}
key, _ := crypto.LoadKeyFromFile(keyPath)
aead, _ := crypto.NewAEAD(key)
cfg.Listen = ":0"
cfg.DataDir = dir
cfg.SecretKeyFile = keyPath
reg := metrics.NewRegistry()
deps := Deps{
Cfg: cfg,
Store: st,
AEAD: aead,
Metrics: reg,
}
s := New(deps)
ts := httptest.NewServer(s.srv.Handler)
t.Cleanup(ts.Close)
return ts.URL, reg, st
}
func TestMetricsRouteNotMountedByDefault(t *testing.T) {
t.Parallel()
url, _, _ := newMetricsServer(t, config.Config{})
res, err := stdhttp.Get(url + "/metrics")
if err != nil {
t.Fatalf("GET: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusNotFound {
t.Errorf("status: got %d, want 404 (route should not be mounted)", res.StatusCode)
}
}
func TestMetricsTokenRequired(t *testing.T) {
t.Parallel()
url, _, _ := newMetricsServer(t, config.Config{
MetricsToken: "the-token",
})
// Missing token.
res, err := stdhttp.Get(url + "/metrics")
if err != nil {
t.Fatalf("GET: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusUnauthorized {
t.Errorf("no token: got %d", res.StatusCode)
}
if !strings.Contains(res.Header.Get("WWW-Authenticate"), "Bearer") {
t.Errorf("WWW-Authenticate hint missing: %q", res.Header.Get("WWW-Authenticate"))
}
// Wrong token.
req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
req.Header.Set("Authorization", "Bearer not-the-token")
res2, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("GET: %v", err)
}
defer res2.Body.Close()
if res2.StatusCode != stdhttp.StatusUnauthorized {
t.Errorf("wrong token: got %d", res2.StatusCode)
}
// Right token.
req3, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
req3.Header.Set("Authorization", "Bearer the-token")
res3, err3 := stdhttp.DefaultClient.Do(req3)
if err3 != nil {
t.Fatalf("GET: %v", err3)
}
defer res3.Body.Close()
if res3.StatusCode != stdhttp.StatusOK {
t.Errorf("right token: got %d", res3.StatusCode)
}
if ct := res3.Header.Get("Content-Type"); !strings.HasPrefix(ct, "text/plain") {
t.Errorf("content-type: %q", ct)
}
}
func TestMetricsCIDRGate(t *testing.T) {
t.Parallel()
// 127.0.0.1 is what httptest hits with; pick a CIDR that excludes it
// to assert the "wrong source" branch.
url, _, _ := newMetricsServer(t, config.Config{
MetricsTrustedCIDRs: []string{"10.0.0.0/8"},
})
res, err := stdhttp.Get(url + "/metrics")
if err != nil {
t.Fatalf("GET: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusUnauthorized {
t.Errorf("loopback hitting non-matching CIDR: got %d, want 401", res.StatusCode)
}
// Now allow loopback.
url2, _, _ := newMetricsServer(t, config.Config{
MetricsTrustedCIDRs: []string{"127.0.0.0/8"},
})
res2, err := stdhttp.Get(url2 + "/metrics")
if err != nil {
t.Fatalf("GET: %v", err)
}
defer res2.Body.Close()
if res2.StatusCode != stdhttp.StatusOK {
t.Errorf("loopback in allow CIDR: got %d, want 200", res2.StatusCode)
}
}
func TestMetricsTokenAndCIDRBothRequired(t *testing.T) {
t.Parallel()
url, _, _ := newMetricsServer(t, config.Config{
MetricsToken: "the-token",
MetricsTrustedCIDRs: []string{"127.0.0.0/8"},
})
// Token only — CIDR ok (loopback) but token missing.
res, err := stdhttp.Get(url + "/metrics")
if err != nil {
t.Fatalf("GET: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusUnauthorized {
t.Errorf("missing token but in CIDR: got %d", res.StatusCode)
}
// Both right.
req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
req.Header.Set("Authorization", "Bearer the-token")
res2, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("GET: %v", err)
}
defer res2.Body.Close()
if res2.StatusCode != stdhttp.StatusOK {
t.Errorf("both right: got %d", res2.StatusCode)
}
}
func readAll(t *testing.T, r io.Reader) string {
t.Helper()
b, err := io.ReadAll(r)
if err != nil {
t.Fatalf("read: %v", err)
}
return string(b)
}
func TestMetricsBodyContainsExpectedLines(t *testing.T) {
t.Parallel()
url, reg, _ := newMetricsServer(t, config.Config{
MetricsToken: "the-token",
})
reg.ObserveJob("backup", "succeeded", 0) // produce one histogram row
req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
req.Header.Set("Authorization", "Bearer the-token")
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("GET: %v", err)
}
defer res.Body.Close()
body := readAll(t, res.Body)
for _, want := range []string{
"rm_hosts_total",
"rm_hosts_online",
`rm_active_alerts{severity="critical"}`,
"rm_build_info{",
"rm_job_duration_seconds_count{kind=\"backup\",status=\"succeeded\"}",
} {
if !strings.Contains(body, want) {
t.Errorf("body missing %q\n--- body ---\n%s", want, body)
}
}
}
+11
View File
@@ -17,6 +17,7 @@ import (
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
@@ -56,6 +57,12 @@ type Deps struct {
// OIDC (optional). Non-nil when the operator has configured an
// IdP — handlers under /auth/oidc/* are mounted only when set.
OIDC *oidc.Client
// Metrics (optional). When non-nil the WS job-finished branch
// records job durations and the /metrics handler can pull a
// histogram snapshot. Independent of MetricsAuthEnabled — the
// recorder runs even if the scrape endpoint is gated off, so a
// later config flip doesn't lose the running window.
Metrics *metrics.Registry
}
// Server is the running HTTP server.
@@ -131,12 +138,16 @@ func (s *Server) routes(r chi.Router) {
r.Get("/agent/binary", s.handleAgentBinary)
r.Get("/install/*", s.handleInstallAsset)
r.Get("/api/version", s.handleVersion)
if s.deps.Cfg.MetricsAuthEnabled() {
r.Get("/metrics", s.handleMetrics)
}
if s.deps.Hub != nil {
hd := ws.HandlerDeps{
Hub: s.deps.Hub,
Store: s.deps.Store,
JobHub: s.deps.JobHub,
AlertEngine: s.deps.AlertEngine,
Metrics: s.deps.Metrics,
OnHello: s.onAgentHello,
OnScheduleAck: s.applyScheduleAck,
OnScheduleFire: s.dispatchScheduledJob,