P6-04+05: Prometheus /metrics endpoint + Grafana dashboard
New internal/server/metrics package emits the legacy text/plain exposition format directly, so we don't pull in prometheus/client_golang. Endpoint is opt-in via RM_METRICS_TOKEN and/or RM_METRICS_TRUSTED_CIDR; route is not mounted at all if neither gate is set. Both gates ANDed when both configured. Per-host gauges (online, last_backup_*, repo_size_bytes, snapshot_count, open_alerts, repo_status), server gauges (hosts_total/online, active_alerts by severity, build_info), and an in-memory job-duration histogram observed from the existing MsgJobFinished branch in the WS handler. Docs in docs/prometheus.md (enable + scrape config + metric reference + dashboard import). Sample dashboard at deploy/grafana/restic-manager-dashboard.json - six panels, Grafana schema 39, single Prometheus datasource variable. Tests: golden render, concurrent observe, bucket boundaries in the metrics package; auth matrix (no auth -> 404, token gate, CIDR gate, both required) in the HTTP layer.
This commit is contained in:
@@ -15,6 +15,7 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/alert"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||
)
|
||||
@@ -27,6 +28,9 @@ type HandlerDeps struct {
|
||||
// AlertEngine receives job-finished and host-online events so the
|
||||
// alert engine can evaluate its rules. Optional; nil = no-op.
|
||||
AlertEngine *alert.Engine
|
||||
// Metrics records job-duration observations on every terminal
|
||||
// status. Optional; nil = no-op (test fixtures pass nil).
|
||||
Metrics *metrics.Registry
|
||||
// UpdateWatcher reconciles in-flight agent-update dispatches against
|
||||
// hello envelopes. Optional; nil = no-op.
|
||||
UpdateWatcher *UpdateWatcher
|
||||
@@ -239,6 +243,13 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
|
||||
slog.Warn("ws: set host last backup", "host_id", hostID, "err", err)
|
||||
}
|
||||
}
|
||||
// Job-duration histogram (P6-04). Skip when StartedAt is
|
||||
// missing (race: agent shipped finished without a started,
|
||||
// or the row predates this code).
|
||||
if deps.Metrics != nil && job.StartedAt != nil {
|
||||
deps.Metrics.ObserveJob(job.Kind, string(p.Status),
|
||||
p.FinishedAt.Sub(*job.StartedAt))
|
||||
}
|
||||
}
|
||||
if deps.JobHub != nil {
|
||||
deps.JobHub.Broadcast(p.JobID, env)
|
||||
|
||||
Reference in New Issue
Block a user