ccd14f7cee
New internal/server/metrics package emits the legacy text/plain exposition format directly, so we don't pull in prometheus/client_golang. Endpoint is opt-in via RM_METRICS_TOKEN and/or RM_METRICS_TRUSTED_CIDR; route is not mounted at all if neither gate is set. Both gates ANDed when both configured. Per-host gauges (online, last_backup_*, repo_size_bytes, snapshot_count, open_alerts, repo_status), server gauges (hosts_total/online, active_alerts by severity, build_info), and an in-memory job-duration histogram observed from the existing MsgJobFinished branch in the WS handler. Docs in docs/prometheus.md (enable + scrape config + metric reference + dashboard import). Sample dashboard at deploy/grafana/restic-manager-dashboard.json - six panels, Grafana schema 39, single Prometheus datasource variable. Tests: golden render, concurrent observe, bucket boundaries in the metrics package; auth matrix (no auth -> 404, token gate, CIDR gate, both required) in the HTTP layer.
186 lines
5.1 KiB
Go
186 lines
5.1 KiB
Go
package http
|
|
|
|
import (
|
|
"context"
|
|
"crypto/subtle"
|
|
"net"
|
|
"net/http"
|
|
"net/netip"
|
|
"runtime"
|
|
"strings"
|
|
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
|
)
|
|
|
|
// handleMetrics serves the Prometheus exposition body. The route is
|
|
// only mounted when the operator has opted in via RM_METRICS_TOKEN
|
|
// or RM_METRICS_TRUSTED_CIDR (see Server.New + Cfg.MetricsAuthEnabled).
|
|
func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
|
|
if !authoriseMetricsScrape(r, s.deps.Cfg) {
|
|
// 401 with no body; Prom respects this and surfaces the failed
|
|
// scrape. WWW-Authenticate hints at bearer when the operator
|
|
// actually configured a token.
|
|
if s.deps.Cfg.MetricsToken != "" {
|
|
w.Header().Set("WWW-Authenticate", `Bearer realm="restic-manager metrics"`)
|
|
}
|
|
w.WriteHeader(http.StatusUnauthorized)
|
|
return
|
|
}
|
|
|
|
snap, err := s.gatherMetricsSnapshot(r.Context())
|
|
if err != nil {
|
|
http.Error(w, "snapshot: "+err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
// 0.0.4 is the long-stable text-format version Prometheus accepts
|
|
// without negotiation; OpenMetrics is intentionally not used here.
|
|
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
|
if err := metrics.Render(w, snap); err != nil {
|
|
// Body is partially written; nothing useful we can do beyond
|
|
// dropping the connection (chi's recoverer will log).
|
|
return
|
|
}
|
|
}
|
|
|
|
// authoriseMetricsScrape applies bearer + CIDR gates per the spec.
|
|
// AND semantics when both are configured; either alone is sufficient
|
|
// when only it is configured.
|
|
func authoriseMetricsScrape(r *http.Request, cfg config.Config) bool {
|
|
tokenOK := true
|
|
if cfg.MetricsToken != "" {
|
|
tokenOK = false
|
|
hdr := r.Header.Get("Authorization")
|
|
const prefix = "Bearer "
|
|
if strings.HasPrefix(hdr, prefix) {
|
|
got := []byte(strings.TrimPrefix(hdr, prefix))
|
|
want := []byte(cfg.MetricsToken)
|
|
if subtle.ConstantTimeCompare(got, want) == 1 {
|
|
tokenOK = true
|
|
}
|
|
}
|
|
}
|
|
|
|
cidrOK := true
|
|
if len(cfg.MetricsTrustedCIDRs) > 0 {
|
|
cidrOK = false
|
|
ip := callerIP(r, cfg.TrustedProxies)
|
|
if ip.IsValid() {
|
|
for _, c := range cfg.MetricsTrustedCIDRs {
|
|
prefix, err := netip.ParsePrefix(c)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if prefix.Contains(ip) {
|
|
cidrOK = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return tokenOK && cidrOK
|
|
}
|
|
|
|
// callerIP resolves the client IP. When the request hit the server
|
|
// directly we use RemoteAddr; when the immediate hop is a trusted
|
|
// proxy we honour the right-most untrusted X-Forwarded-For entry
|
|
// (mirrors how realIP middlewares typically resolve).
|
|
func callerIP(r *http.Request, trustedProxies []string) netip.Addr {
|
|
host, _, err := net.SplitHostPort(r.RemoteAddr)
|
|
if err != nil {
|
|
host = r.RemoteAddr
|
|
}
|
|
directAddr, err := netip.ParseAddr(host)
|
|
if err != nil {
|
|
return netip.Addr{}
|
|
}
|
|
|
|
if !addrInAnyCIDR(directAddr, trustedProxies) {
|
|
return directAddr
|
|
}
|
|
|
|
xff := r.Header.Get("X-Forwarded-For")
|
|
if xff == "" {
|
|
return directAddr
|
|
}
|
|
parts := strings.Split(xff, ",")
|
|
// Walk right→left, skipping trusted proxies, until we land on the
|
|
// first untrusted hop — that's the genuine client.
|
|
for i := len(parts) - 1; i >= 0; i-- {
|
|
p := strings.TrimSpace(parts[i])
|
|
a, err := netip.ParseAddr(p)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if addrInAnyCIDR(a, trustedProxies) {
|
|
continue
|
|
}
|
|
return a
|
|
}
|
|
return directAddr
|
|
}
|
|
|
|
func addrInAnyCIDR(a netip.Addr, cidrs []string) bool {
|
|
for _, c := range cidrs {
|
|
pre, err := netip.ParsePrefix(c)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if pre.Contains(a) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// gatherMetricsSnapshot pulls the data the renderer needs. One
|
|
// indexed query per per-host or fleet-wide read; no N+1.
|
|
func (s *Server) gatherMetricsSnapshot(ctx context.Context) (metrics.Snapshot, error) {
|
|
hosts, err := s.deps.Store.ListHosts(ctx)
|
|
if err != nil {
|
|
return metrics.Snapshot{}, err
|
|
}
|
|
hostRows := make([]metrics.HostRow, 0, len(hosts))
|
|
for _, h := range hosts {
|
|
row := metrics.HostRow{
|
|
ID: h.ID,
|
|
Name: h.Name,
|
|
Online: h.Status == "online",
|
|
SnapshotCount: h.SnapshotCount,
|
|
OpenAlertCount: h.OpenAlertCount,
|
|
RepoStatus: h.RepoStatus,
|
|
}
|
|
if h.LastBackupAt != nil {
|
|
ts := h.LastBackupAt.Unix()
|
|
row.LastBackupUnix = &ts
|
|
}
|
|
if h.LastBackupStatus != nil {
|
|
ok := *h.LastBackupStatus == "succeeded"
|
|
row.LastBackupSucceeded = &ok
|
|
}
|
|
if h.RepoSizeBytes > 0 {
|
|
sz := h.RepoSizeBytes
|
|
row.RepoSizeBytes = &sz
|
|
}
|
|
hostRows = append(hostRows, row)
|
|
}
|
|
|
|
open, err := s.deps.Store.ListAlerts(ctx, store.AlertFilter{Status: "open"})
|
|
if err != nil {
|
|
return metrics.Snapshot{}, err
|
|
}
|
|
bySeverity := map[string]int{"info": 0, "warning": 0, "critical": 0}
|
|
for _, a := range open {
|
|
bySeverity[a.Severity]++
|
|
}
|
|
|
|
reg := s.deps.Metrics
|
|
if reg == nil {
|
|
reg = metrics.NewRegistry() // empty histogram block
|
|
}
|
|
return reg.SnapshotWith(hostRows, bySeverity, version.Version, version.Commit, runtime.Version()), nil
|
|
}
|