P6-04+05: Prometheus /metrics endpoint + Grafana dashboard
New internal/server/metrics package emits the legacy text/plain exposition format directly, so we don't pull in prometheus/client_golang. Endpoint is opt-in via RM_METRICS_TOKEN and/or RM_METRICS_TRUSTED_CIDR; route is not mounted at all if neither gate is set. Both gates ANDed when both configured. Per-host gauges (online, last_backup_*, repo_size_bytes, snapshot_count, open_alerts, repo_status), server gauges (hosts_total/online, active_alerts by severity, build_info), and an in-memory job-duration histogram observed from the existing MsgJobFinished branch in the WS handler. Docs in docs/prometheus.md (enable + scrape config + metric reference + dashboard import). Sample dashboard at deploy/grafana/restic-manager-dashboard.json - six panels, Grafana schema 39, single Prometheus datasource variable. Tests: golden render, concurrent observe, bucket boundaries in the metrics package; auth matrix (no auth -> 404, token gate, CIDR gate, both required) in the HTTP layer.
This commit is contained in:
@@ -20,6 +20,7 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/fleetupdate"
|
||||
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
||||
@@ -89,6 +90,7 @@ func run() error {
|
||||
|
||||
hub := ws.NewHub()
|
||||
jobHub := ws.NewJobHub()
|
||||
metricsRegistry := metrics.NewRegistry()
|
||||
|
||||
notifHub := notification.NewHub(st, aead, cfg.BaseURL)
|
||||
alertEngine := alert.NewEngine(st, notifHub)
|
||||
@@ -122,6 +124,7 @@ func run() error {
|
||||
UI: renderer,
|
||||
Version: version,
|
||||
OIDC: oidcClient,
|
||||
Metrics: metricsRegistry,
|
||||
}
|
||||
|
||||
// First-run bootstrap: if the users table is empty, mint a one-time
|
||||
|
||||
@@ -0,0 +1,325 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": { "type": "grafana", "uid": "-- Grafana --" },
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "restic-manager fleet overview. Imports against any Prometheus data source.",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Fleet status",
|
||||
"type": "stat",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_hosts_online",
|
||||
"legendFormat": "online",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_hosts_total",
|
||||
"legendFormat": "total",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Open alerts",
|
||||
"type": "stat",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "sum by (severity) (rm_active_alerts)",
|
||||
"legendFormat": "{{severity}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Backups failing (last reported run)",
|
||||
"type": "stat",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 6, "w": 6, "x": 12, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "count(rm_host_last_backup_success == 0)",
|
||||
"legendFormat": "failing",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Hosts",
|
||||
"type": "table",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 6 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "auto", "displayMode": "auto" }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value #B" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Last backup (s ago)" },
|
||||
{ "id": "unit", "value": "s" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value #C" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Repo size" },
|
||||
{ "id": "unit", "value": "bytes" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value #D" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Snapshots" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value #A" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Online" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value #E" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Open alerts" }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "showHeader": true },
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_host_agent_online",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "time() - rm_host_last_backup_timestamp_seconds",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_host_repo_size_bytes",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_host_snapshot_count",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "D"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_host_open_alerts",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "E"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Repo size over time",
|
||||
"type": "timeseries",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never"
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["last"], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_host_repo_size_bytes",
|
||||
"legendFormat": "{{host}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Job duration p95 (last 1h, by kind)",
|
||||
"type": "timeseries",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 5,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 4,
|
||||
"showPoints": "never"
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["last"], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "histogram_quantile(0.95, sum by (kind, le) (rate(rm_job_duration_seconds_bucket[1h])))",
|
||||
"legendFormat": "{{kind}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["restic-manager", "backups"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Prometheus",
|
||||
"multi": false,
|
||||
"name": "DS_PROMETHEUS",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "restic-manager — fleet",
|
||||
"uid": "rm-fleet-overview",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
@@ -0,0 +1,139 @@
|
||||
# Prometheus + Grafana
|
||||
|
||||
restic-manager exposes a Prometheus scrape endpoint at `GET /metrics`.
|
||||
The endpoint is **opt-in** — it is not mounted at all unless you set
|
||||
at least one of the auth gates below. Once enabled, it serves the
|
||||
standard `text/plain` exposition format that every Prometheus
|
||||
release since 2.x parses without configuration.
|
||||
|
||||
A sample Grafana dashboard lives at
|
||||
`deploy/grafana/restic-manager-dashboard.json`.
|
||||
|
||||
## Enable the endpoint
|
||||
|
||||
Two switches, both off by default. If both are set, both must pass
|
||||
(token AND source-IP); if only one is set, that gate alone
|
||||
authorises a scrape.
|
||||
|
||||
| Env var | YAML key | Effect |
|
||||
|----------------------------|------------------------|--------|
|
||||
| `RM_METRICS_TOKEN` | `metrics_token` | Requires `Authorization: Bearer <token>`. Compared in constant time. |
|
||||
| `RM_METRICS_TRUSTED_CIDR` | `metrics_trusted_cidrs` (list) | Restricts the source IP to one of the listed CIDRs. Comma-separated in env, list in YAML. Honours `X-Forwarded-For` only when the immediate hop matches `RM_TRUSTED_PROXY`. |
|
||||
|
||||
When neither is set, `GET /metrics` returns 404 — the route is not
|
||||
registered with the chi router so a forgotten config can't
|
||||
accidentally publish fleet state.
|
||||
|
||||
### Example: Docker
|
||||
|
||||
```yaml
|
||||
services:
|
||||
restic-manager:
|
||||
image: gitea.dcglab.co.uk/steve/restic-manager:latest
|
||||
environment:
|
||||
RM_METRICS_TOKEN_FILE: /run/secrets/rm_metrics_token
|
||||
RM_METRICS_TRUSTED_CIDR: "10.0.0.0/8"
|
||||
secrets:
|
||||
- rm_metrics_token
|
||||
```
|
||||
|
||||
(`RM_METRICS_TOKEN_FILE` is not currently supported — set
|
||||
`RM_METRICS_TOKEN` directly. The `_FILE` convention is on the
|
||||
roadmap.)
|
||||
|
||||
## Prometheus scrape config
|
||||
|
||||
Drop into your `prometheus.yml`:
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: restic-manager
|
||||
metrics_path: /metrics
|
||||
scheme: https # via your reverse proxy
|
||||
static_configs:
|
||||
- targets: ['restic.example.com']
|
||||
authorization:
|
||||
type: Bearer
|
||||
credentials_file: /etc/prometheus/secrets/rm_metrics_token
|
||||
```
|
||||
|
||||
If you don't run a TLS-terminating proxy in front, drop `scheme:
|
||||
https` (the server is HTTP-only — see `docs/reverse-proxy.md`).
|
||||
|
||||
## Metric reference
|
||||
|
||||
All names are `rm_`-prefixed. Per-host metrics carry a `host_id`
|
||||
label (the stable ULID, immune to renames) and a `host` label
|
||||
(the human-readable name).
|
||||
|
||||
### Server gauges
|
||||
|
||||
| Name | Labels | Description |
|
||||
|-----------------------|------------------------------------|-------------|
|
||||
| `rm_hosts_total` | — | Total number of enrolled hosts (excludes pending announces). |
|
||||
| `rm_hosts_online` | — | Number of hosts with `status='online'`. |
|
||||
| `rm_active_alerts` | `severity` ∈ {info, warning, critical} | Open alerts by severity. |
|
||||
| `rm_build_info` | `version, commit, go_version` | Always 1; pure label-bag for joining. |
|
||||
|
||||
### Per-host gauges
|
||||
|
||||
| Name | Description |
|
||||
|--------------------------------------------|-------------|
|
||||
| `rm_host_agent_online` | 1 if the agent is currently online, 0 otherwise. |
|
||||
| `rm_host_last_backup_timestamp_seconds` | Unix timestamp of the host's most recent backup. **Omitted** for hosts with no backup yet. |
|
||||
| `rm_host_last_backup_success` | 1 if the most recent backup succeeded, 0 otherwise. **Omitted** for hosts with no backup yet. |
|
||||
| `rm_host_repo_size_bytes` | Latest reported repo size from `restic stats --mode raw-data`. **Omitted** when unknown. |
|
||||
| `rm_host_snapshot_count` | Number of restic snapshots known on the host's repo. |
|
||||
| `rm_host_open_alerts` | Number of currently open alerts attached to this host. |
|
||||
| `rm_host_repo_status` | Always 1; the `status` label carries `unknown` / `ready` / `init_failed`. |
|
||||
|
||||
### Job duration histogram
|
||||
|
||||
```
|
||||
rm_job_duration_seconds_bucket{kind, status, le}
|
||||
rm_job_duration_seconds_sum{kind, status}
|
||||
rm_job_duration_seconds_count{kind, status}
|
||||
```
|
||||
|
||||
`kind` ∈ {backup, forget, prune, check, unlock, restore, diff, init, update}.
|
||||
`status` ∈ {succeeded, failed, cancelled}.
|
||||
|
||||
Buckets (seconds):
|
||||
|
||||
```
|
||||
1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf
|
||||
1s 5s 30s 1m 5m 30m 1h 6h 24h
|
||||
```
|
||||
|
||||
The histogram is in-memory only — values reset on process restart.
|
||||
Operators who want durable history should let Prometheus persist
|
||||
the scrapes; restic-manager itself is a control plane, not a
|
||||
metrics database.
|
||||
|
||||
## Grafana dashboard
|
||||
|
||||
Import `deploy/grafana/restic-manager-dashboard.json`:
|
||||
|
||||
1. In Grafana, **+ → Import → Upload JSON file**.
|
||||
2. Pick the Prometheus data source you scrape with.
|
||||
3. The dashboard's six panels populate from the metrics above:
|
||||
* **Fleet status** — online/total stat panel.
|
||||
* **Open alerts** — by severity.
|
||||
* **Hosts** — per-host table (last backup, repo size, snapshots, alerts).
|
||||
* **Repo size over time** — one line per host.
|
||||
* **Backups failing** — count of hosts whose last backup didn't succeed.
|
||||
* **Job duration p95** — `histogram_quantile(0.95, …)` over a 1h window per kind.
|
||||
|
||||
Alerting is intentionally not configured in the dashboard — the
|
||||
control plane already has alerts (P3-05) with native channels for
|
||||
webhook, ntfy, and SMTP. Re-implementing them in Prometheus would
|
||||
just duplicate state. If you do want Prom-side alerts, copy the
|
||||
recording rules into your usual location.
|
||||
|
||||
## Cardinality
|
||||
|
||||
Per scrape: O(hosts) gauge rows + O(kinds × statuses × buckets)
|
||||
histogram rows. A 100-host fleet emits roughly 700 host rows + 270
|
||||
histogram rows — well below any practical limit. There are no
|
||||
`job_id` labels (cardinality bomb avoidance) and no per-source-group
|
||||
labels.
|
||||
@@ -41,6 +41,24 @@ type Config struct {
|
||||
// DataDir. Source-build deployments can override via
|
||||
// RM_BUNDLED_ASSETS_DIR.
|
||||
BundledAssetsDir string `yaml:"bundled_assets_dir"`
|
||||
|
||||
// MetricsToken, if set, gates the /metrics scrape endpoint
|
||||
// behind a `Authorization: Bearer <token>` check (constant-time
|
||||
// compare). When neither this nor MetricsTrustedCIDRs is set,
|
||||
// the route is not mounted at all (the endpoint is opt-in).
|
||||
MetricsToken string `yaml:"metrics_token"`
|
||||
|
||||
// MetricsTrustedCIDRs, if non-empty, gates /metrics so only
|
||||
// callers from these networks may scrape. ANDed with
|
||||
// MetricsToken when both are set.
|
||||
MetricsTrustedCIDRs []string `yaml:"metrics_trusted_cidrs"`
|
||||
}
|
||||
|
||||
// MetricsAuthEnabled reports whether the operator has opted into
|
||||
// exposing the Prometheus scrape endpoint by configuring at least
|
||||
// one auth gate.
|
||||
func (c Config) MetricsAuthEnabled() bool {
|
||||
return c.MetricsToken != "" || len(c.MetricsTrustedCIDRs) > 0
|
||||
}
|
||||
|
||||
// Load resolves config in this order:
|
||||
@@ -93,6 +111,19 @@ func Load(yamlPath string) (Config, error) {
|
||||
if v, ok := os.LookupEnv("RM_BUNDLED_ASSETS_DIR"); ok {
|
||||
c.BundledAssetsDir = v
|
||||
}
|
||||
if v, ok := os.LookupEnv("RM_METRICS_TOKEN"); ok {
|
||||
c.MetricsToken = v
|
||||
}
|
||||
if v, ok := os.LookupEnv("RM_METRICS_TRUSTED_CIDR"); ok {
|
||||
parts := strings.Split(v, ",")
|
||||
c.MetricsTrustedCIDRs = c.MetricsTrustedCIDRs[:0]
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if p != "" {
|
||||
c.MetricsTrustedCIDRs = append(c.MetricsTrustedCIDRs, p)
|
||||
}
|
||||
}
|
||||
}
|
||||
if v, ok := os.LookupEnv("RM_TRUSTED_PROXY"); ok {
|
||||
// Comma-separated CIDRs; allow whitespace for readability.
|
||||
parts := strings.Split(v, ",")
|
||||
@@ -137,5 +168,10 @@ func (c *Config) validate() error {
|
||||
return fmt.Errorf("config: RM_TRUSTED_PROXY entry %q is not a valid CIDR: %w", cidr, err)
|
||||
}
|
||||
}
|
||||
for _, cidr := range c.MetricsTrustedCIDRs {
|
||||
if _, err := netip.ParsePrefix(cidr); err != nil {
|
||||
return fmt.Errorf("config: RM_METRICS_TRUSTED_CIDR entry %q is not a valid CIDR: %w", cidr, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -98,6 +98,45 @@ func TestCookieSecureDefaultAndOverride(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsAuthGates(t *testing.T) {
|
||||
t.Setenv("RM_LISTEN", ":8080")
|
||||
t.Setenv("RM_DATA_DIR", "/tmp/x")
|
||||
|
||||
c, err := Load("")
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if c.MetricsAuthEnabled() {
|
||||
t.Errorf("metrics endpoint should be off by default")
|
||||
}
|
||||
|
||||
t.Setenv("RM_METRICS_TOKEN", "s3cr3t-token-with-enough-bytes")
|
||||
t.Setenv("RM_METRICS_TRUSTED_CIDR", "10.0.0.0/8, 192.168.1.0/24")
|
||||
c, err = Load("")
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if c.MetricsToken != "s3cr3t-token-with-enough-bytes" {
|
||||
t.Errorf("token: %q", c.MetricsToken)
|
||||
}
|
||||
if got := c.MetricsTrustedCIDRs; len(got) != 2 || got[0] != "10.0.0.0/8" || got[1] != "192.168.1.0/24" {
|
||||
t.Errorf("cidrs: %v", got)
|
||||
}
|
||||
if !c.MetricsAuthEnabled() {
|
||||
t.Errorf("MetricsAuthEnabled should be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsTrustedCIDRRejectsGarbage(t *testing.T) {
|
||||
t.Setenv("RM_LISTEN", ":8080")
|
||||
t.Setenv("RM_DATA_DIR", "/tmp/x")
|
||||
t.Setenv("RM_METRICS_TRUSTED_CIDR", "garbage")
|
||||
|
||||
if _, err := Load(""); err == nil {
|
||||
t.Fatal("expected validation error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func writeFile(path string, body []byte) error {
|
||||
return writeFileImpl(path, body)
|
||||
}
|
||||
|
||||
@@ -0,0 +1,185 @@
|
||||
package http
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/subtle"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/netip"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||
)
|
||||
|
||||
// handleMetrics serves the Prometheus exposition body. The route is
|
||||
// only mounted when the operator has opted in via RM_METRICS_TOKEN
|
||||
// or RM_METRICS_TRUSTED_CIDR (see Server.New + Cfg.MetricsAuthEnabled).
|
||||
func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
|
||||
if !authoriseMetricsScrape(r, s.deps.Cfg) {
|
||||
// 401 with no body; Prom respects this and surfaces the failed
|
||||
// scrape. WWW-Authenticate hints at bearer when the operator
|
||||
// actually configured a token.
|
||||
if s.deps.Cfg.MetricsToken != "" {
|
||||
w.Header().Set("WWW-Authenticate", `Bearer realm="restic-manager metrics"`)
|
||||
}
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
|
||||
snap, err := s.gatherMetricsSnapshot(r.Context())
|
||||
if err != nil {
|
||||
http.Error(w, "snapshot: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// 0.0.4 is the long-stable text-format version Prometheus accepts
|
||||
// without negotiation; OpenMetrics is intentionally not used here.
|
||||
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
if err := metrics.Render(w, snap); err != nil {
|
||||
// Body is partially written; nothing useful we can do beyond
|
||||
// dropping the connection (chi's recoverer will log).
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// authoriseMetricsScrape applies bearer + CIDR gates per the spec.
|
||||
// AND semantics when both are configured; either alone is sufficient
|
||||
// when only it is configured.
|
||||
func authoriseMetricsScrape(r *http.Request, cfg config.Config) bool {
|
||||
tokenOK := true
|
||||
if cfg.MetricsToken != "" {
|
||||
tokenOK = false
|
||||
hdr := r.Header.Get("Authorization")
|
||||
const prefix = "Bearer "
|
||||
if strings.HasPrefix(hdr, prefix) {
|
||||
got := []byte(strings.TrimPrefix(hdr, prefix))
|
||||
want := []byte(cfg.MetricsToken)
|
||||
if subtle.ConstantTimeCompare(got, want) == 1 {
|
||||
tokenOK = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cidrOK := true
|
||||
if len(cfg.MetricsTrustedCIDRs) > 0 {
|
||||
cidrOK = false
|
||||
ip := callerIP(r, cfg.TrustedProxies)
|
||||
if ip.IsValid() {
|
||||
for _, c := range cfg.MetricsTrustedCIDRs {
|
||||
prefix, err := netip.ParsePrefix(c)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if prefix.Contains(ip) {
|
||||
cidrOK = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return tokenOK && cidrOK
|
||||
}
|
||||
|
||||
// callerIP resolves the client IP. When the request hit the server
|
||||
// directly we use RemoteAddr; when the immediate hop is a trusted
|
||||
// proxy we honour the right-most untrusted X-Forwarded-For entry
|
||||
// (mirrors how realIP middlewares typically resolve).
|
||||
func callerIP(r *http.Request, trustedProxies []string) netip.Addr {
|
||||
host, _, err := net.SplitHostPort(r.RemoteAddr)
|
||||
if err != nil {
|
||||
host = r.RemoteAddr
|
||||
}
|
||||
directAddr, err := netip.ParseAddr(host)
|
||||
if err != nil {
|
||||
return netip.Addr{}
|
||||
}
|
||||
|
||||
if !addrInAnyCIDR(directAddr, trustedProxies) {
|
||||
return directAddr
|
||||
}
|
||||
|
||||
xff := r.Header.Get("X-Forwarded-For")
|
||||
if xff == "" {
|
||||
return directAddr
|
||||
}
|
||||
parts := strings.Split(xff, ",")
|
||||
// Walk right→left, skipping trusted proxies, until we land on the
|
||||
// first untrusted hop — that's the genuine client.
|
||||
for i := len(parts) - 1; i >= 0; i-- {
|
||||
p := strings.TrimSpace(parts[i])
|
||||
a, err := netip.ParseAddr(p)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if addrInAnyCIDR(a, trustedProxies) {
|
||||
continue
|
||||
}
|
||||
return a
|
||||
}
|
||||
return directAddr
|
||||
}
|
||||
|
||||
func addrInAnyCIDR(a netip.Addr, cidrs []string) bool {
|
||||
for _, c := range cidrs {
|
||||
pre, err := netip.ParsePrefix(c)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if pre.Contains(a) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// gatherMetricsSnapshot pulls the data the renderer needs. One
|
||||
// indexed query per per-host or fleet-wide read; no N+1.
|
||||
func (s *Server) gatherMetricsSnapshot(ctx context.Context) (metrics.Snapshot, error) {
|
||||
hosts, err := s.deps.Store.ListHosts(ctx)
|
||||
if err != nil {
|
||||
return metrics.Snapshot{}, err
|
||||
}
|
||||
hostRows := make([]metrics.HostRow, 0, len(hosts))
|
||||
for _, h := range hosts {
|
||||
row := metrics.HostRow{
|
||||
ID: h.ID,
|
||||
Name: h.Name,
|
||||
Online: h.Status == "online",
|
||||
SnapshotCount: h.SnapshotCount,
|
||||
OpenAlertCount: h.OpenAlertCount,
|
||||
RepoStatus: h.RepoStatus,
|
||||
}
|
||||
if h.LastBackupAt != nil {
|
||||
ts := h.LastBackupAt.Unix()
|
||||
row.LastBackupUnix = &ts
|
||||
}
|
||||
if h.LastBackupStatus != nil {
|
||||
ok := *h.LastBackupStatus == "succeeded"
|
||||
row.LastBackupSucceeded = &ok
|
||||
}
|
||||
if h.RepoSizeBytes > 0 {
|
||||
sz := h.RepoSizeBytes
|
||||
row.RepoSizeBytes = &sz
|
||||
}
|
||||
hostRows = append(hostRows, row)
|
||||
}
|
||||
|
||||
open, err := s.deps.Store.ListAlerts(ctx, store.AlertFilter{Status: "open"})
|
||||
if err != nil {
|
||||
return metrics.Snapshot{}, err
|
||||
}
|
||||
bySeverity := map[string]int{"info": 0, "warning": 0, "critical": 0}
|
||||
for _, a := range open {
|
||||
bySeverity[a.Severity]++
|
||||
}
|
||||
|
||||
reg := s.deps.Metrics
|
||||
if reg == nil {
|
||||
reg = metrics.NewRegistry() // empty histogram block
|
||||
}
|
||||
return reg.SnapshotWith(hostRows, bySeverity, version.Version, version.Commit, runtime.Version()), nil
|
||||
}
|
||||
@@ -0,0 +1,209 @@
|
||||
package http
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
stdhttp "net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
)
|
||||
|
||||
// newMetricsServer builds a Server with metrics enabled per cfg.
|
||||
// Returns (URL, registry) so tests can both observe job durations
|
||||
// directly and exercise the HTTP gate.
|
||||
func newMetricsServer(t *testing.T, cfg config.Config) (string, *metrics.Registry, *store.Store) {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
|
||||
st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { _ = st.Close() })
|
||||
|
||||
keyPath := filepath.Join(dir, "secret.key")
|
||||
if err := crypto.GenerateKeyFile(keyPath); err != nil {
|
||||
t.Fatalf("genkey: %v", err)
|
||||
}
|
||||
key, _ := crypto.LoadKeyFromFile(keyPath)
|
||||
aead, _ := crypto.NewAEAD(key)
|
||||
|
||||
cfg.Listen = ":0"
|
||||
cfg.DataDir = dir
|
||||
cfg.SecretKeyFile = keyPath
|
||||
|
||||
reg := metrics.NewRegistry()
|
||||
deps := Deps{
|
||||
Cfg: cfg,
|
||||
Store: st,
|
||||
AEAD: aead,
|
||||
Metrics: reg,
|
||||
}
|
||||
s := New(deps)
|
||||
ts := httptest.NewServer(s.srv.Handler)
|
||||
t.Cleanup(ts.Close)
|
||||
return ts.URL, reg, st
|
||||
}
|
||||
|
||||
func TestMetricsRouteNotMountedByDefault(t *testing.T) {
|
||||
t.Parallel()
|
||||
url, _, _ := newMetricsServer(t, config.Config{})
|
||||
res, err := stdhttp.Get(url + "/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != stdhttp.StatusNotFound {
|
||||
t.Errorf("status: got %d, want 404 (route should not be mounted)", res.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsTokenRequired(t *testing.T) {
|
||||
t.Parallel()
|
||||
url, _, _ := newMetricsServer(t, config.Config{
|
||||
MetricsToken: "the-token",
|
||||
})
|
||||
|
||||
// Missing token.
|
||||
res, err := stdhttp.Get(url + "/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != stdhttp.StatusUnauthorized {
|
||||
t.Errorf("no token: got %d", res.StatusCode)
|
||||
}
|
||||
if !strings.Contains(res.Header.Get("WWW-Authenticate"), "Bearer") {
|
||||
t.Errorf("WWW-Authenticate hint missing: %q", res.Header.Get("WWW-Authenticate"))
|
||||
}
|
||||
|
||||
// Wrong token.
|
||||
req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
|
||||
req.Header.Set("Authorization", "Bearer not-the-token")
|
||||
res2, err := stdhttp.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res2.Body.Close()
|
||||
if res2.StatusCode != stdhttp.StatusUnauthorized {
|
||||
t.Errorf("wrong token: got %d", res2.StatusCode)
|
||||
}
|
||||
|
||||
// Right token.
|
||||
req3, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
|
||||
req3.Header.Set("Authorization", "Bearer the-token")
|
||||
res3, err3 := stdhttp.DefaultClient.Do(req3)
|
||||
if err3 != nil {
|
||||
t.Fatalf("GET: %v", err3)
|
||||
}
|
||||
defer res3.Body.Close()
|
||||
if res3.StatusCode != stdhttp.StatusOK {
|
||||
t.Errorf("right token: got %d", res3.StatusCode)
|
||||
}
|
||||
if ct := res3.Header.Get("Content-Type"); !strings.HasPrefix(ct, "text/plain") {
|
||||
t.Errorf("content-type: %q", ct)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsCIDRGate(t *testing.T) {
|
||||
t.Parallel()
|
||||
// 127.0.0.1 is what httptest hits with; pick a CIDR that excludes it
|
||||
// to assert the "wrong source" branch.
|
||||
url, _, _ := newMetricsServer(t, config.Config{
|
||||
MetricsTrustedCIDRs: []string{"10.0.0.0/8"},
|
||||
})
|
||||
res, err := stdhttp.Get(url + "/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != stdhttp.StatusUnauthorized {
|
||||
t.Errorf("loopback hitting non-matching CIDR: got %d, want 401", res.StatusCode)
|
||||
}
|
||||
|
||||
// Now allow loopback.
|
||||
url2, _, _ := newMetricsServer(t, config.Config{
|
||||
MetricsTrustedCIDRs: []string{"127.0.0.0/8"},
|
||||
})
|
||||
res2, err := stdhttp.Get(url2 + "/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res2.Body.Close()
|
||||
if res2.StatusCode != stdhttp.StatusOK {
|
||||
t.Errorf("loopback in allow CIDR: got %d, want 200", res2.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsTokenAndCIDRBothRequired(t *testing.T) {
|
||||
t.Parallel()
|
||||
url, _, _ := newMetricsServer(t, config.Config{
|
||||
MetricsToken: "the-token",
|
||||
MetricsTrustedCIDRs: []string{"127.0.0.0/8"},
|
||||
})
|
||||
// Token only — CIDR ok (loopback) but token missing.
|
||||
res, err := stdhttp.Get(url + "/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != stdhttp.StatusUnauthorized {
|
||||
t.Errorf("missing token but in CIDR: got %d", res.StatusCode)
|
||||
}
|
||||
|
||||
// Both right.
|
||||
req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
|
||||
req.Header.Set("Authorization", "Bearer the-token")
|
||||
res2, err := stdhttp.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res2.Body.Close()
|
||||
if res2.StatusCode != stdhttp.StatusOK {
|
||||
t.Errorf("both right: got %d", res2.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func readAll(t *testing.T, r io.Reader) string {
|
||||
t.Helper()
|
||||
b, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
t.Fatalf("read: %v", err)
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
func TestMetricsBodyContainsExpectedLines(t *testing.T) {
|
||||
t.Parallel()
|
||||
url, reg, _ := newMetricsServer(t, config.Config{
|
||||
MetricsToken: "the-token",
|
||||
})
|
||||
reg.ObserveJob("backup", "succeeded", 0) // produce one histogram row
|
||||
|
||||
req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
|
||||
req.Header.Set("Authorization", "Bearer the-token")
|
||||
res, err := stdhttp.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
body := readAll(t, res.Body)
|
||||
for _, want := range []string{
|
||||
"rm_hosts_total",
|
||||
"rm_hosts_online",
|
||||
`rm_active_alerts{severity="critical"}`,
|
||||
"rm_build_info{",
|
||||
"rm_job_duration_seconds_count{kind=\"backup\",status=\"succeeded\"}",
|
||||
} {
|
||||
if !strings.Contains(body, want) {
|
||||
t.Errorf("body missing %q\n--- body ---\n%s", want, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
||||
@@ -56,6 +57,12 @@ type Deps struct {
|
||||
// OIDC (optional). Non-nil when the operator has configured an
|
||||
// IdP — handlers under /auth/oidc/* are mounted only when set.
|
||||
OIDC *oidc.Client
|
||||
// Metrics (optional). When non-nil the WS job-finished branch
|
||||
// records job durations and the /metrics handler can pull a
|
||||
// histogram snapshot. Independent of MetricsAuthEnabled — the
|
||||
// recorder runs even if the scrape endpoint is gated off, so a
|
||||
// later config flip doesn't lose the running window.
|
||||
Metrics *metrics.Registry
|
||||
}
|
||||
|
||||
// Server is the running HTTP server.
|
||||
@@ -131,12 +138,16 @@ func (s *Server) routes(r chi.Router) {
|
||||
r.Get("/agent/binary", s.handleAgentBinary)
|
||||
r.Get("/install/*", s.handleInstallAsset)
|
||||
r.Get("/api/version", s.handleVersion)
|
||||
if s.deps.Cfg.MetricsAuthEnabled() {
|
||||
r.Get("/metrics", s.handleMetrics)
|
||||
}
|
||||
if s.deps.Hub != nil {
|
||||
hd := ws.HandlerDeps{
|
||||
Hub: s.deps.Hub,
|
||||
Store: s.deps.Store,
|
||||
JobHub: s.deps.JobHub,
|
||||
AlertEngine: s.deps.AlertEngine,
|
||||
Metrics: s.deps.Metrics,
|
||||
OnHello: s.onAgentHello,
|
||||
OnScheduleAck: s.applyScheduleAck,
|
||||
OnScheduleFire: s.dispatchScheduledJob,
|
||||
|
||||
@@ -0,0 +1,301 @@
|
||||
// Package metrics owns the in-process Prometheus exposition for
|
||||
// the control plane. It deliberately avoids prometheus/client_golang
|
||||
// — the legacy text format is small and stable, and the repo's house
|
||||
// style is to keep dependency surface minimal.
|
||||
//
|
||||
// Two halves:
|
||||
//
|
||||
// - Registry holds a job-duration histogram. Server hooks call
|
||||
// Registry.ObserveJob from the WS job-finished branch.
|
||||
//
|
||||
// - Render emits a complete /metrics body from a Snapshot. The
|
||||
// Snapshot is a plain value bag; the HTTP handler assembles it
|
||||
// from store reads + Registry.Snapshot at scrape time. This
|
||||
// keeps the package free of any database or HTTP dependency.
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// JobDurationBuckets is the upper-bound ladder for the job duration
|
||||
// histogram, in seconds. Covers admin commands (unlock/init/check
|
||||
// finishing in seconds) up through hours-long backups; +Inf is
|
||||
// implicit.
|
||||
var JobDurationBuckets = []float64{1, 5, 30, 60, 300, 1800, 3600, 21600, 86400}
|
||||
|
||||
// Registry is the in-memory store for the job-duration histogram.
|
||||
// Concurrent observers and a single periodic snapshotter is the
|
||||
// expected access pattern; both are guarded by a mutex.
|
||||
type Registry struct {
|
||||
mu sync.Mutex
|
||||
jobs map[jobKey]*histogramState
|
||||
clock func() time.Time
|
||||
}
|
||||
|
||||
type jobKey struct{ kind, status string }
|
||||
|
||||
type histogramState struct {
|
||||
// counts[i] = number of observations <= JobDurationBuckets[i].
|
||||
// counts[len(JobDurationBuckets)] is the implicit +Inf bucket
|
||||
// (== total count, kept here for symmetry with the rendered
|
||||
// _bucket{le="+Inf"} line and as a sanity check).
|
||||
counts []uint64
|
||||
sum float64
|
||||
count uint64
|
||||
}
|
||||
|
||||
// NewRegistry builds an empty registry.
|
||||
func NewRegistry() *Registry {
|
||||
return &Registry{
|
||||
jobs: make(map[jobKey]*histogramState),
|
||||
clock: time.Now,
|
||||
}
|
||||
}
|
||||
|
||||
// ObserveJob records one job-duration sample. Negative durations
|
||||
// (clock-skew artefacts) are clamped to zero. Empty kind/status
|
||||
// strings are tolerated but degrade the dashboard — callers should
|
||||
// pass meaningful values.
|
||||
func (r *Registry) ObserveJob(kind, status string, dur time.Duration) {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
if dur < 0 {
|
||||
dur = 0
|
||||
}
|
||||
secs := dur.Seconds()
|
||||
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
k := jobKey{kind: kind, status: status}
|
||||
hs, ok := r.jobs[k]
|
||||
if !ok {
|
||||
hs = &histogramState{counts: make([]uint64, len(JobDurationBuckets)+1)}
|
||||
r.jobs[k] = hs
|
||||
}
|
||||
for i, ub := range JobDurationBuckets {
|
||||
if secs <= ub {
|
||||
hs.counts[i]++
|
||||
}
|
||||
}
|
||||
hs.counts[len(JobDurationBuckets)]++ // +Inf
|
||||
hs.sum += secs
|
||||
hs.count++
|
||||
}
|
||||
|
||||
// HistogramRow is one (kind,status) row in a Snapshot. Buckets is
|
||||
// the cumulative count per upper bound (matching JobDurationBuckets,
|
||||
// last element is the +Inf total).
|
||||
type HistogramRow struct {
|
||||
Kind string
|
||||
Status string
|
||||
Buckets []uint64
|
||||
Sum float64
|
||||
Count uint64
|
||||
}
|
||||
|
||||
// snapshotJobs returns a deterministic, sorted copy of the
|
||||
// histogram state. Sort order: kind asc, status asc.
|
||||
func (r *Registry) snapshotJobs() []HistogramRow {
|
||||
if r == nil {
|
||||
return nil
|
||||
}
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
rows := make([]HistogramRow, 0, len(r.jobs))
|
||||
for k, hs := range r.jobs {
|
||||
buckets := make([]uint64, len(hs.counts))
|
||||
copy(buckets, hs.counts)
|
||||
rows = append(rows, HistogramRow{
|
||||
Kind: k.kind,
|
||||
Status: k.status,
|
||||
Buckets: buckets,
|
||||
Sum: hs.sum,
|
||||
Count: hs.count,
|
||||
})
|
||||
}
|
||||
sort.Slice(rows, func(i, j int) bool {
|
||||
if rows[i].Kind != rows[j].Kind {
|
||||
return rows[i].Kind < rows[j].Kind
|
||||
}
|
||||
return rows[i].Status < rows[j].Status
|
||||
})
|
||||
return rows
|
||||
}
|
||||
|
||||
// HostRow is one host's projection for the per-host gauges.
|
||||
// Pointers carry "no value" semantics so we can omit a metric line
|
||||
// when, e.g., a host has never run a backup.
|
||||
type HostRow struct {
|
||||
ID string
|
||||
Name string
|
||||
Online bool
|
||||
LastBackupUnix *int64 // nil = no backup yet
|
||||
LastBackupSucceeded *bool // nil = no backup yet
|
||||
RepoSizeBytes *int64 // nil = no stats yet
|
||||
SnapshotCount int
|
||||
OpenAlertCount int
|
||||
RepoStatus string // "unknown" | "ready" | "init_failed"
|
||||
}
|
||||
|
||||
// Snapshot is a frozen view of the data needed to render /metrics.
|
||||
// Constructed by the HTTP handler from Store reads + Registry.snapshotJobs.
|
||||
type Snapshot struct {
|
||||
Hosts []HostRow
|
||||
HostsTotal int
|
||||
HostsOnline int
|
||||
AlertsBySeverity map[string]int // severity → count
|
||||
BuildVersion string
|
||||
BuildCommit string
|
||||
GoVersion string
|
||||
JobDurationRows []HistogramRow
|
||||
}
|
||||
|
||||
// SnapshotWith builds a Snapshot from raw inputs and the registry's
|
||||
// current job-duration state. Convenience for the HTTP handler.
|
||||
func (r *Registry) SnapshotWith(hosts []HostRow, alerts map[string]int, buildVer, commit, goVer string) Snapshot {
|
||||
online := 0
|
||||
for _, h := range hosts {
|
||||
if h.Online {
|
||||
online++
|
||||
}
|
||||
}
|
||||
return Snapshot{
|
||||
Hosts: hosts,
|
||||
HostsTotal: len(hosts),
|
||||
HostsOnline: online,
|
||||
AlertsBySeverity: alerts,
|
||||
BuildVersion: buildVer,
|
||||
BuildCommit: commit,
|
||||
GoVersion: goVer,
|
||||
JobDurationRows: r.snapshotJobs(),
|
||||
}
|
||||
}
|
||||
|
||||
// Render emits a complete Prometheus text-exposition body for s.
|
||||
// Output is deterministic: metric names appear in a fixed order and
|
||||
// labels within a metric are sorted by their first label value.
|
||||
func Render(w io.Writer, s Snapshot) error {
|
||||
var b strings.Builder
|
||||
|
||||
// --- Server gauges ---------------------------------------------------
|
||||
b.WriteString("# HELP rm_hosts_total Total number of enrolled hosts (excludes pending announces).\n")
|
||||
b.WriteString("# TYPE rm_hosts_total gauge\n")
|
||||
fmt.Fprintf(&b, "rm_hosts_total %d\n", s.HostsTotal)
|
||||
|
||||
b.WriteString("# HELP rm_hosts_online Number of hosts currently online (status='online').\n")
|
||||
b.WriteString("# TYPE rm_hosts_online gauge\n")
|
||||
fmt.Fprintf(&b, "rm_hosts_online %d\n", s.HostsOnline)
|
||||
|
||||
b.WriteString("# HELP rm_active_alerts Open alerts grouped by severity.\n")
|
||||
b.WriteString("# TYPE rm_active_alerts gauge\n")
|
||||
severities := []string{"info", "warning", "critical"}
|
||||
for _, sev := range severities {
|
||||
fmt.Fprintf(&b, "rm_active_alerts{severity=%q} %d\n", sev, s.AlertsBySeverity[sev])
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_build_info Build identifying labels; value is always 1.\n")
|
||||
b.WriteString("# TYPE rm_build_info gauge\n")
|
||||
fmt.Fprintf(&b, "rm_build_info{version=%q,commit=%q,go_version=%q} 1\n",
|
||||
s.BuildVersion, s.BuildCommit, s.GoVersion)
|
||||
|
||||
// --- Per-host gauges -------------------------------------------------
|
||||
// Stable order: by host id.
|
||||
hosts := append([]HostRow(nil), s.Hosts...)
|
||||
sort.Slice(hosts, func(i, j int) bool { return hosts[i].ID < hosts[j].ID })
|
||||
|
||||
b.WriteString("# HELP rm_host_agent_online 1 if the agent is currently online, 0 otherwise.\n")
|
||||
b.WriteString("# TYPE rm_host_agent_online gauge\n")
|
||||
for _, h := range hosts {
|
||||
v := 0
|
||||
if h.Online {
|
||||
v = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_host_agent_online{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, v)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_last_backup_timestamp_seconds Unix timestamp of the host's most recent backup. Omitted for hosts with no backup yet.\n")
|
||||
b.WriteString("# TYPE rm_host_last_backup_timestamp_seconds gauge\n")
|
||||
for _, h := range hosts {
|
||||
if h.LastBackupUnix == nil {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_host_last_backup_timestamp_seconds{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, *h.LastBackupUnix)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_last_backup_success 1 if the host's most recent backup succeeded, 0 otherwise. Omitted for hosts with no backup yet.\n")
|
||||
b.WriteString("# TYPE rm_host_last_backup_success gauge\n")
|
||||
for _, h := range hosts {
|
||||
if h.LastBackupSucceeded == nil {
|
||||
continue
|
||||
}
|
||||
v := 0
|
||||
if *h.LastBackupSucceeded {
|
||||
v = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_host_last_backup_success{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, v)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_repo_size_bytes Latest reported repo size from `restic stats --mode raw-data`. Omitted for hosts with no stats yet.\n")
|
||||
b.WriteString("# TYPE rm_host_repo_size_bytes gauge\n")
|
||||
for _, h := range hosts {
|
||||
if h.RepoSizeBytes == nil {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_host_repo_size_bytes{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, *h.RepoSizeBytes)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_snapshot_count Number of restic snapshots known on the host's repo.\n")
|
||||
b.WriteString("# TYPE rm_host_snapshot_count gauge\n")
|
||||
for _, h := range hosts {
|
||||
fmt.Fprintf(&b, "rm_host_snapshot_count{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, h.SnapshotCount)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_open_alerts Number of currently open alerts attached to this host.\n")
|
||||
b.WriteString("# TYPE rm_host_open_alerts gauge\n")
|
||||
for _, h := range hosts {
|
||||
fmt.Fprintf(&b, "rm_host_open_alerts{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, h.OpenAlertCount)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_repo_status Repo readiness state for the host. Exactly one row per host with status label set.\n")
|
||||
b.WriteString("# TYPE rm_host_repo_status gauge\n")
|
||||
for _, h := range hosts {
|
||||
st := h.RepoStatus
|
||||
if st == "" {
|
||||
st = "unknown"
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_host_repo_status{host_id=%q,host=%q,status=%q} 1\n",
|
||||
h.ID, h.Name, st)
|
||||
}
|
||||
|
||||
// --- Histogram -------------------------------------------------------
|
||||
b.WriteString("# HELP rm_job_duration_seconds End-to-end duration of completed jobs, by kind and terminal status.\n")
|
||||
b.WriteString("# TYPE rm_job_duration_seconds histogram\n")
|
||||
for _, row := range s.JobDurationRows {
|
||||
for i, ub := range JobDurationBuckets {
|
||||
fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"%g\"} %d\n",
|
||||
row.Kind, row.Status, ub, row.Buckets[i])
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"+Inf\"} %d\n",
|
||||
row.Kind, row.Status, row.Buckets[len(JobDurationBuckets)])
|
||||
fmt.Fprintf(&b, "rm_job_duration_seconds_sum{kind=%q,status=%q} %g\n",
|
||||
row.Kind, row.Status, row.Sum)
|
||||
fmt.Fprintf(&b, "rm_job_duration_seconds_count{kind=%q,status=%q} %d\n",
|
||||
row.Kind, row.Status, row.Count)
|
||||
}
|
||||
|
||||
_, err := io.WriteString(w, b.String())
|
||||
return err
|
||||
}
|
||||
@@ -0,0 +1,182 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestObserveJobBuckets(t *testing.T) {
|
||||
r := NewRegistry()
|
||||
// Bucket boundaries: 1, 5, 30, 60, 300, 1800, 3600, 21600, 86400
|
||||
r.ObserveJob("backup", "succeeded", 500*time.Millisecond) // <= 1
|
||||
r.ObserveJob("backup", "succeeded", 30*time.Second) // == 30 (boundary)
|
||||
r.ObserveJob("backup", "succeeded", 90*time.Second) // > 60, <= 300
|
||||
r.ObserveJob("backup", "succeeded", 2*time.Hour) // > 3600 → 21600 bucket
|
||||
rows := r.snapshotJobs()
|
||||
if len(rows) != 1 {
|
||||
t.Fatalf("rows: %d", len(rows))
|
||||
}
|
||||
row := rows[0]
|
||||
if row.Count != 4 {
|
||||
t.Errorf("count: %d", row.Count)
|
||||
}
|
||||
wantSum := 0.5 + 30 + 90 + 7200.0
|
||||
if row.Sum != wantSum {
|
||||
t.Errorf("sum: got %v want %v", row.Sum, wantSum)
|
||||
}
|
||||
// Cumulative buckets:
|
||||
// le=1 → 1 (the 0.5s)
|
||||
// le=5 → 1
|
||||
// le=30 → 2 (boundary inclusive: 30s included)
|
||||
// le=60 → 2
|
||||
// le=300 → 3
|
||||
// le=1800 → 3
|
||||
// le=3600 → 3
|
||||
// le=21600 → 4
|
||||
// le=86400 → 4
|
||||
// le=+Inf → 4
|
||||
want := []uint64{1, 1, 2, 2, 3, 3, 3, 4, 4, 4}
|
||||
for i, w := range want {
|
||||
if row.Buckets[i] != w {
|
||||
t.Errorf("bucket[%d]=%d want %d", i, row.Buckets[i], w)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserveJobNegativeClampedToZero(t *testing.T) {
|
||||
r := NewRegistry()
|
||||
r.ObserveJob("backup", "succeeded", -5*time.Second)
|
||||
rows := r.snapshotJobs()
|
||||
if len(rows) != 1 || rows[0].Sum != 0 || rows[0].Count != 1 {
|
||||
t.Errorf("expected one zero-second observation, got %+v", rows)
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserveJobConcurrent(t *testing.T) {
|
||||
r := NewRegistry()
|
||||
const goroutines = 16
|
||||
const each = 200
|
||||
var wg sync.WaitGroup
|
||||
for g := 0; g < goroutines; g++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < each; i++ {
|
||||
r.ObserveJob("backup", "succeeded", time.Second)
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
rows := r.snapshotJobs()
|
||||
if len(rows) != 1 {
|
||||
t.Fatalf("rows: %d", len(rows))
|
||||
}
|
||||
if rows[0].Count != uint64(goroutines*each) {
|
||||
t.Errorf("count: got %d want %d", rows[0].Count, goroutines*each)
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserveJobNilRegistryNoop(t *testing.T) {
|
||||
var r *Registry // nil
|
||||
r.ObserveJob("backup", "succeeded", time.Second)
|
||||
}
|
||||
|
||||
func TestRenderGolden(t *testing.T) {
|
||||
r := NewRegistry()
|
||||
r.ObserveJob("backup", "succeeded", 5*time.Second)
|
||||
r.ObserveJob("forget", "succeeded", 100*time.Millisecond)
|
||||
|
||||
pi64 := func(v int64) *int64 { return &v }
|
||||
pbool := func(v bool) *bool { return &v }
|
||||
|
||||
hosts := []HostRow{
|
||||
{
|
||||
ID: "01H0001", Name: "alpha",
|
||||
Online: true,
|
||||
LastBackupUnix: pi64(1700000000),
|
||||
LastBackupSucceeded: pbool(true),
|
||||
RepoSizeBytes: pi64(123456789),
|
||||
SnapshotCount: 42,
|
||||
OpenAlertCount: 0,
|
||||
RepoStatus: "ready",
|
||||
},
|
||||
{
|
||||
ID: "01H0002", Name: "bravo",
|
||||
Online: false,
|
||||
SnapshotCount: 0,
|
||||
OpenAlertCount: 1,
|
||||
RepoStatus: "init_failed",
|
||||
},
|
||||
}
|
||||
snap := r.SnapshotWith(hosts,
|
||||
map[string]int{"info": 0, "warning": 1, "critical": 0},
|
||||
"v1.2.3", "deadbeef", "go1.25.0")
|
||||
|
||||
var buf bytes.Buffer
|
||||
if err := Render(&buf, snap); err != nil {
|
||||
t.Fatalf("render: %v", err)
|
||||
}
|
||||
out := buf.String()
|
||||
|
||||
for _, want := range []string{
|
||||
"# HELP rm_hosts_total ",
|
||||
"rm_hosts_total 2\n",
|
||||
"rm_hosts_online 1\n",
|
||||
`rm_active_alerts{severity="warning"} 1`,
|
||||
`rm_active_alerts{severity="info"} 0`,
|
||||
`rm_active_alerts{severity="critical"} 0`,
|
||||
`rm_build_info{version="v1.2.3",commit="deadbeef",go_version="go1.25.0"} 1`,
|
||||
`rm_host_agent_online{host_id="01H0001",host="alpha"} 1`,
|
||||
`rm_host_agent_online{host_id="01H0002",host="bravo"} 0`,
|
||||
`rm_host_last_backup_timestamp_seconds{host_id="01H0001",host="alpha"} 1700000000`,
|
||||
`rm_host_last_backup_success{host_id="01H0001",host="alpha"} 1`,
|
||||
`rm_host_repo_size_bytes{host_id="01H0001",host="alpha"} 123456789`,
|
||||
`rm_host_snapshot_count{host_id="01H0001",host="alpha"} 42`,
|
||||
`rm_host_snapshot_count{host_id="01H0002",host="bravo"} 0`,
|
||||
`rm_host_open_alerts{host_id="01H0002",host="bravo"} 1`,
|
||||
`rm_host_repo_status{host_id="01H0001",host="alpha",status="ready"} 1`,
|
||||
`rm_host_repo_status{host_id="01H0002",host="bravo",status="init_failed"} 1`,
|
||||
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="1"} 0`,
|
||||
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="5"} 1`,
|
||||
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="+Inf"} 1`,
|
||||
`rm_job_duration_seconds_sum{kind="backup",status="succeeded"} 5`,
|
||||
`rm_job_duration_seconds_count{kind="backup",status="succeeded"} 1`,
|
||||
`rm_job_duration_seconds_bucket{kind="forget",status="succeeded",le="1"} 1`,
|
||||
} {
|
||||
if !strings.Contains(out, want) {
|
||||
t.Errorf("missing line:\n %s\n--- full output ---\n%s", want, out)
|
||||
}
|
||||
}
|
||||
|
||||
// bravo had no last backup → those metric lines must be absent for it.
|
||||
for _, ban := range []string{
|
||||
`rm_host_last_backup_timestamp_seconds{host_id="01H0002"`,
|
||||
`rm_host_last_backup_success{host_id="01H0002"`,
|
||||
`rm_host_repo_size_bytes{host_id="01H0002"`,
|
||||
} {
|
||||
if strings.Contains(out, ban) {
|
||||
t.Errorf("unexpected line for bravo: %q", ban)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderEmptySnapshot(t *testing.T) {
|
||||
r := NewRegistry()
|
||||
snap := r.SnapshotWith(nil, nil, "dev", "", "go1.25.0")
|
||||
var buf bytes.Buffer
|
||||
if err := Render(&buf, snap); err != nil {
|
||||
t.Fatalf("render: %v", err)
|
||||
}
|
||||
out := buf.String()
|
||||
if !strings.Contains(out, "rm_hosts_total 0\n") {
|
||||
t.Errorf("missing zero-host gauge:\n%s", out)
|
||||
}
|
||||
// Histogram block has its HELP/TYPE but no rows. The HELP/TYPE
|
||||
// presence is correct and helps Prometheus pre-register the metric.
|
||||
if !strings.Contains(out, "# TYPE rm_job_duration_seconds histogram") {
|
||||
t.Errorf("histogram HELP/TYPE missing")
|
||||
}
|
||||
}
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/alert"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||
)
|
||||
@@ -27,6 +28,9 @@ type HandlerDeps struct {
|
||||
// AlertEngine receives job-finished and host-online events so the
|
||||
// alert engine can evaluate its rules. Optional; nil = no-op.
|
||||
AlertEngine *alert.Engine
|
||||
// Metrics records job-duration observations on every terminal
|
||||
// status. Optional; nil = no-op (test fixtures pass nil).
|
||||
Metrics *metrics.Registry
|
||||
// UpdateWatcher reconciles in-flight agent-update dispatches against
|
||||
// hello envelopes. Optional; nil = no-op.
|
||||
UpdateWatcher *UpdateWatcher
|
||||
@@ -239,6 +243,13 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
|
||||
slog.Warn("ws: set host last backup", "host_id", hostID, "err", err)
|
||||
}
|
||||
}
|
||||
// Job-duration histogram (P6-04). Skip when StartedAt is
|
||||
// missing (race: agent shipped finished without a started,
|
||||
// or the row predates this code).
|
||||
if deps.Metrics != nil && job.StartedAt != nil {
|
||||
deps.Metrics.ObserveJob(job.Kind, string(p.Status),
|
||||
p.FinishedAt.Sub(*job.StartedAt))
|
||||
}
|
||||
}
|
||||
if deps.JobHub != nil {
|
||||
deps.JobHub.Broadcast(p.JobID, env)
|
||||
|
||||
@@ -390,8 +390,45 @@ Sizes: **S** = under a day, **M** = 1–3 days, **L** = 3–7 days.
|
||||
> swap, helper `buildRepoTrendView` shared between page-load and
|
||||
> fragment endpoint). No new dependencies, no client JS, no agent
|
||||
> change. CI green; in-browser smoke walk-through pending operator.
|
||||
- [ ] **P6-04** (M) Prometheus `/metrics` endpoint: per-host gauges (last backup timestamp, last backup status, repo size, snapshot count, agent online), server gauges (active alerts, build info), job duration histograms; protected by bearer token or IP allow-list. _(Was P4-08.)_
|
||||
- [ ] **P6-05** (S) Document Prometheus integration + sample Grafana dashboard JSON. _(Was P4-09.)_
|
||||
- [x] **P6-04** (M) Prometheus `/metrics` endpoint: per-host gauges (last backup timestamp, last backup status, repo size, snapshot count, agent online), server gauges (active alerts, build info), job duration histograms; protected by bearer token or IP allow-list. _(Was P4-08.)_
|
||||
- [x] **P6-05** (S) Document Prometheus integration + sample Grafana dashboard JSON. _(Was P4-09.)_
|
||||
|
||||
> **As shipped (2026-05-07, branch `p6-04-05-prometheus-metrics`):**
|
||||
> Spec `docs/superpowers/specs/2026-05-07-p6-04-05-prometheus-metrics-design.md`,
|
||||
> plan `docs/superpowers/plans/2026-05-07-p6-04-05-prometheus-metrics.md`.
|
||||
> New `internal/server/metrics` package emits the legacy
|
||||
> `text/plain; version=0.0.4` exposition format directly — no
|
||||
> `prometheus/client_golang` dependency, matching the repo's
|
||||
> "no Tailwind, no Node" minimal-deps style. `/metrics` is **opt-in**:
|
||||
> `RM_METRICS_TOKEN` and/or `RM_METRICS_TRUSTED_CIDR` must be set or
|
||||
> the route isn't mounted at all (404). When both are set, both must
|
||||
> pass; either alone gates access. Token compare is constant-time.
|
||||
> CIDR check honours `X-Forwarded-For` only when the immediate hop
|
||||
> is a configured `RM_TRUSTED_PROXY` (mirrors the existing realIP
|
||||
> resolution).
|
||||
>
|
||||
> **Metrics:** per-host gauges (`rm_host_agent_online`,
|
||||
> `rm_host_last_backup_timestamp_seconds`, `rm_host_last_backup_success`,
|
||||
> `rm_host_repo_size_bytes`, `rm_host_snapshot_count`,
|
||||
> `rm_host_open_alerts`, `rm_host_repo_status`); server gauges
|
||||
> (`rm_hosts_total`, `rm_hosts_online`, `rm_active_alerts{severity}`,
|
||||
> `rm_build_info{version,commit,go_version}`); histogram
|
||||
> `rm_job_duration_seconds_bucket{kind,status,le}` with buckets
|
||||
> `1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf`.
|
||||
> Histogram is in-memory; observations come from the existing
|
||||
> `MsgJobFinished` branch in `internal/server/ws/handler.go`.
|
||||
>
|
||||
> **Docs:** `docs/prometheus.md` covers enable + scrape config +
|
||||
> metric reference + dashboard import. **Dashboard:**
|
||||
> `deploy/grafana/restic-manager-dashboard.json` — six panels
|
||||
> (fleet status, open alerts, backups failing, hosts table, repo
|
||||
> size over time, job-duration p95). Schema 39, single Prometheus
|
||||
> datasource variable.
|
||||
>
|
||||
> **Tests:** golden-render + concurrent-observe + bucket-boundary
|
||||
> in the metrics package; auth matrix (no auth → 404; token
|
||||
> missing/wrong/right; CIDR matching/non-matching; token AND CIDR)
|
||||
> in the HTTP layer.
|
||||
|
||||
### Phase 6 acceptance
|
||||
|
||||
|
||||
Reference in New Issue
Block a user