73e733be61
CI / Test (rest) (pull_request) Successful in 41s
CI / Test (store) (pull_request) Successful in 43s
CI / Lint (pull_request) Successful in 29s
CI / Build (windows/amd64) (pull_request) Successful in 44s
CI / Test (server-http) (pull_request) Successful in 1m47s
CI / Build (linux/arm64) (pull_request) Successful in 43s
CI / Build (linux/amd64) (pull_request) Successful in 2m1s
New internal/server/metrics package emits the legacy text/plain exposition format directly, so we don't pull in prometheus/client_golang. Endpoint is opt-in via RM_METRICS_TOKEN and/or RM_METRICS_TRUSTED_CIDR; route is not mounted at all if neither gate is set. Both gates ANDed when both configured. Per-host gauges (online, last_backup_*, repo_size_bytes, snapshot_count, open_alerts, repo_status), server gauges (hosts_total/online, active_alerts by severity, build_info), and an in-memory job-duration histogram observed from the existing MsgJobFinished branch in the WS handler. Docs in docs/prometheus.md (enable + scrape config + metric reference + dashboard import). Sample dashboard at deploy/grafana/restic-manager-dashboard.json - six panels, Grafana schema 39, single Prometheus datasource variable. Tests: golden render, concurrent observe, bucket boundaries in the metrics package; auth matrix (no auth -> 404, token gate, CIDR gate, both required) in the HTTP layer.
183 lines
5.5 KiB
Go
183 lines
5.5 KiB
Go
package metrics
|
|
|
|
import (
|
|
"bytes"
|
|
"strings"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
func TestObserveJobBuckets(t *testing.T) {
|
|
r := NewRegistry()
|
|
// Bucket boundaries: 1, 5, 30, 60, 300, 1800, 3600, 21600, 86400
|
|
r.ObserveJob("backup", "succeeded", 500*time.Millisecond) // <= 1
|
|
r.ObserveJob("backup", "succeeded", 30*time.Second) // == 30 (boundary)
|
|
r.ObserveJob("backup", "succeeded", 90*time.Second) // > 60, <= 300
|
|
r.ObserveJob("backup", "succeeded", 2*time.Hour) // > 3600 → 21600 bucket
|
|
rows := r.snapshotJobs()
|
|
if len(rows) != 1 {
|
|
t.Fatalf("rows: %d", len(rows))
|
|
}
|
|
row := rows[0]
|
|
if row.Count != 4 {
|
|
t.Errorf("count: %d", row.Count)
|
|
}
|
|
wantSum := 0.5 + 30 + 90 + 7200.0
|
|
if row.Sum != wantSum {
|
|
t.Errorf("sum: got %v want %v", row.Sum, wantSum)
|
|
}
|
|
// Cumulative buckets:
|
|
// le=1 → 1 (the 0.5s)
|
|
// le=5 → 1
|
|
// le=30 → 2 (boundary inclusive: 30s included)
|
|
// le=60 → 2
|
|
// le=300 → 3
|
|
// le=1800 → 3
|
|
// le=3600 → 3
|
|
// le=21600 → 4
|
|
// le=86400 → 4
|
|
// le=+Inf → 4
|
|
want := []uint64{1, 1, 2, 2, 3, 3, 3, 4, 4, 4}
|
|
for i, w := range want {
|
|
if row.Buckets[i] != w {
|
|
t.Errorf("bucket[%d]=%d want %d", i, row.Buckets[i], w)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestObserveJobNegativeClampedToZero(t *testing.T) {
|
|
r := NewRegistry()
|
|
r.ObserveJob("backup", "succeeded", -5*time.Second)
|
|
rows := r.snapshotJobs()
|
|
if len(rows) != 1 || rows[0].Sum != 0 || rows[0].Count != 1 {
|
|
t.Errorf("expected one zero-second observation, got %+v", rows)
|
|
}
|
|
}
|
|
|
|
func TestObserveJobConcurrent(t *testing.T) {
|
|
r := NewRegistry()
|
|
const goroutines = 16
|
|
const each = 200
|
|
var wg sync.WaitGroup
|
|
for g := 0; g < goroutines; g++ {
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
for i := 0; i < each; i++ {
|
|
r.ObserveJob("backup", "succeeded", time.Second)
|
|
}
|
|
}()
|
|
}
|
|
wg.Wait()
|
|
rows := r.snapshotJobs()
|
|
if len(rows) != 1 {
|
|
t.Fatalf("rows: %d", len(rows))
|
|
}
|
|
if rows[0].Count != uint64(goroutines*each) {
|
|
t.Errorf("count: got %d want %d", rows[0].Count, goroutines*each)
|
|
}
|
|
}
|
|
|
|
func TestObserveJobNilRegistryNoop(t *testing.T) {
|
|
var r *Registry // nil
|
|
r.ObserveJob("backup", "succeeded", time.Second)
|
|
}
|
|
|
|
func TestRenderGolden(t *testing.T) {
|
|
r := NewRegistry()
|
|
r.ObserveJob("backup", "succeeded", 5*time.Second)
|
|
r.ObserveJob("forget", "succeeded", 100*time.Millisecond)
|
|
|
|
pi64 := func(v int64) *int64 { return &v }
|
|
pbool := func(v bool) *bool { return &v }
|
|
|
|
hosts := []HostRow{
|
|
{
|
|
ID: "01H0001", Name: "alpha",
|
|
Online: true,
|
|
LastBackupUnix: pi64(1700000000),
|
|
LastBackupSucceeded: pbool(true),
|
|
RepoSizeBytes: pi64(123456789),
|
|
SnapshotCount: 42,
|
|
OpenAlertCount: 0,
|
|
RepoStatus: "ready",
|
|
},
|
|
{
|
|
ID: "01H0002", Name: "bravo",
|
|
Online: false,
|
|
SnapshotCount: 0,
|
|
OpenAlertCount: 1,
|
|
RepoStatus: "init_failed",
|
|
},
|
|
}
|
|
snap := r.SnapshotWith(hosts,
|
|
map[string]int{"info": 0, "warning": 1, "critical": 0},
|
|
"v1.2.3", "deadbeef", "go1.25.0")
|
|
|
|
var buf bytes.Buffer
|
|
if err := Render(&buf, snap); err != nil {
|
|
t.Fatalf("render: %v", err)
|
|
}
|
|
out := buf.String()
|
|
|
|
for _, want := range []string{
|
|
"# HELP rm_hosts_total ",
|
|
"rm_hosts_total 2\n",
|
|
"rm_hosts_online 1\n",
|
|
`rm_active_alerts{severity="warning"} 1`,
|
|
`rm_active_alerts{severity="info"} 0`,
|
|
`rm_active_alerts{severity="critical"} 0`,
|
|
`rm_build_info{version="v1.2.3",commit="deadbeef",go_version="go1.25.0"} 1`,
|
|
`rm_host_agent_online{host_id="01H0001",host="alpha"} 1`,
|
|
`rm_host_agent_online{host_id="01H0002",host="bravo"} 0`,
|
|
`rm_host_last_backup_timestamp_seconds{host_id="01H0001",host="alpha"} 1700000000`,
|
|
`rm_host_last_backup_success{host_id="01H0001",host="alpha"} 1`,
|
|
`rm_host_repo_size_bytes{host_id="01H0001",host="alpha"} 123456789`,
|
|
`rm_host_snapshot_count{host_id="01H0001",host="alpha"} 42`,
|
|
`rm_host_snapshot_count{host_id="01H0002",host="bravo"} 0`,
|
|
`rm_host_open_alerts{host_id="01H0002",host="bravo"} 1`,
|
|
`rm_host_repo_status{host_id="01H0001",host="alpha",status="ready"} 1`,
|
|
`rm_host_repo_status{host_id="01H0002",host="bravo",status="init_failed"} 1`,
|
|
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="1"} 0`,
|
|
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="5"} 1`,
|
|
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="+Inf"} 1`,
|
|
`rm_job_duration_seconds_sum{kind="backup",status="succeeded"} 5`,
|
|
`rm_job_duration_seconds_count{kind="backup",status="succeeded"} 1`,
|
|
`rm_job_duration_seconds_bucket{kind="forget",status="succeeded",le="1"} 1`,
|
|
} {
|
|
if !strings.Contains(out, want) {
|
|
t.Errorf("missing line:\n %s\n--- full output ---\n%s", want, out)
|
|
}
|
|
}
|
|
|
|
// bravo had no last backup → those metric lines must be absent for it.
|
|
for _, ban := range []string{
|
|
`rm_host_last_backup_timestamp_seconds{host_id="01H0002"`,
|
|
`rm_host_last_backup_success{host_id="01H0002"`,
|
|
`rm_host_repo_size_bytes{host_id="01H0002"`,
|
|
} {
|
|
if strings.Contains(out, ban) {
|
|
t.Errorf("unexpected line for bravo: %q", ban)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestRenderEmptySnapshot(t *testing.T) {
|
|
r := NewRegistry()
|
|
snap := r.SnapshotWith(nil, nil, "dev", "", "go1.25.0")
|
|
var buf bytes.Buffer
|
|
if err := Render(&buf, snap); err != nil {
|
|
t.Fatalf("render: %v", err)
|
|
}
|
|
out := buf.String()
|
|
if !strings.Contains(out, "rm_hosts_total 0\n") {
|
|
t.Errorf("missing zero-host gauge:\n%s", out)
|
|
}
|
|
// Histogram block has its HELP/TYPE but no rows. The HELP/TYPE
|
|
// presence is correct and helps Prometheus pre-register the metric.
|
|
if !strings.Contains(out, "# TYPE rm_job_duration_seconds histogram") {
|
|
t.Errorf("histogram HELP/TYPE missing")
|
|
}
|
|
}
|