Merge pull request 'P2 redesign Phase 5 — prune/check/unlock + maintenance ticker + repo stats + pending-runs queue' (#3) from p2r-phase5-maintenance into main

Reviewed-on: #3
This commit is contained in:
2026-05-04 09:25:00 +00:00
51 changed files with 6247 additions and 328 deletions
Binary file not shown.

After

Width:  |  Height:  |  Size: 213 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 220 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

+136 -44
View File
@@ -2,13 +2,13 @@ package main
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"log/slog"
"os"
"os/signal"
"strconv"
"syscall"
"time"
@@ -199,32 +199,68 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S
case api.MsgConfigUpdate:
var p api.ConfigUpdatePayload
_ = env.UnmarshalPayload(&p)
// Merge with whatever's already in secrets.enc — empty fields
// in the push mean "leave alone." Atomic write underneath.
cur, err := d.secrets.Load()
if err != nil {
slog.Error("ws agent: load secrets for merge", "err", err)
return nil
slot := p.Slot
if slot == "" {
slot = "repo"
}
changed := false
if p.RepoURL != "" && p.RepoURL != cur.URL {
cur.URL = p.RepoURL
changed = true
}
if p.RepoUsername != "" && p.RepoUsername != cur.Username {
cur.Username = p.RepoUsername
changed = true
}
if p.RepoPassword != "" && p.RepoPassword != cur.Password {
cur.Password = p.RepoPassword
changed = true
}
if changed {
if err := d.secrets.Save(cur); err != nil {
slog.Error("ws agent: persist secrets", "err", err)
switch slot {
case "repo":
// Merge with whatever's already in secrets.enc — empty fields
// in the push mean "leave alone." Atomic write underneath.
cur, err := d.secrets.Load()
if err != nil {
slog.Error("ws agent: load secrets for merge", "err", err)
return nil
}
slog.Info("ws agent: repo credentials updated via config.update")
changed := false
if p.RepoURL != "" && p.RepoURL != cur.URL {
cur.URL = p.RepoURL
changed = true
}
if p.RepoUsername != "" && p.RepoUsername != cur.Username {
cur.Username = p.RepoUsername
changed = true
}
if p.RepoPassword != "" && p.RepoPassword != cur.Password {
cur.Password = p.RepoPassword
changed = true
}
if changed {
if err := d.secrets.Save(cur); err != nil {
slog.Error("ws agent: persist secrets", "err", err)
return nil
}
slog.Info("ws agent: repo credentials updated via config.update")
}
case "admin":
cur, err := d.secrets.LoadAdmin()
if err != nil && !errors.Is(err, secrets.ErrNoAdmin) {
slog.Error("ws agent: load admin secrets", "err", err)
return nil
}
// ErrNoAdmin is not an error here — we are creating the slot.
changed := false
if p.RepoURL != "" && p.RepoURL != cur.URL {
cur.URL = p.RepoURL
changed = true
}
if p.RepoUsername != "" && p.RepoUsername != cur.Username {
cur.Username = p.RepoUsername
changed = true
}
if p.RepoPassword != "" && p.RepoPassword != cur.Password {
cur.Password = p.RepoPassword
changed = true
}
if changed {
if err := d.secrets.SaveAdmin(cur); err != nil {
slog.Error("ws agent: persist admin secrets", "err", err)
return nil
}
slog.Info("ws agent: admin credentials updated via config.update")
}
default:
slog.Warn("ws agent: unknown config.update slot, ignoring", "slot", p.Slot)
}
case api.MsgAgentUpdateAvail:
@@ -251,6 +287,14 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
if creds.Empty() {
return fmt.Errorf("repo credentials not configured (waiting for server config.update push)")
}
// r is the everyday runner — bound to the host's repo
// (append-only) credentials. Reused by every kind except
// JobPrune, which builds its own runner against the
// admin-credentials slot when p.RequiresAdminCreds is set
// (admin creds are not loaded for any other kind, so they're
// not on r). If you find yourself adding a new JobKind that
// needs delete authority, mirror the JobPrune pattern below
// — don't try to overload r.
r := runner.New(runner.Config{
ResticBin: d.resticBin,
RepoURL: creds.URL,
@@ -291,33 +335,81 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
slog.Info("agent: init job complete", "job_id", p.JobID)
}()
case api.JobForget:
var policy restic.ForgetPolicy
if len(p.RetentionPolicy) > 0 {
var raw struct {
KeepLast *int `json:"keep_last,omitempty"`
KeepHourly *int `json:"keep_hourly,omitempty"`
KeepDaily *int `json:"keep_daily,omitempty"`
KeepWeekly *int `json:"keep_weekly,omitempty"`
KeepMonthly *int `json:"keep_monthly,omitempty"`
KeepYearly *int `json:"keep_yearly,omitempty"`
}
if err := json.Unmarshal(p.RetentionPolicy, &raw); err != nil {
return fmt.Errorf("forget: decode retention_policy: %w", err)
}
policy = restic.ForgetPolicy{
KeepLast: raw.KeepLast, KeepHourly: raw.KeepHourly,
KeepDaily: raw.KeepDaily, KeepWeekly: raw.KeepWeekly,
KeepMonthly: raw.KeepMonthly, KeepYearly: raw.KeepYearly,
}
if len(p.ForgetGroups) == 0 {
// Hard-error rather than fall back to a single-policy form:
// the server-side dispatch path (maintenance ticker) is the
// only writer of forget command.run today, and it always
// populates ForgetGroups. A backwards-compatible single-
// policy fallback was specced but skipped — see the
// Phase 5 plan rationale and version.go's lockstep-deploy
// note for why.
return fmt.Errorf("forget: command.run carried no forget_groups (server didn't populate them)")
}
slog.Info("agent: accepting forget job", "job_id", p.JobID, "policy", p.RetentionPolicy)
groups := make([]restic.ForgetGroup, 0, len(p.ForgetGroups))
for _, g := range p.ForgetGroups {
groups = append(groups, restic.ForgetGroup{
Tag: g.Tag,
Policy: restic.ForgetPolicy{
KeepLast: g.Policy.KeepLast,
KeepHourly: g.Policy.KeepHourly,
KeepDaily: g.Policy.KeepDaily,
KeepWeekly: g.Policy.KeepWeekly,
KeepMonthly: g.Policy.KeepMonthly,
KeepYearly: g.Policy.KeepYearly,
},
})
}
slog.Info("agent: accepting forget job", "job_id", p.JobID, "groups", len(groups))
go func() {
if err := r.RunForget(ctx, p.JobID, policy); err != nil {
if err := r.RunForget(ctx, p.JobID, groups); err != nil {
slog.Warn("agent: forget job failed", "job_id", p.JobID, "err", err)
return
}
slog.Info("agent: forget job complete", "job_id", p.JobID)
}()
case api.JobPrune:
// Prune may require admin creds (delete authority on rest-server).
runCreds := creds
if p.RequiresAdminCreds {
ac, err := d.secrets.LoadAdmin()
if err != nil {
return fmt.Errorf("prune: admin creds not configured (server didn't push them): %w", err)
}
if ac.Empty() {
return fmt.Errorf("prune: admin creds incomplete")
}
runCreds = ac
}
prr := runner.New(runner.Config{
ResticBin: d.resticBin,
RepoURL: runCreds.URL,
RepoUsername: runCreds.Username,
RepoPassword: runCreds.Password,
}, tx, time.Second)
slog.Info("agent: accepting prune job", "job_id", p.JobID, "admin_creds", p.RequiresAdminCreds)
go func() {
if err := prr.RunPrune(ctx, p.JobID); err != nil {
slog.Warn("agent: prune job failed", "job_id", p.JobID, "err", err)
}
}()
case api.JobCheck:
subset := 0
if len(p.Args) > 0 {
subset, _ = strconv.Atoi(p.Args[0])
}
slog.Info("agent: accepting check job", "job_id", p.JobID, "subset_pct", subset)
go func() {
if err := r.RunCheck(ctx, p.JobID, subset); err != nil {
slog.Warn("agent: check job failed", "job_id", p.JobID, "err", err)
}
}()
case api.JobUnlock:
slog.Info("agent: accepting unlock job", "job_id", p.JobID)
go func() {
if err := r.RunUnlock(ctx, p.JobID); err != nil {
slog.Warn("agent: unlock job failed", "job_id", p.JobID, "err", err)
}
}()
default:
return fmt.Errorf("kind %q not implemented yet (Phase 2 lands the rest)", p.Kind)
}
+30
View File
@@ -16,6 +16,7 @@ import (
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
@@ -139,6 +140,23 @@ func run() error {
defer purgeTick.Stop()
offlineTick := time.NewTicker(30 * time.Second)
defer offlineTick.Stop()
// Maintenance ticker: drives forget/prune/check on the cadences
// operators set per-host. Independent of the agent's local cron
// (which only handles backup schedules). 60s cadence — the cron
// expressions are minute-grained, so anything finer is wasted
// work.
maintenanceTick := time.NewTicker(60 * time.Second)
defer maintenanceTick.Stop()
// Pending-runs drain ticker: 30s cadence sweeps every host with
// pending_runs rows whose next_attempt_at <= now (rows accumulate
// when a schedule.fire's command.run send fails because the agent
// dropped offline mid-flight). The on-reconnect path in
// onAgentHello handles the common case; this ticker is the
// safety-net for hosts that come back without a fresh hello (they
// shouldn't, but the queue exists either way).
pendingDrainTick := time.NewTicker(30 * time.Second)
defer pendingDrainTick.Stop()
mt := maintenance.New(st)
go func() {
for {
select {
@@ -156,6 +174,18 @@ func run() error {
if n, err := st.MarkHostsOfflineStale(ctx, cutoff); err == nil && n > 0 {
slog.Info("marked hosts offline (stale heartbeat)", "n", n)
}
case <-pendingDrainTick.C:
srv.DrainAllDue(ctx)
case <-maintenanceTick.C:
decisions, err := mt.Decide(ctx, time.Now().UTC())
if err != nil {
slog.Warn("maintenance ticker: decide", "err", err)
continue
}
if len(decisions) > 0 {
slog.Info("maintenance ticker: dispatching", "n", len(decisions))
srv.DispatchMaintenance(ctx, decisions)
}
}
}
}()
+202 -122
View File
@@ -51,24 +51,70 @@ func New(cfg Config, tx Sender, progressMinPeriod time.Duration) *Runner {
return &Runner{cfg: cfg, tx: tx, progressMinPeriod: progressMinPeriod}
}
// RunBackup executes a backup job and reports back via the sender.
// Returns nil on a clean (or "incomplete-but-snapshot-created") finish.
func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, tags []string) error {
startedAt := time.Now().UTC()
startEnv, _ := api.Marshal(api.MsgJobStarted, jobID, api.JobStartedPayload{
JobID: jobID, Kind: api.JobBackup, StartedAt: startedAt,
})
if err := r.tx.Send(startEnv); err != nil {
slog.Warn("runner: send job.started", "err", err)
}
env := restic.Env{
// resticEnv builds the shared restic.Env from r.cfg.
func (r *Runner) resticEnv() restic.Env {
return restic.Env{
Bin: r.cfg.ResticBin,
RepoURL: r.cfg.RepoURL,
RepoUsername: r.cfg.RepoUsername,
RepoPassword: r.cfg.RepoPassword,
}
}
// sendStarted ships a job.started envelope.
func (r *Runner) sendStarted(jobID string, kind api.JobKind, startedAt time.Time) {
env, _ := api.Marshal(api.MsgJobStarted, jobID, api.JobStartedPayload{
JobID: jobID, Kind: kind, StartedAt: startedAt,
})
if err := r.tx.Send(env); err != nil {
slog.Warn("runner: send job.started", "job_id", jobID, "kind", kind, "err", err)
}
}
// streamHandler returns a LineHandler that ships log.stream envelopes.
func (r *Runner) streamHandler(jobID string, seq *atomic.Int64) restic.LineHandler {
return func(stream string, line string, _ any) {
now := time.Now().UTC()
logEnv, _ := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
JobID: jobID,
Seq: seq.Add(1),
TS: now,
Stream: api.LogStream(stream),
Payload: line,
})
_ = r.tx.Send(logEnv)
}
}
// sendFinished ships a job.finished envelope. err==nil → succeeded;
// otherwise failed. statsBlob is forwarded as JobFinishedPayload.Stats.
func (r *Runner) sendFinished(jobID string, finishedAt time.Time, err error, statsBlob json.RawMessage) {
status := api.JobSucceeded
exit := 0
errMsg := ""
if err != nil {
status = api.JobFailed
exit = -1
errMsg = err.Error()
}
finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{
JobID: jobID,
Status: status,
ExitCode: exit,
FinishedAt: finishedAt,
Stats: statsBlob,
Error: errMsg,
})
_ = r.tx.Send(finEnv)
}
// RunBackup executes a backup job and reports back via the sender.
// Returns nil on a clean (or "incomplete-but-snapshot-created") finish.
func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, tags []string) error {
startedAt := time.Now().UTC()
r.sendStarted(jobID, api.JobBackup, startedAt)
env := r.resticEnv()
var seq atomic.Int64
lastProgress := time.Now()
@@ -115,27 +161,11 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t
summary, err := env.RunBackup(ctx, paths, excludes, tags, handle)
finishedAt := time.Now().UTC()
status := api.JobSucceeded
exit := 0
errMsg := ""
if err != nil {
status = api.JobFailed
exit = -1
errMsg = err.Error()
}
var statsBlob json.RawMessage
if summary != nil {
statsBlob, _ = json.Marshal(summary)
}
finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{
JobID: jobID,
Status: status,
ExitCode: exit,
FinishedAt: finishedAt,
Stats: statsBlob,
Error: errMsg,
})
_ = r.tx.Send(finEnv)
r.sendFinished(jobID, finishedAt, err, statsBlob)
// On a successful backup, refresh the server's snapshot projection.
// We do this *after* job.finished so the UI sees the job land first;
@@ -147,6 +177,9 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t
if rerr := r.reportSnapshots(ctx, env); rerr != nil {
slog.Warn("runner: snapshots.report failed", "job_id", jobID, "err", rerr)
}
if rerr := r.reportStats(ctx, env, api.RepoStatsPayload{}); rerr != nil {
slog.Warn("runner: stats.report after backup failed", "job_id", jobID, "err", rerr)
}
}
if err != nil {
@@ -160,111 +193,35 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t
// browser-side log viewer just works.
func (r *Runner) RunInit(ctx context.Context, jobID string) error {
startedAt := time.Now().UTC()
startEnv, _ := api.Marshal(api.MsgJobStarted, jobID, api.JobStartedPayload{
JobID: jobID, Kind: api.JobInit, StartedAt: startedAt,
})
if err := r.tx.Send(startEnv); err != nil {
slog.Warn("runner: send job.started (init)", "err", err)
}
env := restic.Env{
Bin: r.cfg.ResticBin,
RepoURL: r.cfg.RepoURL,
RepoUsername: r.cfg.RepoUsername,
RepoPassword: r.cfg.RepoPassword,
}
r.sendStarted(jobID, api.JobInit, startedAt)
env := r.resticEnv()
var seq atomic.Int64
handle := func(stream string, line string, _ any) {
now := time.Now().UTC()
logEnv, _ := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
JobID: jobID,
Seq: seq.Add(1),
TS: now,
Stream: api.LogStream(stream),
Payload: line,
})
_ = r.tx.Send(logEnv)
}
err := env.RunInit(ctx, handle)
err := env.RunInit(ctx, r.streamHandler(jobID, &seq))
finishedAt := time.Now().UTC()
status := api.JobSucceeded
exit := 0
errMsg := ""
if err != nil {
status = api.JobFailed
exit = -1
errMsg = err.Error()
}
finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{
JobID: jobID,
Status: status,
ExitCode: exit,
FinishedAt: finishedAt,
Error: errMsg,
})
_ = r.tx.Send(finEnv)
r.sendFinished(jobID, finishedAt, err, nil)
if err != nil {
return fmt.Errorf("runner init: %w", err)
}
return nil
}
// RunForget executes a forget job against the configured repo with
// the given retention policy. Same envelope shape as RunBackup so
// the live log viewer + job lifecycle work without special-casing.
// On success refreshes the snapshot projection (forget rewrites the
// snapshot index — the host's snapshot list shrinks).
func (r *Runner) RunForget(ctx context.Context, jobID string, policy restic.ForgetPolicy) error {
// RunForget executes a forget job against the configured repo by
// invoking `restic forget --tag <Tag> --keep-* …` once per group.
// Same envelope shape as RunBackup so the live log viewer + job
// lifecycle work without special-casing. On success refreshes the
// snapshot projection (forget rewrites the snapshot index — the
// host's snapshot list shrinks). Snapshot refresh runs once after
// every group completes, not per-group.
func (r *Runner) RunForget(ctx context.Context, jobID string, groups []restic.ForgetGroup) error {
startedAt := time.Now().UTC()
startEnv, _ := api.Marshal(api.MsgJobStarted, jobID, api.JobStartedPayload{
JobID: jobID, Kind: api.JobForget, StartedAt: startedAt,
})
if err := r.tx.Send(startEnv); err != nil {
slog.Warn("runner: send job.started (forget)", "err", err)
}
env := restic.Env{
Bin: r.cfg.ResticBin,
RepoURL: r.cfg.RepoURL,
RepoUsername: r.cfg.RepoUsername,
RepoPassword: r.cfg.RepoPassword,
}
r.sendStarted(jobID, api.JobForget, startedAt)
env := r.resticEnv()
var seq atomic.Int64
handle := func(stream string, line string, _ any) {
now := time.Now().UTC()
logEnv, _ := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
JobID: jobID,
Seq: seq.Add(1),
TS: now,
Stream: api.LogStream(stream),
Payload: line,
})
_ = r.tx.Send(logEnv)
}
err := env.RunForget(ctx, policy, handle)
err := env.RunForget(ctx, groups, r.streamHandler(jobID, &seq))
finishedAt := time.Now().UTC()
status := api.JobSucceeded
exit := 0
errMsg := ""
if err != nil {
status = api.JobFailed
exit = -1
errMsg = err.Error()
}
finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{
JobID: jobID,
Status: status,
ExitCode: exit,
FinishedAt: finishedAt,
Error: errMsg,
})
_ = r.tx.Send(finEnv)
r.sendFinished(jobID, finishedAt, err, nil)
// Refresh the server's snapshot projection — forget rewrites the
// index so the host's snapshot list almost certainly shrunk.
@@ -281,6 +238,129 @@ func (r *Runner) RunForget(ctx context.Context, jobID string, policy restic.Forg
return nil
}
// RunPrune executes a prune job against the configured repo. On
// success it ships a repo.stats envelope with LastPruneAt set (plus
// a full size refresh via RunStats) before the job.finished envelope,
// so the UI can display updated size information alongside the
// completed job. On failure no stats refresh is attempted.
func (r *Runner) RunPrune(ctx context.Context, jobID string) error {
startedAt := time.Now().UTC()
r.sendStarted(jobID, api.JobPrune, startedAt)
env := r.resticEnv()
var seq atomic.Int64
err := env.RunPrune(ctx, r.streamHandler(jobID, &seq))
finishedAt := time.Now().UTC()
if err == nil {
pruneAt := finishedAt
if rerr := r.reportStats(ctx, env, api.RepoStatsPayload{LastPruneAt: &pruneAt}); rerr != nil {
slog.Warn("runner: stats.report after prune failed", "job_id", jobID, "err", rerr)
}
}
r.sendFinished(jobID, finishedAt, err, nil)
if err != nil {
return fmt.Errorf("runner prune: %w", err)
}
return nil
}
// RunCheck executes a `restic check` job. Always ships a repo.stats
// envelope (success or failure) with LastCheckAt, LastCheckStatus,
// and LockPresent populated from the check result.
func (r *Runner) RunCheck(ctx context.Context, jobID string, subsetPct int) error {
startedAt := time.Now().UTC()
r.sendStarted(jobID, api.JobCheck, startedAt)
env := r.resticEnv()
var seq atomic.Int64
res, err := env.RunCheck(ctx, subsetPct, r.streamHandler(jobID, &seq))
finishedAt := time.Now().UTC()
// Determine check status string.
checkStatus := "ok"
if err != nil {
checkStatus = "failed"
} else if res.ErrorsFound {
checkStatus = "errors_found"
}
lockPresent := res.LockPresent
now := finishedAt
patch := api.RepoStatsPayload{
LastCheckAt: &now,
LastCheckStatus: checkStatus,
LockPresent: &lockPresent,
}
if rerr := r.reportStats(ctx, env, patch); rerr != nil {
slog.Warn("runner: stats.report after check failed", "job_id", jobID, "err", rerr)
}
r.sendFinished(jobID, finishedAt, err, nil)
if err != nil {
return fmt.Errorf("runner check: %w", err)
}
return nil
}
// RunUnlock executes a `restic unlock` job. On success it ships a
// repo.stats envelope with LockPresent=false so the UI banner clears.
func (r *Runner) RunUnlock(ctx context.Context, jobID string) error {
startedAt := time.Now().UTC()
r.sendStarted(jobID, api.JobUnlock, startedAt)
env := r.resticEnv()
var seq atomic.Int64
err := env.RunUnlock(ctx, r.streamHandler(jobID, &seq))
finishedAt := time.Now().UTC()
if err == nil {
lockFalse := false
patch := api.RepoStatsPayload{LockPresent: &lockFalse}
if rerr := r.reportStats(ctx, env, patch); rerr != nil {
slog.Warn("runner: stats.report after unlock failed", "job_id", jobID, "err", rerr)
}
}
r.sendFinished(jobID, finishedAt, err, nil)
if err != nil {
return fmt.Errorf("runner unlock: %w", err)
}
return nil
}
// reportStats ships a repo.stats envelope. If the patch doesn't
// already include size fields, fills them in by invoking env.RunStats.
// Errors from RunStats are non-fatal — the patch is shipped anyway
// with whatever the caller did populate.
func (r *Runner) reportStats(ctx context.Context, env restic.Env, patch api.RepoStatsPayload) error {
listCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
defer cancel()
if patch.TotalSizeBytes == nil {
if s, err := env.RunStats(listCtx, nil); err == nil {
total := s.TotalSize
raw := s.TotalUncompressed
files := s.TotalFileCount
snaps := s.SnapshotsCount
patch.TotalSizeBytes = &total
patch.RawSizeBytes = &raw
patch.UniqueFiles = &files
patch.SnapshotCount = &snaps
} else {
slog.Debug("runner: stats refresh failed (non-fatal)", "err", err)
}
}
envOut, err := api.Marshal(api.MsgRepoStats, "", patch)
if err != nil {
return err
}
return r.tx.Send(envOut)
}
// reportSnapshots calls `restic snapshots --json`, translates the
// payload into the wire shape, and ships it as a snapshots.report
// envelope. Bounded by a separate timeout so a sluggish repo doesn't
+357
View File
@@ -0,0 +1,357 @@
package runner
import (
"context"
"os"
"path/filepath"
"testing"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/restic"
)
// fakeSender collects sent envelopes for assertions.
type fakeSender struct{ envs []api.Envelope }
func (s *fakeSender) Send(e api.Envelope) error {
s.envs = append(s.envs, e)
return nil
}
// setupScript writes a shell script (without shebang) to a temp dir,
// names it "restic", makes it executable, and returns the path.
//
// Writes to "<path>.tmp" then renames into place. The rename is what
// makes this race-free: under -race + many t.Parallel tests, a
// fork-from-another-goroutine can inherit the writable fd from
// os.WriteFile before close completes, and exec'ing the file then
// returns ETXTBSY ("text file busy"). Once the rename lands, the
// final path is a fresh dirent pointing at an inode that has no
// writable fd open anywhere — exec is safe.
func setupScript(t *testing.T, body string) string {
t.Helper()
dir := t.TempDir()
final := filepath.Join(dir, "restic")
tmp := final + ".tmp"
if err := os.WriteFile(tmp, []byte("#!/bin/sh\n"+body+"\n"), 0o755); err != nil {
t.Fatalf("setupScript: write tmp: %v", err)
}
if err := os.Rename(tmp, final); err != nil {
t.Fatalf("setupScript: rename: %v", err)
}
return final
}
// firstEnvOfType returns the first envelope with the given type, or
// fails the test if none is found.
func firstEnvOfType(t *testing.T, envs []api.Envelope, mt api.MessageType) api.Envelope {
t.Helper()
for _, e := range envs {
if e.Type == mt {
return e
}
}
t.Fatalf("no envelope of type %q found in %d envelopes", mt, len(envs))
return api.Envelope{}
}
// envelopeOrder returns the message types of all sent envelopes.
func envelopeOrder(envs []api.Envelope) []api.MessageType {
out := make([]api.MessageType, len(envs))
for i, e := range envs {
out[i] = e.Type
}
return out
}
// TestRunPruneShipsExpectedEnvelopes drives RunPrune with a fake
// binary that prints "prune" on stdout (for the log.stream envelope)
// and emits valid stats JSON so reportStats can populate size fields.
// Expected sequence: job.started → log.stream → repo.stats → job.finished.
func TestRunPruneShipsExpectedEnvelopes(t *testing.T) {
t.Parallel()
// The fake "restic" handles both "prune" and "stats --json" calls.
statsJSON := `{"total_size":1000,"total_uncompressed_size":2000,"snapshots_count":3,"total_file_count":10}`
bin := setupScript(t, `
case "$1" in
prune) echo "prune" ;;
stats) echo '`+statsJSON+`' ;;
*) echo "unknown: $*" ;;
esac
`)
tx := &fakeSender{}
r := New(Config{ResticBin: bin}, tx, 0)
if err := r.RunPrune(context.Background(), "job-1"); err != nil {
t.Fatalf("RunPrune: %v", err)
}
order := envelopeOrder(tx.envs)
// Confirm landmark envelope types appear in the required order.
wantTypes := []api.MessageType{api.MsgJobStarted, api.MsgLogStream, api.MsgRepoStats, api.MsgJobFinished}
positions := map[api.MessageType]int{}
for i, mt := range order {
if _, seen := positions[mt]; !seen {
positions[mt] = i
}
}
for i := 0; i < len(wantTypes)-1; i++ {
a, b := wantTypes[i], wantTypes[i+1]
pa, aOK := positions[a]
pb, bOK := positions[b]
if !aOK {
t.Errorf("envelope type %q not found in output %v", a, order)
continue
}
if !bOK {
t.Errorf("envelope type %q not found in output %v", b, order)
continue
}
if pa >= pb {
t.Errorf("expected %q before %q but positions are %d >= %d (order: %v)", a, b, pa, pb, order)
}
}
// The repo.stats payload must have LastPruneAt set.
statsEnv := firstEnvOfType(t, tx.envs, api.MsgRepoStats)
var statsPayload api.RepoStatsPayload
if err := statsEnv.UnmarshalPayload(&statsPayload); err != nil {
t.Fatalf("unmarshal repo.stats payload: %v", err)
}
if statsPayload.LastPruneAt == nil {
t.Error("expected LastPruneAt to be set in repo.stats after prune")
}
// The job.finished payload must indicate success.
finEnv := firstEnvOfType(t, tx.envs, api.MsgJobFinished)
var finPayload api.JobFinishedPayload
if err := finEnv.UnmarshalPayload(&finPayload); err != nil {
t.Fatalf("unmarshal job.finished payload: %v", err)
}
if finPayload.Status != api.JobSucceeded {
t.Errorf("expected job.finished status=%q, got %q", api.JobSucceeded, finPayload.Status)
}
}
// TestRunCheckShipsCheckStatus verifies that a check run which emits
// a stale-lock line on stderr (exit 0) reports LastCheckStatus="ok"
// and LockPresent=true.
func TestRunCheckShipsCheckStatus(t *testing.T) {
t.Parallel()
statsJSON := `{"total_size":500,"total_uncompressed_size":600,"snapshots_count":1,"total_file_count":5}`
bin := setupScript(t, `
case "$1" in
check) echo "Found stale lock" >&2; exit 0 ;;
stats) echo '`+statsJSON+`' ;;
*) exit 0 ;;
esac
`)
tx := &fakeSender{}
r := New(Config{ResticBin: bin}, tx, 0)
if err := r.RunCheck(context.Background(), "job-2", 0); err != nil {
t.Fatalf("RunCheck: %v", err)
}
// Assert envelope ordering: job.started → log.stream → repo.stats → job.finished.
order := envelopeOrder(tx.envs)
wantTypes := []api.MessageType{api.MsgJobStarted, api.MsgLogStream, api.MsgRepoStats, api.MsgJobFinished}
positions := map[api.MessageType]int{}
for i, mt := range order {
if _, seen := positions[mt]; !seen {
positions[mt] = i
}
}
for i := 0; i < len(wantTypes)-1; i++ {
a, b := wantTypes[i], wantTypes[i+1]
pa, aOK := positions[a]
pb, bOK := positions[b]
if !aOK {
t.Errorf("envelope type %q not found in output %v", a, order)
continue
}
if !bOK {
t.Errorf("envelope type %q not found in output %v", b, order)
continue
}
if pa >= pb {
t.Errorf("expected %q before %q but positions are %d >= %d (order: %v)", a, b, pa, pb, order)
}
}
statsEnv := firstEnvOfType(t, tx.envs, api.MsgRepoStats)
var p api.RepoStatsPayload
if err := statsEnv.UnmarshalPayload(&p); err != nil {
t.Fatalf("unmarshal: %v", err)
}
if p.LastCheckStatus != "ok" {
t.Errorf("LastCheckStatus: got %q, want %q", p.LastCheckStatus, "ok")
}
if p.LockPresent == nil || !*p.LockPresent {
t.Errorf("expected LockPresent=true, got %v", p.LockPresent)
}
if p.LastCheckAt == nil {
t.Error("expected LastCheckAt to be set")
}
}
// TestRunCheckErrorsFoundShipsErrorsStatus verifies that a check run
// that exits 1 (errors found) reports LastCheckStatus="errors_found".
func TestRunCheckErrorsFoundShipsErrorsStatus(t *testing.T) {
t.Parallel()
statsJSON := `{"total_size":500,"total_uncompressed_size":600,"snapshots_count":1,"total_file_count":5}`
bin := setupScript(t, `
case "$1" in
check) exit 1 ;;
stats) echo '`+statsJSON+`' ;;
*) exit 0 ;;
esac
`)
tx := &fakeSender{}
r := New(Config{ResticBin: bin}, tx, 0)
// RunCheck returns nil for exit 1 (errors_found is not a wrapper failure).
if err := r.RunCheck(context.Background(), "job-3", 0); err != nil {
t.Fatalf("RunCheck: %v", err)
}
// Assert envelope ordering: job.started → repo.stats → job.finished.
// (No log.stream expected because the fake script produces no
// output before exit 1 — a real restic check would emit log lines
// before exiting non-zero.)
order := envelopeOrder(tx.envs)
wantTypes := []api.MessageType{api.MsgJobStarted, api.MsgRepoStats, api.MsgJobFinished}
positions := map[api.MessageType]int{}
for i, mt := range order {
if _, seen := positions[mt]; !seen {
positions[mt] = i
}
}
for i := 0; i < len(wantTypes)-1; i++ {
a, b := wantTypes[i], wantTypes[i+1]
pa, aOK := positions[a]
pb, bOK := positions[b]
if !aOK {
t.Errorf("envelope type %q not found in output %v", a, order)
continue
}
if !bOK {
t.Errorf("envelope type %q not found in output %v", b, order)
continue
}
if pa >= pb {
t.Errorf("expected %q before %q but positions are %d >= %d (order: %v)", a, b, pa, pb, order)
}
}
statsEnv := firstEnvOfType(t, tx.envs, api.MsgRepoStats)
var p api.RepoStatsPayload
if err := statsEnv.UnmarshalPayload(&p); err != nil {
t.Fatalf("unmarshal: %v", err)
}
if p.LastCheckStatus != "errors_found" {
t.Errorf("LastCheckStatus: got %q, want %q", p.LastCheckStatus, "errors_found")
}
}
// TestRunUnlockClearsLock verifies that a successful unlock ships a
// repo.stats envelope with LockPresent=false.
func TestRunUnlockClearsLock(t *testing.T) {
t.Parallel()
statsJSON := `{"total_size":100,"total_uncompressed_size":150,"snapshots_count":2,"total_file_count":8}`
bin := setupScript(t, `
case "$1" in
unlock) echo "removed 1 locks" ;;
stats) echo '`+statsJSON+`' ;;
*) exit 0 ;;
esac
`)
tx := &fakeSender{}
r := New(Config{ResticBin: bin}, tx, 0)
if err := r.RunUnlock(context.Background(), "job-4"); err != nil {
t.Fatalf("RunUnlock: %v", err)
}
// Assert envelope ordering: job.started → log.stream → repo.stats → job.finished.
order := envelopeOrder(tx.envs)
wantTypes := []api.MessageType{api.MsgJobStarted, api.MsgLogStream, api.MsgRepoStats, api.MsgJobFinished}
positions := map[api.MessageType]int{}
for i, mt := range order {
if _, seen := positions[mt]; !seen {
positions[mt] = i
}
}
for i := 0; i < len(wantTypes)-1; i++ {
a, b := wantTypes[i], wantTypes[i+1]
pa, aOK := positions[a]
pb, bOK := positions[b]
if !aOK {
t.Errorf("envelope type %q not found in output %v", a, order)
continue
}
if !bOK {
t.Errorf("envelope type %q not found in output %v", b, order)
continue
}
if pa >= pb {
t.Errorf("expected %q before %q but positions are %d >= %d (order: %v)", a, b, pa, pb, order)
}
}
statsEnv := firstEnvOfType(t, tx.envs, api.MsgRepoStats)
var p api.RepoStatsPayload
if err := statsEnv.UnmarshalPayload(&p); err != nil {
t.Fatalf("unmarshal: %v", err)
}
if p.LockPresent == nil {
t.Fatal("expected LockPresent to be set (non-nil)")
}
if *p.LockPresent {
t.Errorf("expected LockPresent=false after unlock, got true")
}
}
// TestRunInitShipsStartedAndFinished confirms the refactored RunInit
// still produces job.started and job.finished envelopes.
func TestRunInitShipsStartedAndFinished(t *testing.T) {
t.Parallel()
bin := setupScript(t, `echo "initialized repository"`)
tx := &fakeSender{}
r := New(Config{ResticBin: bin}, tx, 0)
if err := r.RunInit(context.Background(), "job-init"); err != nil {
t.Fatalf("RunInit: %v", err)
}
_ = firstEnvOfType(t, tx.envs, api.MsgJobStarted)
_ = firstEnvOfType(t, tx.envs, api.MsgJobFinished)
}
// TestRunForgetShipsStartedAndFinished confirms the refactored
// RunForget still produces job.started and job.finished envelopes.
func TestRunForgetShipsStartedAndFinished(t *testing.T) {
t.Parallel()
// Script handles both "forget --json ..." and "snapshots --json" calls.
bin := setupScript(t, `
case "$1" in
forget) echo "[]" ;;
snapshots) echo "[]" ;;
*) exit 0 ;;
esac
`)
tx := &fakeSender{}
r := New(Config{ResticBin: bin}, tx, 0)
keepLast := 1
groups := []restic.ForgetGroup{{
Tag: "documents",
Policy: restic.ForgetPolicy{KeepLast: &keepLast},
}}
if err := r.RunForget(context.Background(), "job-forget", groups); err != nil {
t.Fatalf("RunForget: %v", err)
}
_ = firstEnvOfType(t, tx.envs, api.MsgJobStarted)
_ = firstEnvOfType(t, tx.envs, api.MsgJobFinished)
}
+92 -15
View File
@@ -9,6 +9,7 @@
package secrets
import (
"bytes"
"encoding/json"
"errors"
"fmt"
@@ -24,6 +25,11 @@ import (
// depth — the key is per-host today, but cheap to be careful.)
const additionalData = "rm-agent-repo-creds-v1"
// ErrNoAdmin is returned by LoadAdmin when no admin slot has been
// written yet. Callers must distinguish this from a hard error: the
// agent simply hasn't received an admin config.update push yet.
var ErrNoAdmin = errors.New("secrets: admin slot not configured")
// Repo is the plaintext shape persisted inside the AEAD blob.
type Repo struct {
URL string `json:"repo_url,omitempty"`
@@ -35,6 +41,15 @@ type Repo struct {
// minimum (URL + password) needed to run a backup.
func (r Repo) Empty() bool { return r.URL == "" || r.Password == "" }
// bundle is the on-disk JSON shape as of secrets v2. It holds the
// everyday repo slot and an optional admin slot (prune / unlock).
// Legacy files (pre-v2) contain a flat Repo object; loadBundle
// transparently upgrades those on the next Save.
type bundle struct {
Repo Repo `json:"repo,omitempty"`
Admin *Repo `json:"admin,omitempty"`
}
// Store reads and writes the encrypted secrets file at Path, sealed
// under the 32-byte key Key.
type Store struct {
@@ -55,32 +70,47 @@ func New(path string, key []byte) (*Store, error) {
return &Store{path: path, a: a}, nil
}
// Load returns the persisted Repo, or a zero-value Repo (with no
// error) if the file does not exist yet — first-run agents have
// nothing on disk until the server pushes a config.update.
func (s *Store) Load() (Repo, error) {
// loadBundle reads and decrypts the on-disk blob, returning a bundle.
// It handles back-compat decode: legacy flat Repo blobs are detected
// by the presence of a top-level "repo_url" key and re-wrapped into
// the bundle shape transparently. Returns an empty bundle when the
// file does not exist yet.
func (s *Store) loadBundle() (bundle, error) {
body, err := os.ReadFile(s.path)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
return Repo{}, nil
return bundle{}, nil
}
return Repo{}, fmt.Errorf("secrets: read %q: %w", s.path, err)
return bundle{}, fmt.Errorf("secrets: read %q: %w", s.path, err)
}
plain, err := s.a.Decrypt(string(body), []byte(additionalData))
if err != nil {
return Repo{}, fmt.Errorf("secrets: decrypt %q: %w", s.path, err)
return bundle{}, fmt.Errorf("secrets: decrypt %q: %w", s.path, err)
}
var r Repo
if err := json.Unmarshal(plain, &r); err != nil {
return Repo{}, fmt.Errorf("secrets: parse %q: %w", s.path, err)
// Try the new bundle shape first.
var b bundle
if err := json.Unmarshal(plain, &b); err != nil {
return bundle{}, fmt.Errorf("secrets: parse %q: %w", s.path, err)
}
return r, nil
// If the bundle has an empty Repo slot but the raw JSON contains
// a top-level "repo_url" key, this is a legacy flat blob —
// re-unmarshal it as a Repo and slot it in.
if b.Repo == (Repo{}) && bytes.Contains(plain, []byte(`"repo_url"`)) {
var legacy Repo
if err := json.Unmarshal(plain, &legacy); err == nil {
b.Repo = legacy
}
}
return b, nil
}
// Save replaces the on-disk blob atomically. Mode is 0600. Parent
// directory must already exist (the install script lays it down).
func (s *Store) Save(r Repo) error {
body, err := json.Marshal(r)
// saveBundle marshals b, encrypts it and writes it atomically at
// mode 0600. Parent directory must already exist.
func (s *Store) saveBundle(b bundle) error {
body, err := json.Marshal(b)
if err != nil {
return fmt.Errorf("secrets: marshal: %w", err)
}
@@ -115,3 +145,50 @@ func (s *Store) Save(r Repo) error {
}
return nil
}
// Load returns the persisted Repo (the everyday repo slot), or a
// zero-value Repo (with no error) if the file does not exist yet —
// first-run agents have nothing on disk until the server pushes a
// config.update.
func (s *Store) Load() (Repo, error) {
b, err := s.loadBundle()
if err != nil {
return Repo{}, err
}
return b.Repo, nil
}
// Save replaces the repo slot on disk atomically, preserving the
// admin slot. Mode is 0600. Parent directory must already exist.
func (s *Store) Save(r Repo) error {
b, err := s.loadBundle()
if err != nil {
return fmt.Errorf("secrets: load before save: %w", err)
}
b.Repo = r
return s.saveBundle(b)
}
// LoadAdmin returns the admin slot, or (Repo{}, ErrNoAdmin) when no
// admin slot has been set. All other errors are hard failures.
func (s *Store) LoadAdmin() (Repo, error) {
b, err := s.loadBundle()
if err != nil {
return Repo{}, err
}
if b.Admin == nil {
return Repo{}, ErrNoAdmin
}
return *b.Admin, nil
}
// SaveAdmin replaces the admin slot on disk atomically, preserving
// the repo slot. Mode is 0600.
func (s *Store) SaveAdmin(r Repo) error {
b, err := s.loadBundle()
if err != nil {
return fmt.Errorf("secrets: load before save: %w", err)
}
b.Admin = &r
return s.saveBundle(b)
}
+210
View File
@@ -2,6 +2,8 @@ package secrets
import (
"crypto/rand"
"encoding/json"
"errors"
"io"
"os"
"path/filepath"
@@ -97,3 +99,211 @@ func TestSaveIsAtomic(t *testing.T) {
t.Errorf("dir should hold one file post-save, got %v", names)
}
}
func TestSecretsLoadAdminEmpty(t *testing.T) {
t.Parallel()
// No file yet: LoadAdmin must return ErrNoAdmin, not a hard error.
dir := t.TempDir()
path := filepath.Join(dir, "secrets.enc")
st, err := New(path, freshKey(t))
if err != nil {
t.Fatalf("new: %v", err)
}
_, err = st.LoadAdmin()
if !errors.Is(err, ErrNoAdmin) {
t.Errorf("expected ErrNoAdmin, got %v", err)
}
}
func TestSecretsAdminSlotIndependent(t *testing.T) {
t.Parallel()
dir := t.TempDir()
path := filepath.Join(dir, "secrets.enc")
st, err := New(path, freshKey(t))
if err != nil {
t.Fatalf("new: %v", err)
}
repo := Repo{URL: "rest:https://repo/host", Username: "user", Password: "pw"}
admin := Repo{URL: "rest:https://repo/host", Username: "admin", Password: "adminpw"}
if err := st.Save(repo); err != nil {
t.Fatalf("save repo: %v", err)
}
if err := st.SaveAdmin(admin); err != nil {
t.Fatalf("save admin: %v", err)
}
// Load returns the repo slot unchanged.
gotRepo, err := st.Load()
if err != nil {
t.Fatalf("load: %v", err)
}
if gotRepo != repo {
t.Errorf("repo slot mismatch: got %+v want %+v", gotRepo, repo)
}
// LoadAdmin returns the admin slot.
gotAdmin, err := st.LoadAdmin()
if err != nil {
t.Fatalf("load admin: %v", err)
}
if gotAdmin != admin {
t.Errorf("admin slot mismatch: got %+v want %+v", gotAdmin, admin)
}
// SaveAdmin a second time replaces admin only; repo unchanged.
admin2 := Repo{URL: "rest:https://repo/host", Username: "admin2", Password: "pw2"}
if err := st.SaveAdmin(admin2); err != nil {
t.Fatalf("save admin2: %v", err)
}
gotRepo2, err := st.Load()
if err != nil {
t.Fatalf("load after admin2 save: %v", err)
}
if gotRepo2 != repo {
t.Errorf("repo slot changed unexpectedly: got %+v want %+v", gotRepo2, repo)
}
gotAdmin2, err := st.LoadAdmin()
if err != nil {
t.Fatalf("load admin2: %v", err)
}
if gotAdmin2 != admin2 {
t.Errorf("admin2 slot mismatch: got %+v want %+v", gotAdmin2, admin2)
}
}
func TestSecretsSaveRefusesCorruptFile(t *testing.T) {
t.Parallel()
dir := t.TempDir()
path := filepath.Join(dir, "secrets.enc")
st, err := New(path, freshKey(t))
if err != nil {
t.Fatalf("new: %v", err)
}
// Lay down a valid file first.
if err := st.Save(Repo{URL: "rest:https://r/host", Password: "pw"}); err != nil {
t.Fatalf("initial save: %v", err)
}
// Corrupt the file.
garbage := []byte("not encrypted")
if err := os.WriteFile(path, garbage, 0o600); err != nil {
t.Fatalf("write garbage: %v", err)
}
// Save must refuse to overwrite: decrypt will fail.
saveErr := st.Save(Repo{URL: "rest:https://r/host", Password: "new"})
if saveErr == nil {
t.Fatal("Save over corrupt file must return an error; got nil")
}
// File must NOT have been replaced — still contains the garbage bytes.
got, err := os.ReadFile(path)
if err != nil {
t.Fatalf("re-read: %v", err)
}
if string(got) != string(garbage) {
t.Errorf("corrupt file was overwritten; file size now %d (was %d)", len(got), len(garbage))
}
}
func TestSecretsSaveAdminRefusesCorruptFile(t *testing.T) {
t.Parallel()
dir := t.TempDir()
path := filepath.Join(dir, "secrets.enc")
st, err := New(path, freshKey(t))
if err != nil {
t.Fatalf("new: %v", err)
}
// Lay down a valid file first.
if err := st.SaveAdmin(Repo{URL: "rest:https://r/host", Password: "adminpw"}); err != nil {
t.Fatalf("initial save admin: %v", err)
}
// Corrupt the file.
garbage := []byte("not encrypted admin")
if err := os.WriteFile(path, garbage, 0o600); err != nil {
t.Fatalf("write garbage: %v", err)
}
// SaveAdmin must refuse to overwrite: decrypt will fail.
saveErr := st.SaveAdmin(Repo{URL: "rest:https://r/host", Password: "new"})
if saveErr == nil {
t.Fatal("SaveAdmin over corrupt file must return an error; got nil")
}
// File must NOT have been replaced.
got, err := os.ReadFile(path)
if err != nil {
t.Fatalf("re-read: %v", err)
}
if string(got) != string(garbage) {
t.Errorf("corrupt file was overwritten; file size now %d (was %d)", len(got), len(garbage))
}
}
func TestSecretsLegacyFlatBlobMigrates(t *testing.T) {
t.Parallel()
dir := t.TempDir()
path := filepath.Join(dir, "secrets.enc")
key := freshKey(t)
// Write a legacy flat Repo blob directly — bypassing bundle wrapping.
legacy := Repo{URL: "rest:https://legacy/host", Username: "legacyuser", Password: "legacypw"}
plain, err := json.Marshal(legacy)
if err != nil {
t.Fatalf("marshal legacy: %v", err)
}
a, err := crypto.NewAEAD(key)
if err != nil {
t.Fatalf("aead: %v", err)
}
ct, err := a.Encrypt(plain, []byte(additionalData))
if err != nil {
t.Fatalf("encrypt legacy: %v", err)
}
if err := os.WriteFile(path, []byte(ct), 0o600); err != nil {
t.Fatalf("write legacy file: %v", err)
}
// Open via secrets.New + Load — must return the legacy Repo.
st, err := New(path, key)
if err != nil {
t.Fatalf("new: %v", err)
}
got, err := st.Load()
if err != nil {
t.Fatalf("load legacy: %v", err)
}
if got != legacy {
t.Errorf("legacy decode mismatch: got %+v want %+v", got, legacy)
}
// SaveAdmin should write both slots; re-opening must have both.
admin := Repo{URL: "rest:https://legacy/host", Username: "admin", Password: "adminpw"}
if err := st.SaveAdmin(admin); err != nil {
t.Fatalf("save admin after legacy: %v", err)
}
st2, err := New(path, key)
if err != nil {
t.Fatalf("reopen: %v", err)
}
gotRepo, err := st2.Load()
if err != nil {
t.Fatalf("load repo after migration: %v", err)
}
if gotRepo != legacy {
t.Errorf("repo after migration: got %+v want %+v", gotRepo, legacy)
}
gotAdmin, err := st2.LoadAdmin()
if err != nil {
t.Fatalf("load admin after migration: %v", err)
}
if gotAdmin != admin {
t.Errorf("admin after migration: got %+v want %+v", gotAdmin, admin)
}
}
+66 -18
View File
@@ -77,6 +77,30 @@ const (
JobCancelled JobStatus = "cancelled" //nolint:misspell // wire format
)
// ForgetPolicyJSON is the wire shape of a per-group retention policy
// shipped with a forget command.run. Mirrors store.RetentionPolicy
// JSON tags exactly so a future caller could json-roundtrip between
// the two without reshaping. All fields nullable; an empty struct is
// rejected by the agent (restic refuses to forget without --keep-*).
type ForgetPolicyJSON struct {
KeepLast *int `json:"keep_last,omitempty"`
KeepHourly *int `json:"keep_hourly,omitempty"`
KeepDaily *int `json:"keep_daily,omitempty"`
KeepWeekly *int `json:"keep_weekly,omitempty"`
KeepMonthly *int `json:"keep_monthly,omitempty"`
KeepYearly *int `json:"keep_yearly,omitempty"`
}
// ForgetGroup is one (tag, retention) pair shipped to the agent in a
// forget command.run. The agent invokes
// `restic forget --tag <Tag> --keep-* …` once per group, with each
// group's own policy. The Tag is the source-group name (which is
// also the snapshot tag carried at backup time).
type ForgetGroup struct {
Tag string `json:"tag"`
Policy ForgetPolicyJSON `json:"policy"`
}
// CommandRunPayload is the server → agent dispatch for a run-now job.
//
// For kind=backup, Includes/Excludes/Tag are populated from the source
@@ -85,19 +109,27 @@ const (
// the source group's name) so retention can target it later via
// `restic forget --tag`.
//
// For kind=forget, RetentionPolicy is the typed keep-* set as raw JSON
// (the agent doesn't share the store package's typed struct).
// For kind=forget, ForgetGroups carries one entry per source-group on
// the host that has a non-empty retention policy. The agent walks the
// list and runs `restic forget --tag <Tag> --keep-* …` per group.
//
// Args is preserved as a generic free-form slice for kinds that don't
// fit the structured fields (e.g. unlock takes none; init takes none).
// fit the structured fields (e.g. unlock takes none; init takes none;
// check carries the subset% as Args[0]).
//
// RequiresAdminCreds tells the agent to load the admin slot of its
// secrets store rather than the everyday repo slot. Set by the server
// only for prune (the only kind that needs delete authority on a
// rest-server repo today).
type CommandRunPayload struct {
JobID string `json:"job_id"`
Kind JobKind `json:"kind"`
Args []string `json:"args,omitempty"`
Includes []string `json:"includes,omitempty"`
Excludes []string `json:"excludes,omitempty"`
Tag string `json:"tag,omitempty"`
RetentionPolicy json.RawMessage `json:"retention_policy,omitempty"`
JobID string `json:"job_id"`
Kind JobKind `json:"kind"`
Args []string `json:"args,omitempty"`
Includes []string `json:"includes,omitempty"`
Excludes []string `json:"excludes,omitempty"`
Tag string `json:"tag,omitempty"`
ForgetGroups []ForgetGroup `json:"forget_groups,omitempty"`
RequiresAdminCreds bool `json:"requires_admin_creds,omitempty"`
}
// CommandCancelPayload is the server → agent cancel signal.
@@ -186,15 +218,24 @@ type Snapshot struct {
FileCount int64 `json:"file_count,omitempty"`
}
// RepoStatsPayload — agent reports periodic repo health facts derived
// from `restic stats` and lock-file inspection.
// RepoStatsPayload carries a partial-update snapshot of repo health
// facts, shipped by the agent after prune/check/unlock or a periodic
// stats refresh. Pointer fields follow omitempty semantics: a nil
// pointer means "no update for this field" and is omitted on the
// wire; the server merges only the non-nil fields into its
// host_repo_stats row (matching UpsertHostRepoStats partial-update
// semantics). Non-pointer fields (LastCheckStatus) use the empty
// string as the "no update" sentinel.
type RepoStatsPayload struct {
SizeBytes int64 `json:"size_bytes"`
SnapshotCount int `json:"snapshot_count"`
DedupRatio float64 `json:"dedup_ratio"`
LastCheckAt time.Time `json:"last_check_at,omitempty"`
LastCheckStatus string `json:"last_check_status,omitempty"`
LockState string `json:"lock_state"` // locked|unlocked
TotalSizeBytes *int64 `json:"total_size_bytes,omitempty"`
RawSizeBytes *int64 `json:"raw_size_bytes,omitempty"`
UniqueFiles *int64 `json:"unique_files,omitempty"`
SnapshotCount *int64 `json:"snapshot_count,omitempty"`
LastCheckAt *time.Time `json:"last_check_at,omitempty"`
LastCheckStatus string `json:"last_check_status,omitempty"`
LockPresent *bool `json:"lock_present,omitempty"`
LastPruneAt *time.Time `json:"last_prune_at,omitempty"`
LastPruneFreedBytes *int64 `json:"last_prune_freed_bytes,omitempty"`
}
// Schedule is the agent-facing view of a slim Schedule row plus its
@@ -252,12 +293,19 @@ type ScheduleFirePayload struct {
// ConfigUpdatePayload — server pushes per-host config (currently just
// repo connection details). Empty fields mean "leave existing alone";
// to clear something, send an explicit zero value.
//
// Slot picks which secrets-store slot the agent writes the creds to.
// Empty / "repo" = everyday repo creds (default). "admin" = the
// prune-capable admin user (separate slot — not loaded for backups).
// Forwards-compatible: an agent that ignores Slot simply writes to the
// repo slot and admin pushes become no-ops.
type ConfigUpdatePayload struct {
RepoURL string `json:"repo_url,omitempty"`
RepoPassword string `json:"repo_password,omitempty"` // sensitive
RepoUsername string `json:"repo_username,omitempty"`
RepoCredential string `json:"repo_credential,omitempty"` // sensitive (for rest server basic auth)
HookShell string `json:"hook_shell,omitempty"`
Slot string `json:"slot,omitempty"`
}
// AgentUpdateAvailablePayload — informational only; the agent does
+12
View File
@@ -12,3 +12,15 @@ const CurrentProtocolVersion = 1
// server accepts in a hello. Agents below this are disconnected with
// a structured error pointing at the upgrade docs.
const MinAgentProtocolVersion = 1
// Phase 5 (P2R-03..P2R-08, branch p2r-phase5-maintenance, 2026-05) reshaped
// CommandRunPayload (RetentionPolicy removed, ForgetGroups added, RequiresAdminCreds added),
// ConfigUpdatePayload (Slot added), and RepoStatsPayload (full reshape).
// The protocol version was deliberately NOT bumped because:
// 1. This project deploys agent and server in lockstep from the same release.
// 2. There is no supported "rolling upgrade" path with mixed agent/server versions.
// 3. The smoke env restage block in CLAUDE.md restages the agent binary on
// every server build for exactly this reason.
//
// If a multi-version protocol path is ever introduced, every Phase 5 wire
// change is a breaking change and the version must bump to 2 at that time.
+79
View File
@@ -138,6 +138,85 @@ func TestJobProgressShapeStable(t *testing.T) {
}
}
func TestRepoStatsPayloadRoundTrip(t *testing.T) {
t.Parallel()
// Nil pointer fields must be omitted from JSON output.
empty := RepoStatsPayload{}
raw, err := json.Marshal(empty)
if err != nil {
t.Fatalf("marshal empty: %v", err)
}
if string(raw) != "{}" {
t.Errorf("empty payload should marshal to {}, got %s", raw)
}
// Populated fields must survive a round trip.
total := int64(123456)
rawSize := int64(200000)
files := int64(42)
snaps := int64(7)
lockPresent := true
now := time.Date(2026, 1, 2, 3, 4, 5, 0, time.UTC)
pruneAt := time.Date(2026, 1, 3, 0, 0, 0, 0, time.UTC)
freed := int64(8192)
p := RepoStatsPayload{
TotalSizeBytes: &total,
RawSizeBytes: &rawSize,
UniqueFiles: &files,
SnapshotCount: &snaps,
LastCheckAt: &now,
LastCheckStatus: "ok",
LockPresent: &lockPresent,
LastPruneAt: &pruneAt,
LastPruneFreedBytes: &freed,
}
raw2, err := json.Marshal(p)
if err != nil {
t.Fatalf("marshal full: %v", err)
}
var got RepoStatsPayload
if err := json.Unmarshal(raw2, &got); err != nil {
t.Fatalf("unmarshal: %v", err)
}
if got.TotalSizeBytes == nil || *got.TotalSizeBytes != total {
t.Errorf("TotalSizeBytes: got %v, want %d", got.TotalSizeBytes, total)
}
if got.RawSizeBytes == nil || *got.RawSizeBytes != rawSize {
t.Errorf("RawSizeBytes: got %v, want %d", got.RawSizeBytes, rawSize)
}
if got.UniqueFiles == nil || *got.UniqueFiles != files {
t.Errorf("UniqueFiles: got %v, want %d", got.UniqueFiles, files)
}
if got.SnapshotCount == nil || *got.SnapshotCount != snaps {
t.Errorf("SnapshotCount: got %v, want %d", got.SnapshotCount, snaps)
}
if got.LastCheckAt == nil || !got.LastCheckAt.Equal(now) {
t.Errorf("LastCheckAt: got %v, want %v", got.LastCheckAt, now)
}
if got.LastCheckStatus != "ok" {
t.Errorf("LastCheckStatus: got %q, want %q", got.LastCheckStatus, "ok")
}
if got.LockPresent == nil || *got.LockPresent != lockPresent {
t.Errorf("LockPresent: got %v, want %v", got.LockPresent, lockPresent)
}
if got.LastPruneAt == nil || !got.LastPruneAt.Equal(pruneAt) {
t.Errorf("LastPruneAt: got %v, want %v", got.LastPruneAt, pruneAt)
}
if got.LastPruneFreedBytes == nil || *got.LastPruneFreedBytes != freed {
t.Errorf("LastPruneFreedBytes: got %v, want %d", got.LastPruneFreedBytes, freed)
}
// Partial update: only set LockPresent.
lockFalse := false
partial := RepoStatsPayload{LockPresent: &lockFalse}
rawPartial, _ := json.Marshal(partial)
if string(rawPartial) != `{"lock_present":false}` {
t.Errorf("partial marshal: got %s, want {\"lock_present\":false}", rawPartial)
}
}
// touch time so the import is used by other tests in this file when
// they grow over time.
var _ = time.Now
+183 -63
View File
@@ -151,8 +151,7 @@ func (e Env) RunBackup(ctx context.Context, paths, excludes, tags []string, hand
}
// ForgetPolicy mirrors restic forget's --keep-* flags. All optional;
// nil/zero means "don't pass that flag." Caller passes whatever the
// schedule's RetentionPolicy carries.
// nil/zero means "don't pass that flag."
type ForgetPolicy struct {
KeepLast *int
KeepHourly *int
@@ -181,53 +180,47 @@ func (p ForgetPolicy) args() []string {
return out
}
// Empty reports whether no retention dimensions are set. restic
// forget refuses to run without at least one keep-* flag (it would
// delete every snapshot), so the agent rejects empty policies before
// invoking restic.
// Empty reports whether no retention dimensions are set.
func (p ForgetPolicy) Empty() bool {
return p.KeepLast == nil && p.KeepHourly == nil &&
p.KeepDaily == nil && p.KeepWeekly == nil &&
p.KeepMonthly == nil && p.KeepYearly == nil
}
// RunForget executes `restic forget --keep-* … --json` against the
// configured repo. Does NOT pass --prune — pruning lives behind a
// separate, admin-only credential (see spec §4.3 / P2-06). Restic
// just rewrites the snapshot index; the actual data deletion waits
// for the next prune. Returns nil on a clean exit.
func (e Env) RunForget(ctx context.Context, policy ForgetPolicy, handle LineHandler) error {
if policy.Empty() {
return fmt.Errorf("restic forget: refusing to run with empty retention policy (would delete every snapshot)")
}
args := append([]string{"forget", "--json"}, policy.args()...)
cmd := exec.CommandContext(ctx, e.Bin, args...)
cmd.Env = e.envSlice()
cmd.Dir = e.WorkDir
// ForgetGroup is one (tag, retention-policy) pair fed to RunForget.
// The wrapper invokes `restic forget --tag <Tag> --keep-* …` per
// group so retention can be targeted at a single source-group's
// snapshots without disturbing snapshots tagged for other groups.
type ForgetGroup struct {
Tag string
Policy ForgetPolicy
}
stdout, err := cmd.StdoutPipe()
if err != nil {
return fmt.Errorf("restic forget: stdout pipe: %w", err)
// RunForget executes one `restic forget --tag <Tag> --keep-* …`
// invocation per group. Does NOT pass --prune — pruning lives behind
// a separate admin-only credential (see spec §4.3 / P2-06). Restic
// rewrites the snapshot index; the actual data deletion waits for
// the next prune. Empty groups slice is rejected (would be a no-op);
// any group with an empty policy is rejected (restic forget without
// any keep-* would delete every snapshot in the tagged set).
// Returns the first error encountered, or nil when every group runs
// to a clean exit.
func (e Env) RunForget(ctx context.Context, groups []ForgetGroup, handle LineHandler) error {
if len(groups) == 0 {
return fmt.Errorf("restic forget: refusing to run with no groups (would be a no-op)")
}
stderr, err := cmd.StderrPipe()
if err != nil {
return fmt.Errorf("restic forget: stderr pipe: %w", err)
}
if err := cmd.Start(); err != nil {
return fmt.Errorf("restic forget: start: %w", err)
}
done := make(chan error, 2)
go func() { done <- pumpPlain(stdout, "stdout", handle) }()
go func() { done <- pumpPlain(stderr, "stderr", handle) }()
for i := 0; i < 2; i++ {
if err := <-done; err != nil && handle != nil {
handle("event", fmt.Sprintf("pump error: %v", err), nil)
for _, g := range groups {
if g.Policy.Empty() {
return fmt.Errorf("restic forget: group %q has empty retention policy (would delete every snapshot)", g.Tag)
}
args := []string{"forget", "--json", "--tag", g.Tag}
args = append(args, g.Policy.args()...)
cmd := exec.CommandContext(ctx, e.Bin, args...)
cmd.Env = e.envSlice()
cmd.Dir = e.WorkDir
if err := runWithPump(cmd, handle); err != nil {
return err
}
}
if werr := cmd.Wait(); werr != nil {
return fmt.Errorf("restic forget: %w", werr)
}
return nil
}
@@ -243,19 +236,6 @@ func (e Env) RunInit(ctx context.Context, handle LineHandler) error {
cmd.Env = e.envSlice()
cmd.Dir = e.WorkDir
stdout, err := cmd.StdoutPipe()
if err != nil {
return fmt.Errorf("restic init: stdout pipe: %w", err)
}
stderr, err := cmd.StderrPipe()
if err != nil {
return fmt.Errorf("restic init: stderr pipe: %w", err)
}
if err := cmd.Start(); err != nil {
return fmt.Errorf("restic init: start: %w", err)
}
// Sniff for "config file already exists" on stderr; if we see it
// we'll treat the non-zero exit as a soft success — running init
// against an already-initialized repo is a no-op semantically,
@@ -271,26 +251,166 @@ func (e Env) RunInit(ctx context.Context, handle LineHandler) error {
}
}
done := make(chan error, 2)
go func() { done <- pumpPlain(stdout, "stdout", sniff) }()
go func() { done <- pumpPlain(stderr, "stderr", sniff) }()
for i := 0; i < 2; i++ {
if err := <-done; err != nil && handle != nil {
handle("event", fmt.Sprintf("pump error: %v", err), nil)
}
}
if werr := cmd.Wait(); werr != nil {
if err := runWithPump(cmd, sniff); err != nil {
if alreadyInited {
if handle != nil {
handle("event", "repo already initialized — treating as success", nil)
}
return nil
}
return fmt.Errorf("restic init: %w", werr)
return err
}
return nil
}
// RunPrune executes `restic prune` against the configured repo.
// Requires the *admin* credentials (delete access on the rest-server
// repo) — the caller is responsible for populating Env.RepoUsername
// and Env.RepoPassword with the admin pair before calling this.
//
// Prune emits human-readable progress on stdout/stderr (no --json
// support that's useful for our purposes). We tee everything to the
// handler so the live log is the operator's progress bar.
func (e Env) RunPrune(ctx context.Context, handle LineHandler) error {
cmd := exec.CommandContext(ctx, e.Bin, "prune")
cmd.Env = e.envSlice()
cmd.Dir = e.WorkDir
return runWithPump(cmd, handle)
}
// runWithPump starts the configured cmd, fans stdout+stderr into
// pumpPlain via the supplied handler, waits, and wraps any error
// with the cmd's verb (e.g., "restic prune") for context.
func runWithPump(cmd *exec.Cmd, handle LineHandler) error {
label := "restic"
if len(cmd.Args) > 1 {
label = "restic " + cmd.Args[1]
}
stdout, err := cmd.StdoutPipe()
if err != nil {
return fmt.Errorf("%s: stdout pipe: %w", label, err)
}
stderr, err := cmd.StderrPipe()
if err != nil {
return fmt.Errorf("%s: stderr pipe: %w", label, err)
}
if err := cmd.Start(); err != nil {
return fmt.Errorf("%s: start: %w", label, err)
}
done := make(chan error, 2)
go func() { done <- pumpPlain(stdout, "stdout", handle) }()
go func() { done <- pumpPlain(stderr, "stderr", handle) }()
for i := 0; i < 2; i++ {
if err := <-done; err != nil && handle != nil {
handle("event", fmt.Sprintf("pump error: %v", err), nil)
}
}
if werr := cmd.Wait(); werr != nil {
return fmt.Errorf("%s: %w", label, werr)
}
return nil
}
// RunUnlock executes `restic unlock`. Returns nil on a clean exit.
func (e Env) RunUnlock(ctx context.Context, handle LineHandler) error {
cmd := exec.CommandContext(ctx, e.Bin, "unlock")
cmd.Env = e.envSlice()
cmd.Dir = e.WorkDir
return runWithPump(cmd, handle)
}
// RepoStats mirrors `restic stats --json --mode raw-data` output.
type RepoStats struct {
TotalSize int64 `json:"total_size"`
TotalUncompressed int64 `json:"total_uncompressed_size"`
SnapshotsCount int64 `json:"snapshots_count"`
TotalFileCount int64 `json:"total_file_count"`
TotalBlobCount int64 `json:"total_blob_count"`
}
// RunStats executes `restic stats --json --mode raw-data` and parses
// the (single-line) JSON response. Tees raw output to handle so the
// caller can still log it. Returns an error if no JSON-shaped line
// arrived on stdout.
func (e Env) RunStats(ctx context.Context, handle LineHandler) (*RepoStats, error) {
cmd := exec.CommandContext(ctx, e.Bin, "stats", "--json", "--mode", "raw-data")
cmd.Env = e.envSlice()
cmd.Dir = e.WorkDir
var out *RepoStats
capture := func(stream, line string, ev any) {
if stream == "stdout" && strings.HasPrefix(line, "{") {
var s RepoStats
if json.Unmarshal([]byte(line), &s) == nil {
cp := s
out = &cp
}
}
if handle != nil {
handle(stream, line, ev)
}
}
if err := runWithPump(cmd, capture); err != nil {
return nil, err
}
if out == nil {
return nil, fmt.Errorf("restic stats: no JSON in output")
}
return out, nil
}
// CheckResult summarizes a `restic check` invocation. LockPresent is
// true if the stderr stream contained a stale-lock signal (caller is
// expected to surface this in the UI so the operator can run unlock).
// ErrorsFound is true if check exited with a non-zero status (errors
// detected in repo metadata).
type CheckResult struct {
LockPresent bool
ErrorsFound bool
}
// RunCheck executes `restic check` with optional --read-data-subset.
// subsetPct of 0 omits the flag (full data check); >0 passes
// --read-data-subset N%. Returns a CheckResult summarizing what was
// sniffed from stderr; the result is set even if check itself
// returns an error (so the caller can persist last_check_status).
func (e Env) RunCheck(ctx context.Context, subsetPct int, handle LineHandler) (CheckResult, error) {
args := []string{"check"}
if subsetPct > 0 {
args = append(args, "--read-data-subset", fmt.Sprintf("%d%%", subsetPct))
}
cmd := exec.CommandContext(ctx, e.Bin, args...)
cmd.Env = e.envSlice()
cmd.Dir = e.WorkDir
var res CheckResult
sniff := func(stream, line string, ev any) {
if stream == "stderr" {
if strings.Contains(line, "stale lock") || strings.Contains(line, "already locked") {
res.LockPresent = true
}
}
if handle != nil {
handle(stream, line, ev)
}
}
err := runWithPump(cmd, sniff)
if err != nil {
// restic check exits non-zero when corruption is found; that's
// a CheckResult, not a wrapper failure. Treat ExitError as
// "errors found" but still return the result so the caller can
// persist last_check_status='errors_found'. Reserve the error
// return for actually-broken invocations (binary missing, etc).
var ee *exec.ExitError
if errors.As(err, &ee) {
res.ErrorsFound = true
return res, nil
}
return res, err
}
return res, nil
}
func pumpPlain(r io.Reader, stream string, handle LineHandler) error {
scanner := bufio.NewScanner(r)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
+193
View File
@@ -0,0 +1,193 @@
package restic
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
)
// setupScriptBin writes a small shell script to a temp directory,
// makes it executable, and returns its path. scriptBody is the
// complete script content (without the shebang line — that's added
// automatically).
// Writes to "<path>.tmp" then renames into place — see the matching
// helper in internal/agent/runner/runner_test.go for the ETXTBSY
// race rationale. Same fix applied here so this helper doesn't lose
// the race the next time CI gets unlucky.
func setupScriptBin(t *testing.T, scriptBody string) string {
t.Helper()
dir := t.TempDir()
final := filepath.Join(dir, "restic")
tmp := final + ".tmp"
content := "#!/bin/sh\n" + scriptBody + "\n"
if err := os.WriteFile(tmp, []byte(content), 0o755); err != nil {
t.Fatalf("setupScriptBin: write tmp: %v", err)
}
if err := os.Rename(tmp, final); err != nil {
t.Fatalf("setupScriptBin: rename: %v", err)
}
return final
}
// captureLines returns a LineHandler that appends "stream:line" into
// the returned slice pointer (safe for single-goroutine test use).
func captureLines() (*[]string, LineHandler) {
var lines []string
h := func(stream, line string, _ any) {
lines = append(lines, fmt.Sprintf("%s:%s", stream, line))
}
return &lines, h
}
// --- B1: RunPrune + B2: RunCheck ---
func TestRunPruneInvokesPrune(t *testing.T) {
// Shell script that echoes its args; "prune" should appear in output.
bin := setupScriptBin(t, `echo "$@"`)
env := Env{Bin: bin}
lines, h := captureLines()
if err := env.RunPrune(context.Background(), h); err != nil {
t.Fatalf("RunPrune returned error: %v", err)
}
for _, l := range *lines {
if strings.Contains(l, "prune") {
return
}
}
t.Fatalf("expected 'prune' in captured output; got: %v", *lines)
}
// --- B2: RunCheck ---
func TestRunCheckLockSniff(t *testing.T) {
cases := []struct {
name string
stderrLine string
wantLocked bool
}{
{"stale lock", "Found stale lock from PID 1234", true},
{"already locked", "repository is already locked exclusively", true},
{"benign mention", "subdir/locked-file ok", false},
{"empty", "", false},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
// Script emits the line on stderr, then exits 0.
script := fmt.Sprintf(`printf '%%s\n' %q >&2`, c.stderrLine)
bin := setupScriptBin(t, script)
env := Env{Bin: bin}
res, err := env.RunCheck(context.Background(), 0, nil)
if err != nil {
t.Fatalf("RunCheck returned unexpected error: %v", err)
}
if res.LockPresent != c.wantLocked {
t.Fatalf("LockPresent: got %v, want %v (line: %q)", res.LockPresent, c.wantLocked, c.stderrLine)
}
if res.ErrorsFound {
t.Fatal("expected ErrorsFound=false")
}
})
}
}
func TestRunCheckErrorsFoundOnExit1(t *testing.T) {
bin := setupScriptBin(t, `exit 1`)
env := Env{Bin: bin}
res, err := env.RunCheck(context.Background(), 0, nil)
if err != nil {
t.Fatalf("RunCheck returned unexpected error (should have absorbed exit 1): %v", err)
}
if !res.ErrorsFound {
t.Fatal("expected ErrorsFound=true for exit 1")
}
}
func TestRunCheckSubsetArg(t *testing.T) {
bin := setupScriptBin(t, `echo "$@"`)
env := Env{Bin: bin}
lines, h := captureLines()
if _, err := env.RunCheck(context.Background(), 25, h); err != nil {
t.Fatalf("RunCheck: %v", err)
}
want := "--read-data-subset 25%"
for _, l := range *lines {
if strings.Contains(l, want) {
return
}
}
t.Fatalf("expected %q in captured output; got: %v", want, *lines)
}
// --- B3: RunUnlock + RunStats ---
func TestRunUnlockInvokesUnlock(t *testing.T) {
bin := setupScriptBin(t, `echo "$@"`)
env := Env{Bin: bin}
lines, h := captureLines()
if err := env.RunUnlock(context.Background(), h); err != nil {
t.Fatalf("RunUnlock: %v", err)
}
for _, l := range *lines {
if strings.Contains(l, "unlock") {
return
}
}
t.Fatalf("expected 'unlock' in captured output; got: %v", *lines)
}
func TestRunStatsParsesJSON(t *testing.T) {
bin := setupScriptBin(t, `echo '{"total_size":1234,"total_uncompressed_size":5678,"snapshots_count":3,"total_file_count":100,"total_blob_count":50}'`)
env := Env{Bin: bin}
stats, err := env.RunStats(context.Background(), nil)
if err != nil {
t.Fatalf("RunStats: %v", err)
}
if stats.TotalSize != 1234 {
t.Fatalf("TotalSize: got %d, want 1234", stats.TotalSize)
}
if stats.TotalUncompressed != 5678 {
t.Fatalf("TotalUncompressed: got %d, want 5678", stats.TotalUncompressed)
}
if stats.SnapshotsCount != 3 {
t.Fatalf("SnapshotsCount: got %d, want 3", stats.SnapshotsCount)
}
if stats.TotalFileCount != 100 {
t.Fatalf("TotalFileCount: got %d, want 100", stats.TotalFileCount)
}
if stats.TotalBlobCount != 50 {
t.Fatalf("TotalBlobCount: got %d, want 50", stats.TotalBlobCount)
}
}
func TestRunStatsErrorsWithoutJSON(t *testing.T) {
bin := setupScriptBin(t, `echo "no json here"`)
env := Env{Bin: bin}
_, err := env.RunStats(context.Background(), nil)
if err == nil {
t.Fatal("expected error when no JSON in output")
}
if !strings.Contains(err.Error(), "no JSON in output") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestRunStatsZeroSnapshots(t *testing.T) {
// Confirms RunStats succeeds and returns a valid *RepoStats when the
// repo has no snapshots (snapshots_count=0). A regression that
// re-added a "SnapshotsCount > 0" guard would return an error here.
bin := setupScriptBin(t, `echo '{"total_size":0,"total_uncompressed_size":0,"snapshots_count":0,"total_file_count":0,"total_blob_count":0}'`)
env := Env{Bin: bin}
stats, err := env.RunStats(context.Background(), nil)
if err != nil {
t.Fatalf("RunStats with zero snapshots returned unexpected error: %v", err)
}
if stats == nil {
t.Fatal("expected non-nil *RepoStats, got nil")
}
if stats.SnapshotsCount != 0 {
t.Fatalf("SnapshotsCount: got %d, want 0", stats.SnapshotsCount)
}
}
+1 -1
View File
@@ -167,7 +167,7 @@ func (s *Server) handleAgentEnroll(w stdhttp.ResponseWriter, r *stdhttp.Request)
// /api/hosts/{id}/repo-credentials. Failing the whole enrolment
// here would leave a half-burned token + an orphan host.
if encForHost != "" {
if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, encForHost); err != nil {
if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, store.CredKindRepo, encForHost); err != nil {
slog.Error("enrollment: set host credentials failed",
"host_id", hostID, "err", err)
}
+217 -6
View File
@@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
stdhttp "net/http"
"time"
@@ -39,7 +40,7 @@ func (s *Server) handleGetHostCredentials(w stdhttp.ResponseWriter, r *stdhttp.R
writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "")
return
}
enc, err := s.deps.Store.GetHostCredentials(r.Context(), hostID)
enc, err := s.deps.Store.GetHostCredentials(r.Context(), hostID, store.CredKindRepo)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
writeJSONError(w, stdhttp.StatusNotFound, "not_set", "")
@@ -85,7 +86,8 @@ type hostRepoCredsRequest struct {
// preserved. Re-encrypts under host_id and pushes a config.update
// over the WS if the agent is connected.
func (s *Server) handleSetHostCredentials(w stdhttp.ResponseWriter, r *stdhttp.Request) {
if !s.authedUser(r) {
user, ok := s.requireUser(r)
if !ok {
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "")
return
}
@@ -107,7 +109,7 @@ func (s *Server) handleSetHostCredentials(w stdhttp.ResponseWriter, r *stdhttp.R
// Merge with the existing row, if any.
existing := repoCredsBlob{}
if cur, err := s.deps.Store.GetHostCredentials(r.Context(), hostID); err == nil {
if cur, err := s.deps.Store.GetHostCredentials(r.Context(), hostID, store.CredKindRepo); err == nil {
plain, err := s.deps.AEAD.Decrypt(cur, []byte("host:"+hostID))
if err != nil {
writeJSONError(w, stdhttp.StatusInternalServerError, "decrypt_failed", "")
@@ -139,13 +141,14 @@ func (s *Server) handleSetHostCredentials(w stdhttp.ResponseWriter, r *stdhttp.R
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
return
}
if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, enc); err != nil {
if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, store.CredKindRepo, enc); err != nil {
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
return
}
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
ID: ulid.Make().String(),
UserID: &user.ID,
Actor: "user",
Action: "host.repo_credentials_set",
TargetKind: ptr("host"),
@@ -184,6 +187,209 @@ func (s *Server) pushRepoCredsToAgent(ctx context.Context, hostID string, blob r
return nil
}
// handleGetAdminCredentials returns a redacted view of the host's admin
// creds for UI display. 404 if no admin slot has been set yet. Operator
// uses this to pre-fill the edit form.
func (s *Server) handleGetAdminCredentials(w stdhttp.ResponseWriter, r *stdhttp.Request) {
if !s.authedUser(r) {
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "")
return
}
enc, err := s.deps.Store.GetHostCredentials(r.Context(), hostID, store.CredKindAdmin)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
writeJSONError(w, stdhttp.StatusNotFound, "not_set", "")
return
}
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
return
}
plain, err := s.deps.AEAD.Decrypt(enc, []byte("host:"+hostID+":admin"))
if err != nil {
writeJSONError(w, stdhttp.StatusInternalServerError, "decrypt_failed", "")
return
}
var blob repoCredsBlob
if err := json.Unmarshal(plain, &blob); err != nil {
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
return
}
writeJSON(w, stdhttp.StatusOK, hostRepoCredsView{
RepoURL: blob.RepoURL,
RepoUsername: blob.RepoUsername,
HasPassword: blob.RepoPassword != "",
})
}
// handleSetAdminCredentials lets an operator/admin update a host's admin
// creds (the prune-capable slot). Same merge-then-validate semantics as
// handleSetHostCredentials but operates on store.CredKindAdmin. After
// persisting, pushes a config.update with Slot:"admin" over the WS if
// the agent is connected.
func (s *Server) handleSetAdminCredentials(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "")
return
}
if _, err := s.deps.Store.GetHost(r.Context(), hostID); err != nil {
writeJSONError(w, stdhttp.StatusNotFound, "host_not_found", "")
return
}
var req hostRepoCredsRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSONError(w, stdhttp.StatusBadRequest, "invalid_json", err.Error())
return
}
// Merge with the existing admin row, if any.
existing := repoCredsBlob{}
aad := []byte("host:" + hostID + ":admin")
if cur, err := s.deps.Store.GetHostCredentials(r.Context(), hostID, store.CredKindAdmin); err == nil {
plain, err := s.deps.AEAD.Decrypt(cur, aad)
if err != nil {
writeJSONError(w, stdhttp.StatusInternalServerError, "decrypt_failed", "")
return
}
_ = json.Unmarshal(plain, &existing)
} else if !errors.Is(err, store.ErrNotFound) {
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
return
}
if req.RepoURL != nil {
existing.RepoURL = *req.RepoURL
}
if req.RepoUsername != nil {
existing.RepoUsername = *req.RepoUsername
}
if req.RepoPassword != nil {
existing.RepoPassword = *req.RepoPassword
}
if existing.RepoURL == "" || existing.RepoPassword == "" {
writeJSONError(w, stdhttp.StatusBadRequest, "missing_field",
"repo_url and repo_password must end up non-empty")
return
}
enc, err := s.encryptRepoCreds(existing, aad)
if err != nil {
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
return
}
if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, store.CredKindAdmin, enc); err != nil {
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
return
}
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
ID: ulid.Make().String(),
UserID: &user.ID,
Actor: "user",
Action: "host.admin_credentials_set",
TargetKind: ptr("host"),
TargetID: &hostID,
TS: nowUTC(),
})
// Push to the agent if it's connected. Non-fatal: the next
// handleRunRepoPrune call will push on-demand.
if s.deps.Hub != nil && s.deps.Hub.Connected(hostID) {
_ = s.pushAdminCredsToAgent(r.Context(), hostID)
}
w.WriteHeader(stdhttp.StatusNoContent)
}
// handleDeleteAdminCredentials removes the admin credentials row for the
// host. Returns 204 on success, 404 if the row wasn't set. Does NOT push
// a deletion to the agent — the agent's local admin slot stays as-is
// until the next deployment/reinstall.
func (s *Server) handleDeleteAdminCredentials(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "")
return
}
// Check existence first so we can 404 cleanly.
if _, err := s.deps.Store.GetHostCredentials(r.Context(), hostID, store.CredKindAdmin); err != nil {
if errors.Is(err, store.ErrNotFound) {
writeJSONError(w, stdhttp.StatusNotFound, "not_set", "")
return
}
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
return
}
if err := s.deps.Store.DeleteHostCredentials(r.Context(), hostID, store.CredKindAdmin); err != nil {
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
return
}
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
ID: ulid.Make().String(),
UserID: &user.ID,
Actor: "user",
Action: "host.admin_credentials_deleted",
TargetKind: ptr("host"),
TargetID: &hostID,
TS: nowUTC(),
})
w.WriteHeader(stdhttp.StatusNoContent)
}
// pushAdminCredsToAgent ships the admin-slot config.update down the
// agent's WS. Used by:
// - handleSetAdminCredentials (immediate push when operator saves).
// - handleRunRepoPrune (on-demand push right before a prune dispatch).
//
// Returns store.ErrNotFound if no admin row exists for the host
// (the prune endpoint uses this to refuse with a clear message).
func (s *Server) pushAdminCredsToAgent(ctx context.Context, hostID string) error {
enc, err := s.deps.Store.GetHostCredentials(ctx, hostID, store.CredKindAdmin)
if err != nil {
return err // ErrNotFound bubbles
}
plain, err := s.deps.AEAD.Decrypt(enc, []byte("host:"+hostID+":admin"))
if err != nil {
return fmt.Errorf("push admin creds: decrypt: %w", err)
}
var blob repoCredsBlob
if err := json.Unmarshal(plain, &blob); err != nil {
return fmt.Errorf("push admin creds: parse: %w", err)
}
env, err := api.Marshal(api.MsgConfigUpdate, "", api.ConfigUpdatePayload{
Slot: "admin",
RepoURL: blob.RepoURL,
RepoUsername: blob.RepoUsername,
RepoPassword: blob.RepoPassword,
})
if err != nil {
return err
}
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
return s.deps.Hub.Send(sendCtx, hostID, env)
}
// onAgentHello runs synchronously inside the WS handler immediately
// after a successful hello. It loads the host's encrypted creds (if
// any), decrypts, and ships them down the conn as a config.update so
@@ -205,6 +411,11 @@ func (s *Server) onAgentHello(ctx context.Context, hostID string, conn *ws.Conn)
// just no-ops. Skipped silently when the host has no creds yet —
// the next hello after the operator binds creds will dispatch.
s.maybeAutoInit(ctx, hostID, conn)
// Drain any pending runs that accumulated while this host was
// offline. Use a fresh context — the hello-bound ctx is short-lived,
// and the drain may take seconds across many rows. A non-blocking
// goroutine keeps the hello path snappy.
go s.DrainPending(context.Background(), hostID)
}
// maybeAutoInit dispatches a `restic init` job iff the host has no
@@ -212,7 +423,7 @@ func (s *Server) onAgentHello(ctx context.Context, hostID string, conn *ws.Conn)
// them the runner can't talk to the repo). We rely on Restic's
// idempotent init for re-runs.
func (s *Server) maybeAutoInit(ctx context.Context, hostID string, conn *ws.Conn) {
if _, err := s.deps.Store.GetHostCredentials(ctx, hostID); err != nil {
if _, err := s.deps.Store.GetHostCredentials(ctx, hostID, store.CredKindRepo); err != nil {
// No creds bound yet — operator hasn't supplied them. The next
// hello after creds land will pick this up.
return
@@ -266,7 +477,7 @@ func (s *Server) maybeAutoInit(ctx context.Context, hostID string, conn *ws.Conn
// credentials. Silent no-op when the host has nothing on file
// (the operator hasn't bound creds to it yet).
func (s *Server) pushRepoCredsOnHello(ctx context.Context, hostID string, conn *ws.Conn) {
enc, err := s.deps.Store.GetHostCredentials(ctx, hostID)
enc, err := s.deps.Store.GetHostCredentials(ctx, hostID, store.CredKindRepo)
if err != nil {
if !errors.Is(err, store.ErrNotFound) {
slog.Warn("on-hello: load host creds", "host_id", hostID, "err", err)
+265 -2
View File
@@ -5,6 +5,9 @@ import (
"encoding/json"
"testing"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// TestEnrollmentTransfersRepoCreds verifies the round-trip:
@@ -57,12 +60,12 @@ func TestEnrollmentTransfersRepoCreds(t *testing.T) {
hostID, "host42", "linux", "amd64", "2026-01-01T00:00:00Z"); err != nil {
t.Fatalf("insert host: %v", err)
}
if err := st.SetHostCredentials(ctx, hostID, encForHost); err != nil {
if err := st.SetHostCredentials(ctx, hostID, store.CredKindRepo, encForHost); err != nil {
t.Fatalf("set host credentials: %v", err)
}
// host_credentials row should now hold the host-bound ciphertext.
got, err := st.GetHostCredentials(ctx, hostID)
got, err := st.GetHostCredentials(ctx, hostID, store.CredKindRepo)
if err != nil {
t.Fatalf("get host creds: %v", err)
}
@@ -105,3 +108,263 @@ func TestEnrollmentTokenWithoutCreds(t *testing.T) {
t.Errorf("token without creds should return empty blob; got %q", att.EncRepoCreds)
}
}
// ----- admin credentials tests ----------------------------------------
// TestAdminCredentialsRoundTrip verifies set→get→delete→get (404).
func TestAdminCredentialsRoundTrip(t *testing.T) {
t.Parallel()
srv, url, st := newTestServerWithHub(t)
cookie := loginAsAdmin(t, st)
hostID := makeHost(t, st, "admin-creds-host")
// Mark init done so auto-init doesn't interfere.
_ = st.CreateJob(context.Background(), store.Job{
ID: "init-" + hostID,
HostID: hostID,
Kind: string(api.JobInit),
ActorKind: "system",
CreatedAt: time.Now().UTC(),
})
// GET before set → 404.
status, body := doJSON(t, url, "GET", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie)
if status != 404 {
t.Fatalf("before set: want 404, got %d body=%+v", status, body)
}
// PUT — set admin creds.
status, body = doJSON(t, url, "PUT", "/api/hosts/"+hostID+"/admin-credentials",
map[string]any{
"repo_url": "rest:http://admin.example/host",
"repo_username": "admin",
"repo_password": "s3cur3",
}, cookie)
if status != 204 {
t.Fatalf("set: want 204, got %d body=%+v", status, body)
}
// GET — should return redacted view.
status, body = doJSON(t, url, "GET", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie)
if status != 200 {
t.Fatalf("get after set: want 200, got %d body=%+v", status, body)
}
if body["repo_url"] != "rest:http://admin.example/host" {
t.Errorf("repo_url: %+v", body)
}
if body["repo_username"] != "admin" {
t.Errorf("repo_username: %+v", body)
}
if body["has_password"] != true {
t.Errorf("has_password: %+v", body)
}
// DELETE.
status, _ = doJSON(t, url, "DELETE", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie)
if status != 204 {
t.Fatalf("delete: want 204, got %d", status)
}
// GET after delete → 404.
status, _ = doJSON(t, url, "GET", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie)
if status != 404 {
t.Fatalf("after delete: want 404, got %d", status)
}
// Extra: suppress unused import warning by actually using srv in assertion.
_ = srv
}
// TestAdminCredsAADIsolatedFromRepo writes a blob encrypted with the repo
// AAD ("host:<id>") into the admin kind slot, then GETs it — the handler
// should fail to decrypt and return 500 decrypt_failed. This proves the
// AAD scoping is real.
func TestAdminCredsAADIsolatedFromRepo(t *testing.T) {
t.Parallel()
srv, url, st := newTestServerWithHub(t)
cookie := loginAsAdmin(t, st)
hostID := makeHost(t, st, "aad-isolation-host")
ctx := context.Background()
// Encrypt with the REPO AAD (wrong for admin slot).
enc, err := srv.encryptRepoCreds(repoCredsBlob{
RepoURL: "rest:http://r/x",
RepoPassword: "p",
}, []byte("host:"+hostID)) // wrong AAD — repo, not admin
if err != nil {
t.Fatalf("encrypt: %v", err)
}
// Write it directly into the admin kind slot.
if err := st.SetHostCredentials(ctx, hostID, store.CredKindAdmin, enc); err != nil {
t.Fatalf("set host credentials: %v", err)
}
// GET admin-credentials — handler decrypts with admin AAD, which
// is different, so decrypt must fail → 500.
status, body := doJSON(t, url, "GET", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie)
if status != 500 {
t.Fatalf("want 500 (decrypt_failed), got %d body=%+v", status, body)
}
if code, _ := body["code"].(string); code != "decrypt_failed" {
t.Errorf("want code=decrypt_failed, got %+v", body)
}
}
// TestAdminCredsPushOnSet connects a fake WS host, sets admin creds via
// PUT, drains the conn, and asserts a config.update with Slot:"admin"
// was shipped.
func TestAdminCredsPushOnSet(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "admin-push-host")
cookie := loginAsAdmin(t, st)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "admin-push-host")
// Drain the on-hello burst (config.update for repo + schedule.set
// + possibly command.run(init)).
_ = drainUntil(t, c, api.MsgScheduleSet)
// Now PUT admin creds — should trigger an immediate push.
status, body := doJSON(t, ts.URL, "PUT", "/api/hosts/"+hostID+"/admin-credentials",
map[string]any{
"repo_url": "rest:http://admin.example/h",
"repo_username": "admin",
"repo_password": "prune-pass",
}, cookie)
if status != 204 {
t.Fatalf("set admin creds: want 204, got %d body=%+v", status, body)
}
// Drain until we see a config.update with Slot=admin.
deadline := time.Now().Add(3 * time.Second)
found := false
for !found && time.Now().Before(deadline) {
env := readEnvelope(t, c)
if env.Type != api.MsgConfigUpdate {
continue
}
var p api.ConfigUpdatePayload
if err := env.UnmarshalPayload(&p); err != nil {
t.Fatalf("unmarshal config.update: %v", err)
}
if p.Slot == "admin" {
found = true
if p.RepoURL != "rest:http://admin.example/h" {
t.Errorf("admin push: wrong URL %q", p.RepoURL)
}
}
}
if !found {
t.Fatal("timed out waiting for config.update(slot=admin)")
}
}
// TestDeleteAdminCredentialsAuditLogged checks that DELETE appends an
// audit row with action='host.admin_credentials_deleted' and that the
// row carries the acting user's ID.
func TestDeleteAdminCredentialsAuditLogged(t *testing.T) {
t.Parallel()
_, url, st := newTestServerWithHub(t)
cookie, userID := loginAsAdminWithID(t, st)
hostID := makeHost(t, st, "audit-del-host")
ctx := context.Background()
// Set admin creds first so there is something to delete.
status, body := doJSON(t, url, "PUT", "/api/hosts/"+hostID+"/admin-credentials",
map[string]any{
"repo_url": "rest:http://x/h",
"repo_password": "p",
}, cookie)
if status != 204 {
t.Fatalf("set: want 204, got %d body=%+v", status, body)
}
// Delete.
status, _ = doJSON(t, url, "DELETE", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie)
if status != 204 {
t.Fatalf("delete: want 204, got %d", status)
}
// Query audit_log for the delete row — action, user_id.
rows, err := st.DB().QueryContext(ctx,
`SELECT action, user_id FROM audit_log WHERE target_id = ? AND target_kind = 'host' AND action = 'host.admin_credentials_deleted'`,
hostID)
if err != nil {
t.Fatalf("query audit: %v", err)
}
defer rows.Close()
found := false
for rows.Next() {
var action string
var gotUserID *string
if err := rows.Scan(&action, &gotUserID); err != nil {
t.Fatalf("scan: %v", err)
}
found = true
if gotUserID == nil {
t.Error("audit row: user_id is NULL, want non-nil")
} else if *gotUserID != userID {
t.Errorf("audit row: user_id=%q, want %q", *gotUserID, userID)
}
}
if err := rows.Err(); err != nil {
t.Fatalf("rows: %v", err)
}
if !found {
t.Error("audit row with action='host.admin_credentials_deleted' not found")
}
}
// TestSetAdminCredentialsAuditCarriesUserID checks that PUT
// /api/hosts/{id}/admin-credentials appends an audit row with the
// correct action and a non-nil UserID matching the acting session.
func TestSetAdminCredentialsAuditCarriesUserID(t *testing.T) {
t.Parallel()
_, url, st := newTestServerWithHub(t)
cookie, userID := loginAsAdminWithID(t, st)
hostID := makeHost(t, st, "audit-set-admin-host")
ctx := context.Background()
status, body := doJSON(t, url, "PUT", "/api/hosts/"+hostID+"/admin-credentials",
map[string]any{
"repo_url": "rest:http://admin.example/h",
"repo_password": "s3cr3t",
}, cookie)
if status != 204 {
t.Fatalf("set: want 204, got %d body=%+v", status, body)
}
rows, err := st.DB().QueryContext(ctx,
`SELECT action, user_id FROM audit_log WHERE target_id = ? AND target_kind = 'host' AND action = 'host.admin_credentials_set'`,
hostID)
if err != nil {
t.Fatalf("query audit: %v", err)
}
defer rows.Close()
found := false
for rows.Next() {
var action string
var gotUserID *string
if err := rows.Scan(&action, &gotUserID); err != nil {
t.Fatalf("scan: %v", err)
}
found = true
if gotUserID == nil {
t.Error("audit row: user_id is NULL, want non-nil")
} else if *gotUserID != userID {
t.Errorf("audit row: user_id=%q, want %q", *gotUserID, userID)
}
}
if err := rows.Err(); err != nil {
t.Fatalf("rows: %v", err)
}
if !found {
t.Error("audit row with action='host.admin_credentials_set' not found")
}
}
+1 -1
View File
@@ -72,7 +72,7 @@ func (s *Server) dispatchJob(ctx context.Context, user *store.User,
}
// dispatchJobWithPayload is dispatchJob's variant that lets callers
// fill in structured fields (Includes/Excludes/Tag/RetentionPolicy)
// fill in structured fields (Includes/Excludes/Tag/ForgetGroups/RequiresAdminCreds)
// — used by the per-source-group Run-now path. JobID is filled in
// here; callers leave it zero on the input payload.
func (s *Server) dispatchJobWithPayload(ctx context.Context, user *store.User,
@@ -0,0 +1,132 @@
// maintenance_dispatch.go bridges the pure-logic maintenance.Ticker
// (internal/server/maintenance) to the side-effecting world: checks
// online state, builds the per-kind command.run payload, and calls
// dispatchJobWithPayload — the same path operator-triggered Run-now
// uses. Cadence-driven jobs are persisted with actor_kind="system"
// (dispatchJobWithPayload tags it that way when user==nil).
//
// Maintenance fires deliberately do NOT queue to pending_runs when
// the host is offline — five missed prunes on a laptop returning
// from a week away is not what the operator wants. Skip + log; the
// next 60s tick will re-evaluate.
package http
import (
"context"
"errors"
"log/slog"
"strconv"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// DispatchMaintenance acts on each Decision from the ticker. Offline
// hosts are skipped (logged); prune dispatches without admin creds
// are skipped silently (logged) — the operator hasn't completed the
// admin-creds setup yet, and re-trying every minute would just spam
// the logs. (Operator-triggered prune via the run-now endpoint
// returns a clear error instead — different path, different UX.)
func (s *Server) DispatchMaintenance(ctx context.Context, decisions []maintenance.Decision) {
for _, d := range decisions {
if !s.deps.Hub.Connected(d.HostID) {
slog.Info("maintenance: host offline, skipping",
"host_id", d.HostID, "kind", d.Kind)
continue
}
switch d.Kind {
case "forget":
payload, ok := s.buildForgetPayloadForHost(ctx, d.HostID)
if !ok {
slog.Info("maintenance: forget skipped — no source groups with retention",
"host_id", d.HostID)
continue
}
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobForget, payload)
if code != "" {
slog.Warn("maintenance: forget dispatch failed",
"host_id", d.HostID, "code", code, "msg", msg)
}
case "prune":
if _, err := s.deps.Store.GetHostCredentials(ctx, d.HostID, store.CredKindAdmin); err != nil {
if errors.Is(err, store.ErrNotFound) {
slog.Info("maintenance: prune skipped — no admin creds",
"host_id", d.HostID)
continue
}
slog.Warn("maintenance: prune skipped — admin creds error",
"host_id", d.HostID, "err", err)
continue
}
if err := s.pushAdminCredsToAgent(ctx, d.HostID); err != nil {
slog.Warn("maintenance: prune push admin creds failed",
"host_id", d.HostID, "err", err)
continue
}
payload := api.CommandRunPayload{RequiresAdminCreds: true}
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobPrune, payload)
if code != "" {
slog.Warn("maintenance: prune dispatch failed",
"host_id", d.HostID, "code", code, "msg", msg)
}
case "check":
payload := api.CommandRunPayload{Args: []string{strconv.Itoa(d.SubsetPct)}}
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobCheck, payload)
if code != "" {
slog.Warn("maintenance: check dispatch failed",
"host_id", d.HostID, "code", code, "msg", msg)
}
default:
slog.Warn("maintenance: unknown decision kind",
"host_id", d.HostID, "kind", d.Kind)
}
}
}
// buildForgetPayloadForHost collects every source group on the host
// that has a non-empty retention policy and builds a CommandRunPayload
// with ForgetGroups populated. Returns ok=false if the host has no
// such groups (the dispatcher then skips this kind).
func (s *Server) buildForgetPayloadForHost(ctx context.Context, hostID string) (api.CommandRunPayload, bool) {
groups, err := s.deps.Store.ListSourceGroupsByHost(ctx, hostID)
if err != nil {
slog.Warn("maintenance: list source groups failed", "host_id", hostID, "err", err)
return api.CommandRunPayload{}, false
}
fg := make([]api.ForgetGroup, 0, len(groups))
for _, g := range groups {
if isEmptyRetention(g.RetentionPolicy) {
continue
}
fg = append(fg, api.ForgetGroup{
Tag: g.Name,
Policy: forgetPolicyJSONFromStore(g.RetentionPolicy),
})
}
if len(fg) == 0 {
return api.CommandRunPayload{}, false
}
return api.CommandRunPayload{ForgetGroups: fg}, true
}
func isEmptyRetention(p store.RetentionPolicy) bool {
return p.KeepLast == nil && p.KeepHourly == nil &&
p.KeepDaily == nil && p.KeepWeekly == nil &&
p.KeepMonthly == nil && p.KeepYearly == nil
}
// forgetPolicyJSONFromStore copies retention pointers from the store
// view to the wire view. Both shapes are field-for-field identical;
// this avoids importing store from internal/api (which would invert
// the dependency direction).
func forgetPolicyJSONFromStore(p store.RetentionPolicy) api.ForgetPolicyJSON {
return api.ForgetPolicyJSON{
KeepLast: p.KeepLast,
KeepHourly: p.KeepHourly,
KeepDaily: p.KeepDaily,
KeepWeekly: p.KeepWeekly,
KeepMonthly: p.KeepMonthly,
KeepYearly: p.KeepYearly,
}
}
@@ -0,0 +1,304 @@
// maintenance_dispatch_test.go — exercises Server.DispatchMaintenance
// directly (one Decision at a time). Reuses the same fake-agent
// harness as p2r01_ws_test / repo_ops_test: a real Server with a
// real Hub, plus a websocket connected as the host. We then push
// Decisions through DispatchMaintenance and assert the envelopes
// the agent receives + the job rows that land.
package http
import (
"context"
"encoding/json"
"testing"
"time"
"github.com/coder/websocket"
"github.com/oklog/ulid/v2"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// readNextCommandRun pulls envelopes until a command.run lands or the
// deadline passes. Returns nil if the deadline is hit.
func readNextCommandRun(t *testing.T, c *websocket.Conn, deadline time.Time) *api.CommandRunPayload {
t.Helper()
for time.Now().Before(deadline) {
ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond)
mt, raw, err := c.Read(ctx)
cancel()
if err != nil {
return nil
}
if mt != websocket.MessageText {
continue
}
var env api.Envelope
if err := json.Unmarshal(raw, &env); err != nil {
continue
}
if env.Type != api.MsgCommandRun {
continue
}
var p api.CommandRunPayload
if err := env.UnmarshalPayload(&p); err != nil {
continue
}
return &p
}
return nil
}
// TestDispatchMaintenanceSkipsOfflineHosts: host not connected → no
// envelope, no job row.
func TestDispatchMaintenanceSkipsOfflineHosts(t *testing.T) {
t.Parallel()
srv, _, st := rawTestServer(t)
hostID, _ := enrolHostForWS(t, srv, st, "offline-host")
srv.DispatchMaintenance(context.Background(), []maintenance.Decision{
{HostID: hostID, Kind: "check", SubsetPct: 10},
})
var n int
if err := st.DB().QueryRow(
`SELECT COUNT(*) FROM jobs WHERE host_id = ?`, hostID).Scan(&n); err != nil {
t.Fatalf("count: %v", err)
}
if n != 0 {
t.Errorf("offline host produced %d job rows; want 0", n)
}
}
// TestDispatchMaintenanceForgetShipsForgetGroups: connected host with
// two source groups (one with retention, one without). Decision of
// kind=forget → command.run with ForgetGroups containing only the
// group that had retention.
func TestDispatchMaintenanceForgetShipsForgetGroups(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "forget-host")
seedInitJob(t, st, hostID)
keep := 7
if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{
ID: ulid.Make().String(), HostID: hostID, Name: "documents",
Includes: []string{"/home/documents"},
RetentionPolicy: store.RetentionPolicy{KeepLast: &keep},
}); err != nil {
t.Fatalf("group docs: %v", err)
}
if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{
ID: ulid.Make().String(), HostID: hostID, Name: "ephemeral",
Includes: []string{"/tmp"},
}); err != nil {
t.Fatalf("group eph: %v", err)
}
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "forget-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
srv.DispatchMaintenance(context.Background(), []maintenance.Decision{
{HostID: hostID, Kind: "forget"},
})
got := readNextCommandRun(t, c, time.Now().Add(2*time.Second))
if got == nil {
t.Fatal("no command.run received")
}
if got.Kind != api.JobForget {
t.Errorf("kind: got %q, want %q", got.Kind, api.JobForget)
}
if len(got.ForgetGroups) != 1 {
t.Fatalf("ForgetGroups: got %d entries (%+v), want 1", len(got.ForgetGroups), got.ForgetGroups)
}
if got.ForgetGroups[0].Tag != "documents" {
t.Errorf("forget group tag: got %q, want %q", got.ForgetGroups[0].Tag, "documents")
}
if got.ForgetGroups[0].Policy.KeepLast == nil || *got.ForgetGroups[0].Policy.KeepLast != 7 {
t.Errorf("forget group policy: got %+v", got.ForgetGroups[0].Policy)
}
// Job row must be persisted with actor_kind=system.
var actor string
if err := st.DB().QueryRow(
`SELECT actor_kind FROM jobs WHERE host_id = ? AND kind = 'forget'`, hostID).Scan(&actor); err != nil {
t.Fatalf("query actor_kind: %v", err)
}
if actor != "system" {
t.Errorf("actor_kind: got %q, want system", actor)
}
}
// TestDispatchMaintenanceForgetSkipsHostWithNoRetention: connected
// host, but every source group has empty retention → no envelope.
func TestDispatchMaintenanceForgetSkipsHostWithNoRetention(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "no-ret-host")
seedInitJob(t, st, hostID)
if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{
ID: ulid.Make().String(), HostID: hostID, Name: "ephemeral",
Includes: []string{"/tmp"},
}); err != nil {
t.Fatalf("group: %v", err)
}
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "no-ret-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
srv.DispatchMaintenance(context.Background(), []maintenance.Decision{
{HostID: hostID, Kind: "forget"},
})
if got := readNextCommandRun(t, c, time.Now().Add(800*time.Millisecond)); got != nil {
t.Errorf("unexpected command.run: %+v", got)
}
var n int
if err := st.DB().QueryRow(`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'forget'`, hostID).Scan(&n); err != nil {
t.Fatalf("count: %v", err)
}
if n != 0 {
t.Errorf("forget job rows: got %d, want 0", n)
}
}
// TestDispatchMaintenancePruneSkipsWithoutAdminCreds: no admin creds
// row → no envelope, no job row, silent skip.
func TestDispatchMaintenancePruneSkipsWithoutAdminCreds(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "no-admin-host")
seedInitJob(t, st, hostID)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "no-admin-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
srv.DispatchMaintenance(context.Background(), []maintenance.Decision{
{HostID: hostID, Kind: "prune"},
})
if got := readNextCommandRun(t, c, time.Now().Add(800*time.Millisecond)); got != nil {
t.Errorf("unexpected command.run: %+v", got)
}
var n int
if err := st.DB().QueryRow(`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'prune'`, hostID).Scan(&n); err != nil {
t.Fatalf("count: %v", err)
}
if n != 0 {
t.Errorf("prune job rows: got %d, want 0", n)
}
}
// TestDispatchMaintenancePruneShipsConfigUpdateThenCommandRun: with
// admin creds set, prune dispatch must push admin config.update first
// then command.run(prune, RequiresAdminCreds=true).
func TestDispatchMaintenancePruneShipsConfigUpdateThenCommandRun(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "prune-mt-host")
setAdminCreds(t, srv, st, hostID)
seedInitJob(t, st, hostID)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "prune-mt-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
srv.DispatchMaintenance(context.Background(), []maintenance.Decision{
{HostID: hostID, Kind: "prune"},
})
// Read until we've seen both config.update(slot=admin) and the
// prune command.run.
deadline := time.Now().Add(3 * time.Second)
var sawAdminPush bool
var prunePayload *api.CommandRunPayload
for prunePayload == nil && time.Now().Before(deadline) {
ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond)
mt, raw, err := c.Read(ctx)
cancel()
if err != nil {
break
}
if mt != websocket.MessageText {
continue
}
var env api.Envelope
if err := json.Unmarshal(raw, &env); err != nil {
continue
}
switch env.Type {
case api.MsgConfigUpdate:
var p api.ConfigUpdatePayload
if err := env.UnmarshalPayload(&p); err == nil && p.Slot == "admin" {
sawAdminPush = true
}
case api.MsgCommandRun:
var p api.CommandRunPayload
if err := env.UnmarshalPayload(&p); err == nil && p.Kind == api.JobPrune {
cp := p
prunePayload = &cp
}
}
}
if !sawAdminPush {
t.Error("expected config.update(slot=admin) before prune dispatch")
}
if prunePayload == nil {
t.Fatal("timed out waiting for command.run(prune)")
}
if !prunePayload.RequiresAdminCreds {
t.Error("prune command.run must have RequiresAdminCreds=true")
}
// Persisted job must be system actor.
var actor string
if err := st.DB().QueryRow(
`SELECT actor_kind FROM jobs WHERE host_id = ? AND kind = 'prune'`, hostID).Scan(&actor); err != nil {
t.Fatalf("query actor_kind: %v", err)
}
if actor != "system" {
t.Errorf("actor_kind: got %q, want system", actor)
}
}
// TestDispatchMaintenanceCheckCarriesSubset: Decision SubsetPct=15 →
// command.run.Args == ["15"]. Job row actor_kind=system.
func TestDispatchMaintenanceCheckCarriesSubset(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "check-mt-host")
seedInitJob(t, st, hostID)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "check-mt-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
srv.DispatchMaintenance(context.Background(), []maintenance.Decision{
{HostID: hostID, Kind: "check", SubsetPct: 15},
})
got := readNextCommandRun(t, c, time.Now().Add(2*time.Second))
if got == nil {
t.Fatal("no command.run received")
}
if got.Kind != api.JobCheck {
t.Errorf("kind: got %q, want %q", got.Kind, api.JobCheck)
}
if len(got.Args) != 1 || got.Args[0] != "15" {
t.Errorf("Args: got %+v, want [15]", got.Args)
}
var actor string
if err := st.DB().QueryRow(
`SELECT actor_kind FROM jobs WHERE host_id = ? AND kind = 'check'`, hostID).Scan(&actor); err != nil {
t.Fatalf("query actor_kind: %v", err)
}
if actor != "system" {
t.Errorf("actor_kind: got %q, want system", actor)
}
}
+26
View File
@@ -47,6 +47,32 @@ func loginAsAdmin(t *testing.T, st *store.Store) *stdhttp.Cookie {
return &stdhttp.Cookie{Name: sessionCookieName, Value: tok}
}
// loginAsAdminWithID is like loginAsAdmin but also returns the user ID.
// Use this when tests need to assert that the user ID was recorded
// (e.g. on audit entries).
func loginAsAdminWithID(t *testing.T, st *store.Store) (*stdhttp.Cookie, string) {
t.Helper()
ctx := context.Background()
uid := ulid.Make().String()
hash, _ := auth.HashPassword("very-long-test-password")
if err := st.CreateUser(ctx, store.User{
ID: uid, Username: "tester-" + uid[:6],
PasswordHash: hash, Role: store.RoleAdmin,
CreatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("create user: %v", err)
}
tok, _ := auth.NewToken()
if err := st.CreateSession(ctx, store.Session{
UserID: uid,
CreatedAt: time.Now().UTC(),
ExpiresAt: time.Now().Add(time.Hour).UTC(),
}, auth.HashToken(tok)); err != nil {
t.Fatalf("create session: %v", err)
}
return &stdhttp.Cookie{Name: sessionCookieName, Value: tok}, uid
}
// makeHost inserts a minimal Host row directly via the store. Used by
// HTTP-level tests that don't want to go through the full enrollment
// path. Returns the host id.
+1 -1
View File
@@ -99,7 +99,7 @@ func enrolHostForWS(t *testing.T, srv *Server, st *store.Store, name string) (ho
if err != nil {
t.Fatalf("encrypt: %v", err)
}
if err := st.SetHostCredentials(context.Background(), hostID, enc); err != nil {
if err := st.SetHostCredentials(context.Background(), hostID, store.CredKindRepo, enc); err != nil {
t.Fatalf("set creds: %v", err)
}
return hostID, token
+209
View File
@@ -0,0 +1,209 @@
// pending_drain.go — drains pending_runs rows that are due (or, on
// agent reconnect, every row for that host).
//
// Two trigger paths:
// 1. The 30s tick in cmd/server (DrainAllDue) — sweeps every host
// with rows whose next_attempt_at <= now.
// 2. onAgentHello (DrainPending(hostID)) — when a host comes back,
// walk all of its pending rows synchronously so the operator
// sees the queue drain promptly.
package http
import (
"context"
"errors"
"log/slog"
"sync"
"time"
"github.com/oklog/ulid/v2"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
const (
pendingDrainBatchLimit = 100
pendingDrainBackoffMax = 30 * time.Minute
)
// DrainPending re-dispatches every pending_runs row for hostID. The
// host must already be connected (caller's responsibility — typically
// onAgentHello). Each row's source group + schedule are loaded; if
// either is gone the row is dropped (audit-logged as abandoned). If
// the row's attempt count meets/exceeds the group's retry_max, the
// row is dropped (audit-logged as abandoned). Otherwise we attempt
// dispatch; success deletes the row, failure bumps the attempt and
// reschedules with exponential backoff.
//
// A per-host mutex (hostDrainMutex) ensures that the on-hello goroutine
// and the 30s tick cannot process the same host concurrently. If a drain
// is already in-flight for this host, the call returns immediately — the
// running drain will see any rows we'd have processed.
func (s *Server) DrainPending(ctx context.Context, hostID string) {
mu := s.hostDrainMutex(hostID)
if !mu.TryLock() {
return
}
defer mu.Unlock()
runs, err := s.deps.Store.ListPendingRunsForHost(ctx, hostID)
if err != nil {
slog.Warn("drain pending: list", "host_id", hostID, "err", err)
return
}
if len(runs) == 0 {
return
}
conn := s.deps.Hub.Conn(hostID)
if conn == nil {
// Host went offline between the connectedness check and now.
// Skip — next tick or next reconnect will retry.
return
}
for _, p := range runs {
s.drainOne(ctx, conn, p)
}
}
// drainOne handles a single pending row. Refactored out so DrainPending
// reads cleanly. Side-effects: delete, bump, audit, dispatch — all
// per-row.
func (s *Server) drainOne(ctx context.Context, conn *ws.Conn, p store.PendingRun) {
sc, err := s.deps.Store.GetSchedule(ctx, p.HostID, p.ScheduleID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
s.abandonPending(ctx, p, "schedule gone")
return
}
slog.Warn("drain pending: load schedule",
"host_id", p.HostID, "schedule_id", p.ScheduleID, "err", err)
return
}
if !sc.Enabled {
s.abandonPending(ctx, p, "schedule disabled")
return
}
g, err := s.deps.Store.GetSourceGroup(ctx, p.HostID, p.SourceGroupID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
s.abandonPending(ctx, p, "source group gone")
} else {
slog.Warn("drain pending: load source group",
"host_id", p.HostID, "group_id", p.SourceGroupID, "err", err)
}
return
}
if g.RetryMax > 0 && p.Attempt >= g.RetryMax {
s.abandonPending(ctx, p, "retry_max exceeded")
return
}
// Calls dispatchBackupForGroupCore (not dispatchBackupForGroup) so a
// failed Send doesn't double-enqueue: dispatchBackupForGroup's
// enqueue-on-failure path would create a NEW pending_runs row while
// this function already bumps the EXISTING row via
// BumpPendingRunAttempt, producing geometric duplicates on repeated
// failures.
jobID, _ := s.dispatchBackupForGroupCore(ctx, conn, p.HostID, p.ScheduleID, g, p.ScheduledAt)
if jobID == "" {
// Send failed again. Bump attempt with exponential backoff.
// Exponential backoff doubles immediately on the first drain
// retry: enqueue at base, attempt=1 → drain → 2*base, attempt=2 →
// drain → 4*base, etc. Capped at pendingDrainBackoffMax. With
// defaults (60s base, retry_max=3) the schedule is 60→120→240s.
baseBackoff := time.Duration(g.RetryBackoffSeconds) * time.Second
if baseBackoff <= 0 {
baseBackoff = 60 * time.Second
}
backoff := baseBackoff
for i := 0; i < p.Attempt; i++ {
backoff *= 2
if backoff >= pendingDrainBackoffMax {
backoff = pendingDrainBackoffMax
break
}
}
next := time.Now().UTC().Add(backoff)
if err := s.deps.Store.BumpPendingRunAttempt(ctx, p.ID, next, "drain dispatch failed"); err != nil {
slog.Warn("drain pending: bump", "host_id", p.HostID, "id", p.ID, "err", err)
}
return
}
// Success — drop the pending row.
if err := s.deps.Store.DeletePendingRun(ctx, p.ID); err != nil {
slog.Warn("drain pending: delete after dispatch", "host_id", p.HostID, "id", p.ID, "err", err)
}
slog.Info("drain pending: dispatched",
"host_id", p.HostID, "schedule_id", p.ScheduleID, "group", g.Name,
"attempt", p.Attempt, "job_id", jobID)
}
// abandonPending deletes the row and records an audit entry. The row
// is gone but the audit trail preserves the forensic record of why.
func (s *Server) abandonPending(ctx context.Context, p store.PendingRun, reason string) {
slog.Info("drain pending: abandoning",
"host_id", p.HostID, "schedule_id", p.ScheduleID,
"attempt", p.Attempt, "reason", reason)
scheduleID := p.ScheduleID
if err := s.deps.Store.AppendAudit(ctx, store.AuditEntry{
ID: ulid.Make().String(),
Actor: "system",
Action: "pending_run.abandoned",
TargetKind: ptr("schedule"),
TargetID: &scheduleID,
TS: time.Now().UTC(),
}); err != nil {
slog.Warn("drain pending: audit on abandon", "id", p.ID, "err", err)
}
if err := s.deps.Store.DeletePendingRun(ctx, p.ID); err != nil {
slog.Warn("drain pending: delete on abandon", "id", p.ID, "err", err)
}
}
// hostDrainMutex returns the per-host mutex for DrainPending,
// creating it on first request. The map is guarded by drainLocksMu.
// Mutex objects are never deleted from the map — there are at most
// len(hosts) entries, which is bounded by the fleet size.
func (s *Server) hostDrainMutex(hostID string) *sync.Mutex {
s.drainLocksMu.Lock()
defer s.drainLocksMu.Unlock()
if s.drainLocks == nil {
s.drainLocks = make(map[string]*sync.Mutex)
}
mu, ok := s.drainLocks[hostID]
if !ok {
mu = &sync.Mutex{}
s.drainLocks[hostID] = mu
}
return mu
}
// DrainAllDue is the 30s-ticker entrypoint. Walks rows whose
// next_attempt_at <= now (DuePendingRuns), dedupes by host, and calls
// DrainPending per host. The DrainPending then re-walks the host's
// rows (same DB hit as the dedupe iteration would have done — keeps
// the per-host concurrency model simple).
func (s *Server) DrainAllDue(ctx context.Context) {
if s.deps.Hub == nil {
return
}
due, err := s.deps.Store.DuePendingRuns(ctx, time.Now().UTC(), pendingDrainBatchLimit)
if err != nil {
slog.Warn("drain all due: list", "err", err)
return
}
if len(due) == 0 {
return
}
seen := make(map[string]struct{}, len(due))
for _, p := range due {
if _, ok := seen[p.HostID]; ok {
continue
}
seen[p.HostID] = struct{}{}
if !s.deps.Hub.Connected(p.HostID) {
continue
}
s.DrainPending(ctx, p.HostID)
}
}
+572
View File
@@ -0,0 +1,572 @@
// pending_drain_test.go — covers DrainPending / DrainAllDue and the
// onAgentHello goroutine spawn that drains a freshly-reconnected
// host's queue.
package http
import (
"context"
"encoding/json"
"sync"
"testing"
"time"
"github.com/coder/websocket"
"github.com/oklog/ulid/v2"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// seedSchedAndGroup wires up a host with one source group + one
// schedule pointing at it. Returns (groupID, scheduleID).
func seedSchedAndGroup(t *testing.T, st *store.Store, hostID string, retryMax int) (string, string) {
t.Helper()
gid := ulid.Make().String()
if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{
ID: gid, HostID: hostID, Name: "default",
Includes: []string{"/etc"},
RetryMax: retryMax, RetryBackoffSeconds: 60,
}); err != nil {
t.Fatalf("create group: %v", err)
}
sid := ulid.Make().String()
if err := st.CreateSchedule(context.Background(), &store.Schedule{
ID: sid, HostID: hostID,
CronExpr: "0 3 * * *", Enabled: true,
SourceGroupIDs: []string{gid},
}); err != nil {
t.Fatalf("create schedule: %v", err)
}
// Mark a successful init job so auto-init doesn't pollute reads.
if err := st.CreateJob(context.Background(), store.Job{
ID: ulid.Make().String(), HostID: hostID, Kind: "init",
ActorKind: "system", CreatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("seed init: %v", err)
}
return gid, sid
}
// countPendingForHost returns the number of pending_runs rows for hostID.
func countPendingForHost(t *testing.T, st *store.Store, hostID string) int {
t.Helper()
var n int
if err := st.DB().QueryRow(
`SELECT COUNT(*) FROM pending_runs WHERE host_id = ?`, hostID).Scan(&n); err != nil {
t.Fatalf("count pending: %v", err)
}
return n
}
// waitForPendingCount polls until the pending_runs count for hostID
// reaches wantN or the deadline expires. Use this instead of calling
// DrainPending synchronously when the test relies on the on-hello
// goroutine (which holds the per-host drain mutex) to process rows.
func waitForPendingCount(t *testing.T, st *store.Store, hostID string, wantN int, timeout time.Duration) {
t.Helper()
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
if countPendingForHost(t, st, hostID) == wantN {
return
}
time.Sleep(20 * time.Millisecond)
}
t.Errorf("pending count for host %s: want %d after %v, got %d",
hostID, wantN, timeout, countPendingForHost(t, st, hostID))
}
// countAuditAction returns the number of audit_log rows with the given action.
func countAuditAction(t *testing.T, st *store.Store, action string) int {
t.Helper()
var n int
if err := st.DB().QueryRow(
`SELECT COUNT(*) FROM audit_log WHERE action = ?`, action).Scan(&n); err != nil {
t.Fatalf("count audit: %v", err)
}
return n
}
func TestDrainPendingDispatchesOnReconnect(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "drain-host")
gid, sid := seedSchedAndGroup(t, st, hostID, 5)
// Pre-insert a pending row that's already due. The on-hello
// goroutine should drain it after we connect.
pendingID := ulid.Make().String()
now := time.Now().UTC()
if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{
ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID,
Attempt: 1, NextAttemptAt: now.Add(-time.Second),
ScheduledAt: now.Add(-time.Minute),
}); err != nil {
t.Fatalf("enqueue: %v", err)
}
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "drain-host")
// Walk envelopes looking for a backup command.run carrying the
// group's includes.
var got *api.CommandRunPayload
deadline := time.Now().Add(3 * time.Second)
for time.Now().Before(deadline) {
ctx, cancel := context.WithTimeout(context.Background(), 800*time.Millisecond)
mt, raw, err := c.Read(ctx)
cancel()
if err != nil {
break
}
if mt != websocket.MessageText {
continue
}
var env api.Envelope
if err := json.Unmarshal(raw, &env); err != nil {
continue
}
if env.Type != api.MsgCommandRun {
continue
}
var p api.CommandRunPayload
_ = env.UnmarshalPayload(&p)
if p.Kind == api.JobBackup {
got = &p
break
}
}
if got == nil {
t.Fatalf("no backup command.run dispatched after reconnect drain")
}
if !equalStrings(got.Includes, []string{"/etc"}) {
t.Errorf("backup includes: %v", got.Includes)
}
if got.Tag != "default" {
t.Errorf("backup tag: %q", got.Tag)
}
// Pending row should be gone. Poll briefly: the drain goroutine
// sends command.run via conn.Send and only then calls
// DeletePendingRun. Reading the envelope off the wire above proves
// the send happened, but the delete runs after that on the drain
// goroutine — small window where the count is still 1.
waitForPendingCount(t, st, hostID, 0, 2*time.Second)
if n := countPendingForHost(t, st, hostID); n != 0 {
t.Errorf("pending rows after drain: got %d, want 0", n)
}
// One backup job row landed (in addition to the seeded init).
var n int
_ = st.DB().QueryRow(
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup' AND actor_kind = 'schedule'`,
hostID).Scan(&n)
if n != 1 {
t.Errorf("backup job rows: got %d, want 1", n)
}
}
func TestDrainPendingAbandonsOnRetryMax(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "abandon-retry-host")
gid, sid := seedSchedAndGroup(t, st, hostID, 2)
pendingID := ulid.Make().String()
now := time.Now().UTC()
if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{
ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID,
Attempt: 2, NextAttemptAt: now.Add(-time.Second),
ScheduledAt: now.Add(-time.Minute),
}); err != nil {
t.Fatalf("enqueue: %v", err)
}
auditBefore := countAuditAction(t, st, "pending_run.abandoned")
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "abandon-retry-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
// The on-hello goroutine processes the row (retry_max exceeded → abandon).
// Wait for it to finish rather than calling DrainPending directly, which
// would be a no-op while the goroutine holds the per-host drain mutex.
_ = connFromHub(t, srv, hostID) // ensure hub registration
waitForPendingCount(t, st, hostID, 0, 2*time.Second)
if n := countPendingForHost(t, st, hostID); n != 0 {
t.Errorf("pending rows after abandon: got %d, want 0", n)
}
if d := countAuditAction(t, st, "pending_run.abandoned") - auditBefore; d != 1 {
t.Errorf("audit pending_run.abandoned delta: got %d, want 1", d)
}
// No backup command.run should have been sent.
deadline := time.Now().Add(400 * time.Millisecond)
for time.Now().Before(deadline) {
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
mt, raw, err := c.Read(ctx)
cancel()
if err != nil {
break
}
if mt != websocket.MessageText {
continue
}
var env api.Envelope
_ = json.Unmarshal(raw, &env)
if env.Type == api.MsgCommandRun {
var p api.CommandRunPayload
_ = env.UnmarshalPayload(&p)
if p.Kind == api.JobBackup {
t.Fatalf("abandoned row still dispatched a backup: %+v", p)
}
}
}
// No backup job row.
var n int
_ = st.DB().QueryRow(
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup'`,
hostID).Scan(&n)
if n != 0 {
t.Errorf("abandon path created a backup job: %d rows", n)
}
}
func TestDrainPendingBumpsOnSendFailure(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "bump-host")
gid, sid := seedSchedAndGroup(t, st, hostID, 5)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "bump-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
// Capture the conn before closing the client side. Hub.Conn still
// returns it after the client-side close — the server's Unregister
// fires when its read loop sees the close, but the conn ptr remains
// valid; subsequent Sends just fail.
conn := connFromHub(t, srv, hostID)
if conn == nil {
t.Fatal("conn never registered")
}
// Insert the pending row AFTER the on-hello drain goroutine has
// already scanned (an empty list) — otherwise we race the on-hello
// drain dispatching the row over the still-live socket.
pendingID := ulid.Make().String()
now := time.Now().UTC()
if err := c.Close(websocket.StatusNormalClosure, "test"); err != nil {
t.Fatalf("close: %v", err)
}
// Brief settle so the close is observed by the server's read loop.
time.Sleep(150 * time.Millisecond)
if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{
ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID,
Attempt: 1, NextAttemptAt: now.Add(-time.Second),
ScheduledAt: now.Add(-time.Minute),
}); err != nil {
t.Fatalf("enqueue: %v", err)
}
// DrainPending uses Hub.Conn(hostID); after the client close the
// server may have unregistered already. Call drainOne directly
// against the captured conn so we deterministically exercise the
// "Send fails" branch rather than the "host gone" branch.
srv.drainOne(context.Background(), conn, store.PendingRun{
ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID,
Attempt: 1, NextAttemptAt: now.Add(-time.Second), ScheduledAt: now.Add(-time.Minute),
})
// The original row must be bumped to attempt=2 with a non-empty
// last_error. Critically, NO duplicate row should have been created:
// drainOne calls dispatchBackupForGroupCore (not dispatchBackupForGroup)
// so the enqueue-on-failure path is bypassed and the count stays at 1.
if n := countPendingForHost(t, st, hostID); n != 1 {
t.Errorf("pending rows after send failure: got %d, want 1 (no duplicate enqueue)", n)
}
var attempt int
var lastErr string
if err := st.DB().QueryRow(
`SELECT attempt, COALESCE(last_error,'') FROM pending_runs WHERE id = ?`,
pendingID).Scan(&attempt, &lastErr); err != nil {
t.Fatalf("scan original row: %v", err)
}
if attempt != 2 {
t.Errorf("attempt after bump: got %d, want 2", attempt)
}
if lastErr == "" {
t.Errorf("last_error empty after bump")
}
}
func TestDrainPendingDropsRowsForGoneSchedule(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "gone-sched-host")
gid, sid := seedSchedAndGroup(t, st, hostID, 5)
pendingID := ulid.Make().String()
now := time.Now().UTC()
if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{
ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID,
Attempt: 1, NextAttemptAt: now.Add(-time.Second),
ScheduledAt: now.Add(-time.Minute),
}); err != nil {
t.Fatalf("enqueue: %v", err)
}
// Disable the schedule. (Deleting it would FK-cascade-delete the
// pending_runs row out from under the drainer, which is fine for
// production but defeats the point of the test. The
// disabled-schedule path goes through the same abandonPending code,
// so it's an equivalent assertion.)
if _, err := st.DB().Exec(
`UPDATE schedules SET enabled = 0 WHERE id = ?`, sid); err != nil {
t.Fatalf("disable schedule: %v", err)
}
auditBefore := countAuditAction(t, st, "pending_run.abandoned")
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "gone-sched-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
// The on-hello goroutine processes the row (disabled schedule → abandon).
// Poll for completion instead of calling DrainPending, which would return
// immediately while the goroutine holds the per-host drain mutex.
waitForPendingCount(t, st, hostID, 0, 2*time.Second)
if n := countPendingForHost(t, st, hostID); n != 0 {
t.Errorf("pending rows after schedule-gone abandon: got %d, want 0", n)
}
if d := countAuditAction(t, st, "pending_run.abandoned") - auditBefore; d != 1 {
t.Errorf("audit delta: got %d, want 1", d)
}
// Drain produced no backup envelope.
deadline := time.Now().Add(400 * time.Millisecond)
for time.Now().Before(deadline) {
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
mt, raw, err := c.Read(ctx)
cancel()
if err != nil {
break
}
if mt != websocket.MessageText {
continue
}
var env api.Envelope
_ = json.Unmarshal(raw, &env)
if env.Type == api.MsgCommandRun {
var p api.CommandRunPayload
_ = env.UnmarshalPayload(&p)
if p.Kind == api.JobBackup {
t.Fatalf("gone-schedule abandon still dispatched: %+v", p)
}
}
}
}
// TestDrainPendingDropsRowsForGoneSourceGroup verifies that when a
// source group is gone (ErrNotFound) the pending row is abandoned and
// an audit entry is written. Transient-error paths (SQLITE_BUSY,
// context cancellation) are not covered here because the real *Store
// doesn't expose a fault-injection seam; the code-review check above
// is the gate for that path.
func TestDrainPendingDropsRowsForGoneSourceGroup(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "gone-group-host")
_, sid := seedSchedAndGroup(t, st, hostID, 5)
// Use a source_group_id that never existed. pending_runs carries a
// FK to source_groups, so we must bypass FK enforcement for this
// insert. PRAGMA foreign_keys is connection-scoped and can only be
// changed outside a transaction; DB().Exec runs on an arbitrary
// pooled connection, so we pin it with a dedicated *sql.Conn.
fakeGroupID := ulid.Make().String()
pendingID := ulid.Make().String()
now := time.Now().UTC()
conn, err := st.DB().Conn(context.Background())
if err != nil {
t.Fatalf("db conn: %v", err)
}
defer conn.Close()
if _, err := conn.ExecContext(context.Background(), `PRAGMA foreign_keys = OFF`); err != nil {
t.Fatalf("fk off: %v", err)
}
if _, err := conn.ExecContext(context.Background(),
`INSERT INTO pending_runs (id, schedule_id, source_group_id, host_id, attempt, next_attempt_at, scheduled_at)
VALUES (?, ?, ?, ?, 1, ?, ?)`,
pendingID, sid, fakeGroupID, hostID,
now.Add(-time.Second), now.Add(-time.Minute),
); err != nil {
t.Fatalf("insert pending: %v", err)
}
if _, err := conn.ExecContext(context.Background(), `PRAGMA foreign_keys = ON`); err != nil {
t.Fatalf("fk on: %v", err)
}
auditBefore := countAuditAction(t, st, "pending_run.abandoned")
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "gone-group-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
// The on-hello goroutine processes the row (source group gone → abandon).
// Poll for completion instead of calling DrainPending, which would return
// immediately while the goroutine holds the per-host drain mutex.
waitForPendingCount(t, st, hostID, 0, 2*time.Second)
if n := countPendingForHost(t, st, hostID); n != 0 {
t.Errorf("pending rows after source-group-gone abandon: got %d, want 0", n)
}
if d := countAuditAction(t, st, "pending_run.abandoned") - auditBefore; d != 1 {
t.Errorf("audit delta: got %d, want 1", d)
}
}
func TestDrainAllDueSkipsOfflineHosts(t *testing.T) {
t.Parallel()
srv, _, st := rawTestServer(t)
// Don't dial — host is enrolled but never connected.
hostID, _ := enrolHostForWS(t, srv, st, "offline-host")
gid, sid := seedSchedAndGroup(t, st, hostID, 5)
pendingID := ulid.Make().String()
now := time.Now().UTC()
if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{
ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID,
Attempt: 1, NextAttemptAt: now.Add(-time.Second),
ScheduledAt: now.Add(-time.Minute),
}); err != nil {
t.Fatalf("enqueue: %v", err)
}
auditBefore := countAuditAction(t, st, "pending_run.abandoned")
srv.DrainAllDue(context.Background())
// Row still there (host offline, drainer skips).
if n := countPendingForHost(t, st, hostID); n != 1 {
t.Errorf("pending rows after DrainAllDue against offline host: got %d, want 1", n)
}
if d := countAuditAction(t, st, "pending_run.abandoned") - auditBefore; d != 0 {
t.Errorf("audit unexpectedly changed: delta %d", d)
}
}
func TestEnqueueOnDispatchFailure(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "enqueue-host")
_, sid := seedSchedAndGroup(t, st, hostID, 5)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "enqueue-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
conn := connFromHub(t, srv, hostID)
_ = conn
// Close the client side so the server's next Send errors.
if err := c.Close(websocket.StatusNormalClosure, "test"); err != nil {
t.Fatalf("close: %v", err)
}
time.Sleep(100 * time.Millisecond)
scheduledAt := time.Now().UTC().Add(-30 * time.Second)
srv.dispatchScheduledJob(context.Background(), hostID, conn, sid, scheduledAt)
// One pending row should have been enqueued (attempt=1) with the
// scheduled_at preserved.
rows, err := st.ListPendingRunsForHost(context.Background(), hostID)
if err != nil {
t.Fatalf("list: %v", err)
}
if len(rows) != 1 {
t.Fatalf("pending rows: got %d, want 1", len(rows))
}
if rows[0].Attempt != 1 {
t.Errorf("attempt: got %d, want 1", rows[0].Attempt)
}
// scheduled_at preserved (within RFC3339Nano round-trip tolerance).
if rows[0].ScheduledAt.Sub(scheduledAt).Abs() > time.Microsecond {
t.Errorf("scheduled_at drift: %v vs %v", rows[0].ScheduledAt, scheduledAt)
}
if rows[0].LastError == "" {
t.Errorf("last_error empty")
}
}
// TestDrainPendingSerializesPerHost verifies that concurrent DrainPending
// calls for the same host do not double-dispatch pending rows. The per-host
// mutex (TryLock semantics) means exactly one drain processes each row.
func TestDrainPendingSerializesPerHost(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "serialize-host")
gid, sid := seedSchedAndGroup(t, st, hostID, 10)
// Connect the agent so DrainPending can dispatch.
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "serialize-host")
// Drain the on-hello goroutine's pass first (no pending rows yet),
// then wait for the schedule.set so the connection is fully settled.
_ = drainUntil(t, c, api.MsgScheduleSet)
// Insert 5 pending rows now that the on-hello drain has already run.
now := time.Now().UTC()
for i := range 5 {
pid := ulid.Make().String()
if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{
ID: pid,
ScheduleID: sid,
SourceGroupID: gid,
HostID: hostID,
Attempt: 1,
NextAttemptAt: now.Add(-time.Second),
ScheduledAt: now.Add(-time.Duration(i+1) * time.Minute),
}); err != nil {
t.Fatalf("enqueue row %d: %v", i, err)
}
}
// Spawn 10 goroutines all calling DrainPending concurrently.
var wg sync.WaitGroup
for range 10 {
wg.Add(1)
go func() {
defer wg.Done()
srv.DrainPending(context.Background(), hostID)
}()
}
wg.Wait()
// Drain any envelopes the agent received so we don't block below.
// We read with short timeouts and stop when the connection goes quiet.
drainDeadline := time.Now().Add(500 * time.Millisecond)
for time.Now().Before(drainDeadline) {
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
_, _, err := c.Read(ctx)
cancel()
if err != nil {
break
}
}
// All 5 pending rows must be gone.
if n := countPendingForHost(t, st, hostID); n != 0 {
t.Errorf("pending rows after concurrent drain: got %d, want 0", n)
}
// Exactly 5 backup job rows (one per pending row), not 10+ from a race.
var n int
_ = st.DB().QueryRow(
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup' AND actor_kind = 'schedule'`,
hostID).Scan(&n)
if n != 5 {
t.Errorf("backup job rows: got %d, want 5 (per-host mutex must prevent double-dispatch)", n)
}
}
+165
View File
@@ -0,0 +1,165 @@
// repo_ops.go — operator-triggered Run-now for repo-level operations:
// prune, check, unlock. Backed by the same dispatchJobWithPayload
// pipeline as backup, with an extra step for prune: push admin creds
// first if they're set, refuse loudly if they aren't.
package http
import (
"errors"
"log/slog"
stdhttp "net/http"
"strconv"
"github.com/go-chi/chi/v5"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// handleRunRepoPrune — POST /api/hosts/{id}/repo/prune (and the HTMX
// twin outside /api). Pushes the host's admin credentials down the WS,
// then dispatches a prune command.run with RequiresAdminCreds=true.
func (s *Server) handleRunRepoPrune(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
if wantsHTML(r) {
stdhttp.Redirect(w, r, "/login", stdhttp.StatusSeeOther)
return
}
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
s.runOpError(w, r, stdhttp.StatusBadRequest, "missing_id", "")
return
}
// Push admin creds first. ErrNotFound → operator hasn't set them
// yet. Other errors → likely the host is offline or a decrypt fail.
if err := s.pushAdminCredsToAgent(r.Context(), hostID); err != nil {
if errors.Is(err, store.ErrNotFound) {
s.runOpError(w, r, stdhttp.StatusBadRequest, "admin_creds_required",
"set admin credentials on the Repo page before running prune")
return
}
// Hub.Send failure (offline) or decrypt failure — surface a
// generic offline message so the operator retries when the
// agent is back.
slog.Warn("prune: push admin creds failed", "host_id", hostID, "err", err)
s.runOpError(w, r, stdhttp.StatusServiceUnavailable, "host_offline",
"agent is not currently connected; try again when it reconnects")
return
}
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobPrune,
api.CommandRunPayload{RequiresAdminCreds: true})
if code != "" {
s.runOpError(w, r, status, code, msg)
return
}
s.runOpRedirect(w, r, res)
}
// handleRunRepoCheck — POST /api/hosts/{id}/repo/check. Pulls
// check_subset_pct from host_repo_maintenance for the host (operator
// can override via ?subset=N query param, clamped 0..100). Dispatches
// with the chosen subset in CommandRunPayload.Args[0].
func (s *Server) handleRunRepoCheck(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
if wantsHTML(r) {
stdhttp.Redirect(w, r, "/login", stdhttp.StatusSeeOther)
return
}
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
s.runOpError(w, r, stdhttp.StatusBadRequest, "missing_id", "")
return
}
m, err := s.deps.Store.GetRepoMaintenance(r.Context(), hostID)
if err != nil {
if errors.Is(err, store.ErrNotFound) {
// Maintenance row should auto-seed at enrollment. If it's
// missing, surface a clear error rather than guessing 0%.
s.runOpError(w, r, stdhttp.StatusInternalServerError, "no_maintenance_row",
"host has no repo-maintenance config; was the host fully enrolled?")
return
}
s.runOpError(w, r, stdhttp.StatusInternalServerError, "internal", "")
return
}
subset := m.CheckSubsetPct
if q := r.URL.Query().Get("subset"); q != "" {
if n, err2 := strconv.Atoi(q); err2 == nil {
if n < 0 {
n = 0
}
if n > 100 {
n = 100
}
subset = n
}
// Non-numeric ?subset silently falls back to DB value.
}
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobCheck,
api.CommandRunPayload{Args: []string{strconv.Itoa(subset)}})
if code != "" {
s.runOpError(w, r, status, code, msg)
return
}
s.runOpRedirect(w, r, res)
}
// handleRunRepoUnlock — POST /api/hosts/{id}/repo/unlock. No admin
// creds required — restic unlock works with the everyday user.
func (s *Server) handleRunRepoUnlock(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
if wantsHTML(r) {
stdhttp.Redirect(w, r, "/login", stdhttp.StatusSeeOther)
return
}
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
s.runOpError(w, r, stdhttp.StatusBadRequest, "missing_id", "")
return
}
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobUnlock,
api.CommandRunPayload{})
if code != "" {
s.runOpError(w, r, status, code, msg)
return
}
s.runOpRedirect(w, r, res)
}
// runOpRedirect: HTMX → HX-Redirect to /jobs/{id}; JSON → 202 + JSON
// body. Mirrors handleRunSourceGroup's tail.
func (s *Server) runOpRedirect(w stdhttp.ResponseWriter, r *stdhttp.Request, res runNowResponse) {
if wantsHTML(r) {
w.Header().Set("HX-Redirect", "/jobs/"+res.JobID)
w.WriteHeader(stdhttp.StatusNoContent)
return
}
writeJSON(w, stdhttp.StatusAccepted, res)
}
// runOpError: HTMX → plain-text status; JSON → standard envelope.
// Mirrors runGroupError.
func (s *Server) runOpError(w stdhttp.ResponseWriter, r *stdhttp.Request, status int, code, msg string) {
if wantsHTML(r) {
stdhttp.Error(w, msg, status)
return
}
writeJSONError(w, status, code, msg)
}
+362
View File
@@ -0,0 +1,362 @@
// repo_ops_test.go — integration tests for the repo run-now endpoints:
// prune, check, unlock.
package http
import (
"context"
"encoding/json"
stdhttp "net/http"
"strconv"
"testing"
"time"
"github.com/coder/websocket"
"github.com/oklog/ulid/v2"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// ----- helpers -------------------------------------------------------
// seedInitJob marks a fake init job done for the host so the auto-init
// path doesn't fire and pollute the envelope sequence we're measuring.
func seedInitJob(t *testing.T, st *store.Store, hostID string) {
t.Helper()
if err := st.CreateJob(context.Background(), store.Job{
ID: ulid.Make().String(), HostID: hostID, Kind: "init",
ActorKind: "system", CreatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("seed init job: %v", err)
}
}
// setAdminCreds writes admin credentials for a host via the store directly.
func setAdminCreds(t *testing.T, srv *Server, st *store.Store, hostID string) {
t.Helper()
enc, err := srv.encryptRepoCreds(repoCredsBlob{
RepoURL: "rest:http://admin.example/h",
RepoUsername: "admin",
RepoPassword: "prune-pass",
}, []byte("host:"+hostID+":admin"))
if err != nil {
t.Fatalf("encrypt admin creds: %v", err)
}
if err := st.SetHostCredentials(context.Background(), hostID, store.CredKindAdmin, enc); err != nil {
t.Fatalf("set admin creds: %v", err)
}
}
// setMaintenanceSubset sets check_subset_pct for the host via the store.
func setMaintenanceSubset(t *testing.T, st *store.Store, hostID string, pct int) {
t.Helper()
// Ensure the row exists first.
if err := st.CreateDefaultRepoMaintenance(context.Background(), hostID); err != nil {
t.Fatalf("seed maintenance: %v", err)
}
m, err := st.GetRepoMaintenance(context.Background(), hostID)
if err != nil {
t.Fatalf("get maintenance: %v", err)
}
m.CheckSubsetPct = pct
if err := st.UpdateRepoMaintenance(context.Background(), m); err != nil {
t.Fatalf("update maintenance: %v", err)
}
}
// drainCommandRun reads envelopes until a command.run arrives, then
// unmarshals and returns the payload.
func drainCommandRun(t *testing.T, c *websocket.Conn) api.CommandRunPayload {
t.Helper()
env := drainUntil(t, c, api.MsgCommandRun)
var p api.CommandRunPayload
if err := env.UnmarshalPayload(&p); err != nil {
t.Fatalf("unmarshal command.run: %v", err)
}
return p
}
// ----- prune tests ---------------------------------------------------
// TestRunPruneRefusesWithoutAdminCreds: POST prune with no admin creds
// set → 400, code admin_creds_required, no job row created.
func TestRunPruneRefusesWithoutAdminCreds(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "prune-no-admin")
cookie := loginAsAdmin(t, st)
seedInitJob(t, st, hostID)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "prune-no-admin")
_ = drainUntil(t, c, api.MsgScheduleSet)
status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/prune", nil, cookie)
if status != stdhttp.StatusBadRequest {
t.Fatalf("want 400, got %d body=%+v", status, body)
}
if code, _ := body["code"].(string); code != "admin_creds_required" {
t.Errorf("want code=admin_creds_required, got %+v", body)
}
// No prune job row should have been persisted.
var n int
if err := st.DB().QueryRow(
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'prune'`, hostID).Scan(&n); err != nil {
t.Fatalf("count: %v", err)
}
if n != 0 {
t.Errorf("unexpected prune job rows: %d", n)
}
}
// TestRunPruneShipsConfigUpdateThenCommandRun: set admin creds, connect
// host, POST prune. Assert envelope sequence: config.update(slot=admin)
// → command.run(prune, RequiresAdminCreds=true). Assert job row persisted.
func TestRunPruneShipsConfigUpdateThenCommandRun(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "prune-happy")
cookie := loginAsAdmin(t, st)
setAdminCreds(t, srv, st, hostID)
seedInitJob(t, st, hostID)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "prune-happy")
// Drain on-hello burst (repo config.update + schedule.set).
_ = drainUntil(t, c, api.MsgScheduleSet)
status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/prune", nil, cookie)
if status != stdhttp.StatusAccepted {
t.Fatalf("want 202, got %d body=%+v", status, body)
}
jobID, _ := body["job_id"].(string)
if jobID == "" {
t.Fatalf("no job_id in response: %+v", body)
}
// Read the next two envelopes — must be config.update(slot=admin)
// followed by command.run(prune).
deadline := time.Now().Add(3 * time.Second)
var sawAdminPush bool
var prunePayload *api.CommandRunPayload
for (prunePayload == nil) && time.Now().Before(deadline) {
ctx, cancel := context.WithTimeout(context.Background(), 800*time.Millisecond)
mt, raw, err := c.Read(ctx)
cancel()
if err != nil {
break
}
if mt != websocket.MessageText {
continue
}
var env api.Envelope
if err := json.Unmarshal(raw, &env); err != nil {
continue
}
switch env.Type {
case api.MsgConfigUpdate:
var p api.ConfigUpdatePayload
if err := env.UnmarshalPayload(&p); err == nil && p.Slot == "admin" {
sawAdminPush = true
}
case api.MsgCommandRun:
var p api.CommandRunPayload
if err := env.UnmarshalPayload(&p); err == nil && p.Kind == api.JobPrune {
copy := p
prunePayload = &copy
}
}
}
if !sawAdminPush {
t.Error("expected config.update(slot=admin) before prune dispatch")
}
if prunePayload == nil {
t.Fatal("timed out waiting for command.run(prune)")
}
if !prunePayload.RequiresAdminCreds {
t.Error("prune command.run must have RequiresAdminCreds=true")
}
if prunePayload.JobID != jobID {
t.Errorf("job_id mismatch: dispatch=%s run=%s", jobID, prunePayload.JobID)
}
// Job row must be persisted.
var n int
if err := st.DB().QueryRow(
`SELECT COUNT(*) FROM jobs WHERE id = ? AND host_id = ? AND kind = 'prune'`,
jobID, hostID).Scan(&n); err != nil {
t.Fatalf("count: %v", err)
}
if n != 1 {
t.Errorf("prune job row count: want 1, got %d", n)
}
}
// ----- check tests ---------------------------------------------------
// TestRunCheckUsesMaintenanceSubset: check_subset_pct=25 → Args==["25"].
func TestRunCheckUsesMaintenanceSubset(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "check-subset")
cookie := loginAsAdmin(t, st)
setMaintenanceSubset(t, st, hostID, 25)
seedInitJob(t, st, hostID)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "check-subset")
_ = drainUntil(t, c, api.MsgScheduleSet)
status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/check", nil, cookie)
if status != stdhttp.StatusAccepted {
t.Fatalf("want 202, got %d body=%+v", status, body)
}
p := drainCommandRun(t, c)
if p.Kind != api.JobCheck {
t.Fatalf("kind: want check, got %s", p.Kind)
}
if len(p.Args) != 1 || p.Args[0] != "25" {
t.Errorf("args: want [25], got %v", p.Args)
}
}
// TestRunCheckHonorsSubsetOverride: ?subset=10 overrides DB value of 25.
func TestRunCheckHonorsSubsetOverride(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "check-override")
cookie := loginAsAdmin(t, st)
setMaintenanceSubset(t, st, hostID, 25)
seedInitJob(t, st, hostID)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "check-override")
_ = drainUntil(t, c, api.MsgScheduleSet)
status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/check?subset=10", nil, cookie)
if status != stdhttp.StatusAccepted {
t.Fatalf("want 202, got %d body=%+v", status, body)
}
p := drainCommandRun(t, c)
if len(p.Args) != 1 || p.Args[0] != "10" {
t.Errorf("args: want [10], got %v", p.Args)
}
}
// TestRunCheckRejectsBadSubsetGracefully: ?subset=abc falls back to DB
// value (not an error). strconv.Atoi failure silently ignored.
func TestRunCheckRejectsBadSubsetGracefully(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "check-badsubset")
cookie := loginAsAdmin(t, st)
setMaintenanceSubset(t, st, hostID, 30)
seedInitJob(t, st, hostID)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "check-badsubset")
_ = drainUntil(t, c, api.MsgScheduleSet)
status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/check?subset=abc", nil, cookie)
if status != stdhttp.StatusAccepted {
t.Fatalf("want 202 (bad subset falls back), got %d body=%+v", status, body)
}
p := drainCommandRun(t, c)
if len(p.Args) != 1 || p.Args[0] != strconv.Itoa(30) {
t.Errorf("args: want [30], got %v", p.Args)
}
}
// ----- unlock tests --------------------------------------------------
// TestRunUnlockNeedsNoAdminCreds: no admin creds, POST unlock → 202.
func TestRunUnlockNeedsNoAdminCreds(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "unlock-no-admin")
cookie := loginAsAdmin(t, st)
seedInitJob(t, st, hostID)
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "unlock-no-admin")
_ = drainUntil(t, c, api.MsgScheduleSet)
status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/unlock", nil, cookie)
if status != stdhttp.StatusAccepted {
t.Fatalf("want 202, got %d body=%+v", status, body)
}
p := drainCommandRun(t, c)
if p.Kind != api.JobUnlock {
t.Fatalf("kind: want unlock, got %s", p.Kind)
}
// RequiresAdminCreds must be false for unlock.
if p.RequiresAdminCreds {
t.Error("unlock must not set RequiresAdminCreds")
}
}
// ----- auth tests ----------------------------------------------------
// TestRunOpsRequireAuth: unauthenticated POST to each endpoint → 401.
func TestRunOpsRequireAuth(t *testing.T) {
t.Parallel()
_, url, st := newTestServerWithHub(t)
hostID := makeHost(t, st, "auth-host")
for _, path := range []string{
"/api/hosts/" + hostID + "/repo/prune",
"/api/hosts/" + hostID + "/repo/check",
"/api/hosts/" + hostID + "/repo/unlock",
} {
path := path
t.Run(path, func(t *testing.T) {
t.Parallel()
req, _ := stdhttp.NewRequest("POST", url+path, nil)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusUnauthorized {
t.Errorf("want 401, got %d", res.StatusCode)
}
})
}
// HTMX path: unauthenticated POST with HX-Request: true → 303 to /login.
// Auth check fires before host lookup so the host ID doesn't need to exist.
for _, path := range []string{
"/hosts/" + hostID + "/repo/prune",
"/hosts/" + hostID + "/repo/check",
"/hosts/" + hostID + "/repo/unlock",
} {
path := path
t.Run("htmx"+path, func(t *testing.T) {
t.Parallel()
client := &stdhttp.Client{
CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
return stdhttp.ErrUseLastResponse
},
}
req, _ := stdhttp.NewRequest("POST", url+path, nil)
req.Header.Set("HX-Request", "true")
res, err := client.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusSeeOther {
t.Errorf("want 303, got %d", res.StatusCode)
}
if loc := res.Header.Get("Location"); loc != "/login" {
t.Errorf("want Location=/login, got %q", loc)
}
})
}
}
+54 -15
View File
@@ -164,15 +164,19 @@ func (s *Server) dispatchScheduledJob(ctx context.Context, hostID string, conn *
}
}
// dispatchBackupForGroup builds and sends a single backup command.run
// envelope on conn for the given group. Persists the job row first so
// the live log viewer can subscribe to it.
// dispatchBackupForGroup persists a backup job row, sends the
// command.run envelope to the agent, and audit-logs the dispatch.
// Returns the persisted job ID on success, or "" on any failure
// (failures are slog.Warn-ed). Callers may use the returned ID to,
// e.g., redirect the UI to the live job log.
func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, hostID, scheduleID string, g *store.SourceGroup, scheduledAt time.Time) string {
// dispatchBackupForGroupCore persists a backup job row, marshals and
// sends the command.run envelope, and audit-logs the dispatch. It does
// NOT enqueue a PendingRun on failure — that responsibility belongs to
// the caller when appropriate.
//
// Returns (jobID, nil) on success. Returns ("", err) on any failure;
// the error is also slog.Warn-ed inside this function so callers don't
// need to log it again.
//
// Used by both dispatchBackupForGroup (schedule.fire path, which adds
// enqueue-on-failure) and drainOne (which handles failure via
// BumpPendingRunAttempt on the existing row, avoiding double-enqueue).
func (s *Server) dispatchBackupForGroupCore(ctx context.Context, conn *ws.Conn, hostID, scheduleID string, g *store.SourceGroup, scheduledAt time.Time) (string, error) {
jobID := ulid.Make().String()
now := time.Now().UTC()
scheduleRef := scheduleID
@@ -186,7 +190,7 @@ func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, host
}); err != nil {
slog.Warn("schedule.fire: persist job", "host_id", hostID,
"schedule_id", scheduleID, "group", g.Name, "err", err)
return ""
return "", err
}
// Backup ignores RetentionPolicy — the forget cadence lives on
// host_repo_maintenance and is driven by the server-side ticker
@@ -201,14 +205,17 @@ func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, host
if err != nil {
slog.Warn("schedule.fire: marshal command.run",
"host_id", hostID, "schedule_id", scheduleID, "err", err)
return ""
return "", err
}
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
if err := conn.Send(sendCtx, env); err != nil {
slog.Warn("schedule.fire: send command.run",
"host_id", hostID, "schedule_id", scheduleID, "err", err)
return ""
slog.Warn("schedule.fire: send command.run failed",
"host_id", hostID, "schedule_id", scheduleID, "group", g.Name, "err", err)
// The job row was already persisted — leave it in `queued` status.
// The drainer will re-dispatch (creating a new job row) and the
// orphaned queued row stays for forensic visibility.
return "", err
}
_ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{
ID: ulid.Make().String(),
@@ -221,5 +228,37 @@ func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, host
slog.Info("schedule.fire: dispatched backup",
"host_id", hostID, "schedule_id", scheduleID,
"group", g.Name, "job_id", jobID, "scheduled_at", scheduledAt)
return jobID
return jobID, nil
}
// dispatchBackupForGroup is the schedule.fire entry point. Wraps
// dispatchBackupForGroupCore with enqueue-on-failure: a failed Send
// queues a fresh PendingRun for the drainer to retry later.
//
// Returns the persisted job ID on success, or "" on any failure.
func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, hostID, scheduleID string, g *store.SourceGroup, scheduledAt time.Time) string {
jobID, err := s.dispatchBackupForGroupCore(ctx, conn, hostID, scheduleID, g, scheduledAt)
if err == nil {
return jobID
}
// Send (or an earlier step) failed — err was already logged inside
// the core. Enqueue a fresh PendingRun for the drainer to retry.
backoff := time.Duration(g.RetryBackoffSeconds) * time.Second
if backoff <= 0 {
backoff = 60 * time.Second
}
if enqueueErr := s.deps.Store.EnqueuePendingRun(ctx, &store.PendingRun{
ID: ulid.Make().String(),
ScheduleID: scheduleID,
SourceGroupID: g.ID,
HostID: hostID,
Attempt: 1,
NextAttemptAt: time.Now().UTC().Add(backoff),
ScheduledAt: scheduledAt,
LastError: err.Error(),
}); enqueueErr != nil {
slog.Warn("schedule.fire: enqueue pending run failed",
"host_id", hostID, "schedule_id", scheduleID, "group", g.Name, "err", enqueueErr)
}
return ""
}
+30 -1
View File
@@ -7,6 +7,7 @@ import (
"context"
"errors"
stdhttp "net/http"
"sync"
"time"
"github.com/go-chi/chi/v5"
@@ -41,6 +42,13 @@ type Deps struct {
type Server struct {
srv *stdhttp.Server
deps Deps
// drainLocks serializes DrainPending per host. The on-hello
// goroutine and the 30s ticker can otherwise race for the same
// host, double-dispatching every pending row. Map of hostID →
// sync.Mutex; checked-and-locked atomically via drainLocksMu.
drainLocksMu sync.Mutex
drainLocks map[string]*sync.Mutex
}
// New builds a configured but not-yet-started server.
@@ -59,7 +67,7 @@ func New(deps Deps) *Server {
w.WriteHeader(stdhttp.StatusNoContent)
})
s := &Server{deps: deps}
s := &Server{deps: deps, drainLocks: make(map[string]*sync.Mutex)}
s.routes(r)
s.srv = &stdhttp.Server{
@@ -105,6 +113,13 @@ func (s *Server) routes(r chi.Router) {
r.Get("/hosts/{id}/repo-credentials", s.handleGetHostCredentials)
r.Put("/hosts/{id}/repo-credentials", s.handleSetHostCredentials)
// Admin credentials — the prune-capable slot (separate from the
// everyday repo creds). Optional: hosts that don't prune against
// a rest-server repo with a separate admin user never need this.
r.Get("/hosts/{id}/admin-credentials", s.handleGetAdminCredentials)
r.Put("/hosts/{id}/admin-credentials", s.handleSetAdminCredentials)
r.Delete("/hosts/{id}/admin-credentials", s.handleDeleteAdminCredentials)
// Per-host schedule CRUD. Mutations bump host_schedule_version
// and async-push to a connected agent (see schedule_push.go).
r.Get("/hosts/{id}/schedules", s.handleListSchedules)
@@ -134,12 +149,23 @@ func (s *Server) routes(r chi.Router) {
// mounted at the equivalent path outside /api below — both
// resolve to the same handler, which sniffs HX-Request.
r.Post("/hosts/{id}/source-groups/{gid}/run", s.handleRunSourceGroup)
// Repo-level run-now: prune (needs admin creds), check, unlock.
// HTMX forms are also mounted outside /api below.
r.Post("/hosts/{id}/repo/prune", s.handleRunRepoPrune)
r.Post("/hosts/{id}/repo/check", s.handleRunRepoCheck)
r.Post("/hosts/{id}/repo/unlock", s.handleRunRepoUnlock)
})
// Per-source-group Run-now (HTMX form action). Available even
// when the server is started without UI templates so REST callers
// against the non-/api path also work.
r.Post("/hosts/{id}/source-groups/{gid}/run", s.handleRunSourceGroup)
// Repo-level run-now (HTMX form actions). Same handlers as the /api
// variants — wantsHTML sniff distinguishes JSON vs HTMX response.
r.Post("/hosts/{id}/repo/prune", s.handleRunRepoPrune)
r.Post("/hosts/{id}/repo/check", s.handleRunRepoCheck)
r.Post("/hosts/{id}/repo/unlock", s.handleRunRepoUnlock)
// Retired routes — see ui_handlers.go for the messages. Mounted
// outside the UI gate so cached browser tabs get a clear 410
// even if the server runs without templates.
@@ -202,6 +228,9 @@ func (s *Server) routes(r chi.Router) {
r.Post("/hosts/{id}/repo/credentials", s.handleUIRepoCredentialsSave)
r.Post("/hosts/{id}/repo/bandwidth", s.handleUIRepoBandwidthSave)
r.Post("/hosts/{id}/repo/maintenance", s.handleUIRepoMaintenanceSave)
// Admin credentials form (separate slot for prune-capable user).
r.Post("/hosts/{id}/admin-credentials", s.handleUIAdminCredentialsSave)
r.Post("/hosts/{id}/admin-credentials/delete", s.handleUIAdminCredentialsDelete)
// Schedules tab + create/edit/delete forms.
r.Get("/hosts/{id}/schedules", s.handleUISchedulesList)
r.Get("/hosts/{id}/schedules/new", s.handleUIScheduleNewGet)
+251 -18
View File
@@ -7,6 +7,9 @@ import (
stdhttp "net/http"
"strconv"
"strings"
"time"
"github.com/oklog/ulid/v2"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
@@ -17,10 +20,31 @@ import (
// the page into three independent forms so saving one section
// doesn't disturb the others.
//
// GET /hosts/{id}/repo — render
// POST /hosts/{id}/repo/credentials — connection
// POST /hosts/{id}/repo/bandwidth — host-wide bw caps
// POST /hosts/{id}/repo/maintenance — forget/prune/check cadences
// GET /hosts/{id}/repo — render
// POST /hosts/{id}/repo/credentials — connection
// POST /hosts/{id}/repo/bandwidth — host-wide bw caps
// POST /hosts/{id}/repo/maintenance — forget/prune/check cadences
// POST /hosts/{id}/admin-credentials — admin (prune) creds
// POST /hosts/{id}/admin-credentials/delete — clear admin creds
// repoStatsView is a flat, pre-dereferenced projection of
// store.HostRepoStats for use in templates. Nil pointer fields are
// collapsed to zero/false and accompanied by a Has* sentinel so the
// template can distinguish "zero" from "not yet known."
type repoStatsView struct {
HasTotalSize bool
TotalSizeBytes int64
HasRawSize bool
RawSizeBytes int64
HasLastCheck bool
LastCheckAt time.Time
LastCheckAgo string
LastCheckStatus string
LockPresent bool
HasLastPrune bool
LastPruneAt time.Time
LastPruneAgo string
}
type hostRepoPage struct {
hostChromeData
@@ -30,6 +54,11 @@ type hostRepoPage struct {
RepoUsername string
HasPassword bool
// Admin credentials (optional, prune-only — separate slot).
AdminURL string
AdminUsername string
HasAdminPassword bool
// Bandwidth (form values, blank means "no cap")
BandwidthUp string
BandwidthDown string
@@ -37,6 +66,14 @@ type hostRepoPage struct {
// Maintenance row
Maintenance store.HostRepoMaintenance
// Online mirrors Hub.Connected so Run-now button disabled state is
// accurate at render time.
Online bool
// StatsView is a pre-dereferenced projection of host_repo_stats.
// Nil when no row exists yet (fresh hosts).
StatsView *repoStatsView
// Snapshots-by-tag — map[group_name]count, plus an "untagged" row.
SnapshotsByTag map[string]int
UntaggedSnapshots int
@@ -44,6 +81,7 @@ type hostRepoPage struct {
// Inline form-error banners. Empty when no error for that section.
CredentialsError string
AdminCredsError string
BandwidthError string
MaintenanceError string
@@ -61,7 +99,7 @@ func (s *Server) loadHostRepoPage(r *stdhttp.Request, host store.Host) (*hostRep
}
// Credentials (redacted).
enc, err := s.deps.Store.GetHostCredentials(r.Context(), host.ID)
enc, err := s.deps.Store.GetHostCredentials(r.Context(), host.ID, store.CredKindRepo)
switch {
case err == nil:
plain, derr := s.deps.AEAD.Decrypt(enc, []byte("host:"+host.ID))
@@ -79,6 +117,60 @@ func (s *Server) loadHostRepoPage(r *stdhttp.Request, host store.Host) (*hostRep
return nil, err
}
// Admin credentials (optional — prune-only slot).
adminEnc, aerr := s.deps.Store.GetHostCredentials(r.Context(), host.ID, store.CredKindAdmin)
switch {
case aerr == nil:
plain, derr := s.deps.AEAD.Decrypt(adminEnc, []byte("host:"+host.ID+":admin"))
if derr == nil {
var blob repoCredsBlob
if jerr := json.Unmarshal(plain, &blob); jerr == nil {
p.AdminURL = blob.RepoURL
p.AdminUsername = blob.RepoUsername
p.HasAdminPassword = blob.RepoPassword != ""
}
}
case errors.Is(aerr, store.ErrNotFound):
// admin slot not configured — fine
default:
return nil, aerr
}
// Online status.
if s.deps.Hub != nil {
p.Online = s.deps.Hub.Connected(host.ID)
}
// Repo stats (tolerate ErrNotFound — fresh hosts have no row yet).
if stats, serr := s.deps.Store.GetHostRepoStats(r.Context(), host.ID); serr == nil {
sv := &repoStatsView{}
if stats.TotalSizeBytes != nil {
sv.HasTotalSize = true
sv.TotalSizeBytes = *stats.TotalSizeBytes
}
if stats.RawSizeBytes != nil {
sv.HasRawSize = true
sv.RawSizeBytes = *stats.RawSizeBytes
}
if stats.LastCheckAt != nil {
sv.HasLastCheck = true
sv.LastCheckAt = *stats.LastCheckAt
sv.LastCheckAgo = relTimeAgo(*stats.LastCheckAt)
}
sv.LastCheckStatus = stats.LastCheckStatus
if stats.LockPresent != nil {
sv.LockPresent = *stats.LockPresent
}
if stats.LastPruneAt != nil {
sv.HasLastPrune = true
sv.LastPruneAt = *stats.LastPruneAt
sv.LastPruneAgo = relTimeAgo(*stats.LastPruneAt)
}
p.StatsView = sv
} else if !errors.Is(serr, store.ErrNotFound) {
return nil, serr
}
// Bandwidth.
if host.BandwidthUpKBps != nil {
p.BandwidthUp = strconv.Itoa(*host.BandwidthUpKBps)
@@ -152,11 +244,11 @@ func (s *Server) handleUIHostRepo(w stdhttp.ResponseWriter, r *stdhttp.Request)
}
}
// renderRepoFormError loads the page state, overlays the section's
// error banner, and renders with a 422. Save-success goes through a
// 303 redirect with `?saved=<section>` instead, so this path is for
// validation failures only.
func (s *Server) renderRepoPage(w stdhttp.ResponseWriter, r *stdhttp.Request, u *ui.User, host *store.Host, credErr, bwErr, mntErr string) {
// renderRepoPage loads the page state, overlays section error banners,
// and renders with a 422. Save-success goes through a 303 redirect
// with `?saved=<section>` instead, so this path is for validation
// failures only.
func (s *Server) renderRepoPage(w stdhttp.ResponseWriter, r *stdhttp.Request, u *ui.User, host *store.Host, credErr, adminErr, bwErr, mntErr string) {
page, err := s.loadHostRepoPage(r, *host)
if err != nil {
slog.Error("ui repo: reload after save", "host_id", host.ID, "err", err)
@@ -164,6 +256,7 @@ func (s *Server) renderRepoPage(w stdhttp.ResponseWriter, r *stdhttp.Request, u
return
}
page.CredentialsError = credErr
page.AdminCredsError = adminErr
page.BandwidthError = bwErr
page.MaintenanceError = mntErr
view := s.baseView(u)
@@ -198,13 +291,13 @@ func (s *Server) handleUIRepoCredentialsSave(w stdhttp.ResponseWriter, r *stdhtt
repoPass := r.PostForm.Get("repo_password") // do NOT trim — operators may use trailing space deliberately
if repoURL == "" {
s.renderRepoPage(w, r, u, host, "Repo URL is required.", "", "")
s.renderRepoPage(w, r, u, host, "Repo URL is required.", "", "", "")
return
}
// Merge with existing blob — same semantics as the JSON PUT.
existing := repoCredsBlob{}
if cur, err := s.deps.Store.GetHostCredentials(r.Context(), host.ID); err == nil {
if cur, err := s.deps.Store.GetHostCredentials(r.Context(), host.ID, store.CredKindRepo); err == nil {
if plain, derr := s.deps.AEAD.Decrypt(cur, []byte("host:"+host.ID)); derr == nil {
_ = json.Unmarshal(plain, &existing)
}
@@ -217,7 +310,7 @@ func (s *Server) handleUIRepoCredentialsSave(w stdhttp.ResponseWriter, r *stdhtt
if existing.RepoPassword == "" {
s.renderRepoPage(w, r, u, host,
"No password on file yet — set one before saving the URL/username.",
"", "")
"", "", "")
return
}
@@ -227,7 +320,7 @@ func (s *Server) handleUIRepoCredentialsSave(w stdhttp.ResponseWriter, r *stdhtt
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
return
}
if err := s.deps.Store.SetHostCredentials(r.Context(), host.ID, enc); err != nil {
if err := s.deps.Store.SetHostCredentials(r.Context(), host.ID, store.CredKindRepo, enc); err != nil {
slog.Error("ui repo creds: persist", "err", err)
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
return
@@ -256,7 +349,7 @@ func (s *Server) handleUIRepoBandwidthSave(w stdhttp.ResponseWriter, r *stdhttp.
up, upErr := parseOptionalNonNegInt(r.PostForm.Get("bandwidth_up"))
down, downErr := parseOptionalNonNegInt(r.PostForm.Get("bandwidth_down"))
if upErr != nil || downErr != nil {
s.renderRepoPage(w, r, u, host, "",
s.renderRepoPage(w, r, u, host, "", "",
"Bandwidth caps must be non-negative whole numbers (or blank for no cap).",
"")
return
@@ -294,19 +387,19 @@ func (s *Server) handleUIRepoMaintenanceSave(w stdhttp.ResponseWriter, r *stdhtt
"forget": forgetCron, "prune": pruneCron, "check": checkCron,
} {
if expr == "" {
s.renderRepoPage(w, r, u, host, "", "",
s.renderRepoPage(w, r, u, host, "", "", "",
label+" cadence is required.")
return
}
if _, err := cronParser.Parse(expr); err != nil {
s.renderRepoPage(w, r, u, host, "", "",
s.renderRepoPage(w, r, u, host, "", "", "",
label+" cadence didn't parse: "+err.Error())
return
}
}
subset, err := strconv.Atoi(subsetStr)
if err != nil || subset < 0 || subset > 100 {
s.renderRepoPage(w, r, u, host, "", "",
s.renderRepoPage(w, r, u, host, "", "", "",
"check subset % must be between 0 and 100.")
return
}
@@ -348,3 +441,143 @@ func parseOptionalNonNegInt(s string) (*int, error) {
}
return &n, nil
}
// relTimeAgo returns a short human-readable relative-time string like
// "5m ago", "3h ago", "2d ago" for use in stats panels. Does not use
// the template funcMap so it can be called from Go directly.
func relTimeAgo(t time.Time) string {
d := time.Since(t)
if d < 0 {
d = 0
}
switch {
case d < time.Minute:
return "just now"
case d < time.Hour:
return strconv.Itoa(int(d.Minutes())) + "m ago"
case d < 24*time.Hour:
return strconv.Itoa(int(d.Hours())) + "h ago"
case d < 30*24*time.Hour:
return strconv.Itoa(int(d.Hours()/24)) + "d ago"
default:
return t.Format("2006-01-02")
}
}
// handleUIAdminCredentialsSave handles the HTML form POST to
// /hosts/{id}/admin-credentials. Mirrors handleUIRepoCredentialsSave
// but operates on the admin slot (store.CredKindAdmin, AAD "host:<id>:admin").
// Re-renders the page with an inline error on validation failure;
// redirects with ?saved=admin_credentials on success.
func (s *Server) handleUIAdminCredentialsSave(w stdhttp.ResponseWriter, r *stdhttp.Request) {
u := s.requireUIUser(w, r)
if u == nil {
return
}
host, ok := s.loadHostForUI(w, r)
if !ok {
return
}
if err := r.ParseForm(); err != nil {
stdhttp.Error(w, "bad request", stdhttp.StatusBadRequest)
return
}
repoURL := strings.TrimSpace(r.PostForm.Get("repo_url"))
repoUser := strings.TrimSpace(r.PostForm.Get("repo_username"))
repoPass := r.PostForm.Get("repo_password")
// All blank → no-op save (operator hit Save without filling anything).
// We treat this as harmless — they may have wanted to clear via the
// Clear button instead. Only validate if they've started filling fields.
if repoURL == "" && repoUser == "" && repoPass == "" {
stdhttp.Redirect(w, r, "/hosts/"+host.ID+"/repo", stdhttp.StatusSeeOther)
return
}
aad := []byte("host:" + host.ID + ":admin")
// Merge with the existing admin row, if any.
existing := repoCredsBlob{}
if cur, err := s.deps.Store.GetHostCredentials(r.Context(), host.ID, store.CredKindAdmin); err == nil {
if plain, derr := s.deps.AEAD.Decrypt(cur, aad); derr == nil {
_ = json.Unmarshal(plain, &existing)
}
}
existing.RepoURL = repoURL
existing.RepoUsername = repoUser
if repoPass != "" {
existing.RepoPassword = repoPass
}
if existing.RepoURL == "" {
s.renderRepoPage(w, r, u, host, "", "Repo URL is required.", "", "")
return
}
if existing.RepoPassword == "" {
s.renderRepoPage(w, r, u, host, "",
"No password on file yet — set one before saving the URL/username.",
"", "")
return
}
enc, err := s.encryptRepoCreds(existing, aad)
if err != nil {
slog.Error("ui admin creds: encrypt", "err", err)
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
return
}
if err := s.deps.Store.SetHostCredentials(r.Context(), host.ID, store.CredKindAdmin, enc); err != nil {
slog.Error("ui admin creds: persist", "err", err)
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
return
}
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
ID: ulid.Make().String(),
UserID: &u.ID,
Actor: "user",
Action: "host.admin_credentials_set",
TargetKind: ptr("host"),
TargetID: &host.ID,
TS: nowUTC(),
})
if s.deps.Hub != nil && s.deps.Hub.Connected(host.ID) {
if perr := s.pushAdminCredsToAgent(r.Context(), host.ID); perr != nil {
slog.Warn("ui admin creds: push to agent", "host_id", host.ID, "err", perr)
}
}
stdhttp.Redirect(w, r, "/hosts/"+host.ID+"/repo?saved=admin_credentials", stdhttp.StatusSeeOther)
}
// handleUIAdminCredentialsDelete handles the HTML form POST to
// /hosts/{id}/admin-credentials/delete. Removes the admin slot and
// redirects back to the repo page. Treats "not found" as success
// (idempotent delete from the operator's point of view).
func (s *Server) handleUIAdminCredentialsDelete(w stdhttp.ResponseWriter, r *stdhttp.Request) {
u := s.requireUIUser(w, r)
if u == nil {
return
}
host, ok := s.loadHostForUI(w, r)
if !ok {
return
}
err := s.deps.Store.DeleteHostCredentials(r.Context(), host.ID, store.CredKindAdmin)
if err != nil && !errors.Is(err, store.ErrNotFound) {
slog.Error("ui admin creds: delete", "host_id", host.ID, "err", err)
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
return
}
if err == nil {
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
ID: ulid.Make().String(),
UserID: &u.ID,
Actor: "user",
Action: "host.admin_credentials_deleted",
TargetKind: ptr("host"),
TargetID: &host.ID,
TS: nowUTC(),
})
}
stdhttp.Redirect(w, r, "/hosts/"+host.ID+"/repo?saved=admin_credentials", stdhttp.StatusSeeOther)
}
+400
View File
@@ -0,0 +1,400 @@
// ui_repo_test.go — integration tests for the Repo page HTML UI.
// Covers: admin-creds form rendering, stats panel, lock banner,
// run-now button disabled states, admin-creds form save/delete.
package http
import (
"context"
"io"
stdhttp "net/http"
"net/http/httptest"
"net/url"
"path/filepath"
"strings"
"testing"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// newTestServerWithUI creates a server that includes the UI renderer so
// HTML page tests can render and inspect the full template output.
func newTestServerWithUI(t *testing.T) (*Server, string, *store.Store) {
t.Helper()
dir := t.TempDir()
st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
if err != nil {
t.Fatalf("store: %v", err)
}
t.Cleanup(func() { _ = st.Close() })
keyPath := filepath.Join(dir, "secret.key")
_ = crypto.GenerateKeyFile(keyPath)
key, _ := crypto.LoadKeyFromFile(keyPath)
aead, _ := crypto.NewAEAD(key)
renderer, err := ui.New()
if err != nil {
t.Fatalf("ui.New: %v", err)
}
deps := Deps{
Cfg: config.Config{Listen: ":0", DataDir: dir, SecretKeyFile: keyPath},
Store: st,
AEAD: aead,
Hub: ws.NewHub(),
UI: renderer,
}
s := New(deps)
ts := httptest.NewServer(s.srv.Handler)
t.Cleanup(ts.Close)
return s, ts.URL, st
}
// getRepoPage fetches /hosts/{id}/repo and returns the body string.
func getRepoPage(t *testing.T, baseURL, hostID string, cookie *stdhttp.Cookie) string {
t.Helper()
client := &stdhttp.Client{
CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
return stdhttp.ErrUseLastResponse
},
}
req, err := stdhttp.NewRequest("GET", baseURL+"/hosts/"+hostID+"/repo", nil)
if err != nil {
t.Fatalf("new request: %v", err)
}
req.AddCookie(cookie)
res, err := client.Do(req)
if err != nil {
t.Fatalf("GET /hosts/%s/repo: %v", hostID, err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusOK {
t.Fatalf("GET /hosts/%s/repo: want 200, got %d", hostID, res.StatusCode)
}
raw, _ := io.ReadAll(res.Body)
return string(raw)
}
// postForm posts URL-encoded form data to path, following no redirects,
// and returns the status code and Location header.
func postForm(t *testing.T, baseURL, path string, data url.Values, cookie *stdhttp.Cookie) (int, string) {
t.Helper()
client := &stdhttp.Client{
CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
return stdhttp.ErrUseLastResponse
},
}
req, err := stdhttp.NewRequest("POST", baseURL+path, strings.NewReader(data.Encode()))
if err != nil {
t.Fatalf("new request: %v", err)
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
if cookie != nil {
req.AddCookie(cookie)
}
res, err := client.Do(req)
if err != nil {
t.Fatalf("POST %s: %v", path, err)
}
defer res.Body.Close()
return res.StatusCode, res.Header.Get("Location")
}
// ----- rendering tests ------------------------------------------------
// TestUIRepoPageRendersAdminCredsForm — visit /hosts/{id}/repo for a
// host with no admin creds. Assert the page contains the admin-creds
// section heading and the "not yet set" placeholder text.
func TestUIRepoPageRendersAdminCredsForm(t *testing.T) {
t.Parallel()
_, baseURL, st := newTestServerWithUI(t)
cookie := loginAsAdmin(t, st)
hostID := makeHost(t, st, "repo-page-admin-form")
body := getRepoPage(t, baseURL, hostID, cookie)
if !strings.Contains(body, "Admin credentials") {
t.Error("page missing 'Admin credentials' heading")
}
if !strings.Contains(body, "— not yet set —") {
t.Error("page missing '— not yet set —' placeholder for admin password")
}
}
// TestUIRepoPageRendersStatsPanel — seed a host_repo_stats row, render
// the page, assert "Repo health" panel and the seeded values appear.
func TestUIRepoPageRendersStatsPanel(t *testing.T) {
t.Parallel()
_, baseURL, st := newTestServerWithUI(t)
cookie := loginAsAdmin(t, st)
hostID := makeHost(t, st, "repo-page-stats")
totalSize := int64(5_000_000_000) // 5 GB
checkStatus := "ok"
checkAt := time.Now().Add(-2 * time.Hour).UTC()
if err := st.UpsertHostRepoStats(context.Background(), hostID, store.HostRepoStats{
TotalSizeBytes: &totalSize,
LastCheckAt: &checkAt,
LastCheckStatus: checkStatus,
}); err != nil {
t.Fatalf("upsert stats: %v", err)
}
body := getRepoPage(t, baseURL, hostID, cookie)
if !strings.Contains(body, "Repo health") {
t.Error("page missing 'Repo health' heading")
}
// The bytes helper renders 5 GB as "5.0 GB" (with a <span> unit suffix)
if !strings.Contains(body, "5.0") {
t.Error("page missing '5.0' (total size formatted bytes)")
}
if !strings.Contains(body, "ok") {
t.Error("page missing 'ok' check status")
}
}
// TestUIRepoPageRendersLockBanner — seed stats with LockPresent=true,
// render, assert stale lock warning appears.
func TestUIRepoPageRendersLockBanner(t *testing.T) {
t.Parallel()
_, baseURL, st := newTestServerWithUI(t)
cookie := loginAsAdmin(t, st)
hostID := makeHost(t, st, "repo-page-lock")
lockPresent := true
if err := st.UpsertHostRepoStats(context.Background(), hostID, store.HostRepoStats{
LockPresent: &lockPresent,
}); err != nil {
t.Fatalf("upsert stats: %v", err)
}
body := getRepoPage(t, baseURL, hostID, cookie)
if !strings.Contains(body, "Stale lock detected") {
t.Error("page missing stale lock warning")
}
}
// TestUIRepoRunNowButtonsDisabledWhenOffline — host not in the Hub
// (not connected), render, assert all three buttons carry disabled.
func TestUIRepoRunNowButtonsDisabledWhenOffline(t *testing.T) {
t.Parallel()
_, baseURL, st := newTestServerWithUI(t)
cookie := loginAsAdmin(t, st)
hostID := makeHost(t, st, "repo-page-offline")
// No WS connection → Hub.Connected returns false.
body := getRepoPage(t, baseURL, hostID, cookie)
// All three Run-now buttons should have disabled.
// Each button appears once in the template with class "btn btn-secondary"
// and hx-post attributes. The disabled attribute is added conditionally.
// Count occurrences of 'disabled' in the Run-now section.
runNowIdx := strings.Index(body, "Run now · one-time")
dangerIdx := strings.Index(body, "Danger zone")
if runNowIdx < 0 {
t.Fatal("page missing 'Run now · one-time' section")
}
if dangerIdx < 0 {
t.Fatal("page missing 'Danger zone' section")
}
runNowSection := body[runNowIdx:dangerIdx]
disabledCount := strings.Count(runNowSection, "disabled")
if disabledCount < 3 {
t.Errorf("expected at least 3 disabled attributes in Run-now section (one per button), got %d", disabledCount)
}
}
// TestUIRepoPruneButtonDisabledWithoutAdminCreds — host is online but
// no admin creds set. Assert prune button has disabled and mentions
// "set admin credentials first".
func TestUIRepoPruneButtonDisabledWithoutAdminCreds(t *testing.T) {
t.Parallel()
srv, baseURL, st := newTestServerWithUI(t)
cookie := loginAsAdmin(t, st)
hostID := makeHost(t, st, "repo-page-prune-no-admin")
// Register the host as "connected" in the Hub so the online check passes.
// We use a fake conn by injecting directly — for a simpler approach,
// rely on the fact that the Hub.Connected call just needs the ID registered.
// We can't easily fake a WS conn in a unit test, so instead we verify
// that even without the hub connected the prune button still has
// "set admin credentials first" text since that check runs first.
_ = srv // suppress unused warning
body := getRepoPage(t, baseURL, hostID, cookie)
if !strings.Contains(body, "set admin credentials first") {
t.Error("page missing 'set admin credentials first' on prune button")
}
}
// ----- admin-creds form save/delete tests ----------------------------
// TestUIAdminCredentialsSaveRoundTrip — POST form-encoded body to
// /hosts/{id}/admin-credentials, follow redirect, assert page now shows
// "stored, leave blank to keep" placeholder. Audit row landed.
func TestUIAdminCredentialsSaveRoundTrip(t *testing.T) {
t.Parallel()
_, baseURL, st := newTestServerWithUI(t)
cookie, userID := loginAsAdminWithID(t, st)
hostID := makeHost(t, st, "admin-save-roundtrip")
// POST admin credentials.
status, loc := postForm(t, baseURL, "/hosts/"+hostID+"/admin-credentials", url.Values{
"repo_url": {"rest:http://admin.example/h"},
"repo_username": {"admin-user"},
"repo_password": {"s3cr3t-admin"},
}, cookie)
if status != stdhttp.StatusSeeOther {
t.Fatalf("save: want 303, got %d", status)
}
if !strings.Contains(loc, "saved=admin_credentials") {
t.Errorf("redirect location should contain saved=admin_credentials, got %q", loc)
}
// Follow redirect.
body := getRepoPage(t, baseURL, hostID, cookie)
if !strings.Contains(body, "stored, leave blank to keep") {
t.Error("after save: page missing 'stored, leave blank to keep' placeholder for admin password")
}
// Audit row should exist.
ctx := context.Background()
rows, err := st.DB().QueryContext(ctx,
`SELECT action, user_id FROM audit_log WHERE target_id = ? AND action = 'host.admin_credentials_set'`,
hostID)
if err != nil {
t.Fatalf("query audit: %v", err)
}
defer rows.Close()
found := false
for rows.Next() {
var action string
var gotUID *string
if err := rows.Scan(&action, &gotUID); err != nil {
t.Fatalf("scan: %v", err)
}
found = true
if gotUID == nil || *gotUID != userID {
t.Errorf("audit row user_id: want %q, got %v", userID, gotUID)
}
}
if err := rows.Err(); err != nil {
t.Fatalf("rows.Err: %v", err)
}
if !found {
t.Error("audit row with action='host.admin_credentials_set' not found")
}
}
// TestUIAdminCredentialsDelete — POST to the delete route, assert
// admin row gone and audit row landed.
func TestUIAdminCredentialsDelete(t *testing.T) {
t.Parallel()
srv, baseURL, st := newTestServerWithUI(t)
cookie, userID := loginAsAdminWithID(t, st)
hostID := makeHost(t, st, "admin-delete")
ctx := context.Background()
// Seed admin creds directly.
enc, err := srv.encryptRepoCreds(repoCredsBlob{
RepoURL: "rest:http://admin.example/h",
RepoPassword: "pw",
}, []byte("host:"+hostID+":admin"))
if err != nil {
t.Fatalf("encrypt: %v", err)
}
if err := st.SetHostCredentials(ctx, hostID, store.CredKindAdmin, enc); err != nil {
t.Fatalf("set admin creds: %v", err)
}
// POST to delete route.
status, loc := postForm(t, baseURL, "/hosts/"+hostID+"/admin-credentials/delete", url.Values{}, cookie)
if status != stdhttp.StatusSeeOther {
t.Fatalf("delete: want 303, got %d", status)
}
if !strings.Contains(loc, "saved=admin_credentials") {
t.Errorf("redirect location: want saved=admin_credentials, got %q", loc)
}
// Admin row should be gone.
if _, err := st.GetHostCredentials(ctx, hostID, store.CredKindAdmin); err == nil {
t.Error("admin creds row still present after delete")
}
// Audit row.
rows, err := st.DB().QueryContext(ctx,
`SELECT action, user_id FROM audit_log WHERE target_id = ? AND action = 'host.admin_credentials_deleted'`,
hostID)
if err != nil {
t.Fatalf("query audit: %v", err)
}
defer rows.Close()
found := false
for rows.Next() {
var action string
var gotUID *string
if err := rows.Scan(&action, &gotUID); err != nil {
t.Fatalf("scan: %v", err)
}
found = true
if gotUID == nil || *gotUID != userID {
t.Errorf("audit row user_id: want %q, got %v", userID, gotUID)
}
}
if err := rows.Err(); err != nil {
t.Fatalf("rows.Err: %v", err)
}
if !found {
t.Error("audit row with action='host.admin_credentials_deleted' not found")
}
}
// TestUIAdminCredentialsDeleteIdempotent — POST to the delete route
// when no admin creds exist → 303 redirect (no 404 / 500).
func TestUIAdminCredentialsDeleteIdempotent(t *testing.T) {
t.Parallel()
_, baseURL, st := newTestServerWithUI(t)
cookie := loginAsAdmin(t, st)
hostID := makeHost(t, st, "admin-delete-noop")
status, _ := postForm(t, baseURL, "/hosts/"+hostID+"/admin-credentials/delete", url.Values{}, cookie)
if status != stdhttp.StatusSeeOther {
t.Fatalf("delete (noop): want 303, got %d", status)
}
}
// TestUIAdminCredentialsSaveAllBlankIsNoop — POST empty form → 303
// redirect, no row created.
func TestUIAdminCredentialsSaveAllBlankIsNoop(t *testing.T) {
t.Parallel()
_, baseURL, st := newTestServerWithUI(t)
cookie := loginAsAdmin(t, st)
hostID := makeHost(t, st, "admin-save-blank")
status, loc := postForm(t, baseURL, "/hosts/"+hostID+"/admin-credentials", url.Values{
"repo_url": {""},
"repo_username": {""},
"repo_password": {""},
}, cookie)
if status != stdhttp.StatusSeeOther {
t.Fatalf("blank save: want 303, got %d", status)
}
// All-blank is a no-op: redirect must not carry ?saved= banner.
if strings.Contains(loc, "?saved=") {
t.Errorf("blank save: redirect Location %q must not contain ?saved=", loc)
}
// No admin row should have been created.
if _, err := st.GetHostCredentials(context.Background(), hostID, store.CredKindAdmin); err == nil {
t.Error("admin creds row created unexpectedly for blank save")
}
}
+116
View File
@@ -0,0 +1,116 @@
// Package maintenance owns the server-side scheduler that fires
// forget/prune/check on the cadences operators set on
// host_repo_maintenance rows. Independent of the agent's local cron
// (which now only handles backup schedules).
//
// The ticker is intentionally side-effect-free at the package
// boundary: it asks an injected Backend for current state and emits
// a list of Decisions for the caller to act on. Easy to unit-test
// without a running server.
package maintenance
import (
"context"
"errors"
"time"
"github.com/robfig/cron/v3"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// Decision is one cadence-driven dispatch the ticker recommends.
// SubsetPct is populated only when Kind == "check"; ignored for
// "forget" and "prune".
type Decision struct {
HostID string
Kind string // "forget" | "prune" | "check"
SubsetPct int
}
// Backend is the subset of *store.Store the ticker depends on.
// Constrained interface so tests can pass a fake.
type Backend interface {
ListAllMaintenance(ctx context.Context) ([]store.HostRepoMaintenance, error)
LatestJobByKind(ctx context.Context, hostID, kind string) (*store.Job, error)
}
// Ticker decides which cadence-driven jobs are due to fire at a
// given instant. Stateless — the only state lives in the Backend.
type Ticker struct {
backend Backend
parser cron.Parser
}
// New builds a Ticker bound to the given Backend.
func New(b Backend) *Ticker {
return &Ticker{
backend: b,
parser: cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow),
}
}
// Decide returns the set of jobs the ticker would dispatch at `now`.
// The caller is responsible for: checking host online state,
// persisting the job row, and shipping command.run. Returns nil
// (not an error) when the maintenance table is empty — a fresh
// install is the most common case.
func (t *Ticker) Decide(ctx context.Context, now time.Time) ([]Decision, error) {
rows, err := t.backend.ListAllMaintenance(ctx)
if err != nil {
return nil, err
}
var out []Decision
for _, m := range rows {
if d, ok := t.dueFor(ctx, now, m.HostID, "forget", m.ForgetCron, m.ForgetEnabled, 0); ok {
out = append(out, d)
}
if d, ok := t.dueFor(ctx, now, m.HostID, "prune", m.PruneCron, m.PruneEnabled, 0); ok {
out = append(out, d)
}
if d, ok := t.dueFor(ctx, now, m.HostID, "check", m.CheckCron, m.CheckEnabled, m.CheckSubsetPct); ok {
out = append(out, d)
}
}
return out, nil
}
// dueFor returns true if the cron has a fire-instant strictly after
// the latest persisted job's created_at and at-or-before now.
//
// Anchor selection:
// - When LatestJobByKind returns a job: anchor = j.CreatedAt.
// - When LatestJobByKind returns ErrNotFound: anchor = now - 24h
// (first-run case — cap the lookback so a brand-new host doesn't
// fire 30 days of missed monthly-checks on first tick).
// - When LatestJobByKind returns a hard error: skip this kind for
// this host on this tick.
//
// Disabled (`enabled == false`) or empty cron skips silently.
// Cron parse failures skip silently — the schedule/maintenance
// routes already validate cron at write time, so this is defensive.
func (t *Ticker) dueFor(ctx context.Context, now time.Time, hostID, kind, expr string, enabled bool, subset int) (Decision, bool) {
if !enabled || expr == "" {
return Decision{}, false
}
sched, err := t.parser.Parse(expr)
if err != nil {
return Decision{}, false
}
j, err := t.backend.LatestJobByKind(ctx, hostID, kind)
var anchor time.Time
switch {
case err == nil && j != nil:
anchor = j.CreatedAt
case errors.Is(err, store.ErrNotFound):
anchor = now.Add(-24 * time.Hour)
default:
// Hard error — skip this kind on this tick.
return Decision{}, false
}
next := sched.Next(anchor)
if next.IsZero() || next.After(now) {
return Decision{}, false
}
return Decision{HostID: hostID, Kind: kind, SubsetPct: subset}, true
}
+315
View File
@@ -0,0 +1,315 @@
package maintenance
import (
"context"
"errors"
"testing"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// fakeBackend implements Backend with table-driven canned responses.
type fakeBackend struct {
rows []store.HostRepoMaintenance
// jobs[hostID][kind] -> job (if present, returned). If absent,
// fakeBackend returns ErrNotFound by default.
jobs map[string]map[string]*store.Job
// hardErr forces a non-ErrNotFound failure for a given (host, kind).
hardErr map[string]map[string]error
// listErr forces ListAllMaintenance to fail.
listErr error
}
func (f *fakeBackend) ListAllMaintenance(_ context.Context) ([]store.HostRepoMaintenance, error) {
if f.listErr != nil {
return nil, f.listErr
}
return f.rows, nil
}
func (f *fakeBackend) LatestJobByKind(_ context.Context, hostID, kind string) (*store.Job, error) {
if hostErrs, ok := f.hardErr[hostID]; ok {
if err := hostErrs[kind]; err != nil {
return nil, err
}
}
if hostJobs, ok := f.jobs[hostID]; ok {
if j := hostJobs[kind]; j != nil {
return j, nil
}
}
return nil, store.ErrNotFound
}
// mustTime parses an RFC3339 string, fatal on failure.
func mustTime(t *testing.T, s string) time.Time {
t.Helper()
out, err := time.Parse(time.RFC3339, s)
if err != nil {
t.Fatalf("parse %q: %v", s, err)
}
return out
}
func TestTickerSkipsDisabled(t *testing.T) {
t.Parallel()
be := &fakeBackend{
rows: []store.HostRepoMaintenance{{
HostID: "h1",
ForgetCron: "0 3 * * *",
ForgetEnabled: false,
PruneCron: "0 4 * * *",
PruneEnabled: false,
CheckCron: "0 5 * * *",
CheckEnabled: false,
}},
}
tk := New(be)
now := mustTime(t, "2026-05-04T04:00:00Z")
got, err := tk.Decide(context.Background(), now)
if err != nil {
t.Fatalf("Decide: %v", err)
}
if len(got) != 0 {
t.Errorf("expected no decisions, got %+v", got)
}
}
func TestTickerSkipsEmptyCron(t *testing.T) {
t.Parallel()
be := &fakeBackend{
rows: []store.HostRepoMaintenance{{
HostID: "h1",
ForgetCron: "",
ForgetEnabled: true,
PruneCron: "",
PruneEnabled: true,
CheckCron: "",
CheckEnabled: true,
}},
}
tk := New(be)
got, err := tk.Decide(context.Background(), mustTime(t, "2026-05-04T04:00:00Z"))
if err != nil {
t.Fatalf("Decide: %v", err)
}
if len(got) != 0 {
t.Errorf("expected no decisions, got %+v", got)
}
}
func TestTickerFiresWhenOverdue(t *testing.T) {
t.Parallel()
now := mustTime(t, "2026-05-04T04:00:00Z")
// Latest forget job 25h ago.
last := now.Add(-25 * time.Hour)
be := &fakeBackend{
rows: []store.HostRepoMaintenance{{
HostID: "h1",
ForgetCron: "0 3 * * *",
ForgetEnabled: true,
}},
jobs: map[string]map[string]*store.Job{
"h1": {"forget": &store.Job{ID: "j1", HostID: "h1", Kind: "forget", CreatedAt: last}},
},
}
tk := New(be)
got, err := tk.Decide(context.Background(), now)
if err != nil {
t.Fatalf("Decide: %v", err)
}
if len(got) != 1 || got[0].Kind != "forget" || got[0].HostID != "h1" {
t.Errorf("expected one forget decision, got %+v", got)
}
}
func TestTickerSuppressesWhenRecent(t *testing.T) {
t.Parallel()
now := mustTime(t, "2026-05-04T04:00:00Z")
last := mustTime(t, "2026-05-04T03:30:00Z")
be := &fakeBackend{
rows: []store.HostRepoMaintenance{{
HostID: "h1",
ForgetCron: "0 3 * * *",
ForgetEnabled: true,
}},
jobs: map[string]map[string]*store.Job{
"h1": {"forget": &store.Job{ID: "j1", HostID: "h1", Kind: "forget", CreatedAt: last}},
},
}
tk := New(be)
got, err := tk.Decide(context.Background(), now)
if err != nil {
t.Fatalf("Decide: %v", err)
}
if len(got) != 0 {
t.Errorf("expected no decisions, got %+v", got)
}
}
func TestTickerFirstRunAnchorBoundedAt24h(t *testing.T) {
t.Parallel()
be := &fakeBackend{
rows: []store.HostRepoMaintenance{{
HostID: "h1",
ForgetCron: "0 3 * * *",
ForgetEnabled: true,
}},
}
tk := New(be)
// Case 1: now=04:00. Anchor=04:00 - 24h = previous-day 04:00. Next
// fire after that is today 03:00 — within window → fire.
now1 := mustTime(t, "2026-05-04T04:00:00Z")
got, err := tk.Decide(context.Background(), now1)
if err != nil {
t.Fatalf("Decide: %v", err)
}
if len(got) != 1 {
t.Errorf("case1: expected 1 decision, got %+v", got)
}
// Case 2: a cron firing less often than once per 24h with a
// no-prior-job anchor must not fire when the most recent fire is
// outside the 24h lookback window. Use a weekly cron (Mondays at
// 03:00) and `now` on a Tuesday: anchor=now-24h lands on Monday,
// so cron.Next(Monday) = next-week Monday → after now → no fire.
// 2026-05-04 is a Monday, 2026-05-05 a Tuesday.
be2 := &fakeBackend{
rows: []store.HostRepoMaintenance{{
HostID: "h2",
ForgetCron: "0 3 * * 1",
ForgetEnabled: true,
}},
}
tk2 := New(be2)
now2 := mustTime(t, "2026-05-05T03:00:00Z")
got2, err := tk2.Decide(context.Background(), now2)
if err != nil {
t.Fatalf("Decide case2: %v", err)
}
if len(got2) != 0 {
t.Errorf("case2: expected no decisions (cron fires < once/24h, prior fire was Monday 03:00 which is exactly 24h ago and anchor=now-24h means next-after is next Monday), got %+v", got2)
}
}
func TestTickerCheckDecisionCarriesSubset(t *testing.T) {
t.Parallel()
now := mustTime(t, "2026-05-04T04:00:00Z")
last := now.Add(-30 * 24 * time.Hour)
be := &fakeBackend{
rows: []store.HostRepoMaintenance{{
HostID: "h1",
CheckCron: "0 3 * * *",
CheckEnabled: true,
CheckSubsetPct: 25,
}},
jobs: map[string]map[string]*store.Job{
"h1": {"check": &store.Job{ID: "j1", HostID: "h1", Kind: "check", CreatedAt: last}},
},
}
tk := New(be)
got, err := tk.Decide(context.Background(), now)
if err != nil {
t.Fatalf("Decide: %v", err)
}
if len(got) != 1 || got[0].Kind != "check" || got[0].SubsetPct != 25 {
t.Errorf("expected check decision with SubsetPct=25, got %+v", got)
}
}
func TestTickerHardJobErrorSkipsKind(t *testing.T) {
t.Parallel()
now := mustTime(t, "2026-05-04T04:00:00Z")
last := now.Add(-25 * time.Hour)
hardErr := errors.New("synthetic db error")
be := &fakeBackend{
rows: []store.HostRepoMaintenance{{
HostID: "h1",
ForgetCron: "0 3 * * *",
ForgetEnabled: true,
CheckCron: "0 3 * * *",
CheckEnabled: true,
}},
jobs: map[string]map[string]*store.Job{
// check has a normal latest-job; should still fire.
"h1": {"check": &store.Job{ID: "jc", HostID: "h1", Kind: "check", CreatedAt: last}},
},
hardErr: map[string]map[string]error{
"h1": {"forget": hardErr},
},
}
tk := New(be)
got, err := tk.Decide(context.Background(), now)
if err != nil {
t.Fatalf("Decide: %v", err)
}
// Only the check decision should land — forget is skipped.
if len(got) != 1 || got[0].Kind != "check" {
t.Errorf("expected only check decision, got %+v", got)
}
}
func TestTickerHandlesMultipleHosts(t *testing.T) {
t.Parallel()
now := mustTime(t, "2026-05-04T04:00:00Z")
last := now.Add(-25 * time.Hour)
be := &fakeBackend{
rows: []store.HostRepoMaintenance{
{
HostID: "ha",
ForgetCron: "0 3 * * *",
ForgetEnabled: true,
},
{
HostID: "hb",
CheckCron: "0 3 * * *",
CheckEnabled: true,
PruneCron: "0 4 * * *",
PruneEnabled: false, // disabled — should not fire
},
},
jobs: map[string]map[string]*store.Job{
"ha": {"forget": &store.Job{ID: "j1", HostID: "ha", Kind: "forget", CreatedAt: last}},
"hb": {"check": &store.Job{ID: "j2", HostID: "hb", Kind: "check", CreatedAt: last}},
},
}
tk := New(be)
got, err := tk.Decide(context.Background(), now)
if err != nil {
t.Fatalf("Decide: %v", err)
}
if len(got) != 2 {
t.Fatalf("expected 2 decisions, got %d: %+v", len(got), got)
}
kinds := map[string]string{}
for _, d := range got {
kinds[d.HostID] = d.Kind
}
if kinds["ha"] != "forget" {
t.Errorf("ha: expected forget, got %q", kinds["ha"])
}
if kinds["hb"] != "check" {
t.Errorf("hb: expected check, got %q", kinds["hb"])
}
}
func TestTickerInvalidCronSkipsSilently(t *testing.T) {
t.Parallel()
be := &fakeBackend{
rows: []store.HostRepoMaintenance{{
HostID: "h1",
ForgetCron: "not a cron",
ForgetEnabled: true,
}},
}
tk := New(be)
got, err := tk.Decide(context.Background(), mustTime(t, "2026-05-04T04:00:00Z"))
if err != nil {
t.Fatalf("Decide: %v", err)
}
if len(got) != 0 {
t.Errorf("expected no decisions for invalid cron, got %+v", got)
}
}
+28 -2
View File
@@ -267,8 +267,34 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
deps.OnScheduleFire(ctx, hostID, c, p.ScheduleID, p.ScheduledAt)
}
case api.MsgRepoStats, api.MsgCommandResult:
// TODO(P2): persist these projections.
case api.MsgRepoStats:
var p api.RepoStatsPayload
if err := env.UnmarshalPayload(&p); err != nil {
slog.Warn("ws: bad repo.stats payload", "host_id", hostID, "err", err)
break
}
patch := store.HostRepoStats{
HostID: hostID,
TotalSizeBytes: p.TotalSizeBytes,
RawSizeBytes: p.RawSizeBytes,
UniqueFiles: p.UniqueFiles,
SnapshotCount: p.SnapshotCount,
LastCheckAt: p.LastCheckAt,
LastCheckStatus: p.LastCheckStatus,
LockPresent: p.LockPresent,
LastPruneAt: p.LastPruneAt,
LastPruneFreedBytes: p.LastPruneFreedBytes,
}
if err := deps.Store.UpsertHostRepoStats(ctx, hostID, patch); err != nil {
slog.Warn("ws: upsert host repo stats", "host_id", hostID, "err", err)
} else {
slog.Info("ws: repo stats refreshed", "host_id", hostID)
}
case api.MsgCommandResult:
// TODO(P2): persist command.result acks for "did the agent
// accept the dispatch?" forensics. Currently the job lifecycle
// (job.started → job.finished) is sufficient signal.
slog.Debug("ws msg not yet handled", "type", env.Type, "host_id", hostID)
case api.MsgError:
+135
View File
@@ -0,0 +1,135 @@
package ws
import (
"context"
"path/filepath"
"testing"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// openWSTestStore opens an isolated file-backed db in t.TempDir.
func openWSTestStore(t *testing.T) *store.Store {
t.Helper()
dir := t.TempDir()
s, err := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
if err != nil {
t.Fatalf("open: %v", err)
}
t.Cleanup(func() { _ = s.Close() })
return s
}
// seedHostWS inserts a minimal host row directly via the store's DB.
func seedHostWS(t *testing.T, s *store.Store, hostID string) {
t.Helper()
_, err := s.DB().Exec(
`INSERT INTO hosts (id, name, os, arch, enrolled_at) VALUES (?,?,?,?,?)`,
hostID, hostID, "linux", "amd64", "2026-01-01T00:00:00Z")
if err != nil {
t.Fatalf("seed host %q: %v", hostID, err)
}
}
func int64ptrWS(v int64) *int64 { return &v }
func boolptrWS(v bool) *bool { return &v }
func TestRepoStatsReportPersisted(t *testing.T) {
t.Parallel()
s := openWSTestStore(t)
ctx := context.Background()
const hostID = "h-stats-ws"
seedHostWS(t, s, hostID)
now := time.Now().UTC().Truncate(time.Second)
pruneAt := now.Add(-2 * time.Hour)
payload := api.RepoStatsPayload{
TotalSizeBytes: int64ptrWS(1024),
RawSizeBytes: int64ptrWS(2048),
UniqueFiles: int64ptrWS(42),
SnapshotCount: int64ptrWS(7),
LastCheckAt: &now,
LastCheckStatus: "ok",
LockPresent: boolptrWS(false),
LastPruneAt: &pruneAt,
LastPruneFreedBytes: int64ptrWS(512),
}
env, err := api.Marshal(api.MsgRepoStats, "", payload)
if err != nil {
t.Fatalf("marshal: %v", err)
}
deps := HandlerDeps{Store: s}
dispatchAgentMessage(ctx, nil, hostID, env, deps)
got, err := s.GetHostRepoStats(ctx, hostID)
if err != nil {
t.Fatalf("get host repo stats: %v", err)
}
if got.TotalSizeBytes == nil || *got.TotalSizeBytes != 1024 {
t.Errorf("TotalSizeBytes: got %v want 1024", got.TotalSizeBytes)
}
if got.RawSizeBytes == nil || *got.RawSizeBytes != 2048 {
t.Errorf("RawSizeBytes: got %v want 2048", got.RawSizeBytes)
}
if got.UniqueFiles == nil || *got.UniqueFiles != 42 {
t.Errorf("UniqueFiles: got %v want 42", got.UniqueFiles)
}
if got.SnapshotCount == nil || *got.SnapshotCount != 7 {
t.Errorf("SnapshotCount: got %v want 7", got.SnapshotCount)
}
if got.LastCheckAt == nil || !got.LastCheckAt.Equal(now) {
t.Errorf("LastCheckAt: got %v want %v", got.LastCheckAt, now)
}
if got.LastCheckStatus != "ok" {
t.Errorf("LastCheckStatus: got %q want %q", got.LastCheckStatus, "ok")
}
if got.LockPresent == nil || *got.LockPresent != false {
t.Errorf("LockPresent: got %v want false", got.LockPresent)
}
if got.LastPruneAt == nil || !got.LastPruneAt.Equal(pruneAt) {
t.Errorf("LastPruneAt: got %v want %v", got.LastPruneAt, pruneAt)
}
if got.LastPruneFreedBytes == nil || *got.LastPruneFreedBytes != 512 {
t.Errorf("LastPruneFreedBytes: got %v want 512", got.LastPruneFreedBytes)
}
}
func TestRepoStatsReportPartialUpdate(t *testing.T) {
t.Parallel()
s := openWSTestStore(t)
ctx := context.Background()
const hostID = "h-stats-partial"
seedHostWS(t, s, hostID)
// Pre-seed: TotalSizeBytes = 100.
if err := s.UpsertHostRepoStats(ctx, hostID, store.HostRepoStats{
TotalSizeBytes: int64ptrWS(100),
}); err != nil {
t.Fatalf("pre-seed upsert: %v", err)
}
// Send a repo.stats payload that only sets LastCheckStatus.
env, err := api.Marshal(api.MsgRepoStats, "", api.RepoStatsPayload{
LastCheckStatus: "ok",
})
if err != nil {
t.Fatalf("marshal: %v", err)
}
dispatchAgentMessage(ctx, nil, hostID, env, HandlerDeps{Store: s})
got, err := s.GetHostRepoStats(ctx, hostID)
if err != nil {
t.Fatalf("get: %v", err)
}
if got.TotalSizeBytes == nil || *got.TotalSizeBytes != 100 {
t.Errorf("TotalSizeBytes lost: got %v want 100", got.TotalSizeBytes)
}
if got.LastCheckStatus != "ok" {
t.Errorf("LastCheckStatus: got %q want ok", got.LastCheckStatus)
}
}
+34 -11
View File
@@ -8,13 +8,23 @@ import (
"time"
)
// CredentialKind identifies the role of a host_credentials row.
type CredentialKind string
const (
// CredKindRepo is the append-only credential used for every backup.
CredKindRepo CredentialKind = "repo"
// CredKindAdmin is the delete-capable credential used for prune.
CredKindAdmin CredentialKind = "admin"
)
// GetHostCredentials returns the AEAD-encrypted repo creds blob for
// the host, or ("", ErrNotFound) if no credential has ever been set.
// the host + kind, or ("", ErrNotFound) if no matching row exists.
// The caller decrypts using host_id as AEAD additional data.
func (s *Store) GetHostCredentials(ctx context.Context, hostID string) (string, error) {
func (s *Store) GetHostCredentials(ctx context.Context, hostID string, kind CredentialKind) (string, error) {
row := s.db.QueryRowContext(ctx,
`SELECT enc_repo_creds FROM host_credentials WHERE host_id = ?`,
hostID)
`SELECT enc_repo_creds FROM host_credentials WHERE host_id = ? AND kind = ?`,
hostID, string(kind))
var enc string
if err := row.Scan(&enc); err != nil {
if errors.Is(err, sql.ErrNoRows) {
@@ -25,22 +35,35 @@ func (s *Store) GetHostCredentials(ctx context.Context, hostID string) (string,
return enc, nil
}
// SetHostCredentials replaces the host's encrypted repo creds blob.
// The caller has already encrypted using host_id as additional data.
func (s *Store) SetHostCredentials(ctx context.Context, hostID, encRepoCreds string) error {
// SetHostCredentials replaces the host's encrypted repo creds blob for
// the given kind. The caller has already encrypted using host_id as
// additional data.
func (s *Store) SetHostCredentials(ctx context.Context, hostID string, kind CredentialKind, encRepoCreds string) error {
if encRepoCreds == "" {
return fmt.Errorf("store: empty enc_repo_creds")
}
now := time.Now().UTC().Format(time.RFC3339Nano)
_, err := s.db.ExecContext(ctx,
`INSERT INTO host_credentials (host_id, enc_repo_creds, updated_at)
VALUES (?, ?, ?)
ON CONFLICT(host_id) DO UPDATE SET
`INSERT INTO host_credentials (host_id, kind, enc_repo_creds, updated_at)
VALUES (?, ?, ?, ?)
ON CONFLICT(host_id, kind) DO UPDATE SET
enc_repo_creds = excluded.enc_repo_creds,
updated_at = excluded.updated_at`,
hostID, encRepoCreds, now)
hostID, string(kind), encRepoCreds, now)
if err != nil {
return fmt.Errorf("store: set host credentials: %w", err)
}
return nil
}
// DeleteHostCredentials removes the credential row for the given host
// and kind. A no-op if the row does not exist.
func (s *Store) DeleteHostCredentials(ctx context.Context, hostID string, kind CredentialKind) error {
_, err := s.db.ExecContext(ctx,
`DELETE FROM host_credentials WHERE host_id = ? AND kind = ?`,
hostID, string(kind))
if err != nil {
return fmt.Errorf("store: delete host credentials: %w", err)
}
return nil
}
+103
View File
@@ -0,0 +1,103 @@
package store
import (
"context"
"errors"
"testing"
)
// seedHost inserts a minimal host row for testing.
func seedHost(t *testing.T, s *Store, hostID string) {
t.Helper()
_, err := s.DB().Exec(
`INSERT INTO hosts (id, name, os, arch, enrolled_at) VALUES (?,?,?,?,?)`,
hostID, hostID, "linux", "amd64", "2026-01-01T00:00:00Z")
if err != nil {
t.Fatalf("seed host %q: %v", hostID, err)
}
}
func TestHostCredentialsAdminRowSeparate(t *testing.T) {
t.Parallel()
s := openTestStore(t)
ctx := context.Background()
const hostID = "h-creds-test"
seedHost(t, s, hostID)
const repoBlob = "enc-repo-blob"
const adminBlob = "enc-admin-blob"
// Set repo creds.
if err := s.SetHostCredentials(ctx, hostID, CredKindRepo, repoBlob); err != nil {
t.Fatalf("set repo creds: %v", err)
}
// Set admin creds.
if err := s.SetHostCredentials(ctx, hostID, CredKindAdmin, adminBlob); err != nil {
t.Fatalf("set admin creds: %v", err)
}
// Fetch each by kind and assert they differ.
gotRepo, err := s.GetHostCredentials(ctx, hostID, CredKindRepo)
if err != nil {
t.Fatalf("get repo creds: %v", err)
}
gotAdmin, err := s.GetHostCredentials(ctx, hostID, CredKindAdmin)
if err != nil {
t.Fatalf("get admin creds: %v", err)
}
if gotRepo != repoBlob {
t.Errorf("repo creds: got %q, want %q", gotRepo, repoBlob)
}
if gotAdmin != adminBlob {
t.Errorf("admin creds: got %q, want %q", gotAdmin, adminBlob)
}
if gotRepo == gotAdmin {
t.Error("repo and admin blobs must differ")
}
// Delete admin; repo must be unaffected.
if err := s.DeleteHostCredentials(ctx, hostID, CredKindAdmin); err != nil {
t.Fatalf("delete admin creds: %v", err)
}
if _, err := s.GetHostCredentials(ctx, hostID, CredKindAdmin); !errors.Is(err, ErrNotFound) {
t.Errorf("after delete, expected ErrNotFound for admin; got %v", err)
}
if got, err := s.GetHostCredentials(ctx, hostID, CredKindRepo); err != nil || got != repoBlob {
t.Errorf("repo creds should survive admin delete; got %q, err %v", got, err)
}
}
func TestHostCredentialsNotFound(t *testing.T) {
t.Parallel()
s := openTestStore(t)
ctx := context.Background()
_, err := s.GetHostCredentials(ctx, "no-such-host", CredKindRepo)
if !errors.Is(err, ErrNotFound) {
t.Errorf("expected ErrNotFound, got %v", err)
}
}
func TestHostCredentialsUpsert(t *testing.T) {
t.Parallel()
s := openTestStore(t)
ctx := context.Background()
const hostID = "h-upsert-test"
seedHost(t, s, hostID)
if err := s.SetHostCredentials(ctx, hostID, CredKindRepo, "v1"); err != nil {
t.Fatalf("set v1: %v", err)
}
if err := s.SetHostCredentials(ctx, hostID, CredKindRepo, "v2"); err != nil {
t.Fatalf("set v2 (upsert): %v", err)
}
got, err := s.GetHostCredentials(ctx, hostID, CredKindRepo)
if err != nil {
t.Fatalf("get: %v", err)
}
if got != "v2" {
t.Errorf("expected v2, got %q", got)
}
}
+231
View File
@@ -0,0 +1,231 @@
package store
import (
"context"
"database/sql"
"errors"
"fmt"
"time"
)
// HostRepoStats is the per-host projection of repo-level metrics.
// All pointer fields are nullable; nil means "not yet known." The row
// is created (or replaced) by UpsertHostRepoStats which merges in only
// the non-nil fields from a patch.
type HostRepoStats struct {
HostID string
TotalSizeBytes *int64
RawSizeBytes *int64
UniqueFiles *int64
SnapshotCount *int64
LastCheckAt *time.Time
LastCheckStatus string // "" | "ok" | "errors_found" | "failed"
LockPresent *bool
LastPruneAt *time.Time
LastPruneFreedBytes *int64
UpdatedAt time.Time
}
// GetHostRepoStats returns the row, or (nil, ErrNotFound) if absent.
func (s *Store) GetHostRepoStats(ctx context.Context, hostID string) (*HostRepoStats, error) {
row := s.db.QueryRowContext(ctx,
`SELECT host_id, total_size_bytes, raw_size_bytes, unique_files,
snapshot_count, last_check_at, last_check_status,
lock_present, last_prune_at, last_prune_freed_bytes, updated_at
FROM host_repo_stats WHERE host_id = ?`, hostID)
return scanHostRepoStats(row)
}
// getHostRepoStatsTx is identical to GetHostRepoStats but runs on an
// existing transaction so the fetch-merge-upsert in UpsertHostRepoStats
// is fully serialized.
func getHostRepoStatsTx(ctx context.Context, tx *sql.Tx, hostID string) (*HostRepoStats, error) {
row := tx.QueryRowContext(ctx,
`SELECT host_id, total_size_bytes, raw_size_bytes, unique_files,
snapshot_count, last_check_at, last_check_status,
lock_present, last_prune_at, last_prune_freed_bytes, updated_at
FROM host_repo_stats WHERE host_id = ?`, hostID)
return scanHostRepoStats(row)
}
// scanHostRepoStats scans one row from host_repo_stats.
func scanHostRepoStats(row *sql.Row) (*HostRepoStats, error) {
var (
st HostRepoStats
totalSize sql.NullInt64
rawSize sql.NullInt64
uniqueFiles sql.NullInt64
snapshotCount sql.NullInt64
lastCheckAt sql.NullString
lastCheckStatus sql.NullString
lockPresent int64
lastPruneAt sql.NullString
lastPruneFreed sql.NullInt64
updatedAt string
)
if err := row.Scan(
&st.HostID,
&totalSize, &rawSize, &uniqueFiles, &snapshotCount,
&lastCheckAt, &lastCheckStatus,
&lockPresent,
&lastPruneAt, &lastPruneFreed,
&updatedAt,
); err != nil {
if errors.Is(err, sql.ErrNoRows) {
return nil, ErrNotFound
}
return nil, fmt.Errorf("store: scan host_repo_stats: %w", err)
}
if totalSize.Valid {
v := totalSize.Int64
st.TotalSizeBytes = &v
}
if rawSize.Valid {
v := rawSize.Int64
st.RawSizeBytes = &v
}
if uniqueFiles.Valid {
v := uniqueFiles.Int64
st.UniqueFiles = &v
}
if snapshotCount.Valid {
v := snapshotCount.Int64
st.SnapshotCount = &v
}
if lastCheckAt.Valid {
t, err := time.Parse(time.RFC3339Nano, lastCheckAt.String)
if err != nil {
return nil, fmt.Errorf("store: parse last_check_at: %w", err)
}
st.LastCheckAt = &t
}
if lastCheckStatus.Valid {
st.LastCheckStatus = lastCheckStatus.String
}
lp := lockPresent != 0
st.LockPresent = &lp
if lastPruneAt.Valid {
t, err := time.Parse(time.RFC3339Nano, lastPruneAt.String)
if err != nil {
return nil, fmt.Errorf("store: parse last_prune_at: %w", err)
}
st.LastPruneAt = &t
}
if lastPruneFreed.Valid {
v := lastPruneFreed.Int64
st.LastPruneFreedBytes = &v
}
t, err := time.Parse(time.RFC3339Nano, updatedAt)
if err != nil {
return nil, fmt.Errorf("store: parse host_repo_stats.updated_at: %w", err)
}
st.UpdatedAt = t
return &st, nil
}
// UpsertHostRepoStats writes a partial update — only non-nil pointer
// fields (and LastCheckStatus when non-empty) overwrite existing
// columns. Wrapped in a transaction so concurrent upserts on the same
// host don't lose updates.
func (s *Store) UpsertHostRepoStats(ctx context.Context, hostID string, patch HostRepoStats) error {
tx, err := s.db.BeginTx(ctx, nil)
if err != nil {
return fmt.Errorf("store: begin host_repo_stats tx: %w", err)
}
defer func() { _ = tx.Rollback() }()
// Fetch existing row; start from zero if absent.
cur, err := getHostRepoStatsTx(ctx, tx, hostID)
if err != nil && !errors.Is(err, ErrNotFound) {
return err
}
if cur == nil {
cur = &HostRepoStats{HostID: hostID}
}
// Merge: non-nil patch fields overwrite current.
if patch.TotalSizeBytes != nil {
cur.TotalSizeBytes = patch.TotalSizeBytes
}
if patch.RawSizeBytes != nil {
cur.RawSizeBytes = patch.RawSizeBytes
}
if patch.UniqueFiles != nil {
cur.UniqueFiles = patch.UniqueFiles
}
if patch.SnapshotCount != nil {
cur.SnapshotCount = patch.SnapshotCount
}
if patch.LastCheckAt != nil {
cur.LastCheckAt = patch.LastCheckAt
}
if patch.LastCheckStatus != "" {
cur.LastCheckStatus = patch.LastCheckStatus
}
if patch.LockPresent != nil {
cur.LockPresent = patch.LockPresent
}
if patch.LastPruneAt != nil {
cur.LastPruneAt = patch.LastPruneAt
}
if patch.LastPruneFreedBytes != nil {
cur.LastPruneFreedBytes = patch.LastPruneFreedBytes
}
now := time.Now().UTC().Format(time.RFC3339Nano)
// Convert *bool → int for lock_present.
var lockPresentInt int64
if cur.LockPresent != nil && *cur.LockPresent {
lockPresentInt = 1
}
if _, err = tx.ExecContext(ctx,
`INSERT INTO host_repo_stats
(host_id, total_size_bytes, raw_size_bytes, unique_files,
snapshot_count, last_check_at, last_check_status,
lock_present, last_prune_at, last_prune_freed_bytes, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(host_id) DO UPDATE SET
total_size_bytes = excluded.total_size_bytes,
raw_size_bytes = excluded.raw_size_bytes,
unique_files = excluded.unique_files,
snapshot_count = excluded.snapshot_count,
last_check_at = excluded.last_check_at,
last_check_status = excluded.last_check_status,
lock_present = excluded.lock_present,
last_prune_at = excluded.last_prune_at,
last_prune_freed_bytes = excluded.last_prune_freed_bytes,
updated_at = excluded.updated_at`,
hostID,
nullableInt64(cur.TotalSizeBytes),
nullableInt64(cur.RawSizeBytes),
nullableInt64(cur.UniqueFiles),
nullableInt64(cur.SnapshotCount),
nullableTime(cur.LastCheckAt),
nullableStr(cur.LastCheckStatus),
lockPresentInt,
nullableTime(cur.LastPruneAt),
nullableInt64(cur.LastPruneFreedBytes),
now,
); err != nil {
return fmt.Errorf("store: upsert host_repo_stats: %w", err)
}
return tx.Commit()
}
// nullableInt64 converts *int64 to a database/sql-compatible nullable value.
func nullableInt64(p *int64) any {
if p == nil {
return nil
}
return *p
}
// nullableTime converts *time.Time to an RFC3339Nano string or nil.
func nullableTime(p *time.Time) any {
if p == nil {
return nil
}
return p.UTC().Format(time.RFC3339Nano)
}
+131
View File
@@ -0,0 +1,131 @@
package store
import (
"context"
"errors"
"testing"
"time"
)
func int64ptr(v int64) *int64 { return &v }
func boolptr(v bool) *bool { return &v }
func TestHostRepoStatsRoundTrip(t *testing.T) {
t.Parallel()
s := openTestStore(t)
ctx := context.Background()
const hostID = "h-stats-test"
seedHost(t, s, hostID)
// 1. Initial upsert: set TotalSizeBytes only.
if err := s.UpsertHostRepoStats(ctx, hostID, HostRepoStats{
TotalSizeBytes: int64ptr(100),
}); err != nil {
t.Fatalf("upsert 1: %v", err)
}
got, err := s.GetHostRepoStats(ctx, hostID)
if err != nil {
t.Fatalf("get after upsert 1: %v", err)
}
if got.TotalSizeBytes == nil || *got.TotalSizeBytes != 100 {
t.Errorf("TotalSizeBytes: want 100, got %v", got.TotalSizeBytes)
}
if got.LastCheckStatus != "" {
t.Errorf("LastCheckStatus should be empty after first upsert, got %q", got.LastCheckStatus)
}
// 2. Upsert with LastCheckStatus; TotalSizeBytes must be preserved.
if err := s.UpsertHostRepoStats(ctx, hostID, HostRepoStats{
LastCheckStatus: "ok",
}); err != nil {
t.Fatalf("upsert 2: %v", err)
}
got, err = s.GetHostRepoStats(ctx, hostID)
if err != nil {
t.Fatalf("get after upsert 2: %v", err)
}
if got.TotalSizeBytes == nil || *got.TotalSizeBytes != 100 {
t.Errorf("TotalSizeBytes should still be 100 after second upsert, got %v", got.TotalSizeBytes)
}
if got.LastCheckStatus != "ok" {
t.Errorf("LastCheckStatus: want %q, got %q", "ok", got.LastCheckStatus)
}
// 3. Upsert with LockPresent=true; all other fields preserved.
now := time.Now().UTC().Truncate(time.Second)
if err := s.UpsertHostRepoStats(ctx, hostID, HostRepoStats{
LockPresent: boolptr(true),
LastCheckAt: &now,
}); err != nil {
t.Fatalf("upsert 3: %v", err)
}
got, err = s.GetHostRepoStats(ctx, hostID)
if err != nil {
t.Fatalf("get after upsert 3: %v", err)
}
if got.LockPresent == nil || !*got.LockPresent {
t.Error("LockPresent should be true after upsert 3")
}
if got.TotalSizeBytes == nil || *got.TotalSizeBytes != 100 {
t.Errorf("TotalSizeBytes still 100 expected, got %v", got.TotalSizeBytes)
}
if got.LastCheckStatus != "ok" {
t.Errorf("LastCheckStatus still 'ok' expected, got %q", got.LastCheckStatus)
}
if got.LastCheckAt == nil {
t.Error("LastCheckAt should be set")
} else if !got.LastCheckAt.UTC().Truncate(time.Second).Equal(now) {
t.Errorf("LastCheckAt: got %v, want %v", got.LastCheckAt.UTC().Truncate(time.Second), now)
}
// 4. Clear lock (set to false).
if err := s.UpsertHostRepoStats(ctx, hostID, HostRepoStats{
LockPresent: boolptr(false),
}); err != nil {
t.Fatalf("upsert 4: %v", err)
}
got, err = s.GetHostRepoStats(ctx, hostID)
if err != nil {
t.Fatalf("get after upsert 4: %v", err)
}
if got.LockPresent == nil || *got.LockPresent {
t.Error("LockPresent should be false after upsert 4")
}
}
func TestHostRepoStatsNotFound(t *testing.T) {
t.Parallel()
s := openTestStore(t)
ctx := context.Background()
_, err := s.GetHostRepoStats(ctx, "no-such-host")
if !errors.Is(err, ErrNotFound) {
t.Errorf("expected ErrNotFound, got %v", err)
}
}
func TestHostRepoStatsCascadeDelete(t *testing.T) {
t.Parallel()
s := openTestStore(t)
ctx := context.Background()
const hostID = "h-cascade-test"
seedHost(t, s, hostID)
if err := s.UpsertHostRepoStats(ctx, hostID, HostRepoStats{
TotalSizeBytes: int64ptr(999),
}); err != nil {
t.Fatalf("upsert: %v", err)
}
// Delete the host; stats row should cascade-delete.
if _, err := s.DB().ExecContext(ctx,
`DELETE FROM hosts WHERE id = ?`, hostID); err != nil {
t.Fatalf("delete host: %v", err)
}
_, err := s.GetHostRepoStats(ctx, hostID)
if !errors.Is(err, ErrNotFound) {
t.Errorf("after host delete, expected ErrNotFound for stats; got %v", err)
}
}
+65
View File
@@ -193,6 +193,71 @@ func (s *Store) GetJob(ctx context.Context, id string) (*Job, error) {
return &j, nil
}
// LatestJobByKind returns the most recent job (any status, including
// queued and running) of the given kind for the host, or
// (nil, ErrNotFound) if no such job exists. Used by the maintenance
// ticker to compute "last fire" anchors for the cron-due check;
// in-flight jobs MUST be considered or a long-running prune (>60s)
// would re-fire on the next tick while the first is still running.
func (s *Store) LatestJobByKind(ctx context.Context, hostID, kind string) (*Job, error) {
row := s.db.QueryRowContext(ctx,
`SELECT id, host_id, kind, status, scheduled_id, actor_kind, actor_id,
started_at, finished_at, exit_code, stats, error, created_at
FROM jobs
WHERE host_id = ? AND kind = ?
ORDER BY created_at DESC
LIMIT 1`, hostID, kind)
var (
j Job
schedID sql.NullString
actorID sql.NullString
startedAt sql.NullString
finishedAt sql.NullString
exitCode sql.NullInt64
stats sql.NullString
errMsg sql.NullString
createdAt string
)
if err := row.Scan(&j.ID, &j.HostID, &j.Kind, &j.Status, &schedID,
&j.ActorKind, &actorID, &startedAt, &finishedAt,
&exitCode, &stats, &errMsg, &createdAt); err != nil {
if errors.Is(err, sql.ErrNoRows) {
return nil, ErrNotFound
}
return nil, fmt.Errorf("store: scan latest job by kind: %w", err)
}
if schedID.Valid {
s := schedID.String
j.ScheduledID = &s
}
if actorID.Valid {
s := actorID.String
j.ActorID = &s
}
if startedAt.Valid {
t, _ := time.Parse(time.RFC3339Nano, startedAt.String)
j.StartedAt = &t
}
if finishedAt.Valid {
t, _ := time.Parse(time.RFC3339Nano, finishedAt.String)
j.FinishedAt = &t
}
if exitCode.Valid {
i := int(exitCode.Int64)
j.ExitCode = &i
}
if stats.Valid && stats.String != "" {
j.Stats = json.RawMessage(stats.String)
}
if errMsg.Valid {
s := errMsg.String
j.Error = &s
}
t, _ := time.Parse(time.RFC3339Nano, createdAt)
j.CreatedAt = t
return &j, nil
}
// HasJobOfKind reports whether any job of the given kind exists for
// this host, regardless of status. Used by the auto-init path on
// agent hello to decide whether to dispatch a fresh `restic init` —
+136
View File
@@ -0,0 +1,136 @@
package store
import (
"context"
"errors"
"testing"
"time"
)
func TestLatestJobByKind(t *testing.T) {
t.Parallel()
s := openTestStore(t)
ctx := context.Background()
hostID := makeSchedHost(t, s)
// No jobs yet → ErrNotFound.
if _, err := s.LatestJobByKind(ctx, hostID, "forget"); !errors.Is(err, ErrNotFound) {
t.Fatalf("expected ErrNotFound on empty, got %v", err)
}
// Insert two finished jobs of kind=forget; the newer one should win.
older := time.Now().UTC().Add(-2 * time.Hour)
newer := time.Now().UTC().Add(-1 * time.Hour)
if err := s.CreateJob(ctx, Job{
ID: "j-old", HostID: hostID, Kind: "forget",
ActorKind: "system", CreatedAt: older,
}); err != nil {
t.Fatalf("create older: %v", err)
}
if err := s.MarkJobFinished(ctx, "j-old", "succeeded", 0, nil, "", older.Add(time.Minute)); err != nil {
t.Fatalf("finish older: %v", err)
}
if err := s.CreateJob(ctx, Job{
ID: "j-new", HostID: hostID, Kind: "forget",
ActorKind: "system", CreatedAt: newer,
}); err != nil {
t.Fatalf("create newer: %v", err)
}
if err := s.MarkJobFinished(ctx, "j-new", "failed", 1, nil, "boom", newer.Add(time.Minute)); err != nil {
t.Fatalf("finish newer: %v", err)
}
got, err := s.LatestJobByKind(ctx, hostID, "forget")
if err != nil {
t.Fatalf("LatestJobByKind: %v", err)
}
if got.ID != "j-new" {
t.Errorf("want j-new, got %q", got.ID)
}
// An in-flight running job must be returned — long-prune-suppresses-tick
// scenario: if a prune runs >60s the next tick must not re-fire it.
runningAt := time.Now().UTC()
if err := s.CreateJob(ctx, Job{
ID: "j-running", HostID: hostID, Kind: "forget",
ActorKind: "system", CreatedAt: runningAt,
}); err != nil {
t.Fatalf("create running: %v", err)
}
if err := s.MarkJobStarted(ctx, "j-running", runningAt); err != nil {
t.Fatalf("mark started: %v", err)
}
got2, err := s.LatestJobByKind(ctx, hostID, "forget")
if err != nil {
t.Fatalf("LatestJobByKind 2: %v", err)
}
if got2.ID != "j-running" {
t.Errorf("in-flight running job must be returned; want j-running, got %q", got2.ID)
}
// A queued (not-yet-started) job is also returned (it is newer than
// j-running because CreatedAt is later).
queuedAt := runningAt.Add(time.Millisecond)
if err := s.CreateJob(ctx, Job{
ID: "j-queued", HostID: hostID, Kind: "forget",
ActorKind: "system", CreatedAt: queuedAt,
}); err != nil {
t.Fatalf("create queued: %v", err)
}
got3, err := s.LatestJobByKind(ctx, hostID, "forget")
if err != nil {
t.Fatalf("LatestJobByKind 3: %v", err)
}
if got3.ID != "j-queued" {
t.Errorf("queued job must be returned as newest; want j-queued, got %q", got3.ID)
}
// Different kind → ErrNotFound.
if _, err := s.LatestJobByKind(ctx, hostID, "prune"); !errors.Is(err, ErrNotFound) {
t.Fatalf("expected ErrNotFound for prune, got %v", err)
}
}
func TestListAllMaintenance(t *testing.T) {
t.Parallel()
s := openTestStore(t)
ctx := context.Background()
// Empty case.
rows, err := s.ListAllMaintenance(ctx)
if err != nil {
t.Fatalf("empty list: %v", err)
}
if len(rows) != 0 {
t.Errorf("want empty, got %+v", rows)
}
// Seed two hosts with maintenance rows.
h1 := "01HMAINTHOST00000000000A1"
h2 := "01HMAINTHOST00000000000A2"
for i, id := range []string{h1, h2} {
if err := s.CreateHost(ctx, Host{
ID: id, Name: "maint-host-" + string(rune('a'+i)),
OS: "linux", Arch: "amd64",
AgentVersion: "dev", ResticVersion: "0.16.0", ProtocolVersion: 1,
EnrolledAt: time.Now().UTC(),
}, "th-"+id, ""); err != nil {
t.Fatalf("create host %s: %v", id, err)
}
}
if err := s.CreateDefaultRepoMaintenance(ctx, h1); err != nil {
t.Fatalf("seed h1: %v", err)
}
if err := s.CreateDefaultRepoMaintenance(ctx, h2); err != nil {
t.Fatalf("seed h2: %v", err)
}
rows, err = s.ListAllMaintenance(ctx)
if err != nil {
t.Fatalf("list: %v", err)
}
if len(rows) != 2 {
t.Errorf("want 2 rows, got %d", len(rows))
}
}
+34
View File
@@ -50,6 +50,40 @@ func (st *Store) GetRepoMaintenance(ctx context.Context, hostID string) (*HostRe
return &m, nil
}
// ListAllMaintenance returns every host_repo_maintenance row.
// Used by the server-side maintenance ticker to iterate every
// host on each tick. Order is unspecified (the ticker doesn't
// care).
func (st *Store) ListAllMaintenance(ctx context.Context) ([]HostRepoMaintenance, error) {
rows, err := st.db.QueryContext(ctx,
`SELECT host_id, forget_cron, forget_enabled,
prune_cron, prune_enabled,
check_cron, check_enabled, check_subset_pct
FROM host_repo_maintenance`)
if err != nil {
return nil, fmt.Errorf("store: list all maintenance: %w", err)
}
defer func() { _ = rows.Close() }()
var out []HostRepoMaintenance
for rows.Next() {
var (
m HostRepoMaintenance
forgetEnabled, pruneEnabled, checkEnabled int
)
if err := rows.Scan(&m.HostID,
&m.ForgetCron, &forgetEnabled,
&m.PruneCron, &pruneEnabled,
&m.CheckCron, &checkEnabled, &m.CheckSubsetPct); err != nil {
return nil, fmt.Errorf("store: scan maintenance: %w", err)
}
m.ForgetEnabled = forgetEnabled != 0
m.PruneEnabled = pruneEnabled != 0
m.CheckEnabled = checkEnabled != 0
out = append(out, m)
}
return out, rows.Err()
}
// UpdateRepoMaintenance replaces every editable field. Doesn't bump
// the schedule version — these run on the server's own ticker, not
// the agent's local cron, so the agent doesn't need to know.
@@ -0,0 +1,58 @@
-- 0009_admin_creds_and_repo_stats.sql
--
-- Phase 5 of the P2 redesign needs two things in the schema:
--
-- 1. A second credential row per host. Today host_credentials is
-- 1:1 with hosts. For prune (and any future destructive op) we
-- want a rest-server admin user whose password gives delete
-- access — separate from the append-only user used on every
-- backup. Add a `kind` column with default 'repo'; existing rows
-- become kind='repo'. Future admin rows live alongside.
--
-- 2. A small singleton-per-host projection for repo size, snapshot
-- count, last-prune freed bytes, lock state, and last-check
-- result. Backed by `restic stats --json` + sniffed `restic
-- check` stderr.
--
-- Use column-level ALTERs only; host_credentials has no inbound
-- FKs but the rule from CLAUDE.md still applies.
ALTER TABLE host_credentials ADD COLUMN kind TEXT NOT NULL DEFAULT 'repo';
-- The PK on host_credentials is currently (host_id) — we need a
-- composite (host_id, kind). SQLite has no ALTER TABLE …
-- ADD/CHANGE PRIMARY KEY, so this is the one place a rebuild is
-- justified. host_credentials has no inbound FKs, so the cascade
-- trap doesn't apply here. Verified against schema/0002.
CREATE TABLE host_credentials_new (
host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
kind TEXT NOT NULL DEFAULT 'repo'
CHECK (kind IN ('repo', 'admin')),
enc_repo_creds TEXT NOT NULL,
updated_at TEXT NOT NULL,
PRIMARY KEY (host_id, kind)
);
INSERT INTO host_credentials_new (host_id, kind, enc_repo_creds, updated_at)
SELECT host_id, kind, enc_repo_creds, updated_at FROM host_credentials;
DROP TABLE host_credentials;
ALTER TABLE host_credentials_new RENAME TO host_credentials;
-- Repo stats projection. One row per host, upserted by the agent's
-- stats.report envelope (which fires after every successful backup
-- and after every check / prune). All fields nullable so a freshly
-- enrolled host with no jobs yet is representable.
CREATE TABLE host_repo_stats (
host_id TEXT PRIMARY KEY REFERENCES hosts(id) ON DELETE CASCADE,
total_size_bytes INTEGER,
raw_size_bytes INTEGER,
unique_files INTEGER,
snapshot_count INTEGER,
last_check_at TEXT,
last_check_status TEXT CHECK (last_check_status IS NULL OR last_check_status IN ('ok', 'errors_found', 'failed')),
lock_present INTEGER NOT NULL DEFAULT 0,
last_prune_at TEXT,
last_prune_freed_bytes INTEGER,
updated_at TEXT NOT NULL
);
+37
View File
@@ -72,6 +72,43 @@ func (st *Store) DuePendingRuns(ctx context.Context, now time.Time, limit int) (
return out, rows.Err()
}
// ListPendingRunsForHost returns every pending row for the host
// (regardless of next_attempt_at), ordered by next_attempt_at
// ascending. Used by the on-reconnect drain — when a host comes
// back, we walk every pending row for it, not just the due ones,
// because the host being back makes "due" unimportant: every row
// is dispatchable now.
func (st *Store) ListPendingRunsForHost(ctx context.Context, hostID string) ([]PendingRun, error) {
rows, err := st.db.QueryContext(ctx,
`SELECT id, schedule_id, source_group_id, host_id, attempt,
next_attempt_at, scheduled_at, COALESCE(last_error, '')
FROM pending_runs
WHERE host_id = ?
ORDER BY next_attempt_at`,
hostID)
if err != nil {
return nil, fmt.Errorf("store: list pending runs for host: %w", err)
}
defer func() { _ = rows.Close() }()
out := []PendingRun{}
for rows.Next() {
var p PendingRun
var nextAt, scheduledAt string
if err := rows.Scan(&p.ID, &p.ScheduleID, &p.SourceGroupID, &p.HostID,
&p.Attempt, &nextAt, &scheduledAt, &p.LastError); err != nil {
return nil, err
}
if t, err := time.Parse(time.RFC3339Nano, nextAt); err == nil {
p.NextAttemptAt = t
}
if t, err := time.Parse(time.RFC3339Nano, scheduledAt); err == nil {
p.ScheduledAt = t
}
out = append(out, p)
}
return out, rows.Err()
}
// DeletePendingRun removes a row by id. Called after successful
// dispatch or after exceeding retry_max.
func (st *Store) DeletePendingRun(ctx context.Context, id string) error {
+75
View File
@@ -219,3 +219,78 @@ func TestPendingRunQueue(t *testing.T) {
t.Fatalf("after delete: %v", due)
}
}
func TestListPendingRunsForHost(t *testing.T) {
t.Parallel()
s := openTestStore(t)
ctx := context.Background()
hostA := makeSchedHost(t, s)
hostB := "01HPENDLISTHOSTB00000001"
if err := s.CreateHost(ctx, Host{
ID: hostB, Name: "pending-list-host-b", OS: "linux", Arch: "amd64",
AgentVersion: "dev", ResticVersion: "0.16.0", ProtocolVersion: 1,
EnrolledAt: time.Now().UTC(),
}, "tokenhashB", ""); err != nil {
t.Fatal(err)
}
gA := makeGroup(t, s, hostA, "default", "01HPENDLISTGRPA000000001")
gB := makeGroup(t, s, hostB, "default", "01HPENDLISTGRPB000000001")
schedA := "01HPENDLISTSCHEDA0000001"
schedB := "01HPENDLISTSCHEDB0000001"
if err := s.CreateSchedule(ctx, &Schedule{
ID: schedA, HostID: hostA, CronExpr: "@hourly", Enabled: true,
SourceGroupIDs: []string{gA},
}); err != nil {
t.Fatal(err)
}
if err := s.CreateSchedule(ctx, &Schedule{
ID: schedB, HostID: hostB, CronExpr: "@hourly", Enabled: true,
SourceGroupIDs: []string{gB},
}); err != nil {
t.Fatal(err)
}
now := time.Now().UTC()
// Two rows for hostA — one not-yet-due, one already-due — and one
// for hostB. ListPendingRunsForHost(A) must return both A rows
// (regardless of due-ness) ordered by next_attempt_at ascending.
rows := []*PendingRun{
{
ID: "01HPENDLISTROW0000000A02", ScheduleID: schedA, SourceGroupID: gA, HostID: hostA,
NextAttemptAt: now.Add(time.Hour), ScheduledAt: now,
},
{
ID: "01HPENDLISTROW0000000A01", ScheduleID: schedA, SourceGroupID: gA, HostID: hostA,
NextAttemptAt: now.Add(-time.Minute), ScheduledAt: now.Add(-time.Hour),
},
{
ID: "01HPENDLISTROW0000000B01", ScheduleID: schedB, SourceGroupID: gB, HostID: hostB,
NextAttemptAt: now, ScheduledAt: now,
},
}
for _, r := range rows {
if err := s.EnqueuePendingRun(ctx, r); err != nil {
t.Fatal(err)
}
}
out, err := s.ListPendingRunsForHost(ctx, hostA)
if err != nil {
t.Fatal(err)
}
if len(out) != 2 {
t.Fatalf("len=%d, want 2: %+v", len(out), out)
}
// Ordered ascending by next_attempt_at: the -1m row first, then +1h.
if out[0].ID != "01HPENDLISTROW0000000A01" || out[1].ID != "01HPENDLISTROW0000000A02" {
t.Fatalf("order: got %s,%s", out[0].ID, out[1].ID)
}
out, err = s.ListPendingRunsForHost(ctx, "non-existent-host")
if err != nil {
t.Fatal(err)
}
if len(out) != 0 {
t.Fatalf("non-existent host: got %d rows", len(out))
}
}
+64
View File
@@ -84,6 +84,70 @@ func TestMigrateIsIdempotent(t *testing.T) {
}
}
func TestMigration0009Schema(t *testing.T) {
t.Parallel()
s := openTestStore(t)
ctx := context.Background()
// host_credentials must have a composite PK (host_id, kind).
// We verify this by inserting two rows for the same host_id (different kinds)
// and confirming a duplicate (host_id, kind) fails.
_, err := s.DB().ExecContext(ctx,
`INSERT INTO hosts (id, name, os, arch, enrolled_at) VALUES (?,?,?,?,?)`,
"h-0009", "test-host", "linux", "amd64", "2026-01-01T00:00:00Z")
if err != nil {
t.Fatalf("insert host: %v", err)
}
now := "2026-01-01T00:00:00Z"
if _, err := s.DB().ExecContext(ctx,
`INSERT INTO host_credentials (host_id, kind, enc_repo_creds, updated_at) VALUES (?,?,?,?)`,
"h-0009", "repo", "enc-repo", now); err != nil {
t.Fatalf("insert repo creds: %v", err)
}
if _, err := s.DB().ExecContext(ctx,
`INSERT INTO host_credentials (host_id, kind, enc_repo_creds, updated_at) VALUES (?,?,?,?)`,
"h-0009", "admin", "enc-admin", now); err != nil {
t.Fatalf("insert admin creds: %v", err)
}
// Duplicate (host_id, kind) must fail.
if _, err := s.DB().ExecContext(ctx,
`INSERT INTO host_credentials (host_id, kind, enc_repo_creds, updated_at) VALUES (?,?,?,?)`,
"h-0009", "repo", "enc-repo-2", now); err == nil {
t.Fatal("expected unique constraint violation on (host_id, kind), got nil")
}
// CHECK (kind IN ('repo','admin')) must reject an invalid kind.
if _, err := s.DB().ExecContext(ctx,
`INSERT INTO host_credentials (host_id, kind, enc_repo_creds, updated_at) VALUES (?,?,?,?)`,
"h-0009", "other", "enc-other", now); err == nil {
t.Fatal("expected CHECK constraint violation on kind='other', got nil")
}
// host_repo_stats table must exist with expected columns.
if _, err := s.DB().ExecContext(ctx,
`INSERT INTO host_repo_stats (host_id, lock_present, updated_at) VALUES (?,?,?)`,
"h-0009", 0, now); err != nil {
t.Fatalf("insert host_repo_stats: %v", err)
}
var lockPresent int
if err := s.DB().QueryRowContext(ctx,
`SELECT lock_present FROM host_repo_stats WHERE host_id = ?`, "h-0009",
).Scan(&lockPresent); err != nil {
t.Fatalf("select host_repo_stats: %v", err)
}
if lockPresent != 0 {
t.Errorf("expected lock_present=0, got %d", lockPresent)
}
// CHECK (last_check_status IN ('ok','errors_found','failed')) must reject
// an invalid value.
if _, err := s.DB().ExecContext(ctx,
`UPDATE host_repo_stats SET last_check_status = ? WHERE host_id = ?`,
"wat", "h-0009"); err == nil {
t.Fatal("expected CHECK constraint violation on last_check_status='wat', got nil")
}
}
func TestForeignKeysEnforced(t *testing.T) {
t.Parallel()
s := openTestStore(t)
+17 -7
View File
@@ -166,14 +166,24 @@ Sizes: **S** = under a day, **M** = 13 days, **L** = 37 days.
- Header "version N · agent in sync / agent at vM" indicator preserved across all tabs (backed by `host_schedule_version` + `applied_schedule_version`).
- Form validation re-renders with the operator's typed input intact (mirror P2-04's behaviour). Each save fires `pushScheduleSetAsync` so an online agent re-arms within seconds.
### P2 redesign — Phase 5 (server-side maintenance ticker) — TODO
### P2 redesign — Phase 5 (server-side maintenance ticker)
- [ ] **P2R-03** (M) `prune` command end-to-end. Restic wrapper (`restic.RunPrune`), agent dispatcher (`case api.JobPrune:`), wire envelope. **Admin-only credential**: a second `host_credentials` row keyed by `host_id` + `kind=admin` carries the non-append-only username/password; server pushes it via `config.update` only when dispatching a prune job, and the agent's secrets store keeps it in a separate slot from the everyday append-only creds. UI: prune row on the Repo page. Operator-triggered Run-now via `POST /hosts/{id}/repo/prune`. Cadence-driven dispatch lands in P2R-04.
- [ ] **P2R-04** (M) `check` command end-to-end (`restic check --read-data-subset N%`). Wrapper + dispatcher + wire. UI: check row on the Repo page (with the subset % slider). Operator Run-now via `POST /hosts/{id}/repo/check`. Cadence-driven dispatch lands in P2R-05.
- [ ] **P2R-05** (S) `unlock` command end-to-end (`restic unlock`). Operator-only — no cadence. `POST /hosts/{id}/repo/unlock`. Repo page surfaces lock state from the most recent `check` (which warns about stale locks).
- [ ] **P2R-06** (M) Server-side maintenance ticker. Cron-style loop on the server reads `host_repo_maintenance` rows, dispatches `forget` / `prune` / `check` jobs against the right host on the configured cadence (last-run timestamps tracked per kind on the maintenance row). Independent of the agent's local cron — the agent's cron only handles backup schedules now. Skips offline hosts (queues to `pending_runs` instead — see P2R-08). Handles ticker restarts cleanly (no-op if a job of the same kind ran inside the cadence window).
- [ ] **P2R-07** (S) Repo stats panel on the Repo page: size, dedup ratio, snapshot count, last-check timestamp + result, lock state, last-prune timestamp + bytes-freed. Backed by parsing `restic stats --json` output that the agent ships periodically (piggyback on the existing snapshots-report path).
- [ ] **P2R-08** (M) Pending-runs queue worker. On agent reconnect, server drains `pending_runs` rows for that host and re-dispatches them in order. Bump backoff per `pending_run.attempt_count`; drop rows that have exceeded the source-group's `retry_max`. Audit-logged. Smoke-tested by stopping the agent, running maintenance ticker so cadence misses, restarting agent, watching the queue drain.
- [x] **P2R-03** (M) `prune` command end-to-end. Restic wrapper (`restic.RunPrune`), agent dispatcher (`case api.JobPrune:`), wire envelope. **Admin-only credential**: a second `host_credentials` row keyed by `host_id` + `kind=admin` carries the non-append-only username/password; server pushes it via `config.update` only when dispatching a prune job, and the agent's secrets store keeps it in a separate slot from the everyday append-only creds. UI: prune row on the Repo page. Operator-triggered Run-now via `POST /hosts/{id}/repo/prune`. Cadence-driven dispatch lands in P2R-04.
- [x] **P2R-04** (M) `check` command end-to-end (`restic check --read-data-subset N%`). Wrapper + dispatcher + wire. UI: check row on the Repo page (with the subset % slider). Operator Run-now via `POST /hosts/{id}/repo/check`. Cadence-driven dispatch lands in P2R-05.
- [x] **P2R-05** (S) `unlock` command end-to-end (`restic unlock`). Operator-only — no cadence. `POST /hosts/{id}/repo/unlock`. Repo page surfaces lock state from the most recent `check` (which warns about stale locks).
- [x] **P2R-06** (M) Server-side maintenance ticker. Cron-style loop on the server reads `host_repo_maintenance` rows, dispatches `forget` / `prune` / `check` jobs against the right host on the configured cadence (last-run timestamps tracked per kind on the maintenance row). Independent of the agent's local cron — the agent's cron only handles backup schedules now. Skips offline hosts (queues to `pending_runs` instead — see P2R-08). Handles ticker restarts cleanly (no-op if a job of the same kind ran inside the cadence window).
- [x] **P2R-07** (S) Repo stats panel on the Repo page: size, dedup ratio, snapshot count, last-check timestamp + result, lock state, last-prune timestamp + bytes-freed. Backed by parsing `restic stats --json` output that the agent ships periodically (piggyback on the existing snapshots-report path).
- [x] **P2R-08** (M) Pending-runs queue worker. On agent reconnect, server drains `pending_runs` rows for that host and re-dispatches them in order. Bump backoff per `pending_run.attempt_count`; drop rows that have exceeded the source-group's `retry_max`. Audit-logged. Smoke-tested by stopping the agent, running maintenance ticker so cadence misses, restarting agent, watching the queue drain.
### P2 redesign — Phase 5 ✅
- Restic-manager Phase 5 lands on branch `p2r-phase5-maintenance`:
prune/check/unlock end-to-end (P2R-03/04/05); server-side
maintenance ticker drives forget/prune/check on cadence (P2R-06);
repo-stats panel surfaces size, lock state, last-check / last-prune
(P2R-07); pending-runs queue worker drains scheduled-backup
fires that raced an agent disconnect (P2R-08). See
`docs/superpowers/plans/2026-05-03-p2-redesign-phase-5.md`.
### P2 redesign — Phase 6 (auto-init follow-up) — TODO
File diff suppressed because one or more lines are too long
+117
View File
@@ -42,6 +42,54 @@
</div>
</form>
{{/* ---------- Admin credentials (optional) ---------- */}}
<h2 class="text-[11.5px] font-semibold uppercase tracking-[0.08em] text-ink-mute mt-7 mb-3.5">
Admin credentials <span class="text-ink-fade normal-case">· prune-only · optional</span>
</h2>
<form method="post" action="/hosts/{{$host.ID}}/admin-credentials" class="panel rounded-[7px] p-5">
{{if $page.AdminCredsError}}
<div class="rounded-[6px] px-3.5 py-3 text-[13px] mb-4"
style="border: 1px solid color-mix(in oklch, var(--bad), transparent 60%); background: color-mix(in oklch, var(--bad), transparent 92%);">
{{$page.AdminCredsError}}
</div>
{{end}}
{{if eq $page.SavedSection "admin_credentials"}}
<div class="text-[12px] text-ok mb-3 mono">✓ saved</div>
{{end}}
<p class="text-[12.5px] text-ink-mid leading-[1.6] mb-4 max-w-[640px]">
Only needed for rest-server repos that distinguish an append-only
user (everyday backups) from a delete-capable user (prune /
forget). For S3 / B2 / SFTP / local, leave this blank — the
everyday repo credentials handle prune too.
</p>
<div class="grid grid-cols-2 gap-4">
<div>
<label class="field-label" for="admin_repo_url">Repo URL <span class="text-ink-fade">· usually same as above</span></label>
<input id="admin_repo_url" name="repo_url" type="text" class="field mono" value="{{$page.AdminURL}}" />
</div>
<div>
<label class="field-label" for="admin_repo_username">Username</label>
<input id="admin_repo_username" name="repo_username" type="text" class="field mono" value="{{$page.AdminUsername}}" />
</div>
<div class="col-span-2">
<label class="field-label" for="admin_repo_password">Password</label>
<input id="admin_repo_password" name="repo_password" type="password" class="field mono"
placeholder="{{if $page.HasAdminPassword}}•••••••••••••••• · stored, leave blank to keep{{else}}— not yet set —{{end}}"
autocomplete="new-password" />
</div>
</div>
<div class="mt-4 pt-4 border-t border-line-soft flex gap-2 items-center">
<button type="submit" class="btn btn-primary">Save admin credentials</button>
{{if $page.HasAdminPassword}}
<button type="submit" form="admin-creds-clear" class="btn btn-secondary"
onclick="return confirm('Clear admin credentials? Prune jobs will be refused until you re-set them.');">Clear</button>
{{end}}
</div>
</form>
{{if $page.HasAdminPassword}}
<form id="admin-creds-clear" method="post" action="/hosts/{{$host.ID}}/admin-credentials/delete"></form>
{{end}}
{{/* ---------- Bandwidth ---------- */}}
<h2 class="text-[11.5px] font-semibold uppercase tracking-[0.08em] text-ink-mute mt-7 mb-3.5">Bandwidth · host-wide</h2>
<form method="post" action="/hosts/{{$host.ID}}/repo/bandwidth" class="panel rounded-[7px] p-5">
@@ -138,6 +186,40 @@
</div>
</form>
{{/* ---------- Run now · one-time ---------- */}}
<h2 class="text-[11.5px] font-semibold uppercase tracking-[0.08em] text-ink-mute mt-7 mb-3.5">Run now · one-time</h2>
<div class="panel rounded-[7px] p-5">
<p class="text-[12.5px] text-ink-mid leading-[1.6] mb-4 max-w-[640px]">
Operator-triggered. Output streams live to the job log. Cadence-driven runs land independently from the server-side ticker.
</p>
<div class="grid grid-cols-3 gap-3">
<button type="button"
hx-post="/hosts/{{$host.ID}}/repo/check"
hx-swap="none"
hx-confirm="Run check now ({{$m.CheckSubsetPct}}% data subset)?"
class="btn btn-secondary"
{{if not $page.Online}}disabled title="agent is offline"{{end}}>
check
</button>
<button type="button"
hx-post="/hosts/{{$host.ID}}/repo/prune"
hx-swap="none"
hx-confirm="Run prune now? Removes data not referenced by any snapshot — heavy operation."
class="btn btn-secondary"
{{if not $page.HasAdminPassword}}disabled title="set admin credentials first"{{else if not $page.Online}}disabled title="agent is offline"{{end}}>
prune
</button>
<button type="button"
hx-post="/hosts/{{$host.ID}}/repo/unlock"
hx-swap="none"
hx-confirm="Clear stale repo locks?"
class="btn btn-secondary"
{{if not $page.Online}}disabled title="agent is offline"{{end}}>
unlock
</button>
</div>
</div>
{{/* ---------- Danger zone ---------- */}}
<h2 class="text-[11.5px] font-semibold uppercase tracking-[0.08em] text-bad mt-9 mb-3.5">Danger zone</h2>
<div class="panel rounded-[7px] p-5"
@@ -179,6 +261,41 @@
</div>
</div>
{{/* ---------- Repo health ---------- */}}
{{if $page.StatsView}}
{{$s := $page.StatsView}}
<h2 class="text-[11.5px] font-semibold uppercase tracking-[0.08em] text-ink-mute mt-7 mb-3.5">Repo health</h2>
<div class="panel rounded-[7px] p-5 text-[13px]">
{{if $s.LockPresent}}
<div class="rounded-[6px] px-3.5 py-3 text-[12.5px] mb-4"
style="border: 1px solid color-mix(in oklch, var(--warn), transparent 60%); background: color-mix(in oklch, var(--warn), transparent 92%);">
Stale lock detected on the most recent check. Run <span class="mono">unlock</span> above to clear it before the next backup.
</div>
{{end}}
<dl class="grid grid-cols-2 gap-y-2 gap-x-4">
{{if $s.HasTotalSize}}
<dt class="text-ink-fade">Total size</dt>
<dd class="mono text-right">{{bytes $s.TotalSizeBytes}}</dd>
{{end}}
{{if $s.HasRawSize}}
<dt class="text-ink-fade">Raw size <span class="text-ink-fade text-[11px]">· pre-dedup</span></dt>
<dd class="mono text-right">{{bytes $s.RawSizeBytes}}</dd>
{{end}}
{{if $s.HasLastCheck}}
<dt class="text-ink-fade">Last check</dt>
<dd class="mono text-right text-[12px]">
{{$s.LastCheckAgo}}
{{if $s.LastCheckStatus}} · <span class="{{if eq $s.LastCheckStatus "ok"}}text-ok{{else if eq $s.LastCheckStatus "errors_found"}}text-bad{{else}}text-ink-mid{{end}}">{{$s.LastCheckStatus}}</span>{{end}}
</dd>
{{end}}
{{if $s.HasLastPrune}}
<dt class="text-ink-fade">Last prune</dt>
<dd class="mono text-right text-[12px]">{{$s.LastPruneAgo}}</dd>
{{end}}
</dl>
</div>
{{end}}
{{if gt (len $page.GroupNames) 0}}
<h2 class="text-[11.5px] font-semibold uppercase tracking-[0.08em] text-ink-mute mt-7 mb-3.5">Snapshots by source</h2>
<div class="panel rounded-[7px] p-4">