diff --git a/_diag/p2r-phase5-sweep/01-repo-page.png b/_diag/p2r-phase5-sweep/01-repo-page.png new file mode 100644 index 0000000..0bfee98 Binary files /dev/null and b/_diag/p2r-phase5-sweep/01-repo-page.png differ diff --git a/_diag/p2r-phase5-sweep/02-check-job-running.png b/_diag/p2r-phase5-sweep/02-check-job-running.png new file mode 100644 index 0000000..4f3741e Binary files /dev/null and b/_diag/p2r-phase5-sweep/02-check-job-running.png differ diff --git a/_diag/p2r-phase5-sweep/03-check-job-done.png b/_diag/p2r-phase5-sweep/03-check-job-done.png new file mode 100644 index 0000000..4f3741e Binary files /dev/null and b/_diag/p2r-phase5-sweep/03-check-job-done.png differ diff --git a/_diag/p2r-phase5-sweep/04-repo-after-check.png b/_diag/p2r-phase5-sweep/04-repo-after-check.png new file mode 100644 index 0000000..435cb87 Binary files /dev/null and b/_diag/p2r-phase5-sweep/04-repo-after-check.png differ diff --git a/_diag/p2r-phase5-sweep/05-unlock-job.png b/_diag/p2r-phase5-sweep/05-unlock-job.png new file mode 100644 index 0000000..d0c1e7d Binary files /dev/null and b/_diag/p2r-phase5-sweep/05-unlock-job.png differ diff --git a/_diag/p2r-phase5-sweep/06-dashboard.png b/_diag/p2r-phase5-sweep/06-dashboard.png new file mode 100644 index 0000000..377a278 Binary files /dev/null and b/_diag/p2r-phase5-sweep/06-dashboard.png differ diff --git a/cmd/agent/main.go b/cmd/agent/main.go index cb38457..d401640 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -2,13 +2,13 @@ package main import ( "context" - "encoding/json" "errors" "flag" "fmt" "log/slog" "os" "os/signal" + "strconv" "syscall" "time" @@ -199,32 +199,68 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S case api.MsgConfigUpdate: var p api.ConfigUpdatePayload _ = env.UnmarshalPayload(&p) - // Merge with whatever's already in secrets.enc — empty fields - // in the push mean "leave alone." Atomic write underneath. - cur, err := d.secrets.Load() - if err != nil { - slog.Error("ws agent: load secrets for merge", "err", err) - return nil + slot := p.Slot + if slot == "" { + slot = "repo" } - changed := false - if p.RepoURL != "" && p.RepoURL != cur.URL { - cur.URL = p.RepoURL - changed = true - } - if p.RepoUsername != "" && p.RepoUsername != cur.Username { - cur.Username = p.RepoUsername - changed = true - } - if p.RepoPassword != "" && p.RepoPassword != cur.Password { - cur.Password = p.RepoPassword - changed = true - } - if changed { - if err := d.secrets.Save(cur); err != nil { - slog.Error("ws agent: persist secrets", "err", err) + switch slot { + case "repo": + // Merge with whatever's already in secrets.enc — empty fields + // in the push mean "leave alone." Atomic write underneath. + cur, err := d.secrets.Load() + if err != nil { + slog.Error("ws agent: load secrets for merge", "err", err) return nil } - slog.Info("ws agent: repo credentials updated via config.update") + changed := false + if p.RepoURL != "" && p.RepoURL != cur.URL { + cur.URL = p.RepoURL + changed = true + } + if p.RepoUsername != "" && p.RepoUsername != cur.Username { + cur.Username = p.RepoUsername + changed = true + } + if p.RepoPassword != "" && p.RepoPassword != cur.Password { + cur.Password = p.RepoPassword + changed = true + } + if changed { + if err := d.secrets.Save(cur); err != nil { + slog.Error("ws agent: persist secrets", "err", err) + return nil + } + slog.Info("ws agent: repo credentials updated via config.update") + } + case "admin": + cur, err := d.secrets.LoadAdmin() + if err != nil && !errors.Is(err, secrets.ErrNoAdmin) { + slog.Error("ws agent: load admin secrets", "err", err) + return nil + } + // ErrNoAdmin is not an error here — we are creating the slot. + changed := false + if p.RepoURL != "" && p.RepoURL != cur.URL { + cur.URL = p.RepoURL + changed = true + } + if p.RepoUsername != "" && p.RepoUsername != cur.Username { + cur.Username = p.RepoUsername + changed = true + } + if p.RepoPassword != "" && p.RepoPassword != cur.Password { + cur.Password = p.RepoPassword + changed = true + } + if changed { + if err := d.secrets.SaveAdmin(cur); err != nil { + slog.Error("ws agent: persist admin secrets", "err", err) + return nil + } + slog.Info("ws agent: admin credentials updated via config.update") + } + default: + slog.Warn("ws agent: unknown config.update slot, ignoring", "slot", p.Slot) } case api.MsgAgentUpdateAvail: @@ -251,6 +287,14 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc if creds.Empty() { return fmt.Errorf("repo credentials not configured (waiting for server config.update push)") } + // r is the everyday runner — bound to the host's repo + // (append-only) credentials. Reused by every kind except + // JobPrune, which builds its own runner against the + // admin-credentials slot when p.RequiresAdminCreds is set + // (admin creds are not loaded for any other kind, so they're + // not on r). If you find yourself adding a new JobKind that + // needs delete authority, mirror the JobPrune pattern below + // — don't try to overload r. r := runner.New(runner.Config{ ResticBin: d.resticBin, RepoURL: creds.URL, @@ -291,33 +335,81 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc slog.Info("agent: init job complete", "job_id", p.JobID) }() case api.JobForget: - var policy restic.ForgetPolicy - if len(p.RetentionPolicy) > 0 { - var raw struct { - KeepLast *int `json:"keep_last,omitempty"` - KeepHourly *int `json:"keep_hourly,omitempty"` - KeepDaily *int `json:"keep_daily,omitempty"` - KeepWeekly *int `json:"keep_weekly,omitempty"` - KeepMonthly *int `json:"keep_monthly,omitempty"` - KeepYearly *int `json:"keep_yearly,omitempty"` - } - if err := json.Unmarshal(p.RetentionPolicy, &raw); err != nil { - return fmt.Errorf("forget: decode retention_policy: %w", err) - } - policy = restic.ForgetPolicy{ - KeepLast: raw.KeepLast, KeepHourly: raw.KeepHourly, - KeepDaily: raw.KeepDaily, KeepWeekly: raw.KeepWeekly, - KeepMonthly: raw.KeepMonthly, KeepYearly: raw.KeepYearly, - } + if len(p.ForgetGroups) == 0 { + // Hard-error rather than fall back to a single-policy form: + // the server-side dispatch path (maintenance ticker) is the + // only writer of forget command.run today, and it always + // populates ForgetGroups. A backwards-compatible single- + // policy fallback was specced but skipped — see the + // Phase 5 plan rationale and version.go's lockstep-deploy + // note for why. + return fmt.Errorf("forget: command.run carried no forget_groups (server didn't populate them)") } - slog.Info("agent: accepting forget job", "job_id", p.JobID, "policy", p.RetentionPolicy) + groups := make([]restic.ForgetGroup, 0, len(p.ForgetGroups)) + for _, g := range p.ForgetGroups { + groups = append(groups, restic.ForgetGroup{ + Tag: g.Tag, + Policy: restic.ForgetPolicy{ + KeepLast: g.Policy.KeepLast, + KeepHourly: g.Policy.KeepHourly, + KeepDaily: g.Policy.KeepDaily, + KeepWeekly: g.Policy.KeepWeekly, + KeepMonthly: g.Policy.KeepMonthly, + KeepYearly: g.Policy.KeepYearly, + }, + }) + } + slog.Info("agent: accepting forget job", "job_id", p.JobID, "groups", len(groups)) go func() { - if err := r.RunForget(ctx, p.JobID, policy); err != nil { + if err := r.RunForget(ctx, p.JobID, groups); err != nil { slog.Warn("agent: forget job failed", "job_id", p.JobID, "err", err) return } slog.Info("agent: forget job complete", "job_id", p.JobID) }() + case api.JobPrune: + // Prune may require admin creds (delete authority on rest-server). + runCreds := creds + if p.RequiresAdminCreds { + ac, err := d.secrets.LoadAdmin() + if err != nil { + return fmt.Errorf("prune: admin creds not configured (server didn't push them): %w", err) + } + if ac.Empty() { + return fmt.Errorf("prune: admin creds incomplete") + } + runCreds = ac + } + prr := runner.New(runner.Config{ + ResticBin: d.resticBin, + RepoURL: runCreds.URL, + RepoUsername: runCreds.Username, + RepoPassword: runCreds.Password, + }, tx, time.Second) + slog.Info("agent: accepting prune job", "job_id", p.JobID, "admin_creds", p.RequiresAdminCreds) + go func() { + if err := prr.RunPrune(ctx, p.JobID); err != nil { + slog.Warn("agent: prune job failed", "job_id", p.JobID, "err", err) + } + }() + case api.JobCheck: + subset := 0 + if len(p.Args) > 0 { + subset, _ = strconv.Atoi(p.Args[0]) + } + slog.Info("agent: accepting check job", "job_id", p.JobID, "subset_pct", subset) + go func() { + if err := r.RunCheck(ctx, p.JobID, subset); err != nil { + slog.Warn("agent: check job failed", "job_id", p.JobID, "err", err) + } + }() + case api.JobUnlock: + slog.Info("agent: accepting unlock job", "job_id", p.JobID) + go func() { + if err := r.RunUnlock(ctx, p.JobID); err != nil { + slog.Warn("agent: unlock job failed", "job_id", p.JobID, "err", err) + } + }() default: return fmt.Errorf("kind %q not implemented yet (Phase 2 lands the rest)", p.Kind) } diff --git a/cmd/server/main.go b/cmd/server/main.go index 4059bb2..c97b39d 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -16,6 +16,7 @@ import ( "gitea.dcglab.co.uk/steve/restic-manager/internal/crypto" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/config" rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws" "gitea.dcglab.co.uk/steve/restic-manager/internal/store" @@ -139,6 +140,23 @@ func run() error { defer purgeTick.Stop() offlineTick := time.NewTicker(30 * time.Second) defer offlineTick.Stop() + // Maintenance ticker: drives forget/prune/check on the cadences + // operators set per-host. Independent of the agent's local cron + // (which only handles backup schedules). 60s cadence — the cron + // expressions are minute-grained, so anything finer is wasted + // work. + maintenanceTick := time.NewTicker(60 * time.Second) + defer maintenanceTick.Stop() + // Pending-runs drain ticker: 30s cadence sweeps every host with + // pending_runs rows whose next_attempt_at <= now (rows accumulate + // when a schedule.fire's command.run send fails because the agent + // dropped offline mid-flight). The on-reconnect path in + // onAgentHello handles the common case; this ticker is the + // safety-net for hosts that come back without a fresh hello (they + // shouldn't, but the queue exists either way). + pendingDrainTick := time.NewTicker(30 * time.Second) + defer pendingDrainTick.Stop() + mt := maintenance.New(st) go func() { for { select { @@ -156,6 +174,18 @@ func run() error { if n, err := st.MarkHostsOfflineStale(ctx, cutoff); err == nil && n > 0 { slog.Info("marked hosts offline (stale heartbeat)", "n", n) } + case <-pendingDrainTick.C: + srv.DrainAllDue(ctx) + case <-maintenanceTick.C: + decisions, err := mt.Decide(ctx, time.Now().UTC()) + if err != nil { + slog.Warn("maintenance ticker: decide", "err", err) + continue + } + if len(decisions) > 0 { + slog.Info("maintenance ticker: dispatching", "n", len(decisions)) + srv.DispatchMaintenance(ctx, decisions) + } } } }() diff --git a/internal/agent/runner/runner.go b/internal/agent/runner/runner.go index dc6763c..985380e 100644 --- a/internal/agent/runner/runner.go +++ b/internal/agent/runner/runner.go @@ -51,24 +51,70 @@ func New(cfg Config, tx Sender, progressMinPeriod time.Duration) *Runner { return &Runner{cfg: cfg, tx: tx, progressMinPeriod: progressMinPeriod} } -// RunBackup executes a backup job and reports back via the sender. -// Returns nil on a clean (or "incomplete-but-snapshot-created") finish. -func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, tags []string) error { - startedAt := time.Now().UTC() - - startEnv, _ := api.Marshal(api.MsgJobStarted, jobID, api.JobStartedPayload{ - JobID: jobID, Kind: api.JobBackup, StartedAt: startedAt, - }) - if err := r.tx.Send(startEnv); err != nil { - slog.Warn("runner: send job.started", "err", err) - } - - env := restic.Env{ +// resticEnv builds the shared restic.Env from r.cfg. +func (r *Runner) resticEnv() restic.Env { + return restic.Env{ Bin: r.cfg.ResticBin, RepoURL: r.cfg.RepoURL, RepoUsername: r.cfg.RepoUsername, RepoPassword: r.cfg.RepoPassword, } +} + +// sendStarted ships a job.started envelope. +func (r *Runner) sendStarted(jobID string, kind api.JobKind, startedAt time.Time) { + env, _ := api.Marshal(api.MsgJobStarted, jobID, api.JobStartedPayload{ + JobID: jobID, Kind: kind, StartedAt: startedAt, + }) + if err := r.tx.Send(env); err != nil { + slog.Warn("runner: send job.started", "job_id", jobID, "kind", kind, "err", err) + } +} + +// streamHandler returns a LineHandler that ships log.stream envelopes. +func (r *Runner) streamHandler(jobID string, seq *atomic.Int64) restic.LineHandler { + return func(stream string, line string, _ any) { + now := time.Now().UTC() + logEnv, _ := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{ + JobID: jobID, + Seq: seq.Add(1), + TS: now, + Stream: api.LogStream(stream), + Payload: line, + }) + _ = r.tx.Send(logEnv) + } +} + +// sendFinished ships a job.finished envelope. err==nil → succeeded; +// otherwise failed. statsBlob is forwarded as JobFinishedPayload.Stats. +func (r *Runner) sendFinished(jobID string, finishedAt time.Time, err error, statsBlob json.RawMessage) { + status := api.JobSucceeded + exit := 0 + errMsg := "" + if err != nil { + status = api.JobFailed + exit = -1 + errMsg = err.Error() + } + finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{ + JobID: jobID, + Status: status, + ExitCode: exit, + FinishedAt: finishedAt, + Stats: statsBlob, + Error: errMsg, + }) + _ = r.tx.Send(finEnv) +} + +// RunBackup executes a backup job and reports back via the sender. +// Returns nil on a clean (or "incomplete-but-snapshot-created") finish. +func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, tags []string) error { + startedAt := time.Now().UTC() + r.sendStarted(jobID, api.JobBackup, startedAt) + + env := r.resticEnv() var seq atomic.Int64 lastProgress := time.Now() @@ -115,27 +161,11 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t summary, err := env.RunBackup(ctx, paths, excludes, tags, handle) finishedAt := time.Now().UTC() - status := api.JobSucceeded - exit := 0 - errMsg := "" - if err != nil { - status = api.JobFailed - exit = -1 - errMsg = err.Error() - } var statsBlob json.RawMessage if summary != nil { statsBlob, _ = json.Marshal(summary) } - finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{ - JobID: jobID, - Status: status, - ExitCode: exit, - FinishedAt: finishedAt, - Stats: statsBlob, - Error: errMsg, - }) - _ = r.tx.Send(finEnv) + r.sendFinished(jobID, finishedAt, err, statsBlob) // On a successful backup, refresh the server's snapshot projection. // We do this *after* job.finished so the UI sees the job land first; @@ -147,6 +177,9 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t if rerr := r.reportSnapshots(ctx, env); rerr != nil { slog.Warn("runner: snapshots.report failed", "job_id", jobID, "err", rerr) } + if rerr := r.reportStats(ctx, env, api.RepoStatsPayload{}); rerr != nil { + slog.Warn("runner: stats.report after backup failed", "job_id", jobID, "err", rerr) + } } if err != nil { @@ -160,111 +193,35 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t // browser-side log viewer just works. func (r *Runner) RunInit(ctx context.Context, jobID string) error { startedAt := time.Now().UTC() - startEnv, _ := api.Marshal(api.MsgJobStarted, jobID, api.JobStartedPayload{ - JobID: jobID, Kind: api.JobInit, StartedAt: startedAt, - }) - if err := r.tx.Send(startEnv); err != nil { - slog.Warn("runner: send job.started (init)", "err", err) - } - - env := restic.Env{ - Bin: r.cfg.ResticBin, - RepoURL: r.cfg.RepoURL, - RepoUsername: r.cfg.RepoUsername, - RepoPassword: r.cfg.RepoPassword, - } + r.sendStarted(jobID, api.JobInit, startedAt) + env := r.resticEnv() var seq atomic.Int64 - handle := func(stream string, line string, _ any) { - now := time.Now().UTC() - logEnv, _ := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{ - JobID: jobID, - Seq: seq.Add(1), - TS: now, - Stream: api.LogStream(stream), - Payload: line, - }) - _ = r.tx.Send(logEnv) - } - - err := env.RunInit(ctx, handle) + err := env.RunInit(ctx, r.streamHandler(jobID, &seq)) finishedAt := time.Now().UTC() - - status := api.JobSucceeded - exit := 0 - errMsg := "" - if err != nil { - status = api.JobFailed - exit = -1 - errMsg = err.Error() - } - finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{ - JobID: jobID, - Status: status, - ExitCode: exit, - FinishedAt: finishedAt, - Error: errMsg, - }) - _ = r.tx.Send(finEnv) + r.sendFinished(jobID, finishedAt, err, nil) if err != nil { return fmt.Errorf("runner init: %w", err) } return nil } -// RunForget executes a forget job against the configured repo with -// the given retention policy. Same envelope shape as RunBackup so -// the live log viewer + job lifecycle work without special-casing. -// On success refreshes the snapshot projection (forget rewrites the -// snapshot index — the host's snapshot list shrinks). -func (r *Runner) RunForget(ctx context.Context, jobID string, policy restic.ForgetPolicy) error { +// RunForget executes a forget job against the configured repo by +// invoking `restic forget --tag --keep-* …` once per group. +// Same envelope shape as RunBackup so the live log viewer + job +// lifecycle work without special-casing. On success refreshes the +// snapshot projection (forget rewrites the snapshot index — the +// host's snapshot list shrinks). Snapshot refresh runs once after +// every group completes, not per-group. +func (r *Runner) RunForget(ctx context.Context, jobID string, groups []restic.ForgetGroup) error { startedAt := time.Now().UTC() - startEnv, _ := api.Marshal(api.MsgJobStarted, jobID, api.JobStartedPayload{ - JobID: jobID, Kind: api.JobForget, StartedAt: startedAt, - }) - if err := r.tx.Send(startEnv); err != nil { - slog.Warn("runner: send job.started (forget)", "err", err) - } - - env := restic.Env{ - Bin: r.cfg.ResticBin, - RepoURL: r.cfg.RepoURL, - RepoUsername: r.cfg.RepoUsername, - RepoPassword: r.cfg.RepoPassword, - } + r.sendStarted(jobID, api.JobForget, startedAt) + env := r.resticEnv() var seq atomic.Int64 - handle := func(stream string, line string, _ any) { - now := time.Now().UTC() - logEnv, _ := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{ - JobID: jobID, - Seq: seq.Add(1), - TS: now, - Stream: api.LogStream(stream), - Payload: line, - }) - _ = r.tx.Send(logEnv) - } - - err := env.RunForget(ctx, policy, handle) + err := env.RunForget(ctx, groups, r.streamHandler(jobID, &seq)) finishedAt := time.Now().UTC() - - status := api.JobSucceeded - exit := 0 - errMsg := "" - if err != nil { - status = api.JobFailed - exit = -1 - errMsg = err.Error() - } - finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{ - JobID: jobID, - Status: status, - ExitCode: exit, - FinishedAt: finishedAt, - Error: errMsg, - }) - _ = r.tx.Send(finEnv) + r.sendFinished(jobID, finishedAt, err, nil) // Refresh the server's snapshot projection — forget rewrites the // index so the host's snapshot list almost certainly shrunk. @@ -281,6 +238,129 @@ func (r *Runner) RunForget(ctx context.Context, jobID string, policy restic.Forg return nil } +// RunPrune executes a prune job against the configured repo. On +// success it ships a repo.stats envelope with LastPruneAt set (plus +// a full size refresh via RunStats) before the job.finished envelope, +// so the UI can display updated size information alongside the +// completed job. On failure no stats refresh is attempted. +func (r *Runner) RunPrune(ctx context.Context, jobID string) error { + startedAt := time.Now().UTC() + r.sendStarted(jobID, api.JobPrune, startedAt) + + env := r.resticEnv() + var seq atomic.Int64 + err := env.RunPrune(ctx, r.streamHandler(jobID, &seq)) + finishedAt := time.Now().UTC() + + if err == nil { + pruneAt := finishedAt + if rerr := r.reportStats(ctx, env, api.RepoStatsPayload{LastPruneAt: &pruneAt}); rerr != nil { + slog.Warn("runner: stats.report after prune failed", "job_id", jobID, "err", rerr) + } + } + + r.sendFinished(jobID, finishedAt, err, nil) + + if err != nil { + return fmt.Errorf("runner prune: %w", err) + } + return nil +} + +// RunCheck executes a `restic check` job. Always ships a repo.stats +// envelope (success or failure) with LastCheckAt, LastCheckStatus, +// and LockPresent populated from the check result. +func (r *Runner) RunCheck(ctx context.Context, jobID string, subsetPct int) error { + startedAt := time.Now().UTC() + r.sendStarted(jobID, api.JobCheck, startedAt) + + env := r.resticEnv() + var seq atomic.Int64 + res, err := env.RunCheck(ctx, subsetPct, r.streamHandler(jobID, &seq)) + finishedAt := time.Now().UTC() + + // Determine check status string. + checkStatus := "ok" + if err != nil { + checkStatus = "failed" + } else if res.ErrorsFound { + checkStatus = "errors_found" + } + + lockPresent := res.LockPresent + now := finishedAt + patch := api.RepoStatsPayload{ + LastCheckAt: &now, + LastCheckStatus: checkStatus, + LockPresent: &lockPresent, + } + if rerr := r.reportStats(ctx, env, patch); rerr != nil { + slog.Warn("runner: stats.report after check failed", "job_id", jobID, "err", rerr) + } + + r.sendFinished(jobID, finishedAt, err, nil) + + if err != nil { + return fmt.Errorf("runner check: %w", err) + } + return nil +} + +// RunUnlock executes a `restic unlock` job. On success it ships a +// repo.stats envelope with LockPresent=false so the UI banner clears. +func (r *Runner) RunUnlock(ctx context.Context, jobID string) error { + startedAt := time.Now().UTC() + r.sendStarted(jobID, api.JobUnlock, startedAt) + + env := r.resticEnv() + var seq atomic.Int64 + err := env.RunUnlock(ctx, r.streamHandler(jobID, &seq)) + finishedAt := time.Now().UTC() + + if err == nil { + lockFalse := false + patch := api.RepoStatsPayload{LockPresent: &lockFalse} + if rerr := r.reportStats(ctx, env, patch); rerr != nil { + slog.Warn("runner: stats.report after unlock failed", "job_id", jobID, "err", rerr) + } + } + + r.sendFinished(jobID, finishedAt, err, nil) + + if err != nil { + return fmt.Errorf("runner unlock: %w", err) + } + return nil +} + +// reportStats ships a repo.stats envelope. If the patch doesn't +// already include size fields, fills them in by invoking env.RunStats. +// Errors from RunStats are non-fatal — the patch is shipped anyway +// with whatever the caller did populate. +func (r *Runner) reportStats(ctx context.Context, env restic.Env, patch api.RepoStatsPayload) error { + listCtx, cancel := context.WithTimeout(ctx, 60*time.Second) + defer cancel() + if patch.TotalSizeBytes == nil { + if s, err := env.RunStats(listCtx, nil); err == nil { + total := s.TotalSize + raw := s.TotalUncompressed + files := s.TotalFileCount + snaps := s.SnapshotsCount + patch.TotalSizeBytes = &total + patch.RawSizeBytes = &raw + patch.UniqueFiles = &files + patch.SnapshotCount = &snaps + } else { + slog.Debug("runner: stats refresh failed (non-fatal)", "err", err) + } + } + envOut, err := api.Marshal(api.MsgRepoStats, "", patch) + if err != nil { + return err + } + return r.tx.Send(envOut) +} + // reportSnapshots calls `restic snapshots --json`, translates the // payload into the wire shape, and ships it as a snapshots.report // envelope. Bounded by a separate timeout so a sluggish repo doesn't diff --git a/internal/agent/runner/runner_test.go b/internal/agent/runner/runner_test.go new file mode 100644 index 0000000..c9fb042 --- /dev/null +++ b/internal/agent/runner/runner_test.go @@ -0,0 +1,357 @@ +package runner + +import ( + "context" + "os" + "path/filepath" + "testing" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/restic" +) + +// fakeSender collects sent envelopes for assertions. +type fakeSender struct{ envs []api.Envelope } + +func (s *fakeSender) Send(e api.Envelope) error { + s.envs = append(s.envs, e) + return nil +} + +// setupScript writes a shell script (without shebang) to a temp dir, +// names it "restic", makes it executable, and returns the path. +// +// Writes to ".tmp" then renames into place. The rename is what +// makes this race-free: under -race + many t.Parallel tests, a +// fork-from-another-goroutine can inherit the writable fd from +// os.WriteFile before close completes, and exec'ing the file then +// returns ETXTBSY ("text file busy"). Once the rename lands, the +// final path is a fresh dirent pointing at an inode that has no +// writable fd open anywhere — exec is safe. +func setupScript(t *testing.T, body string) string { + t.Helper() + dir := t.TempDir() + final := filepath.Join(dir, "restic") + tmp := final + ".tmp" + if err := os.WriteFile(tmp, []byte("#!/bin/sh\n"+body+"\n"), 0o755); err != nil { + t.Fatalf("setupScript: write tmp: %v", err) + } + if err := os.Rename(tmp, final); err != nil { + t.Fatalf("setupScript: rename: %v", err) + } + return final +} + +// firstEnvOfType returns the first envelope with the given type, or +// fails the test if none is found. +func firstEnvOfType(t *testing.T, envs []api.Envelope, mt api.MessageType) api.Envelope { + t.Helper() + for _, e := range envs { + if e.Type == mt { + return e + } + } + t.Fatalf("no envelope of type %q found in %d envelopes", mt, len(envs)) + return api.Envelope{} +} + +// envelopeOrder returns the message types of all sent envelopes. +func envelopeOrder(envs []api.Envelope) []api.MessageType { + out := make([]api.MessageType, len(envs)) + for i, e := range envs { + out[i] = e.Type + } + return out +} + +// TestRunPruneShipsExpectedEnvelopes drives RunPrune with a fake +// binary that prints "prune" on stdout (for the log.stream envelope) +// and emits valid stats JSON so reportStats can populate size fields. +// Expected sequence: job.started → log.stream → repo.stats → job.finished. +func TestRunPruneShipsExpectedEnvelopes(t *testing.T) { + t.Parallel() + + // The fake "restic" handles both "prune" and "stats --json" calls. + statsJSON := `{"total_size":1000,"total_uncompressed_size":2000,"snapshots_count":3,"total_file_count":10}` + bin := setupScript(t, ` +case "$1" in + prune) echo "prune" ;; + stats) echo '`+statsJSON+`' ;; + *) echo "unknown: $*" ;; +esac +`) + + tx := &fakeSender{} + r := New(Config{ResticBin: bin}, tx, 0) + if err := r.RunPrune(context.Background(), "job-1"); err != nil { + t.Fatalf("RunPrune: %v", err) + } + + order := envelopeOrder(tx.envs) + // Confirm landmark envelope types appear in the required order. + wantTypes := []api.MessageType{api.MsgJobStarted, api.MsgLogStream, api.MsgRepoStats, api.MsgJobFinished} + positions := map[api.MessageType]int{} + for i, mt := range order { + if _, seen := positions[mt]; !seen { + positions[mt] = i + } + } + for i := 0; i < len(wantTypes)-1; i++ { + a, b := wantTypes[i], wantTypes[i+1] + pa, aOK := positions[a] + pb, bOK := positions[b] + if !aOK { + t.Errorf("envelope type %q not found in output %v", a, order) + continue + } + if !bOK { + t.Errorf("envelope type %q not found in output %v", b, order) + continue + } + if pa >= pb { + t.Errorf("expected %q before %q but positions are %d >= %d (order: %v)", a, b, pa, pb, order) + } + } + + // The repo.stats payload must have LastPruneAt set. + statsEnv := firstEnvOfType(t, tx.envs, api.MsgRepoStats) + var statsPayload api.RepoStatsPayload + if err := statsEnv.UnmarshalPayload(&statsPayload); err != nil { + t.Fatalf("unmarshal repo.stats payload: %v", err) + } + if statsPayload.LastPruneAt == nil { + t.Error("expected LastPruneAt to be set in repo.stats after prune") + } + + // The job.finished payload must indicate success. + finEnv := firstEnvOfType(t, tx.envs, api.MsgJobFinished) + var finPayload api.JobFinishedPayload + if err := finEnv.UnmarshalPayload(&finPayload); err != nil { + t.Fatalf("unmarshal job.finished payload: %v", err) + } + if finPayload.Status != api.JobSucceeded { + t.Errorf("expected job.finished status=%q, got %q", api.JobSucceeded, finPayload.Status) + } +} + +// TestRunCheckShipsCheckStatus verifies that a check run which emits +// a stale-lock line on stderr (exit 0) reports LastCheckStatus="ok" +// and LockPresent=true. +func TestRunCheckShipsCheckStatus(t *testing.T) { + t.Parallel() + + statsJSON := `{"total_size":500,"total_uncompressed_size":600,"snapshots_count":1,"total_file_count":5}` + bin := setupScript(t, ` +case "$1" in + check) echo "Found stale lock" >&2; exit 0 ;; + stats) echo '`+statsJSON+`' ;; + *) exit 0 ;; +esac +`) + + tx := &fakeSender{} + r := New(Config{ResticBin: bin}, tx, 0) + if err := r.RunCheck(context.Background(), "job-2", 0); err != nil { + t.Fatalf("RunCheck: %v", err) + } + + // Assert envelope ordering: job.started → log.stream → repo.stats → job.finished. + order := envelopeOrder(tx.envs) + wantTypes := []api.MessageType{api.MsgJobStarted, api.MsgLogStream, api.MsgRepoStats, api.MsgJobFinished} + positions := map[api.MessageType]int{} + for i, mt := range order { + if _, seen := positions[mt]; !seen { + positions[mt] = i + } + } + for i := 0; i < len(wantTypes)-1; i++ { + a, b := wantTypes[i], wantTypes[i+1] + pa, aOK := positions[a] + pb, bOK := positions[b] + if !aOK { + t.Errorf("envelope type %q not found in output %v", a, order) + continue + } + if !bOK { + t.Errorf("envelope type %q not found in output %v", b, order) + continue + } + if pa >= pb { + t.Errorf("expected %q before %q but positions are %d >= %d (order: %v)", a, b, pa, pb, order) + } + } + + statsEnv := firstEnvOfType(t, tx.envs, api.MsgRepoStats) + var p api.RepoStatsPayload + if err := statsEnv.UnmarshalPayload(&p); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if p.LastCheckStatus != "ok" { + t.Errorf("LastCheckStatus: got %q, want %q", p.LastCheckStatus, "ok") + } + if p.LockPresent == nil || !*p.LockPresent { + t.Errorf("expected LockPresent=true, got %v", p.LockPresent) + } + if p.LastCheckAt == nil { + t.Error("expected LastCheckAt to be set") + } +} + +// TestRunCheckErrorsFoundShipsErrorsStatus verifies that a check run +// that exits 1 (errors found) reports LastCheckStatus="errors_found". +func TestRunCheckErrorsFoundShipsErrorsStatus(t *testing.T) { + t.Parallel() + + statsJSON := `{"total_size":500,"total_uncompressed_size":600,"snapshots_count":1,"total_file_count":5}` + bin := setupScript(t, ` +case "$1" in + check) exit 1 ;; + stats) echo '`+statsJSON+`' ;; + *) exit 0 ;; +esac +`) + + tx := &fakeSender{} + r := New(Config{ResticBin: bin}, tx, 0) + // RunCheck returns nil for exit 1 (errors_found is not a wrapper failure). + if err := r.RunCheck(context.Background(), "job-3", 0); err != nil { + t.Fatalf("RunCheck: %v", err) + } + + // Assert envelope ordering: job.started → repo.stats → job.finished. + // (No log.stream expected because the fake script produces no + // output before exit 1 — a real restic check would emit log lines + // before exiting non-zero.) + order := envelopeOrder(tx.envs) + wantTypes := []api.MessageType{api.MsgJobStarted, api.MsgRepoStats, api.MsgJobFinished} + positions := map[api.MessageType]int{} + for i, mt := range order { + if _, seen := positions[mt]; !seen { + positions[mt] = i + } + } + for i := 0; i < len(wantTypes)-1; i++ { + a, b := wantTypes[i], wantTypes[i+1] + pa, aOK := positions[a] + pb, bOK := positions[b] + if !aOK { + t.Errorf("envelope type %q not found in output %v", a, order) + continue + } + if !bOK { + t.Errorf("envelope type %q not found in output %v", b, order) + continue + } + if pa >= pb { + t.Errorf("expected %q before %q but positions are %d >= %d (order: %v)", a, b, pa, pb, order) + } + } + + statsEnv := firstEnvOfType(t, tx.envs, api.MsgRepoStats) + var p api.RepoStatsPayload + if err := statsEnv.UnmarshalPayload(&p); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if p.LastCheckStatus != "errors_found" { + t.Errorf("LastCheckStatus: got %q, want %q", p.LastCheckStatus, "errors_found") + } +} + +// TestRunUnlockClearsLock verifies that a successful unlock ships a +// repo.stats envelope with LockPresent=false. +func TestRunUnlockClearsLock(t *testing.T) { + t.Parallel() + + statsJSON := `{"total_size":100,"total_uncompressed_size":150,"snapshots_count":2,"total_file_count":8}` + bin := setupScript(t, ` +case "$1" in + unlock) echo "removed 1 locks" ;; + stats) echo '`+statsJSON+`' ;; + *) exit 0 ;; +esac +`) + + tx := &fakeSender{} + r := New(Config{ResticBin: bin}, tx, 0) + if err := r.RunUnlock(context.Background(), "job-4"); err != nil { + t.Fatalf("RunUnlock: %v", err) + } + + // Assert envelope ordering: job.started → log.stream → repo.stats → job.finished. + order := envelopeOrder(tx.envs) + wantTypes := []api.MessageType{api.MsgJobStarted, api.MsgLogStream, api.MsgRepoStats, api.MsgJobFinished} + positions := map[api.MessageType]int{} + for i, mt := range order { + if _, seen := positions[mt]; !seen { + positions[mt] = i + } + } + for i := 0; i < len(wantTypes)-1; i++ { + a, b := wantTypes[i], wantTypes[i+1] + pa, aOK := positions[a] + pb, bOK := positions[b] + if !aOK { + t.Errorf("envelope type %q not found in output %v", a, order) + continue + } + if !bOK { + t.Errorf("envelope type %q not found in output %v", b, order) + continue + } + if pa >= pb { + t.Errorf("expected %q before %q but positions are %d >= %d (order: %v)", a, b, pa, pb, order) + } + } + + statsEnv := firstEnvOfType(t, tx.envs, api.MsgRepoStats) + var p api.RepoStatsPayload + if err := statsEnv.UnmarshalPayload(&p); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if p.LockPresent == nil { + t.Fatal("expected LockPresent to be set (non-nil)") + } + if *p.LockPresent { + t.Errorf("expected LockPresent=false after unlock, got true") + } +} + +// TestRunInitShipsStartedAndFinished confirms the refactored RunInit +// still produces job.started and job.finished envelopes. +func TestRunInitShipsStartedAndFinished(t *testing.T) { + t.Parallel() + bin := setupScript(t, `echo "initialized repository"`) + tx := &fakeSender{} + r := New(Config{ResticBin: bin}, tx, 0) + if err := r.RunInit(context.Background(), "job-init"); err != nil { + t.Fatalf("RunInit: %v", err) + } + _ = firstEnvOfType(t, tx.envs, api.MsgJobStarted) + _ = firstEnvOfType(t, tx.envs, api.MsgJobFinished) +} + +// TestRunForgetShipsStartedAndFinished confirms the refactored +// RunForget still produces job.started and job.finished envelopes. +func TestRunForgetShipsStartedAndFinished(t *testing.T) { + t.Parallel() + // Script handles both "forget --json ..." and "snapshots --json" calls. + bin := setupScript(t, ` +case "$1" in + forget) echo "[]" ;; + snapshots) echo "[]" ;; + *) exit 0 ;; +esac +`) + tx := &fakeSender{} + r := New(Config{ResticBin: bin}, tx, 0) + keepLast := 1 + groups := []restic.ForgetGroup{{ + Tag: "documents", + Policy: restic.ForgetPolicy{KeepLast: &keepLast}, + }} + if err := r.RunForget(context.Background(), "job-forget", groups); err != nil { + t.Fatalf("RunForget: %v", err) + } + _ = firstEnvOfType(t, tx.envs, api.MsgJobStarted) + _ = firstEnvOfType(t, tx.envs, api.MsgJobFinished) +} diff --git a/internal/agent/secrets/secrets.go b/internal/agent/secrets/secrets.go index f5bbc82..ff285e9 100644 --- a/internal/agent/secrets/secrets.go +++ b/internal/agent/secrets/secrets.go @@ -9,6 +9,7 @@ package secrets import ( + "bytes" "encoding/json" "errors" "fmt" @@ -24,6 +25,11 @@ import ( // depth — the key is per-host today, but cheap to be careful.) const additionalData = "rm-agent-repo-creds-v1" +// ErrNoAdmin is returned by LoadAdmin when no admin slot has been +// written yet. Callers must distinguish this from a hard error: the +// agent simply hasn't received an admin config.update push yet. +var ErrNoAdmin = errors.New("secrets: admin slot not configured") + // Repo is the plaintext shape persisted inside the AEAD blob. type Repo struct { URL string `json:"repo_url,omitempty"` @@ -35,6 +41,15 @@ type Repo struct { // minimum (URL + password) needed to run a backup. func (r Repo) Empty() bool { return r.URL == "" || r.Password == "" } +// bundle is the on-disk JSON shape as of secrets v2. It holds the +// everyday repo slot and an optional admin slot (prune / unlock). +// Legacy files (pre-v2) contain a flat Repo object; loadBundle +// transparently upgrades those on the next Save. +type bundle struct { + Repo Repo `json:"repo,omitempty"` + Admin *Repo `json:"admin,omitempty"` +} + // Store reads and writes the encrypted secrets file at Path, sealed // under the 32-byte key Key. type Store struct { @@ -55,32 +70,47 @@ func New(path string, key []byte) (*Store, error) { return &Store{path: path, a: a}, nil } -// Load returns the persisted Repo, or a zero-value Repo (with no -// error) if the file does not exist yet — first-run agents have -// nothing on disk until the server pushes a config.update. -func (s *Store) Load() (Repo, error) { +// loadBundle reads and decrypts the on-disk blob, returning a bundle. +// It handles back-compat decode: legacy flat Repo blobs are detected +// by the presence of a top-level "repo_url" key and re-wrapped into +// the bundle shape transparently. Returns an empty bundle when the +// file does not exist yet. +func (s *Store) loadBundle() (bundle, error) { body, err := os.ReadFile(s.path) if err != nil { if errors.Is(err, os.ErrNotExist) { - return Repo{}, nil + return bundle{}, nil } - return Repo{}, fmt.Errorf("secrets: read %q: %w", s.path, err) + return bundle{}, fmt.Errorf("secrets: read %q: %w", s.path, err) } plain, err := s.a.Decrypt(string(body), []byte(additionalData)) if err != nil { - return Repo{}, fmt.Errorf("secrets: decrypt %q: %w", s.path, err) + return bundle{}, fmt.Errorf("secrets: decrypt %q: %w", s.path, err) } - var r Repo - if err := json.Unmarshal(plain, &r); err != nil { - return Repo{}, fmt.Errorf("secrets: parse %q: %w", s.path, err) + + // Try the new bundle shape first. + var b bundle + if err := json.Unmarshal(plain, &b); err != nil { + return bundle{}, fmt.Errorf("secrets: parse %q: %w", s.path, err) } - return r, nil + + // If the bundle has an empty Repo slot but the raw JSON contains + // a top-level "repo_url" key, this is a legacy flat blob — + // re-unmarshal it as a Repo and slot it in. + if b.Repo == (Repo{}) && bytes.Contains(plain, []byte(`"repo_url"`)) { + var legacy Repo + if err := json.Unmarshal(plain, &legacy); err == nil { + b.Repo = legacy + } + } + + return b, nil } -// Save replaces the on-disk blob atomically. Mode is 0600. Parent -// directory must already exist (the install script lays it down). -func (s *Store) Save(r Repo) error { - body, err := json.Marshal(r) +// saveBundle marshals b, encrypts it and writes it atomically at +// mode 0600. Parent directory must already exist. +func (s *Store) saveBundle(b bundle) error { + body, err := json.Marshal(b) if err != nil { return fmt.Errorf("secrets: marshal: %w", err) } @@ -115,3 +145,50 @@ func (s *Store) Save(r Repo) error { } return nil } + +// Load returns the persisted Repo (the everyday repo slot), or a +// zero-value Repo (with no error) if the file does not exist yet — +// first-run agents have nothing on disk until the server pushes a +// config.update. +func (s *Store) Load() (Repo, error) { + b, err := s.loadBundle() + if err != nil { + return Repo{}, err + } + return b.Repo, nil +} + +// Save replaces the repo slot on disk atomically, preserving the +// admin slot. Mode is 0600. Parent directory must already exist. +func (s *Store) Save(r Repo) error { + b, err := s.loadBundle() + if err != nil { + return fmt.Errorf("secrets: load before save: %w", err) + } + b.Repo = r + return s.saveBundle(b) +} + +// LoadAdmin returns the admin slot, or (Repo{}, ErrNoAdmin) when no +// admin slot has been set. All other errors are hard failures. +func (s *Store) LoadAdmin() (Repo, error) { + b, err := s.loadBundle() + if err != nil { + return Repo{}, err + } + if b.Admin == nil { + return Repo{}, ErrNoAdmin + } + return *b.Admin, nil +} + +// SaveAdmin replaces the admin slot on disk atomically, preserving +// the repo slot. Mode is 0600. +func (s *Store) SaveAdmin(r Repo) error { + b, err := s.loadBundle() + if err != nil { + return fmt.Errorf("secrets: load before save: %w", err) + } + b.Admin = &r + return s.saveBundle(b) +} diff --git a/internal/agent/secrets/secrets_test.go b/internal/agent/secrets/secrets_test.go index 07fa57b..2737cf5 100644 --- a/internal/agent/secrets/secrets_test.go +++ b/internal/agent/secrets/secrets_test.go @@ -2,6 +2,8 @@ package secrets import ( "crypto/rand" + "encoding/json" + "errors" "io" "os" "path/filepath" @@ -97,3 +99,211 @@ func TestSaveIsAtomic(t *testing.T) { t.Errorf("dir should hold one file post-save, got %v", names) } } + +func TestSecretsLoadAdminEmpty(t *testing.T) { + t.Parallel() + // No file yet: LoadAdmin must return ErrNoAdmin, not a hard error. + dir := t.TempDir() + path := filepath.Join(dir, "secrets.enc") + st, err := New(path, freshKey(t)) + if err != nil { + t.Fatalf("new: %v", err) + } + _, err = st.LoadAdmin() + if !errors.Is(err, ErrNoAdmin) { + t.Errorf("expected ErrNoAdmin, got %v", err) + } +} + +func TestSecretsAdminSlotIndependent(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "secrets.enc") + st, err := New(path, freshKey(t)) + if err != nil { + t.Fatalf("new: %v", err) + } + + repo := Repo{URL: "rest:https://repo/host", Username: "user", Password: "pw"} + admin := Repo{URL: "rest:https://repo/host", Username: "admin", Password: "adminpw"} + + if err := st.Save(repo); err != nil { + t.Fatalf("save repo: %v", err) + } + if err := st.SaveAdmin(admin); err != nil { + t.Fatalf("save admin: %v", err) + } + + // Load returns the repo slot unchanged. + gotRepo, err := st.Load() + if err != nil { + t.Fatalf("load: %v", err) + } + if gotRepo != repo { + t.Errorf("repo slot mismatch: got %+v want %+v", gotRepo, repo) + } + + // LoadAdmin returns the admin slot. + gotAdmin, err := st.LoadAdmin() + if err != nil { + t.Fatalf("load admin: %v", err) + } + if gotAdmin != admin { + t.Errorf("admin slot mismatch: got %+v want %+v", gotAdmin, admin) + } + + // SaveAdmin a second time replaces admin only; repo unchanged. + admin2 := Repo{URL: "rest:https://repo/host", Username: "admin2", Password: "pw2"} + if err := st.SaveAdmin(admin2); err != nil { + t.Fatalf("save admin2: %v", err) + } + gotRepo2, err := st.Load() + if err != nil { + t.Fatalf("load after admin2 save: %v", err) + } + if gotRepo2 != repo { + t.Errorf("repo slot changed unexpectedly: got %+v want %+v", gotRepo2, repo) + } + gotAdmin2, err := st.LoadAdmin() + if err != nil { + t.Fatalf("load admin2: %v", err) + } + if gotAdmin2 != admin2 { + t.Errorf("admin2 slot mismatch: got %+v want %+v", gotAdmin2, admin2) + } +} + +func TestSecretsSaveRefusesCorruptFile(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "secrets.enc") + st, err := New(path, freshKey(t)) + if err != nil { + t.Fatalf("new: %v", err) + } + + // Lay down a valid file first. + if err := st.Save(Repo{URL: "rest:https://r/host", Password: "pw"}); err != nil { + t.Fatalf("initial save: %v", err) + } + + // Corrupt the file. + garbage := []byte("not encrypted") + if err := os.WriteFile(path, garbage, 0o600); err != nil { + t.Fatalf("write garbage: %v", err) + } + + // Save must refuse to overwrite: decrypt will fail. + saveErr := st.Save(Repo{URL: "rest:https://r/host", Password: "new"}) + if saveErr == nil { + t.Fatal("Save over corrupt file must return an error; got nil") + } + + // File must NOT have been replaced — still contains the garbage bytes. + got, err := os.ReadFile(path) + if err != nil { + t.Fatalf("re-read: %v", err) + } + if string(got) != string(garbage) { + t.Errorf("corrupt file was overwritten; file size now %d (was %d)", len(got), len(garbage)) + } +} + +func TestSecretsSaveAdminRefusesCorruptFile(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "secrets.enc") + st, err := New(path, freshKey(t)) + if err != nil { + t.Fatalf("new: %v", err) + } + + // Lay down a valid file first. + if err := st.SaveAdmin(Repo{URL: "rest:https://r/host", Password: "adminpw"}); err != nil { + t.Fatalf("initial save admin: %v", err) + } + + // Corrupt the file. + garbage := []byte("not encrypted admin") + if err := os.WriteFile(path, garbage, 0o600); err != nil { + t.Fatalf("write garbage: %v", err) + } + + // SaveAdmin must refuse to overwrite: decrypt will fail. + saveErr := st.SaveAdmin(Repo{URL: "rest:https://r/host", Password: "new"}) + if saveErr == nil { + t.Fatal("SaveAdmin over corrupt file must return an error; got nil") + } + + // File must NOT have been replaced. + got, err := os.ReadFile(path) + if err != nil { + t.Fatalf("re-read: %v", err) + } + if string(got) != string(garbage) { + t.Errorf("corrupt file was overwritten; file size now %d (was %d)", len(got), len(garbage)) + } +} + +func TestSecretsLegacyFlatBlobMigrates(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "secrets.enc") + key := freshKey(t) + + // Write a legacy flat Repo blob directly — bypassing bundle wrapping. + legacy := Repo{URL: "rest:https://legacy/host", Username: "legacyuser", Password: "legacypw"} + plain, err := json.Marshal(legacy) + if err != nil { + t.Fatalf("marshal legacy: %v", err) + } + a, err := crypto.NewAEAD(key) + if err != nil { + t.Fatalf("aead: %v", err) + } + ct, err := a.Encrypt(plain, []byte(additionalData)) + if err != nil { + t.Fatalf("encrypt legacy: %v", err) + } + if err := os.WriteFile(path, []byte(ct), 0o600); err != nil { + t.Fatalf("write legacy file: %v", err) + } + + // Open via secrets.New + Load — must return the legacy Repo. + st, err := New(path, key) + if err != nil { + t.Fatalf("new: %v", err) + } + got, err := st.Load() + if err != nil { + t.Fatalf("load legacy: %v", err) + } + if got != legacy { + t.Errorf("legacy decode mismatch: got %+v want %+v", got, legacy) + } + + // SaveAdmin should write both slots; re-opening must have both. + admin := Repo{URL: "rest:https://legacy/host", Username: "admin", Password: "adminpw"} + if err := st.SaveAdmin(admin); err != nil { + t.Fatalf("save admin after legacy: %v", err) + } + + st2, err := New(path, key) + if err != nil { + t.Fatalf("reopen: %v", err) + } + gotRepo, err := st2.Load() + if err != nil { + t.Fatalf("load repo after migration: %v", err) + } + if gotRepo != legacy { + t.Errorf("repo after migration: got %+v want %+v", gotRepo, legacy) + } + gotAdmin, err := st2.LoadAdmin() + if err != nil { + t.Fatalf("load admin after migration: %v", err) + } + if gotAdmin != admin { + t.Errorf("admin after migration: got %+v want %+v", gotAdmin, admin) + } +} diff --git a/internal/api/messages.go b/internal/api/messages.go index c93cad6..816d203 100644 --- a/internal/api/messages.go +++ b/internal/api/messages.go @@ -77,6 +77,30 @@ const ( JobCancelled JobStatus = "cancelled" //nolint:misspell // wire format ) +// ForgetPolicyJSON is the wire shape of a per-group retention policy +// shipped with a forget command.run. Mirrors store.RetentionPolicy +// JSON tags exactly so a future caller could json-roundtrip between +// the two without reshaping. All fields nullable; an empty struct is +// rejected by the agent (restic refuses to forget without --keep-*). +type ForgetPolicyJSON struct { + KeepLast *int `json:"keep_last,omitempty"` + KeepHourly *int `json:"keep_hourly,omitempty"` + KeepDaily *int `json:"keep_daily,omitempty"` + KeepWeekly *int `json:"keep_weekly,omitempty"` + KeepMonthly *int `json:"keep_monthly,omitempty"` + KeepYearly *int `json:"keep_yearly,omitempty"` +} + +// ForgetGroup is one (tag, retention) pair shipped to the agent in a +// forget command.run. The agent invokes +// `restic forget --tag --keep-* …` once per group, with each +// group's own policy. The Tag is the source-group name (which is +// also the snapshot tag carried at backup time). +type ForgetGroup struct { + Tag string `json:"tag"` + Policy ForgetPolicyJSON `json:"policy"` +} + // CommandRunPayload is the server → agent dispatch for a run-now job. // // For kind=backup, Includes/Excludes/Tag are populated from the source @@ -85,19 +109,27 @@ const ( // the source group's name) so retention can target it later via // `restic forget --tag`. // -// For kind=forget, RetentionPolicy is the typed keep-* set as raw JSON -// (the agent doesn't share the store package's typed struct). +// For kind=forget, ForgetGroups carries one entry per source-group on +// the host that has a non-empty retention policy. The agent walks the +// list and runs `restic forget --tag --keep-* …` per group. // // Args is preserved as a generic free-form slice for kinds that don't -// fit the structured fields (e.g. unlock takes none; init takes none). +// fit the structured fields (e.g. unlock takes none; init takes none; +// check carries the subset% as Args[0]). +// +// RequiresAdminCreds tells the agent to load the admin slot of its +// secrets store rather than the everyday repo slot. Set by the server +// only for prune (the only kind that needs delete authority on a +// rest-server repo today). type CommandRunPayload struct { - JobID string `json:"job_id"` - Kind JobKind `json:"kind"` - Args []string `json:"args,omitempty"` - Includes []string `json:"includes,omitempty"` - Excludes []string `json:"excludes,omitempty"` - Tag string `json:"tag,omitempty"` - RetentionPolicy json.RawMessage `json:"retention_policy,omitempty"` + JobID string `json:"job_id"` + Kind JobKind `json:"kind"` + Args []string `json:"args,omitempty"` + Includes []string `json:"includes,omitempty"` + Excludes []string `json:"excludes,omitempty"` + Tag string `json:"tag,omitempty"` + ForgetGroups []ForgetGroup `json:"forget_groups,omitempty"` + RequiresAdminCreds bool `json:"requires_admin_creds,omitempty"` } // CommandCancelPayload is the server → agent cancel signal. @@ -186,15 +218,24 @@ type Snapshot struct { FileCount int64 `json:"file_count,omitempty"` } -// RepoStatsPayload — agent reports periodic repo health facts derived -// from `restic stats` and lock-file inspection. +// RepoStatsPayload carries a partial-update snapshot of repo health +// facts, shipped by the agent after prune/check/unlock or a periodic +// stats refresh. Pointer fields follow omitempty semantics: a nil +// pointer means "no update for this field" and is omitted on the +// wire; the server merges only the non-nil fields into its +// host_repo_stats row (matching UpsertHostRepoStats partial-update +// semantics). Non-pointer fields (LastCheckStatus) use the empty +// string as the "no update" sentinel. type RepoStatsPayload struct { - SizeBytes int64 `json:"size_bytes"` - SnapshotCount int `json:"snapshot_count"` - DedupRatio float64 `json:"dedup_ratio"` - LastCheckAt time.Time `json:"last_check_at,omitempty"` - LastCheckStatus string `json:"last_check_status,omitempty"` - LockState string `json:"lock_state"` // locked|unlocked + TotalSizeBytes *int64 `json:"total_size_bytes,omitempty"` + RawSizeBytes *int64 `json:"raw_size_bytes,omitempty"` + UniqueFiles *int64 `json:"unique_files,omitempty"` + SnapshotCount *int64 `json:"snapshot_count,omitempty"` + LastCheckAt *time.Time `json:"last_check_at,omitempty"` + LastCheckStatus string `json:"last_check_status,omitempty"` + LockPresent *bool `json:"lock_present,omitempty"` + LastPruneAt *time.Time `json:"last_prune_at,omitempty"` + LastPruneFreedBytes *int64 `json:"last_prune_freed_bytes,omitempty"` } // Schedule is the agent-facing view of a slim Schedule row plus its @@ -252,12 +293,19 @@ type ScheduleFirePayload struct { // ConfigUpdatePayload — server pushes per-host config (currently just // repo connection details). Empty fields mean "leave existing alone"; // to clear something, send an explicit zero value. +// +// Slot picks which secrets-store slot the agent writes the creds to. +// Empty / "repo" = everyday repo creds (default). "admin" = the +// prune-capable admin user (separate slot — not loaded for backups). +// Forwards-compatible: an agent that ignores Slot simply writes to the +// repo slot and admin pushes become no-ops. type ConfigUpdatePayload struct { RepoURL string `json:"repo_url,omitempty"` RepoPassword string `json:"repo_password,omitempty"` // sensitive RepoUsername string `json:"repo_username,omitempty"` RepoCredential string `json:"repo_credential,omitempty"` // sensitive (for rest server basic auth) HookShell string `json:"hook_shell,omitempty"` + Slot string `json:"slot,omitempty"` } // AgentUpdateAvailablePayload — informational only; the agent does diff --git a/internal/api/version.go b/internal/api/version.go index 1e9d1f5..0720162 100644 --- a/internal/api/version.go +++ b/internal/api/version.go @@ -12,3 +12,15 @@ const CurrentProtocolVersion = 1 // server accepts in a hello. Agents below this are disconnected with // a structured error pointing at the upgrade docs. const MinAgentProtocolVersion = 1 + +// Phase 5 (P2R-03..P2R-08, branch p2r-phase5-maintenance, 2026-05) reshaped +// CommandRunPayload (RetentionPolicy removed, ForgetGroups added, RequiresAdminCreds added), +// ConfigUpdatePayload (Slot added), and RepoStatsPayload (full reshape). +// The protocol version was deliberately NOT bumped because: +// 1. This project deploys agent and server in lockstep from the same release. +// 2. There is no supported "rolling upgrade" path with mixed agent/server versions. +// 3. The smoke env restage block in CLAUDE.md restages the agent binary on +// every server build for exactly this reason. +// +// If a multi-version protocol path is ever introduced, every Phase 5 wire +// change is a breaking change and the version must bump to 2 at that time. diff --git a/internal/api/wire_test.go b/internal/api/wire_test.go index 095f2c6..c0b4b96 100644 --- a/internal/api/wire_test.go +++ b/internal/api/wire_test.go @@ -138,6 +138,85 @@ func TestJobProgressShapeStable(t *testing.T) { } } +func TestRepoStatsPayloadRoundTrip(t *testing.T) { + t.Parallel() + + // Nil pointer fields must be omitted from JSON output. + empty := RepoStatsPayload{} + raw, err := json.Marshal(empty) + if err != nil { + t.Fatalf("marshal empty: %v", err) + } + if string(raw) != "{}" { + t.Errorf("empty payload should marshal to {}, got %s", raw) + } + + // Populated fields must survive a round trip. + total := int64(123456) + rawSize := int64(200000) + files := int64(42) + snaps := int64(7) + lockPresent := true + now := time.Date(2026, 1, 2, 3, 4, 5, 0, time.UTC) + pruneAt := time.Date(2026, 1, 3, 0, 0, 0, 0, time.UTC) + freed := int64(8192) + + p := RepoStatsPayload{ + TotalSizeBytes: &total, + RawSizeBytes: &rawSize, + UniqueFiles: &files, + SnapshotCount: &snaps, + LastCheckAt: &now, + LastCheckStatus: "ok", + LockPresent: &lockPresent, + LastPruneAt: &pruneAt, + LastPruneFreedBytes: &freed, + } + raw2, err := json.Marshal(p) + if err != nil { + t.Fatalf("marshal full: %v", err) + } + var got RepoStatsPayload + if err := json.Unmarshal(raw2, &got); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if got.TotalSizeBytes == nil || *got.TotalSizeBytes != total { + t.Errorf("TotalSizeBytes: got %v, want %d", got.TotalSizeBytes, total) + } + if got.RawSizeBytes == nil || *got.RawSizeBytes != rawSize { + t.Errorf("RawSizeBytes: got %v, want %d", got.RawSizeBytes, rawSize) + } + if got.UniqueFiles == nil || *got.UniqueFiles != files { + t.Errorf("UniqueFiles: got %v, want %d", got.UniqueFiles, files) + } + if got.SnapshotCount == nil || *got.SnapshotCount != snaps { + t.Errorf("SnapshotCount: got %v, want %d", got.SnapshotCount, snaps) + } + if got.LastCheckAt == nil || !got.LastCheckAt.Equal(now) { + t.Errorf("LastCheckAt: got %v, want %v", got.LastCheckAt, now) + } + if got.LastCheckStatus != "ok" { + t.Errorf("LastCheckStatus: got %q, want %q", got.LastCheckStatus, "ok") + } + if got.LockPresent == nil || *got.LockPresent != lockPresent { + t.Errorf("LockPresent: got %v, want %v", got.LockPresent, lockPresent) + } + if got.LastPruneAt == nil || !got.LastPruneAt.Equal(pruneAt) { + t.Errorf("LastPruneAt: got %v, want %v", got.LastPruneAt, pruneAt) + } + if got.LastPruneFreedBytes == nil || *got.LastPruneFreedBytes != freed { + t.Errorf("LastPruneFreedBytes: got %v, want %d", got.LastPruneFreedBytes, freed) + } + + // Partial update: only set LockPresent. + lockFalse := false + partial := RepoStatsPayload{LockPresent: &lockFalse} + rawPartial, _ := json.Marshal(partial) + if string(rawPartial) != `{"lock_present":false}` { + t.Errorf("partial marshal: got %s, want {\"lock_present\":false}", rawPartial) + } +} + // touch time so the import is used by other tests in this file when // they grow over time. var _ = time.Now diff --git a/internal/restic/runner.go b/internal/restic/runner.go index 05721af..51675ff 100644 --- a/internal/restic/runner.go +++ b/internal/restic/runner.go @@ -151,8 +151,7 @@ func (e Env) RunBackup(ctx context.Context, paths, excludes, tags []string, hand } // ForgetPolicy mirrors restic forget's --keep-* flags. All optional; -// nil/zero means "don't pass that flag." Caller passes whatever the -// schedule's RetentionPolicy carries. +// nil/zero means "don't pass that flag." type ForgetPolicy struct { KeepLast *int KeepHourly *int @@ -181,53 +180,47 @@ func (p ForgetPolicy) args() []string { return out } -// Empty reports whether no retention dimensions are set. restic -// forget refuses to run without at least one keep-* flag (it would -// delete every snapshot), so the agent rejects empty policies before -// invoking restic. +// Empty reports whether no retention dimensions are set. func (p ForgetPolicy) Empty() bool { return p.KeepLast == nil && p.KeepHourly == nil && p.KeepDaily == nil && p.KeepWeekly == nil && p.KeepMonthly == nil && p.KeepYearly == nil } -// RunForget executes `restic forget --keep-* … --json` against the -// configured repo. Does NOT pass --prune — pruning lives behind a -// separate, admin-only credential (see spec §4.3 / P2-06). Restic -// just rewrites the snapshot index; the actual data deletion waits -// for the next prune. Returns nil on a clean exit. -func (e Env) RunForget(ctx context.Context, policy ForgetPolicy, handle LineHandler) error { - if policy.Empty() { - return fmt.Errorf("restic forget: refusing to run with empty retention policy (would delete every snapshot)") - } - args := append([]string{"forget", "--json"}, policy.args()...) - cmd := exec.CommandContext(ctx, e.Bin, args...) - cmd.Env = e.envSlice() - cmd.Dir = e.WorkDir +// ForgetGroup is one (tag, retention-policy) pair fed to RunForget. +// The wrapper invokes `restic forget --tag --keep-* …` per +// group so retention can be targeted at a single source-group's +// snapshots without disturbing snapshots tagged for other groups. +type ForgetGroup struct { + Tag string + Policy ForgetPolicy +} - stdout, err := cmd.StdoutPipe() - if err != nil { - return fmt.Errorf("restic forget: stdout pipe: %w", err) +// RunForget executes one `restic forget --tag --keep-* …` +// invocation per group. Does NOT pass --prune — pruning lives behind +// a separate admin-only credential (see spec §4.3 / P2-06). Restic +// rewrites the snapshot index; the actual data deletion waits for +// the next prune. Empty groups slice is rejected (would be a no-op); +// any group with an empty policy is rejected (restic forget without +// any keep-* would delete every snapshot in the tagged set). +// Returns the first error encountered, or nil when every group runs +// to a clean exit. +func (e Env) RunForget(ctx context.Context, groups []ForgetGroup, handle LineHandler) error { + if len(groups) == 0 { + return fmt.Errorf("restic forget: refusing to run with no groups (would be a no-op)") } - stderr, err := cmd.StderrPipe() - if err != nil { - return fmt.Errorf("restic forget: stderr pipe: %w", err) - } - - if err := cmd.Start(); err != nil { - return fmt.Errorf("restic forget: start: %w", err) - } - - done := make(chan error, 2) - go func() { done <- pumpPlain(stdout, "stdout", handle) }() - go func() { done <- pumpPlain(stderr, "stderr", handle) }() - for i := 0; i < 2; i++ { - if err := <-done; err != nil && handle != nil { - handle("event", fmt.Sprintf("pump error: %v", err), nil) + for _, g := range groups { + if g.Policy.Empty() { + return fmt.Errorf("restic forget: group %q has empty retention policy (would delete every snapshot)", g.Tag) + } + args := []string{"forget", "--json", "--tag", g.Tag} + args = append(args, g.Policy.args()...) + cmd := exec.CommandContext(ctx, e.Bin, args...) + cmd.Env = e.envSlice() + cmd.Dir = e.WorkDir + if err := runWithPump(cmd, handle); err != nil { + return err } - } - if werr := cmd.Wait(); werr != nil { - return fmt.Errorf("restic forget: %w", werr) } return nil } @@ -243,19 +236,6 @@ func (e Env) RunInit(ctx context.Context, handle LineHandler) error { cmd.Env = e.envSlice() cmd.Dir = e.WorkDir - stdout, err := cmd.StdoutPipe() - if err != nil { - return fmt.Errorf("restic init: stdout pipe: %w", err) - } - stderr, err := cmd.StderrPipe() - if err != nil { - return fmt.Errorf("restic init: stderr pipe: %w", err) - } - - if err := cmd.Start(); err != nil { - return fmt.Errorf("restic init: start: %w", err) - } - // Sniff for "config file already exists" on stderr; if we see it // we'll treat the non-zero exit as a soft success — running init // against an already-initialized repo is a no-op semantically, @@ -271,26 +251,166 @@ func (e Env) RunInit(ctx context.Context, handle LineHandler) error { } } - done := make(chan error, 2) - go func() { done <- pumpPlain(stdout, "stdout", sniff) }() - go func() { done <- pumpPlain(stderr, "stderr", sniff) }() - for i := 0; i < 2; i++ { - if err := <-done; err != nil && handle != nil { - handle("event", fmt.Sprintf("pump error: %v", err), nil) - } - } - if werr := cmd.Wait(); werr != nil { + if err := runWithPump(cmd, sniff); err != nil { if alreadyInited { if handle != nil { handle("event", "repo already initialized — treating as success", nil) } return nil } - return fmt.Errorf("restic init: %w", werr) + return err } return nil } +// RunPrune executes `restic prune` against the configured repo. +// Requires the *admin* credentials (delete access on the rest-server +// repo) — the caller is responsible for populating Env.RepoUsername +// and Env.RepoPassword with the admin pair before calling this. +// +// Prune emits human-readable progress on stdout/stderr (no --json +// support that's useful for our purposes). We tee everything to the +// handler so the live log is the operator's progress bar. +func (e Env) RunPrune(ctx context.Context, handle LineHandler) error { + cmd := exec.CommandContext(ctx, e.Bin, "prune") + cmd.Env = e.envSlice() + cmd.Dir = e.WorkDir + return runWithPump(cmd, handle) +} + +// runWithPump starts the configured cmd, fans stdout+stderr into +// pumpPlain via the supplied handler, waits, and wraps any error +// with the cmd's verb (e.g., "restic prune") for context. +func runWithPump(cmd *exec.Cmd, handle LineHandler) error { + label := "restic" + if len(cmd.Args) > 1 { + label = "restic " + cmd.Args[1] + } + stdout, err := cmd.StdoutPipe() + if err != nil { + return fmt.Errorf("%s: stdout pipe: %w", label, err) + } + stderr, err := cmd.StderrPipe() + if err != nil { + return fmt.Errorf("%s: stderr pipe: %w", label, err) + } + if err := cmd.Start(); err != nil { + return fmt.Errorf("%s: start: %w", label, err) + } + done := make(chan error, 2) + go func() { done <- pumpPlain(stdout, "stdout", handle) }() + go func() { done <- pumpPlain(stderr, "stderr", handle) }() + for i := 0; i < 2; i++ { + if err := <-done; err != nil && handle != nil { + handle("event", fmt.Sprintf("pump error: %v", err), nil) + } + } + if werr := cmd.Wait(); werr != nil { + return fmt.Errorf("%s: %w", label, werr) + } + return nil +} + +// RunUnlock executes `restic unlock`. Returns nil on a clean exit. +func (e Env) RunUnlock(ctx context.Context, handle LineHandler) error { + cmd := exec.CommandContext(ctx, e.Bin, "unlock") + cmd.Env = e.envSlice() + cmd.Dir = e.WorkDir + return runWithPump(cmd, handle) +} + +// RepoStats mirrors `restic stats --json --mode raw-data` output. +type RepoStats struct { + TotalSize int64 `json:"total_size"` + TotalUncompressed int64 `json:"total_uncompressed_size"` + SnapshotsCount int64 `json:"snapshots_count"` + TotalFileCount int64 `json:"total_file_count"` + TotalBlobCount int64 `json:"total_blob_count"` +} + +// RunStats executes `restic stats --json --mode raw-data` and parses +// the (single-line) JSON response. Tees raw output to handle so the +// caller can still log it. Returns an error if no JSON-shaped line +// arrived on stdout. +func (e Env) RunStats(ctx context.Context, handle LineHandler) (*RepoStats, error) { + cmd := exec.CommandContext(ctx, e.Bin, "stats", "--json", "--mode", "raw-data") + cmd.Env = e.envSlice() + cmd.Dir = e.WorkDir + var out *RepoStats + capture := func(stream, line string, ev any) { + if stream == "stdout" && strings.HasPrefix(line, "{") { + var s RepoStats + if json.Unmarshal([]byte(line), &s) == nil { + cp := s + out = &cp + } + } + if handle != nil { + handle(stream, line, ev) + } + } + if err := runWithPump(cmd, capture); err != nil { + return nil, err + } + if out == nil { + return nil, fmt.Errorf("restic stats: no JSON in output") + } + return out, nil +} + +// CheckResult summarizes a `restic check` invocation. LockPresent is +// true if the stderr stream contained a stale-lock signal (caller is +// expected to surface this in the UI so the operator can run unlock). +// ErrorsFound is true if check exited with a non-zero status (errors +// detected in repo metadata). +type CheckResult struct { + LockPresent bool + ErrorsFound bool +} + +// RunCheck executes `restic check` with optional --read-data-subset. +// subsetPct of 0 omits the flag (full data check); >0 passes +// --read-data-subset N%. Returns a CheckResult summarizing what was +// sniffed from stderr; the result is set even if check itself +// returns an error (so the caller can persist last_check_status). +func (e Env) RunCheck(ctx context.Context, subsetPct int, handle LineHandler) (CheckResult, error) { + args := []string{"check"} + if subsetPct > 0 { + args = append(args, "--read-data-subset", fmt.Sprintf("%d%%", subsetPct)) + } + cmd := exec.CommandContext(ctx, e.Bin, args...) + cmd.Env = e.envSlice() + cmd.Dir = e.WorkDir + + var res CheckResult + sniff := func(stream, line string, ev any) { + if stream == "stderr" { + if strings.Contains(line, "stale lock") || strings.Contains(line, "already locked") { + res.LockPresent = true + } + } + if handle != nil { + handle(stream, line, ev) + } + } + + err := runWithPump(cmd, sniff) + if err != nil { + // restic check exits non-zero when corruption is found; that's + // a CheckResult, not a wrapper failure. Treat ExitError as + // "errors found" but still return the result so the caller can + // persist last_check_status='errors_found'. Reserve the error + // return for actually-broken invocations (binary missing, etc). + var ee *exec.ExitError + if errors.As(err, &ee) { + res.ErrorsFound = true + return res, nil + } + return res, err + } + return res, nil +} + func pumpPlain(r io.Reader, stream string, handle LineHandler) error { scanner := bufio.NewScanner(r) scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) diff --git a/internal/restic/runner_test.go b/internal/restic/runner_test.go new file mode 100644 index 0000000..a2d6708 --- /dev/null +++ b/internal/restic/runner_test.go @@ -0,0 +1,193 @@ +package restic + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "testing" +) + +// setupScriptBin writes a small shell script to a temp directory, +// makes it executable, and returns its path. scriptBody is the +// complete script content (without the shebang line — that's added +// automatically). +// Writes to ".tmp" then renames into place — see the matching +// helper in internal/agent/runner/runner_test.go for the ETXTBSY +// race rationale. Same fix applied here so this helper doesn't lose +// the race the next time CI gets unlucky. +func setupScriptBin(t *testing.T, scriptBody string) string { + t.Helper() + dir := t.TempDir() + final := filepath.Join(dir, "restic") + tmp := final + ".tmp" + content := "#!/bin/sh\n" + scriptBody + "\n" + if err := os.WriteFile(tmp, []byte(content), 0o755); err != nil { + t.Fatalf("setupScriptBin: write tmp: %v", err) + } + if err := os.Rename(tmp, final); err != nil { + t.Fatalf("setupScriptBin: rename: %v", err) + } + return final +} + +// captureLines returns a LineHandler that appends "stream:line" into +// the returned slice pointer (safe for single-goroutine test use). +func captureLines() (*[]string, LineHandler) { + var lines []string + h := func(stream, line string, _ any) { + lines = append(lines, fmt.Sprintf("%s:%s", stream, line)) + } + return &lines, h +} + +// --- B1: RunPrune + B2: RunCheck --- + +func TestRunPruneInvokesPrune(t *testing.T) { + // Shell script that echoes its args; "prune" should appear in output. + bin := setupScriptBin(t, `echo "$@"`) + env := Env{Bin: bin} + lines, h := captureLines() + if err := env.RunPrune(context.Background(), h); err != nil { + t.Fatalf("RunPrune returned error: %v", err) + } + for _, l := range *lines { + if strings.Contains(l, "prune") { + return + } + } + t.Fatalf("expected 'prune' in captured output; got: %v", *lines) +} + +// --- B2: RunCheck --- + +func TestRunCheckLockSniff(t *testing.T) { + cases := []struct { + name string + stderrLine string + wantLocked bool + }{ + {"stale lock", "Found stale lock from PID 1234", true}, + {"already locked", "repository is already locked exclusively", true}, + {"benign mention", "subdir/locked-file ok", false}, + {"empty", "", false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + // Script emits the line on stderr, then exits 0. + script := fmt.Sprintf(`printf '%%s\n' %q >&2`, c.stderrLine) + bin := setupScriptBin(t, script) + env := Env{Bin: bin} + res, err := env.RunCheck(context.Background(), 0, nil) + if err != nil { + t.Fatalf("RunCheck returned unexpected error: %v", err) + } + if res.LockPresent != c.wantLocked { + t.Fatalf("LockPresent: got %v, want %v (line: %q)", res.LockPresent, c.wantLocked, c.stderrLine) + } + if res.ErrorsFound { + t.Fatal("expected ErrorsFound=false") + } + }) + } +} + +func TestRunCheckErrorsFoundOnExit1(t *testing.T) { + bin := setupScriptBin(t, `exit 1`) + env := Env{Bin: bin} + res, err := env.RunCheck(context.Background(), 0, nil) + if err != nil { + t.Fatalf("RunCheck returned unexpected error (should have absorbed exit 1): %v", err) + } + if !res.ErrorsFound { + t.Fatal("expected ErrorsFound=true for exit 1") + } +} + +func TestRunCheckSubsetArg(t *testing.T) { + bin := setupScriptBin(t, `echo "$@"`) + env := Env{Bin: bin} + lines, h := captureLines() + if _, err := env.RunCheck(context.Background(), 25, h); err != nil { + t.Fatalf("RunCheck: %v", err) + } + want := "--read-data-subset 25%" + for _, l := range *lines { + if strings.Contains(l, want) { + return + } + } + t.Fatalf("expected %q in captured output; got: %v", want, *lines) +} + +// --- B3: RunUnlock + RunStats --- + +func TestRunUnlockInvokesUnlock(t *testing.T) { + bin := setupScriptBin(t, `echo "$@"`) + env := Env{Bin: bin} + lines, h := captureLines() + if err := env.RunUnlock(context.Background(), h); err != nil { + t.Fatalf("RunUnlock: %v", err) + } + for _, l := range *lines { + if strings.Contains(l, "unlock") { + return + } + } + t.Fatalf("expected 'unlock' in captured output; got: %v", *lines) +} + +func TestRunStatsParsesJSON(t *testing.T) { + bin := setupScriptBin(t, `echo '{"total_size":1234,"total_uncompressed_size":5678,"snapshots_count":3,"total_file_count":100,"total_blob_count":50}'`) + env := Env{Bin: bin} + stats, err := env.RunStats(context.Background(), nil) + if err != nil { + t.Fatalf("RunStats: %v", err) + } + if stats.TotalSize != 1234 { + t.Fatalf("TotalSize: got %d, want 1234", stats.TotalSize) + } + if stats.TotalUncompressed != 5678 { + t.Fatalf("TotalUncompressed: got %d, want 5678", stats.TotalUncompressed) + } + if stats.SnapshotsCount != 3 { + t.Fatalf("SnapshotsCount: got %d, want 3", stats.SnapshotsCount) + } + if stats.TotalFileCount != 100 { + t.Fatalf("TotalFileCount: got %d, want 100", stats.TotalFileCount) + } + if stats.TotalBlobCount != 50 { + t.Fatalf("TotalBlobCount: got %d, want 50", stats.TotalBlobCount) + } +} + +func TestRunStatsErrorsWithoutJSON(t *testing.T) { + bin := setupScriptBin(t, `echo "no json here"`) + env := Env{Bin: bin} + _, err := env.RunStats(context.Background(), nil) + if err == nil { + t.Fatal("expected error when no JSON in output") + } + if !strings.Contains(err.Error(), "no JSON in output") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestRunStatsZeroSnapshots(t *testing.T) { + // Confirms RunStats succeeds and returns a valid *RepoStats when the + // repo has no snapshots (snapshots_count=0). A regression that + // re-added a "SnapshotsCount > 0" guard would return an error here. + bin := setupScriptBin(t, `echo '{"total_size":0,"total_uncompressed_size":0,"snapshots_count":0,"total_file_count":0,"total_blob_count":0}'`) + env := Env{Bin: bin} + stats, err := env.RunStats(context.Background(), nil) + if err != nil { + t.Fatalf("RunStats with zero snapshots returned unexpected error: %v", err) + } + if stats == nil { + t.Fatal("expected non-nil *RepoStats, got nil") + } + if stats.SnapshotsCount != 0 { + t.Fatalf("SnapshotsCount: got %d, want 0", stats.SnapshotsCount) + } +} diff --git a/internal/server/http/enrollment.go b/internal/server/http/enrollment.go index 2706ea5..f1615e0 100644 --- a/internal/server/http/enrollment.go +++ b/internal/server/http/enrollment.go @@ -167,7 +167,7 @@ func (s *Server) handleAgentEnroll(w stdhttp.ResponseWriter, r *stdhttp.Request) // /api/hosts/{id}/repo-credentials. Failing the whole enrolment // here would leave a half-burned token + an orphan host. if encForHost != "" { - if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, encForHost); err != nil { + if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, store.CredKindRepo, encForHost); err != nil { slog.Error("enrollment: set host credentials failed", "host_id", hostID, "err", err) } diff --git a/internal/server/http/host_credentials.go b/internal/server/http/host_credentials.go index 5887a75..0060de3 100644 --- a/internal/server/http/host_credentials.go +++ b/internal/server/http/host_credentials.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "errors" + "fmt" "log/slog" stdhttp "net/http" "time" @@ -39,7 +40,7 @@ func (s *Server) handleGetHostCredentials(w stdhttp.ResponseWriter, r *stdhttp.R writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "") return } - enc, err := s.deps.Store.GetHostCredentials(r.Context(), hostID) + enc, err := s.deps.Store.GetHostCredentials(r.Context(), hostID, store.CredKindRepo) if err != nil { if errors.Is(err, store.ErrNotFound) { writeJSONError(w, stdhttp.StatusNotFound, "not_set", "") @@ -85,7 +86,8 @@ type hostRepoCredsRequest struct { // preserved. Re-encrypts under host_id and pushes a config.update // over the WS if the agent is connected. func (s *Server) handleSetHostCredentials(w stdhttp.ResponseWriter, r *stdhttp.Request) { - if !s.authedUser(r) { + user, ok := s.requireUser(r) + if !ok { writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "") return } @@ -107,7 +109,7 @@ func (s *Server) handleSetHostCredentials(w stdhttp.ResponseWriter, r *stdhttp.R // Merge with the existing row, if any. existing := repoCredsBlob{} - if cur, err := s.deps.Store.GetHostCredentials(r.Context(), hostID); err == nil { + if cur, err := s.deps.Store.GetHostCredentials(r.Context(), hostID, store.CredKindRepo); err == nil { plain, err := s.deps.AEAD.Decrypt(cur, []byte("host:"+hostID)) if err != nil { writeJSONError(w, stdhttp.StatusInternalServerError, "decrypt_failed", "") @@ -139,13 +141,14 @@ func (s *Server) handleSetHostCredentials(w stdhttp.ResponseWriter, r *stdhttp.R writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "") return } - if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, enc); err != nil { + if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, store.CredKindRepo, enc); err != nil { writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "") return } _ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{ ID: ulid.Make().String(), + UserID: &user.ID, Actor: "user", Action: "host.repo_credentials_set", TargetKind: ptr("host"), @@ -184,6 +187,209 @@ func (s *Server) pushRepoCredsToAgent(ctx context.Context, hostID string, blob r return nil } +// handleGetAdminCredentials returns a redacted view of the host's admin +// creds for UI display. 404 if no admin slot has been set yet. Operator +// uses this to pre-fill the edit form. +func (s *Server) handleGetAdminCredentials(w stdhttp.ResponseWriter, r *stdhttp.Request) { + if !s.authedUser(r) { + writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "") + return + } + hostID := chi.URLParam(r, "id") + if hostID == "" { + writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "") + return + } + enc, err := s.deps.Store.GetHostCredentials(r.Context(), hostID, store.CredKindAdmin) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + writeJSONError(w, stdhttp.StatusNotFound, "not_set", "") + return + } + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "") + return + } + plain, err := s.deps.AEAD.Decrypt(enc, []byte("host:"+hostID+":admin")) + if err != nil { + writeJSONError(w, stdhttp.StatusInternalServerError, "decrypt_failed", "") + return + } + var blob repoCredsBlob + if err := json.Unmarshal(plain, &blob); err != nil { + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "") + return + } + writeJSON(w, stdhttp.StatusOK, hostRepoCredsView{ + RepoURL: blob.RepoURL, + RepoUsername: blob.RepoUsername, + HasPassword: blob.RepoPassword != "", + }) +} + +// handleSetAdminCredentials lets an operator/admin update a host's admin +// creds (the prune-capable slot). Same merge-then-validate semantics as +// handleSetHostCredentials but operates on store.CredKindAdmin. After +// persisting, pushes a config.update with Slot:"admin" over the WS if +// the agent is connected. +func (s *Server) handleSetAdminCredentials(w stdhttp.ResponseWriter, r *stdhttp.Request) { + user, ok := s.requireUser(r) + if !ok { + writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "") + return + } + hostID := chi.URLParam(r, "id") + if hostID == "" { + writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "") + return + } + if _, err := s.deps.Store.GetHost(r.Context(), hostID); err != nil { + writeJSONError(w, stdhttp.StatusNotFound, "host_not_found", "") + return + } + + var req hostRepoCredsRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + writeJSONError(w, stdhttp.StatusBadRequest, "invalid_json", err.Error()) + return + } + + // Merge with the existing admin row, if any. + existing := repoCredsBlob{} + aad := []byte("host:" + hostID + ":admin") + if cur, err := s.deps.Store.GetHostCredentials(r.Context(), hostID, store.CredKindAdmin); err == nil { + plain, err := s.deps.AEAD.Decrypt(cur, aad) + if err != nil { + writeJSONError(w, stdhttp.StatusInternalServerError, "decrypt_failed", "") + return + } + _ = json.Unmarshal(plain, &existing) + } else if !errors.Is(err, store.ErrNotFound) { + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "") + return + } + + if req.RepoURL != nil { + existing.RepoURL = *req.RepoURL + } + if req.RepoUsername != nil { + existing.RepoUsername = *req.RepoUsername + } + if req.RepoPassword != nil { + existing.RepoPassword = *req.RepoPassword + } + if existing.RepoURL == "" || existing.RepoPassword == "" { + writeJSONError(w, stdhttp.StatusBadRequest, "missing_field", + "repo_url and repo_password must end up non-empty") + return + } + + enc, err := s.encryptRepoCreds(existing, aad) + if err != nil { + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "") + return + } + if err := s.deps.Store.SetHostCredentials(r.Context(), hostID, store.CredKindAdmin, enc); err != nil { + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "") + return + } + + _ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{ + ID: ulid.Make().String(), + UserID: &user.ID, + Actor: "user", + Action: "host.admin_credentials_set", + TargetKind: ptr("host"), + TargetID: &hostID, + TS: nowUTC(), + }) + + // Push to the agent if it's connected. Non-fatal: the next + // handleRunRepoPrune call will push on-demand. + if s.deps.Hub != nil && s.deps.Hub.Connected(hostID) { + _ = s.pushAdminCredsToAgent(r.Context(), hostID) + } + + w.WriteHeader(stdhttp.StatusNoContent) +} + +// handleDeleteAdminCredentials removes the admin credentials row for the +// host. Returns 204 on success, 404 if the row wasn't set. Does NOT push +// a deletion to the agent — the agent's local admin slot stays as-is +// until the next deployment/reinstall. +func (s *Server) handleDeleteAdminCredentials(w stdhttp.ResponseWriter, r *stdhttp.Request) { + user, ok := s.requireUser(r) + if !ok { + writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "") + return + } + hostID := chi.URLParam(r, "id") + if hostID == "" { + writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "") + return + } + + // Check existence first so we can 404 cleanly. + if _, err := s.deps.Store.GetHostCredentials(r.Context(), hostID, store.CredKindAdmin); err != nil { + if errors.Is(err, store.ErrNotFound) { + writeJSONError(w, stdhttp.StatusNotFound, "not_set", "") + return + } + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "") + return + } + + if err := s.deps.Store.DeleteHostCredentials(r.Context(), hostID, store.CredKindAdmin); err != nil { + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "") + return + } + + _ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{ + ID: ulid.Make().String(), + UserID: &user.ID, + Actor: "user", + Action: "host.admin_credentials_deleted", + TargetKind: ptr("host"), + TargetID: &hostID, + TS: nowUTC(), + }) + + w.WriteHeader(stdhttp.StatusNoContent) +} + +// pushAdminCredsToAgent ships the admin-slot config.update down the +// agent's WS. Used by: +// - handleSetAdminCredentials (immediate push when operator saves). +// - handleRunRepoPrune (on-demand push right before a prune dispatch). +// +// Returns store.ErrNotFound if no admin row exists for the host +// (the prune endpoint uses this to refuse with a clear message). +func (s *Server) pushAdminCredsToAgent(ctx context.Context, hostID string) error { + enc, err := s.deps.Store.GetHostCredentials(ctx, hostID, store.CredKindAdmin) + if err != nil { + return err // ErrNotFound bubbles + } + plain, err := s.deps.AEAD.Decrypt(enc, []byte("host:"+hostID+":admin")) + if err != nil { + return fmt.Errorf("push admin creds: decrypt: %w", err) + } + var blob repoCredsBlob + if err := json.Unmarshal(plain, &blob); err != nil { + return fmt.Errorf("push admin creds: parse: %w", err) + } + env, err := api.Marshal(api.MsgConfigUpdate, "", api.ConfigUpdatePayload{ + Slot: "admin", + RepoURL: blob.RepoURL, + RepoUsername: blob.RepoUsername, + RepoPassword: blob.RepoPassword, + }) + if err != nil { + return err + } + sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + return s.deps.Hub.Send(sendCtx, hostID, env) +} + // onAgentHello runs synchronously inside the WS handler immediately // after a successful hello. It loads the host's encrypted creds (if // any), decrypts, and ships them down the conn as a config.update so @@ -205,6 +411,11 @@ func (s *Server) onAgentHello(ctx context.Context, hostID string, conn *ws.Conn) // just no-ops. Skipped silently when the host has no creds yet — // the next hello after the operator binds creds will dispatch. s.maybeAutoInit(ctx, hostID, conn) + // Drain any pending runs that accumulated while this host was + // offline. Use a fresh context — the hello-bound ctx is short-lived, + // and the drain may take seconds across many rows. A non-blocking + // goroutine keeps the hello path snappy. + go s.DrainPending(context.Background(), hostID) } // maybeAutoInit dispatches a `restic init` job iff the host has no @@ -212,7 +423,7 @@ func (s *Server) onAgentHello(ctx context.Context, hostID string, conn *ws.Conn) // them the runner can't talk to the repo). We rely on Restic's // idempotent init for re-runs. func (s *Server) maybeAutoInit(ctx context.Context, hostID string, conn *ws.Conn) { - if _, err := s.deps.Store.GetHostCredentials(ctx, hostID); err != nil { + if _, err := s.deps.Store.GetHostCredentials(ctx, hostID, store.CredKindRepo); err != nil { // No creds bound yet — operator hasn't supplied them. The next // hello after creds land will pick this up. return @@ -266,7 +477,7 @@ func (s *Server) maybeAutoInit(ctx context.Context, hostID string, conn *ws.Conn // credentials. Silent no-op when the host has nothing on file // (the operator hasn't bound creds to it yet). func (s *Server) pushRepoCredsOnHello(ctx context.Context, hostID string, conn *ws.Conn) { - enc, err := s.deps.Store.GetHostCredentials(ctx, hostID) + enc, err := s.deps.Store.GetHostCredentials(ctx, hostID, store.CredKindRepo) if err != nil { if !errors.Is(err, store.ErrNotFound) { slog.Warn("on-hello: load host creds", "host_id", hostID, "err", err) diff --git a/internal/server/http/host_credentials_test.go b/internal/server/http/host_credentials_test.go index af2d286..c38e553 100644 --- a/internal/server/http/host_credentials_test.go +++ b/internal/server/http/host_credentials_test.go @@ -5,6 +5,9 @@ import ( "encoding/json" "testing" "time" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" ) // TestEnrollmentTransfersRepoCreds verifies the round-trip: @@ -57,12 +60,12 @@ func TestEnrollmentTransfersRepoCreds(t *testing.T) { hostID, "host42", "linux", "amd64", "2026-01-01T00:00:00Z"); err != nil { t.Fatalf("insert host: %v", err) } - if err := st.SetHostCredentials(ctx, hostID, encForHost); err != nil { + if err := st.SetHostCredentials(ctx, hostID, store.CredKindRepo, encForHost); err != nil { t.Fatalf("set host credentials: %v", err) } // host_credentials row should now hold the host-bound ciphertext. - got, err := st.GetHostCredentials(ctx, hostID) + got, err := st.GetHostCredentials(ctx, hostID, store.CredKindRepo) if err != nil { t.Fatalf("get host creds: %v", err) } @@ -105,3 +108,263 @@ func TestEnrollmentTokenWithoutCreds(t *testing.T) { t.Errorf("token without creds should return empty blob; got %q", att.EncRepoCreds) } } + +// ----- admin credentials tests ---------------------------------------- + +// TestAdminCredentialsRoundTrip verifies set→get→delete→get (404). +func TestAdminCredentialsRoundTrip(t *testing.T) { + t.Parallel() + srv, url, st := newTestServerWithHub(t) + cookie := loginAsAdmin(t, st) + hostID := makeHost(t, st, "admin-creds-host") + + // Mark init done so auto-init doesn't interfere. + _ = st.CreateJob(context.Background(), store.Job{ + ID: "init-" + hostID, + HostID: hostID, + Kind: string(api.JobInit), + ActorKind: "system", + CreatedAt: time.Now().UTC(), + }) + + // GET before set → 404. + status, body := doJSON(t, url, "GET", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie) + if status != 404 { + t.Fatalf("before set: want 404, got %d body=%+v", status, body) + } + + // PUT — set admin creds. + status, body = doJSON(t, url, "PUT", "/api/hosts/"+hostID+"/admin-credentials", + map[string]any{ + "repo_url": "rest:http://admin.example/host", + "repo_username": "admin", + "repo_password": "s3cur3", + }, cookie) + if status != 204 { + t.Fatalf("set: want 204, got %d body=%+v", status, body) + } + + // GET — should return redacted view. + status, body = doJSON(t, url, "GET", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie) + if status != 200 { + t.Fatalf("get after set: want 200, got %d body=%+v", status, body) + } + if body["repo_url"] != "rest:http://admin.example/host" { + t.Errorf("repo_url: %+v", body) + } + if body["repo_username"] != "admin" { + t.Errorf("repo_username: %+v", body) + } + if body["has_password"] != true { + t.Errorf("has_password: %+v", body) + } + + // DELETE. + status, _ = doJSON(t, url, "DELETE", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie) + if status != 204 { + t.Fatalf("delete: want 204, got %d", status) + } + + // GET after delete → 404. + status, _ = doJSON(t, url, "GET", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie) + if status != 404 { + t.Fatalf("after delete: want 404, got %d", status) + } + + // Extra: suppress unused import warning by actually using srv in assertion. + _ = srv +} + +// TestAdminCredsAADIsolatedFromRepo writes a blob encrypted with the repo +// AAD ("host:") into the admin kind slot, then GETs it — the handler +// should fail to decrypt and return 500 decrypt_failed. This proves the +// AAD scoping is real. +func TestAdminCredsAADIsolatedFromRepo(t *testing.T) { + t.Parallel() + srv, url, st := newTestServerWithHub(t) + cookie := loginAsAdmin(t, st) + hostID := makeHost(t, st, "aad-isolation-host") + + ctx := context.Background() + // Encrypt with the REPO AAD (wrong for admin slot). + enc, err := srv.encryptRepoCreds(repoCredsBlob{ + RepoURL: "rest:http://r/x", + RepoPassword: "p", + }, []byte("host:"+hostID)) // wrong AAD — repo, not admin + if err != nil { + t.Fatalf("encrypt: %v", err) + } + // Write it directly into the admin kind slot. + if err := st.SetHostCredentials(ctx, hostID, store.CredKindAdmin, enc); err != nil { + t.Fatalf("set host credentials: %v", err) + } + + // GET admin-credentials — handler decrypts with admin AAD, which + // is different, so decrypt must fail → 500. + status, body := doJSON(t, url, "GET", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie) + if status != 500 { + t.Fatalf("want 500 (decrypt_failed), got %d body=%+v", status, body) + } + if code, _ := body["code"].(string); code != "decrypt_failed" { + t.Errorf("want code=decrypt_failed, got %+v", body) + } +} + +// TestAdminCredsPushOnSet connects a fake WS host, sets admin creds via +// PUT, drains the conn, and asserts a config.update with Slot:"admin" +// was shipped. +func TestAdminCredsPushOnSet(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "admin-push-host") + cookie := loginAsAdmin(t, st) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "admin-push-host") + + // Drain the on-hello burst (config.update for repo + schedule.set + // + possibly command.run(init)). + _ = drainUntil(t, c, api.MsgScheduleSet) + + // Now PUT admin creds — should trigger an immediate push. + status, body := doJSON(t, ts.URL, "PUT", "/api/hosts/"+hostID+"/admin-credentials", + map[string]any{ + "repo_url": "rest:http://admin.example/h", + "repo_username": "admin", + "repo_password": "prune-pass", + }, cookie) + if status != 204 { + t.Fatalf("set admin creds: want 204, got %d body=%+v", status, body) + } + + // Drain until we see a config.update with Slot=admin. + deadline := time.Now().Add(3 * time.Second) + found := false + for !found && time.Now().Before(deadline) { + env := readEnvelope(t, c) + if env.Type != api.MsgConfigUpdate { + continue + } + var p api.ConfigUpdatePayload + if err := env.UnmarshalPayload(&p); err != nil { + t.Fatalf("unmarshal config.update: %v", err) + } + if p.Slot == "admin" { + found = true + if p.RepoURL != "rest:http://admin.example/h" { + t.Errorf("admin push: wrong URL %q", p.RepoURL) + } + } + } + if !found { + t.Fatal("timed out waiting for config.update(slot=admin)") + } +} + +// TestDeleteAdminCredentialsAuditLogged checks that DELETE appends an +// audit row with action='host.admin_credentials_deleted' and that the +// row carries the acting user's ID. +func TestDeleteAdminCredentialsAuditLogged(t *testing.T) { + t.Parallel() + _, url, st := newTestServerWithHub(t) + cookie, userID := loginAsAdminWithID(t, st) + hostID := makeHost(t, st, "audit-del-host") + + ctx := context.Background() + + // Set admin creds first so there is something to delete. + status, body := doJSON(t, url, "PUT", "/api/hosts/"+hostID+"/admin-credentials", + map[string]any{ + "repo_url": "rest:http://x/h", + "repo_password": "p", + }, cookie) + if status != 204 { + t.Fatalf("set: want 204, got %d body=%+v", status, body) + } + + // Delete. + status, _ = doJSON(t, url, "DELETE", "/api/hosts/"+hostID+"/admin-credentials", nil, cookie) + if status != 204 { + t.Fatalf("delete: want 204, got %d", status) + } + + // Query audit_log for the delete row — action, user_id. + rows, err := st.DB().QueryContext(ctx, + `SELECT action, user_id FROM audit_log WHERE target_id = ? AND target_kind = 'host' AND action = 'host.admin_credentials_deleted'`, + hostID) + if err != nil { + t.Fatalf("query audit: %v", err) + } + defer rows.Close() + + found := false + for rows.Next() { + var action string + var gotUserID *string + if err := rows.Scan(&action, &gotUserID); err != nil { + t.Fatalf("scan: %v", err) + } + found = true + if gotUserID == nil { + t.Error("audit row: user_id is NULL, want non-nil") + } else if *gotUserID != userID { + t.Errorf("audit row: user_id=%q, want %q", *gotUserID, userID) + } + } + if err := rows.Err(); err != nil { + t.Fatalf("rows: %v", err) + } + if !found { + t.Error("audit row with action='host.admin_credentials_deleted' not found") + } +} + +// TestSetAdminCredentialsAuditCarriesUserID checks that PUT +// /api/hosts/{id}/admin-credentials appends an audit row with the +// correct action and a non-nil UserID matching the acting session. +func TestSetAdminCredentialsAuditCarriesUserID(t *testing.T) { + t.Parallel() + _, url, st := newTestServerWithHub(t) + cookie, userID := loginAsAdminWithID(t, st) + hostID := makeHost(t, st, "audit-set-admin-host") + + ctx := context.Background() + + status, body := doJSON(t, url, "PUT", "/api/hosts/"+hostID+"/admin-credentials", + map[string]any{ + "repo_url": "rest:http://admin.example/h", + "repo_password": "s3cr3t", + }, cookie) + if status != 204 { + t.Fatalf("set: want 204, got %d body=%+v", status, body) + } + + rows, err := st.DB().QueryContext(ctx, + `SELECT action, user_id FROM audit_log WHERE target_id = ? AND target_kind = 'host' AND action = 'host.admin_credentials_set'`, + hostID) + if err != nil { + t.Fatalf("query audit: %v", err) + } + defer rows.Close() + + found := false + for rows.Next() { + var action string + var gotUserID *string + if err := rows.Scan(&action, &gotUserID); err != nil { + t.Fatalf("scan: %v", err) + } + found = true + if gotUserID == nil { + t.Error("audit row: user_id is NULL, want non-nil") + } else if *gotUserID != userID { + t.Errorf("audit row: user_id=%q, want %q", *gotUserID, userID) + } + } + if err := rows.Err(); err != nil { + t.Fatalf("rows: %v", err) + } + if !found { + t.Error("audit row with action='host.admin_credentials_set' not found") + } +} diff --git a/internal/server/http/jobs.go b/internal/server/http/jobs.go index e6afd7a..d4efa63 100644 --- a/internal/server/http/jobs.go +++ b/internal/server/http/jobs.go @@ -72,7 +72,7 @@ func (s *Server) dispatchJob(ctx context.Context, user *store.User, } // dispatchJobWithPayload is dispatchJob's variant that lets callers -// fill in structured fields (Includes/Excludes/Tag/RetentionPolicy) +// fill in structured fields (Includes/Excludes/Tag/ForgetGroups/RequiresAdminCreds) // — used by the per-source-group Run-now path. JobID is filled in // here; callers leave it zero on the input payload. func (s *Server) dispatchJobWithPayload(ctx context.Context, user *store.User, diff --git a/internal/server/http/maintenance_dispatch.go b/internal/server/http/maintenance_dispatch.go new file mode 100644 index 0000000..598533f --- /dev/null +++ b/internal/server/http/maintenance_dispatch.go @@ -0,0 +1,132 @@ +// maintenance_dispatch.go bridges the pure-logic maintenance.Ticker +// (internal/server/maintenance) to the side-effecting world: checks +// online state, builds the per-kind command.run payload, and calls +// dispatchJobWithPayload — the same path operator-triggered Run-now +// uses. Cadence-driven jobs are persisted with actor_kind="system" +// (dispatchJobWithPayload tags it that way when user==nil). +// +// Maintenance fires deliberately do NOT queue to pending_runs when +// the host is offline — five missed prunes on a laptop returning +// from a week away is not what the operator wants. Skip + log; the +// next 60s tick will re-evaluate. +package http + +import ( + "context" + "errors" + "log/slog" + "strconv" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// DispatchMaintenance acts on each Decision from the ticker. Offline +// hosts are skipped (logged); prune dispatches without admin creds +// are skipped silently (logged) — the operator hasn't completed the +// admin-creds setup yet, and re-trying every minute would just spam +// the logs. (Operator-triggered prune via the run-now endpoint +// returns a clear error instead — different path, different UX.) +func (s *Server) DispatchMaintenance(ctx context.Context, decisions []maintenance.Decision) { + for _, d := range decisions { + if !s.deps.Hub.Connected(d.HostID) { + slog.Info("maintenance: host offline, skipping", + "host_id", d.HostID, "kind", d.Kind) + continue + } + switch d.Kind { + case "forget": + payload, ok := s.buildForgetPayloadForHost(ctx, d.HostID) + if !ok { + slog.Info("maintenance: forget skipped — no source groups with retention", + "host_id", d.HostID) + continue + } + _, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobForget, payload) + if code != "" { + slog.Warn("maintenance: forget dispatch failed", + "host_id", d.HostID, "code", code, "msg", msg) + } + case "prune": + if _, err := s.deps.Store.GetHostCredentials(ctx, d.HostID, store.CredKindAdmin); err != nil { + if errors.Is(err, store.ErrNotFound) { + slog.Info("maintenance: prune skipped — no admin creds", + "host_id", d.HostID) + continue + } + slog.Warn("maintenance: prune skipped — admin creds error", + "host_id", d.HostID, "err", err) + continue + } + if err := s.pushAdminCredsToAgent(ctx, d.HostID); err != nil { + slog.Warn("maintenance: prune push admin creds failed", + "host_id", d.HostID, "err", err) + continue + } + payload := api.CommandRunPayload{RequiresAdminCreds: true} + _, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobPrune, payload) + if code != "" { + slog.Warn("maintenance: prune dispatch failed", + "host_id", d.HostID, "code", code, "msg", msg) + } + case "check": + payload := api.CommandRunPayload{Args: []string{strconv.Itoa(d.SubsetPct)}} + _, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobCheck, payload) + if code != "" { + slog.Warn("maintenance: check dispatch failed", + "host_id", d.HostID, "code", code, "msg", msg) + } + default: + slog.Warn("maintenance: unknown decision kind", + "host_id", d.HostID, "kind", d.Kind) + } + } +} + +// buildForgetPayloadForHost collects every source group on the host +// that has a non-empty retention policy and builds a CommandRunPayload +// with ForgetGroups populated. Returns ok=false if the host has no +// such groups (the dispatcher then skips this kind). +func (s *Server) buildForgetPayloadForHost(ctx context.Context, hostID string) (api.CommandRunPayload, bool) { + groups, err := s.deps.Store.ListSourceGroupsByHost(ctx, hostID) + if err != nil { + slog.Warn("maintenance: list source groups failed", "host_id", hostID, "err", err) + return api.CommandRunPayload{}, false + } + fg := make([]api.ForgetGroup, 0, len(groups)) + for _, g := range groups { + if isEmptyRetention(g.RetentionPolicy) { + continue + } + fg = append(fg, api.ForgetGroup{ + Tag: g.Name, + Policy: forgetPolicyJSONFromStore(g.RetentionPolicy), + }) + } + if len(fg) == 0 { + return api.CommandRunPayload{}, false + } + return api.CommandRunPayload{ForgetGroups: fg}, true +} + +func isEmptyRetention(p store.RetentionPolicy) bool { + return p.KeepLast == nil && p.KeepHourly == nil && + p.KeepDaily == nil && p.KeepWeekly == nil && + p.KeepMonthly == nil && p.KeepYearly == nil +} + +// forgetPolicyJSONFromStore copies retention pointers from the store +// view to the wire view. Both shapes are field-for-field identical; +// this avoids importing store from internal/api (which would invert +// the dependency direction). +func forgetPolicyJSONFromStore(p store.RetentionPolicy) api.ForgetPolicyJSON { + return api.ForgetPolicyJSON{ + KeepLast: p.KeepLast, + KeepHourly: p.KeepHourly, + KeepDaily: p.KeepDaily, + KeepWeekly: p.KeepWeekly, + KeepMonthly: p.KeepMonthly, + KeepYearly: p.KeepYearly, + } +} diff --git a/internal/server/http/maintenance_dispatch_test.go b/internal/server/http/maintenance_dispatch_test.go new file mode 100644 index 0000000..9702cf5 --- /dev/null +++ b/internal/server/http/maintenance_dispatch_test.go @@ -0,0 +1,304 @@ +// maintenance_dispatch_test.go — exercises Server.DispatchMaintenance +// directly (one Decision at a time). Reuses the same fake-agent +// harness as p2r01_ws_test / repo_ops_test: a real Server with a +// real Hub, plus a websocket connected as the host. We then push +// Decisions through DispatchMaintenance and assert the envelopes +// the agent receives + the job rows that land. +package http + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/coder/websocket" + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// readNextCommandRun pulls envelopes until a command.run lands or the +// deadline passes. Returns nil if the deadline is hit. +func readNextCommandRun(t *testing.T, c *websocket.Conn, deadline time.Time) *api.CommandRunPayload { + t.Helper() + for time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond) + mt, raw, err := c.Read(ctx) + cancel() + if err != nil { + return nil + } + if mt != websocket.MessageText { + continue + } + var env api.Envelope + if err := json.Unmarshal(raw, &env); err != nil { + continue + } + if env.Type != api.MsgCommandRun { + continue + } + var p api.CommandRunPayload + if err := env.UnmarshalPayload(&p); err != nil { + continue + } + return &p + } + return nil +} + +// TestDispatchMaintenanceSkipsOfflineHosts: host not connected → no +// envelope, no job row. +func TestDispatchMaintenanceSkipsOfflineHosts(t *testing.T) { + t.Parallel() + srv, _, st := rawTestServer(t) + hostID, _ := enrolHostForWS(t, srv, st, "offline-host") + + srv.DispatchMaintenance(context.Background(), []maintenance.Decision{ + {HostID: hostID, Kind: "check", SubsetPct: 10}, + }) + + var n int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM jobs WHERE host_id = ?`, hostID).Scan(&n); err != nil { + t.Fatalf("count: %v", err) + } + if n != 0 { + t.Errorf("offline host produced %d job rows; want 0", n) + } +} + +// TestDispatchMaintenanceForgetShipsForgetGroups: connected host with +// two source groups (one with retention, one without). Decision of +// kind=forget → command.run with ForgetGroups containing only the +// group that had retention. +func TestDispatchMaintenanceForgetShipsForgetGroups(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "forget-host") + seedInitJob(t, st, hostID) + + keep := 7 + if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{ + ID: ulid.Make().String(), HostID: hostID, Name: "documents", + Includes: []string{"/home/documents"}, + RetentionPolicy: store.RetentionPolicy{KeepLast: &keep}, + }); err != nil { + t.Fatalf("group docs: %v", err) + } + if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{ + ID: ulid.Make().String(), HostID: hostID, Name: "ephemeral", + Includes: []string{"/tmp"}, + }); err != nil { + t.Fatalf("group eph: %v", err) + } + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "forget-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + srv.DispatchMaintenance(context.Background(), []maintenance.Decision{ + {HostID: hostID, Kind: "forget"}, + }) + + got := readNextCommandRun(t, c, time.Now().Add(2*time.Second)) + if got == nil { + t.Fatal("no command.run received") + } + if got.Kind != api.JobForget { + t.Errorf("kind: got %q, want %q", got.Kind, api.JobForget) + } + if len(got.ForgetGroups) != 1 { + t.Fatalf("ForgetGroups: got %d entries (%+v), want 1", len(got.ForgetGroups), got.ForgetGroups) + } + if got.ForgetGroups[0].Tag != "documents" { + t.Errorf("forget group tag: got %q, want %q", got.ForgetGroups[0].Tag, "documents") + } + if got.ForgetGroups[0].Policy.KeepLast == nil || *got.ForgetGroups[0].Policy.KeepLast != 7 { + t.Errorf("forget group policy: got %+v", got.ForgetGroups[0].Policy) + } + + // Job row must be persisted with actor_kind=system. + var actor string + if err := st.DB().QueryRow( + `SELECT actor_kind FROM jobs WHERE host_id = ? AND kind = 'forget'`, hostID).Scan(&actor); err != nil { + t.Fatalf("query actor_kind: %v", err) + } + if actor != "system" { + t.Errorf("actor_kind: got %q, want system", actor) + } +} + +// TestDispatchMaintenanceForgetSkipsHostWithNoRetention: connected +// host, but every source group has empty retention → no envelope. +func TestDispatchMaintenanceForgetSkipsHostWithNoRetention(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "no-ret-host") + seedInitJob(t, st, hostID) + if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{ + ID: ulid.Make().String(), HostID: hostID, Name: "ephemeral", + Includes: []string{"/tmp"}, + }); err != nil { + t.Fatalf("group: %v", err) + } + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "no-ret-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + srv.DispatchMaintenance(context.Background(), []maintenance.Decision{ + {HostID: hostID, Kind: "forget"}, + }) + + if got := readNextCommandRun(t, c, time.Now().Add(800*time.Millisecond)); got != nil { + t.Errorf("unexpected command.run: %+v", got) + } + var n int + if err := st.DB().QueryRow(`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'forget'`, hostID).Scan(&n); err != nil { + t.Fatalf("count: %v", err) + } + if n != 0 { + t.Errorf("forget job rows: got %d, want 0", n) + } +} + +// TestDispatchMaintenancePruneSkipsWithoutAdminCreds: no admin creds +// row → no envelope, no job row, silent skip. +func TestDispatchMaintenancePruneSkipsWithoutAdminCreds(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "no-admin-host") + seedInitJob(t, st, hostID) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "no-admin-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + srv.DispatchMaintenance(context.Background(), []maintenance.Decision{ + {HostID: hostID, Kind: "prune"}, + }) + + if got := readNextCommandRun(t, c, time.Now().Add(800*time.Millisecond)); got != nil { + t.Errorf("unexpected command.run: %+v", got) + } + var n int + if err := st.DB().QueryRow(`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'prune'`, hostID).Scan(&n); err != nil { + t.Fatalf("count: %v", err) + } + if n != 0 { + t.Errorf("prune job rows: got %d, want 0", n) + } +} + +// TestDispatchMaintenancePruneShipsConfigUpdateThenCommandRun: with +// admin creds set, prune dispatch must push admin config.update first +// then command.run(prune, RequiresAdminCreds=true). +func TestDispatchMaintenancePruneShipsConfigUpdateThenCommandRun(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "prune-mt-host") + setAdminCreds(t, srv, st, hostID) + seedInitJob(t, st, hostID) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "prune-mt-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + srv.DispatchMaintenance(context.Background(), []maintenance.Decision{ + {HostID: hostID, Kind: "prune"}, + }) + + // Read until we've seen both config.update(slot=admin) and the + // prune command.run. + deadline := time.Now().Add(3 * time.Second) + var sawAdminPush bool + var prunePayload *api.CommandRunPayload + for prunePayload == nil && time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond) + mt, raw, err := c.Read(ctx) + cancel() + if err != nil { + break + } + if mt != websocket.MessageText { + continue + } + var env api.Envelope + if err := json.Unmarshal(raw, &env); err != nil { + continue + } + switch env.Type { + case api.MsgConfigUpdate: + var p api.ConfigUpdatePayload + if err := env.UnmarshalPayload(&p); err == nil && p.Slot == "admin" { + sawAdminPush = true + } + case api.MsgCommandRun: + var p api.CommandRunPayload + if err := env.UnmarshalPayload(&p); err == nil && p.Kind == api.JobPrune { + cp := p + prunePayload = &cp + } + } + } + if !sawAdminPush { + t.Error("expected config.update(slot=admin) before prune dispatch") + } + if prunePayload == nil { + t.Fatal("timed out waiting for command.run(prune)") + } + if !prunePayload.RequiresAdminCreds { + t.Error("prune command.run must have RequiresAdminCreds=true") + } + + // Persisted job must be system actor. + var actor string + if err := st.DB().QueryRow( + `SELECT actor_kind FROM jobs WHERE host_id = ? AND kind = 'prune'`, hostID).Scan(&actor); err != nil { + t.Fatalf("query actor_kind: %v", err) + } + if actor != "system" { + t.Errorf("actor_kind: got %q, want system", actor) + } +} + +// TestDispatchMaintenanceCheckCarriesSubset: Decision SubsetPct=15 → +// command.run.Args == ["15"]. Job row actor_kind=system. +func TestDispatchMaintenanceCheckCarriesSubset(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "check-mt-host") + seedInitJob(t, st, hostID) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "check-mt-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + srv.DispatchMaintenance(context.Background(), []maintenance.Decision{ + {HostID: hostID, Kind: "check", SubsetPct: 15}, + }) + + got := readNextCommandRun(t, c, time.Now().Add(2*time.Second)) + if got == nil { + t.Fatal("no command.run received") + } + if got.Kind != api.JobCheck { + t.Errorf("kind: got %q, want %q", got.Kind, api.JobCheck) + } + if len(got.Args) != 1 || got.Args[0] != "15" { + t.Errorf("Args: got %+v, want [15]", got.Args) + } + + var actor string + if err := st.DB().QueryRow( + `SELECT actor_kind FROM jobs WHERE host_id = ? AND kind = 'check'`, hostID).Scan(&actor); err != nil { + t.Fatalf("query actor_kind: %v", err) + } + if actor != "system" { + t.Errorf("actor_kind: got %q, want system", actor) + } +} diff --git a/internal/server/http/p2r01_test.go b/internal/server/http/p2r01_test.go index 6863e87..07451db 100644 --- a/internal/server/http/p2r01_test.go +++ b/internal/server/http/p2r01_test.go @@ -47,6 +47,32 @@ func loginAsAdmin(t *testing.T, st *store.Store) *stdhttp.Cookie { return &stdhttp.Cookie{Name: sessionCookieName, Value: tok} } +// loginAsAdminWithID is like loginAsAdmin but also returns the user ID. +// Use this when tests need to assert that the user ID was recorded +// (e.g. on audit entries). +func loginAsAdminWithID(t *testing.T, st *store.Store) (*stdhttp.Cookie, string) { + t.Helper() + ctx := context.Background() + uid := ulid.Make().String() + hash, _ := auth.HashPassword("very-long-test-password") + if err := st.CreateUser(ctx, store.User{ + ID: uid, Username: "tester-" + uid[:6], + PasswordHash: hash, Role: store.RoleAdmin, + CreatedAt: time.Now().UTC(), + }); err != nil { + t.Fatalf("create user: %v", err) + } + tok, _ := auth.NewToken() + if err := st.CreateSession(ctx, store.Session{ + UserID: uid, + CreatedAt: time.Now().UTC(), + ExpiresAt: time.Now().Add(time.Hour).UTC(), + }, auth.HashToken(tok)); err != nil { + t.Fatalf("create session: %v", err) + } + return &stdhttp.Cookie{Name: sessionCookieName, Value: tok}, uid +} + // makeHost inserts a minimal Host row directly via the store. Used by // HTTP-level tests that don't want to go through the full enrollment // path. Returns the host id. diff --git a/internal/server/http/p2r01_ws_test.go b/internal/server/http/p2r01_ws_test.go index bc3c57a..23bb9a0 100644 --- a/internal/server/http/p2r01_ws_test.go +++ b/internal/server/http/p2r01_ws_test.go @@ -99,7 +99,7 @@ func enrolHostForWS(t *testing.T, srv *Server, st *store.Store, name string) (ho if err != nil { t.Fatalf("encrypt: %v", err) } - if err := st.SetHostCredentials(context.Background(), hostID, enc); err != nil { + if err := st.SetHostCredentials(context.Background(), hostID, store.CredKindRepo, enc); err != nil { t.Fatalf("set creds: %v", err) } return hostID, token diff --git a/internal/server/http/pending_drain.go b/internal/server/http/pending_drain.go new file mode 100644 index 0000000..a69116d --- /dev/null +++ b/internal/server/http/pending_drain.go @@ -0,0 +1,209 @@ +// pending_drain.go — drains pending_runs rows that are due (or, on +// agent reconnect, every row for that host). +// +// Two trigger paths: +// 1. The 30s tick in cmd/server (DrainAllDue) — sweeps every host +// with rows whose next_attempt_at <= now. +// 2. onAgentHello (DrainPending(hostID)) — when a host comes back, +// walk all of its pending rows synchronously so the operator +// sees the queue drain promptly. +package http + +import ( + "context" + "errors" + "log/slog" + "sync" + "time" + + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +const ( + pendingDrainBatchLimit = 100 + pendingDrainBackoffMax = 30 * time.Minute +) + +// DrainPending re-dispatches every pending_runs row for hostID. The +// host must already be connected (caller's responsibility — typically +// onAgentHello). Each row's source group + schedule are loaded; if +// either is gone the row is dropped (audit-logged as abandoned). If +// the row's attempt count meets/exceeds the group's retry_max, the +// row is dropped (audit-logged as abandoned). Otherwise we attempt +// dispatch; success deletes the row, failure bumps the attempt and +// reschedules with exponential backoff. +// +// A per-host mutex (hostDrainMutex) ensures that the on-hello goroutine +// and the 30s tick cannot process the same host concurrently. If a drain +// is already in-flight for this host, the call returns immediately — the +// running drain will see any rows we'd have processed. +func (s *Server) DrainPending(ctx context.Context, hostID string) { + mu := s.hostDrainMutex(hostID) + if !mu.TryLock() { + return + } + defer mu.Unlock() + + runs, err := s.deps.Store.ListPendingRunsForHost(ctx, hostID) + if err != nil { + slog.Warn("drain pending: list", "host_id", hostID, "err", err) + return + } + if len(runs) == 0 { + return + } + conn := s.deps.Hub.Conn(hostID) + if conn == nil { + // Host went offline between the connectedness check and now. + // Skip — next tick or next reconnect will retry. + return + } + for _, p := range runs { + s.drainOne(ctx, conn, p) + } +} + +// drainOne handles a single pending row. Refactored out so DrainPending +// reads cleanly. Side-effects: delete, bump, audit, dispatch — all +// per-row. +func (s *Server) drainOne(ctx context.Context, conn *ws.Conn, p store.PendingRun) { + sc, err := s.deps.Store.GetSchedule(ctx, p.HostID, p.ScheduleID) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + s.abandonPending(ctx, p, "schedule gone") + return + } + slog.Warn("drain pending: load schedule", + "host_id", p.HostID, "schedule_id", p.ScheduleID, "err", err) + return + } + if !sc.Enabled { + s.abandonPending(ctx, p, "schedule disabled") + return + } + g, err := s.deps.Store.GetSourceGroup(ctx, p.HostID, p.SourceGroupID) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + s.abandonPending(ctx, p, "source group gone") + } else { + slog.Warn("drain pending: load source group", + "host_id", p.HostID, "group_id", p.SourceGroupID, "err", err) + } + return + } + if g.RetryMax > 0 && p.Attempt >= g.RetryMax { + s.abandonPending(ctx, p, "retry_max exceeded") + return + } + // Calls dispatchBackupForGroupCore (not dispatchBackupForGroup) so a + // failed Send doesn't double-enqueue: dispatchBackupForGroup's + // enqueue-on-failure path would create a NEW pending_runs row while + // this function already bumps the EXISTING row via + // BumpPendingRunAttempt, producing geometric duplicates on repeated + // failures. + jobID, _ := s.dispatchBackupForGroupCore(ctx, conn, p.HostID, p.ScheduleID, g, p.ScheduledAt) + if jobID == "" { + // Send failed again. Bump attempt with exponential backoff. + // Exponential backoff doubles immediately on the first drain + // retry: enqueue at base, attempt=1 → drain → 2*base, attempt=2 → + // drain → 4*base, etc. Capped at pendingDrainBackoffMax. With + // defaults (60s base, retry_max=3) the schedule is 60→120→240s. + baseBackoff := time.Duration(g.RetryBackoffSeconds) * time.Second + if baseBackoff <= 0 { + baseBackoff = 60 * time.Second + } + backoff := baseBackoff + for i := 0; i < p.Attempt; i++ { + backoff *= 2 + if backoff >= pendingDrainBackoffMax { + backoff = pendingDrainBackoffMax + break + } + } + next := time.Now().UTC().Add(backoff) + if err := s.deps.Store.BumpPendingRunAttempt(ctx, p.ID, next, "drain dispatch failed"); err != nil { + slog.Warn("drain pending: bump", "host_id", p.HostID, "id", p.ID, "err", err) + } + return + } + // Success — drop the pending row. + if err := s.deps.Store.DeletePendingRun(ctx, p.ID); err != nil { + slog.Warn("drain pending: delete after dispatch", "host_id", p.HostID, "id", p.ID, "err", err) + } + slog.Info("drain pending: dispatched", + "host_id", p.HostID, "schedule_id", p.ScheduleID, "group", g.Name, + "attempt", p.Attempt, "job_id", jobID) +} + +// abandonPending deletes the row and records an audit entry. The row +// is gone but the audit trail preserves the forensic record of why. +func (s *Server) abandonPending(ctx context.Context, p store.PendingRun, reason string) { + slog.Info("drain pending: abandoning", + "host_id", p.HostID, "schedule_id", p.ScheduleID, + "attempt", p.Attempt, "reason", reason) + scheduleID := p.ScheduleID + if err := s.deps.Store.AppendAudit(ctx, store.AuditEntry{ + ID: ulid.Make().String(), + Actor: "system", + Action: "pending_run.abandoned", + TargetKind: ptr("schedule"), + TargetID: &scheduleID, + TS: time.Now().UTC(), + }); err != nil { + slog.Warn("drain pending: audit on abandon", "id", p.ID, "err", err) + } + if err := s.deps.Store.DeletePendingRun(ctx, p.ID); err != nil { + slog.Warn("drain pending: delete on abandon", "id", p.ID, "err", err) + } +} + +// hostDrainMutex returns the per-host mutex for DrainPending, +// creating it on first request. The map is guarded by drainLocksMu. +// Mutex objects are never deleted from the map — there are at most +// len(hosts) entries, which is bounded by the fleet size. +func (s *Server) hostDrainMutex(hostID string) *sync.Mutex { + s.drainLocksMu.Lock() + defer s.drainLocksMu.Unlock() + if s.drainLocks == nil { + s.drainLocks = make(map[string]*sync.Mutex) + } + mu, ok := s.drainLocks[hostID] + if !ok { + mu = &sync.Mutex{} + s.drainLocks[hostID] = mu + } + return mu +} + +// DrainAllDue is the 30s-ticker entrypoint. Walks rows whose +// next_attempt_at <= now (DuePendingRuns), dedupes by host, and calls +// DrainPending per host. The DrainPending then re-walks the host's +// rows (same DB hit as the dedupe iteration would have done — keeps +// the per-host concurrency model simple). +func (s *Server) DrainAllDue(ctx context.Context) { + if s.deps.Hub == nil { + return + } + due, err := s.deps.Store.DuePendingRuns(ctx, time.Now().UTC(), pendingDrainBatchLimit) + if err != nil { + slog.Warn("drain all due: list", "err", err) + return + } + if len(due) == 0 { + return + } + seen := make(map[string]struct{}, len(due)) + for _, p := range due { + if _, ok := seen[p.HostID]; ok { + continue + } + seen[p.HostID] = struct{}{} + if !s.deps.Hub.Connected(p.HostID) { + continue + } + s.DrainPending(ctx, p.HostID) + } +} diff --git a/internal/server/http/pending_drain_test.go b/internal/server/http/pending_drain_test.go new file mode 100644 index 0000000..0cec822 --- /dev/null +++ b/internal/server/http/pending_drain_test.go @@ -0,0 +1,572 @@ +// pending_drain_test.go — covers DrainPending / DrainAllDue and the +// onAgentHello goroutine spawn that drains a freshly-reconnected +// host's queue. +package http + +import ( + "context" + "encoding/json" + "sync" + "testing" + "time" + + "github.com/coder/websocket" + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// seedSchedAndGroup wires up a host with one source group + one +// schedule pointing at it. Returns (groupID, scheduleID). +func seedSchedAndGroup(t *testing.T, st *store.Store, hostID string, retryMax int) (string, string) { + t.Helper() + gid := ulid.Make().String() + if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{ + ID: gid, HostID: hostID, Name: "default", + Includes: []string{"/etc"}, + RetryMax: retryMax, RetryBackoffSeconds: 60, + }); err != nil { + t.Fatalf("create group: %v", err) + } + sid := ulid.Make().String() + if err := st.CreateSchedule(context.Background(), &store.Schedule{ + ID: sid, HostID: hostID, + CronExpr: "0 3 * * *", Enabled: true, + SourceGroupIDs: []string{gid}, + }); err != nil { + t.Fatalf("create schedule: %v", err) + } + // Mark a successful init job so auto-init doesn't pollute reads. + if err := st.CreateJob(context.Background(), store.Job{ + ID: ulid.Make().String(), HostID: hostID, Kind: "init", + ActorKind: "system", CreatedAt: time.Now().UTC(), + }); err != nil { + t.Fatalf("seed init: %v", err) + } + return gid, sid +} + +// countPendingForHost returns the number of pending_runs rows for hostID. +func countPendingForHost(t *testing.T, st *store.Store, hostID string) int { + t.Helper() + var n int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM pending_runs WHERE host_id = ?`, hostID).Scan(&n); err != nil { + t.Fatalf("count pending: %v", err) + } + return n +} + +// waitForPendingCount polls until the pending_runs count for hostID +// reaches wantN or the deadline expires. Use this instead of calling +// DrainPending synchronously when the test relies on the on-hello +// goroutine (which holds the per-host drain mutex) to process rows. +func waitForPendingCount(t *testing.T, st *store.Store, hostID string, wantN int, timeout time.Duration) { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if countPendingForHost(t, st, hostID) == wantN { + return + } + time.Sleep(20 * time.Millisecond) + } + t.Errorf("pending count for host %s: want %d after %v, got %d", + hostID, wantN, timeout, countPendingForHost(t, st, hostID)) +} + +// countAuditAction returns the number of audit_log rows with the given action. +func countAuditAction(t *testing.T, st *store.Store, action string) int { + t.Helper() + var n int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM audit_log WHERE action = ?`, action).Scan(&n); err != nil { + t.Fatalf("count audit: %v", err) + } + return n +} + +func TestDrainPendingDispatchesOnReconnect(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "drain-host") + gid, sid := seedSchedAndGroup(t, st, hostID, 5) + + // Pre-insert a pending row that's already due. The on-hello + // goroutine should drain it after we connect. + pendingID := ulid.Make().String() + now := time.Now().UTC() + if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{ + ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID, + Attempt: 1, NextAttemptAt: now.Add(-time.Second), + ScheduledAt: now.Add(-time.Minute), + }); err != nil { + t.Fatalf("enqueue: %v", err) + } + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "drain-host") + + // Walk envelopes looking for a backup command.run carrying the + // group's includes. + var got *api.CommandRunPayload + deadline := time.Now().Add(3 * time.Second) + for time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 800*time.Millisecond) + mt, raw, err := c.Read(ctx) + cancel() + if err != nil { + break + } + if mt != websocket.MessageText { + continue + } + var env api.Envelope + if err := json.Unmarshal(raw, &env); err != nil { + continue + } + if env.Type != api.MsgCommandRun { + continue + } + var p api.CommandRunPayload + _ = env.UnmarshalPayload(&p) + if p.Kind == api.JobBackup { + got = &p + break + } + } + if got == nil { + t.Fatalf("no backup command.run dispatched after reconnect drain") + } + if !equalStrings(got.Includes, []string{"/etc"}) { + t.Errorf("backup includes: %v", got.Includes) + } + if got.Tag != "default" { + t.Errorf("backup tag: %q", got.Tag) + } + + // Pending row should be gone. Poll briefly: the drain goroutine + // sends command.run via conn.Send and only then calls + // DeletePendingRun. Reading the envelope off the wire above proves + // the send happened, but the delete runs after that on the drain + // goroutine — small window where the count is still 1. + waitForPendingCount(t, st, hostID, 0, 2*time.Second) + if n := countPendingForHost(t, st, hostID); n != 0 { + t.Errorf("pending rows after drain: got %d, want 0", n) + } + + // One backup job row landed (in addition to the seeded init). + var n int + _ = st.DB().QueryRow( + `SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup' AND actor_kind = 'schedule'`, + hostID).Scan(&n) + if n != 1 { + t.Errorf("backup job rows: got %d, want 1", n) + } +} + +func TestDrainPendingAbandonsOnRetryMax(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "abandon-retry-host") + gid, sid := seedSchedAndGroup(t, st, hostID, 2) + + pendingID := ulid.Make().String() + now := time.Now().UTC() + if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{ + ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID, + Attempt: 2, NextAttemptAt: now.Add(-time.Second), + ScheduledAt: now.Add(-time.Minute), + }); err != nil { + t.Fatalf("enqueue: %v", err) + } + + auditBefore := countAuditAction(t, st, "pending_run.abandoned") + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "abandon-retry-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + // The on-hello goroutine processes the row (retry_max exceeded → abandon). + // Wait for it to finish rather than calling DrainPending directly, which + // would be a no-op while the goroutine holds the per-host drain mutex. + _ = connFromHub(t, srv, hostID) // ensure hub registration + waitForPendingCount(t, st, hostID, 0, 2*time.Second) + + if n := countPendingForHost(t, st, hostID); n != 0 { + t.Errorf("pending rows after abandon: got %d, want 0", n) + } + if d := countAuditAction(t, st, "pending_run.abandoned") - auditBefore; d != 1 { + t.Errorf("audit pending_run.abandoned delta: got %d, want 1", d) + } + // No backup command.run should have been sent. + deadline := time.Now().Add(400 * time.Millisecond) + for time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + mt, raw, err := c.Read(ctx) + cancel() + if err != nil { + break + } + if mt != websocket.MessageText { + continue + } + var env api.Envelope + _ = json.Unmarshal(raw, &env) + if env.Type == api.MsgCommandRun { + var p api.CommandRunPayload + _ = env.UnmarshalPayload(&p) + if p.Kind == api.JobBackup { + t.Fatalf("abandoned row still dispatched a backup: %+v", p) + } + } + } + // No backup job row. + var n int + _ = st.DB().QueryRow( + `SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup'`, + hostID).Scan(&n) + if n != 0 { + t.Errorf("abandon path created a backup job: %d rows", n) + } +} + +func TestDrainPendingBumpsOnSendFailure(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "bump-host") + gid, sid := seedSchedAndGroup(t, st, hostID, 5) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "bump-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + // Capture the conn before closing the client side. Hub.Conn still + // returns it after the client-side close — the server's Unregister + // fires when its read loop sees the close, but the conn ptr remains + // valid; subsequent Sends just fail. + conn := connFromHub(t, srv, hostID) + if conn == nil { + t.Fatal("conn never registered") + } + + // Insert the pending row AFTER the on-hello drain goroutine has + // already scanned (an empty list) — otherwise we race the on-hello + // drain dispatching the row over the still-live socket. + pendingID := ulid.Make().String() + now := time.Now().UTC() + + if err := c.Close(websocket.StatusNormalClosure, "test"); err != nil { + t.Fatalf("close: %v", err) + } + // Brief settle so the close is observed by the server's read loop. + time.Sleep(150 * time.Millisecond) + + if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{ + ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID, + Attempt: 1, NextAttemptAt: now.Add(-time.Second), + ScheduledAt: now.Add(-time.Minute), + }); err != nil { + t.Fatalf("enqueue: %v", err) + } + + // DrainPending uses Hub.Conn(hostID); after the client close the + // server may have unregistered already. Call drainOne directly + // against the captured conn so we deterministically exercise the + // "Send fails" branch rather than the "host gone" branch. + srv.drainOne(context.Background(), conn, store.PendingRun{ + ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID, + Attempt: 1, NextAttemptAt: now.Add(-time.Second), ScheduledAt: now.Add(-time.Minute), + }) + + // The original row must be bumped to attempt=2 with a non-empty + // last_error. Critically, NO duplicate row should have been created: + // drainOne calls dispatchBackupForGroupCore (not dispatchBackupForGroup) + // so the enqueue-on-failure path is bypassed and the count stays at 1. + if n := countPendingForHost(t, st, hostID); n != 1 { + t.Errorf("pending rows after send failure: got %d, want 1 (no duplicate enqueue)", n) + } + var attempt int + var lastErr string + if err := st.DB().QueryRow( + `SELECT attempt, COALESCE(last_error,'') FROM pending_runs WHERE id = ?`, + pendingID).Scan(&attempt, &lastErr); err != nil { + t.Fatalf("scan original row: %v", err) + } + if attempt != 2 { + t.Errorf("attempt after bump: got %d, want 2", attempt) + } + if lastErr == "" { + t.Errorf("last_error empty after bump") + } +} + +func TestDrainPendingDropsRowsForGoneSchedule(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "gone-sched-host") + gid, sid := seedSchedAndGroup(t, st, hostID, 5) + + pendingID := ulid.Make().String() + now := time.Now().UTC() + if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{ + ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID, + Attempt: 1, NextAttemptAt: now.Add(-time.Second), + ScheduledAt: now.Add(-time.Minute), + }); err != nil { + t.Fatalf("enqueue: %v", err) + } + + // Disable the schedule. (Deleting it would FK-cascade-delete the + // pending_runs row out from under the drainer, which is fine for + // production but defeats the point of the test. The + // disabled-schedule path goes through the same abandonPending code, + // so it's an equivalent assertion.) + if _, err := st.DB().Exec( + `UPDATE schedules SET enabled = 0 WHERE id = ?`, sid); err != nil { + t.Fatalf("disable schedule: %v", err) + } + + auditBefore := countAuditAction(t, st, "pending_run.abandoned") + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "gone-sched-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + // The on-hello goroutine processes the row (disabled schedule → abandon). + // Poll for completion instead of calling DrainPending, which would return + // immediately while the goroutine holds the per-host drain mutex. + waitForPendingCount(t, st, hostID, 0, 2*time.Second) + + if n := countPendingForHost(t, st, hostID); n != 0 { + t.Errorf("pending rows after schedule-gone abandon: got %d, want 0", n) + } + if d := countAuditAction(t, st, "pending_run.abandoned") - auditBefore; d != 1 { + t.Errorf("audit delta: got %d, want 1", d) + } + // Drain produced no backup envelope. + deadline := time.Now().Add(400 * time.Millisecond) + for time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + mt, raw, err := c.Read(ctx) + cancel() + if err != nil { + break + } + if mt != websocket.MessageText { + continue + } + var env api.Envelope + _ = json.Unmarshal(raw, &env) + if env.Type == api.MsgCommandRun { + var p api.CommandRunPayload + _ = env.UnmarshalPayload(&p) + if p.Kind == api.JobBackup { + t.Fatalf("gone-schedule abandon still dispatched: %+v", p) + } + } + } +} + +// TestDrainPendingDropsRowsForGoneSourceGroup verifies that when a +// source group is gone (ErrNotFound) the pending row is abandoned and +// an audit entry is written. Transient-error paths (SQLITE_BUSY, +// context cancellation) are not covered here because the real *Store +// doesn't expose a fault-injection seam; the code-review check above +// is the gate for that path. +func TestDrainPendingDropsRowsForGoneSourceGroup(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "gone-group-host") + _, sid := seedSchedAndGroup(t, st, hostID, 5) + + // Use a source_group_id that never existed. pending_runs carries a + // FK to source_groups, so we must bypass FK enforcement for this + // insert. PRAGMA foreign_keys is connection-scoped and can only be + // changed outside a transaction; DB().Exec runs on an arbitrary + // pooled connection, so we pin it with a dedicated *sql.Conn. + fakeGroupID := ulid.Make().String() + pendingID := ulid.Make().String() + now := time.Now().UTC() + conn, err := st.DB().Conn(context.Background()) + if err != nil { + t.Fatalf("db conn: %v", err) + } + defer conn.Close() + if _, err := conn.ExecContext(context.Background(), `PRAGMA foreign_keys = OFF`); err != nil { + t.Fatalf("fk off: %v", err) + } + if _, err := conn.ExecContext(context.Background(), + `INSERT INTO pending_runs (id, schedule_id, source_group_id, host_id, attempt, next_attempt_at, scheduled_at) + VALUES (?, ?, ?, ?, 1, ?, ?)`, + pendingID, sid, fakeGroupID, hostID, + now.Add(-time.Second), now.Add(-time.Minute), + ); err != nil { + t.Fatalf("insert pending: %v", err) + } + if _, err := conn.ExecContext(context.Background(), `PRAGMA foreign_keys = ON`); err != nil { + t.Fatalf("fk on: %v", err) + } + + auditBefore := countAuditAction(t, st, "pending_run.abandoned") + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "gone-group-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + // The on-hello goroutine processes the row (source group gone → abandon). + // Poll for completion instead of calling DrainPending, which would return + // immediately while the goroutine holds the per-host drain mutex. + waitForPendingCount(t, st, hostID, 0, 2*time.Second) + + if n := countPendingForHost(t, st, hostID); n != 0 { + t.Errorf("pending rows after source-group-gone abandon: got %d, want 0", n) + } + if d := countAuditAction(t, st, "pending_run.abandoned") - auditBefore; d != 1 { + t.Errorf("audit delta: got %d, want 1", d) + } +} + +func TestDrainAllDueSkipsOfflineHosts(t *testing.T) { + t.Parallel() + srv, _, st := rawTestServer(t) + // Don't dial — host is enrolled but never connected. + hostID, _ := enrolHostForWS(t, srv, st, "offline-host") + gid, sid := seedSchedAndGroup(t, st, hostID, 5) + + pendingID := ulid.Make().String() + now := time.Now().UTC() + if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{ + ID: pendingID, ScheduleID: sid, SourceGroupID: gid, HostID: hostID, + Attempt: 1, NextAttemptAt: now.Add(-time.Second), + ScheduledAt: now.Add(-time.Minute), + }); err != nil { + t.Fatalf("enqueue: %v", err) + } + + auditBefore := countAuditAction(t, st, "pending_run.abandoned") + + srv.DrainAllDue(context.Background()) + + // Row still there (host offline, drainer skips). + if n := countPendingForHost(t, st, hostID); n != 1 { + t.Errorf("pending rows after DrainAllDue against offline host: got %d, want 1", n) + } + if d := countAuditAction(t, st, "pending_run.abandoned") - auditBefore; d != 0 { + t.Errorf("audit unexpectedly changed: delta %d", d) + } +} + +func TestEnqueueOnDispatchFailure(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "enqueue-host") + _, sid := seedSchedAndGroup(t, st, hostID, 5) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "enqueue-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + conn := connFromHub(t, srv, hostID) + _ = conn + + // Close the client side so the server's next Send errors. + if err := c.Close(websocket.StatusNormalClosure, "test"); err != nil { + t.Fatalf("close: %v", err) + } + time.Sleep(100 * time.Millisecond) + + scheduledAt := time.Now().UTC().Add(-30 * time.Second) + srv.dispatchScheduledJob(context.Background(), hostID, conn, sid, scheduledAt) + + // One pending row should have been enqueued (attempt=1) with the + // scheduled_at preserved. + rows, err := st.ListPendingRunsForHost(context.Background(), hostID) + if err != nil { + t.Fatalf("list: %v", err) + } + if len(rows) != 1 { + t.Fatalf("pending rows: got %d, want 1", len(rows)) + } + if rows[0].Attempt != 1 { + t.Errorf("attempt: got %d, want 1", rows[0].Attempt) + } + // scheduled_at preserved (within RFC3339Nano round-trip tolerance). + if rows[0].ScheduledAt.Sub(scheduledAt).Abs() > time.Microsecond { + t.Errorf("scheduled_at drift: %v vs %v", rows[0].ScheduledAt, scheduledAt) + } + if rows[0].LastError == "" { + t.Errorf("last_error empty") + } +} + +// TestDrainPendingSerializesPerHost verifies that concurrent DrainPending +// calls for the same host do not double-dispatch pending rows. The per-host +// mutex (TryLock semantics) means exactly one drain processes each row. +func TestDrainPendingSerializesPerHost(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "serialize-host") + gid, sid := seedSchedAndGroup(t, st, hostID, 10) + + // Connect the agent so DrainPending can dispatch. + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "serialize-host") + // Drain the on-hello goroutine's pass first (no pending rows yet), + // then wait for the schedule.set so the connection is fully settled. + _ = drainUntil(t, c, api.MsgScheduleSet) + + // Insert 5 pending rows now that the on-hello drain has already run. + now := time.Now().UTC() + for i := range 5 { + pid := ulid.Make().String() + if err := st.EnqueuePendingRun(context.Background(), &store.PendingRun{ + ID: pid, + ScheduleID: sid, + SourceGroupID: gid, + HostID: hostID, + Attempt: 1, + NextAttemptAt: now.Add(-time.Second), + ScheduledAt: now.Add(-time.Duration(i+1) * time.Minute), + }); err != nil { + t.Fatalf("enqueue row %d: %v", i, err) + } + } + + // Spawn 10 goroutines all calling DrainPending concurrently. + var wg sync.WaitGroup + for range 10 { + wg.Add(1) + go func() { + defer wg.Done() + srv.DrainPending(context.Background(), hostID) + }() + } + wg.Wait() + + // Drain any envelopes the agent received so we don't block below. + // We read with short timeouts and stop when the connection goes quiet. + drainDeadline := time.Now().Add(500 * time.Millisecond) + for time.Now().Before(drainDeadline) { + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + _, _, err := c.Read(ctx) + cancel() + if err != nil { + break + } + } + + // All 5 pending rows must be gone. + if n := countPendingForHost(t, st, hostID); n != 0 { + t.Errorf("pending rows after concurrent drain: got %d, want 0", n) + } + + // Exactly 5 backup job rows (one per pending row), not 10+ from a race. + var n int + _ = st.DB().QueryRow( + `SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup' AND actor_kind = 'schedule'`, + hostID).Scan(&n) + if n != 5 { + t.Errorf("backup job rows: got %d, want 5 (per-host mutex must prevent double-dispatch)", n) + } +} diff --git a/internal/server/http/repo_ops.go b/internal/server/http/repo_ops.go new file mode 100644 index 0000000..920677d --- /dev/null +++ b/internal/server/http/repo_ops.go @@ -0,0 +1,165 @@ +// repo_ops.go — operator-triggered Run-now for repo-level operations: +// prune, check, unlock. Backed by the same dispatchJobWithPayload +// pipeline as backup, with an extra step for prune: push admin creds +// first if they're set, refuse loudly if they aren't. +package http + +import ( + "errors" + "log/slog" + stdhttp "net/http" + "strconv" + + "github.com/go-chi/chi/v5" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// handleRunRepoPrune — POST /api/hosts/{id}/repo/prune (and the HTMX +// twin outside /api). Pushes the host's admin credentials down the WS, +// then dispatches a prune command.run with RequiresAdminCreds=true. +func (s *Server) handleRunRepoPrune(w stdhttp.ResponseWriter, r *stdhttp.Request) { + user, ok := s.requireUser(r) + if !ok { + if wantsHTML(r) { + stdhttp.Redirect(w, r, "/login", stdhttp.StatusSeeOther) + return + } + writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "") + return + } + hostID := chi.URLParam(r, "id") + if hostID == "" { + s.runOpError(w, r, stdhttp.StatusBadRequest, "missing_id", "") + return + } + + // Push admin creds first. ErrNotFound → operator hasn't set them + // yet. Other errors → likely the host is offline or a decrypt fail. + if err := s.pushAdminCredsToAgent(r.Context(), hostID); err != nil { + if errors.Is(err, store.ErrNotFound) { + s.runOpError(w, r, stdhttp.StatusBadRequest, "admin_creds_required", + "set admin credentials on the Repo page before running prune") + return + } + // Hub.Send failure (offline) or decrypt failure — surface a + // generic offline message so the operator retries when the + // agent is back. + slog.Warn("prune: push admin creds failed", "host_id", hostID, "err", err) + s.runOpError(w, r, stdhttp.StatusServiceUnavailable, "host_offline", + "agent is not currently connected; try again when it reconnects") + return + } + + res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobPrune, + api.CommandRunPayload{RequiresAdminCreds: true}) + if code != "" { + s.runOpError(w, r, status, code, msg) + return + } + s.runOpRedirect(w, r, res) +} + +// handleRunRepoCheck — POST /api/hosts/{id}/repo/check. Pulls +// check_subset_pct from host_repo_maintenance for the host (operator +// can override via ?subset=N query param, clamped 0..100). Dispatches +// with the chosen subset in CommandRunPayload.Args[0]. +func (s *Server) handleRunRepoCheck(w stdhttp.ResponseWriter, r *stdhttp.Request) { + user, ok := s.requireUser(r) + if !ok { + if wantsHTML(r) { + stdhttp.Redirect(w, r, "/login", stdhttp.StatusSeeOther) + return + } + writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "") + return + } + hostID := chi.URLParam(r, "id") + if hostID == "" { + s.runOpError(w, r, stdhttp.StatusBadRequest, "missing_id", "") + return + } + + m, err := s.deps.Store.GetRepoMaintenance(r.Context(), hostID) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + // Maintenance row should auto-seed at enrollment. If it's + // missing, surface a clear error rather than guessing 0%. + s.runOpError(w, r, stdhttp.StatusInternalServerError, "no_maintenance_row", + "host has no repo-maintenance config; was the host fully enrolled?") + return + } + s.runOpError(w, r, stdhttp.StatusInternalServerError, "internal", "") + return + } + subset := m.CheckSubsetPct + if q := r.URL.Query().Get("subset"); q != "" { + if n, err2 := strconv.Atoi(q); err2 == nil { + if n < 0 { + n = 0 + } + if n > 100 { + n = 100 + } + subset = n + } + // Non-numeric ?subset silently falls back to DB value. + } + + res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobCheck, + api.CommandRunPayload{Args: []string{strconv.Itoa(subset)}}) + if code != "" { + s.runOpError(w, r, status, code, msg) + return + } + s.runOpRedirect(w, r, res) +} + +// handleRunRepoUnlock — POST /api/hosts/{id}/repo/unlock. No admin +// creds required — restic unlock works with the everyday user. +func (s *Server) handleRunRepoUnlock(w stdhttp.ResponseWriter, r *stdhttp.Request) { + user, ok := s.requireUser(r) + if !ok { + if wantsHTML(r) { + stdhttp.Redirect(w, r, "/login", stdhttp.StatusSeeOther) + return + } + writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "") + return + } + hostID := chi.URLParam(r, "id") + if hostID == "" { + s.runOpError(w, r, stdhttp.StatusBadRequest, "missing_id", "") + return + } + + res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobUnlock, + api.CommandRunPayload{}) + if code != "" { + s.runOpError(w, r, status, code, msg) + return + } + s.runOpRedirect(w, r, res) +} + +// runOpRedirect: HTMX → HX-Redirect to /jobs/{id}; JSON → 202 + JSON +// body. Mirrors handleRunSourceGroup's tail. +func (s *Server) runOpRedirect(w stdhttp.ResponseWriter, r *stdhttp.Request, res runNowResponse) { + if wantsHTML(r) { + w.Header().Set("HX-Redirect", "/jobs/"+res.JobID) + w.WriteHeader(stdhttp.StatusNoContent) + return + } + writeJSON(w, stdhttp.StatusAccepted, res) +} + +// runOpError: HTMX → plain-text status; JSON → standard envelope. +// Mirrors runGroupError. +func (s *Server) runOpError(w stdhttp.ResponseWriter, r *stdhttp.Request, status int, code, msg string) { + if wantsHTML(r) { + stdhttp.Error(w, msg, status) + return + } + writeJSONError(w, status, code, msg) +} diff --git a/internal/server/http/repo_ops_test.go b/internal/server/http/repo_ops_test.go new file mode 100644 index 0000000..65ec9bb --- /dev/null +++ b/internal/server/http/repo_ops_test.go @@ -0,0 +1,362 @@ +// repo_ops_test.go — integration tests for the repo run-now endpoints: +// prune, check, unlock. +package http + +import ( + "context" + "encoding/json" + stdhttp "net/http" + "strconv" + "testing" + "time" + + "github.com/coder/websocket" + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// ----- helpers ------------------------------------------------------- + +// seedInitJob marks a fake init job done for the host so the auto-init +// path doesn't fire and pollute the envelope sequence we're measuring. +func seedInitJob(t *testing.T, st *store.Store, hostID string) { + t.Helper() + if err := st.CreateJob(context.Background(), store.Job{ + ID: ulid.Make().String(), HostID: hostID, Kind: "init", + ActorKind: "system", CreatedAt: time.Now().UTC(), + }); err != nil { + t.Fatalf("seed init job: %v", err) + } +} + +// setAdminCreds writes admin credentials for a host via the store directly. +func setAdminCreds(t *testing.T, srv *Server, st *store.Store, hostID string) { + t.Helper() + enc, err := srv.encryptRepoCreds(repoCredsBlob{ + RepoURL: "rest:http://admin.example/h", + RepoUsername: "admin", + RepoPassword: "prune-pass", + }, []byte("host:"+hostID+":admin")) + if err != nil { + t.Fatalf("encrypt admin creds: %v", err) + } + if err := st.SetHostCredentials(context.Background(), hostID, store.CredKindAdmin, enc); err != nil { + t.Fatalf("set admin creds: %v", err) + } +} + +// setMaintenanceSubset sets check_subset_pct for the host via the store. +func setMaintenanceSubset(t *testing.T, st *store.Store, hostID string, pct int) { + t.Helper() + // Ensure the row exists first. + if err := st.CreateDefaultRepoMaintenance(context.Background(), hostID); err != nil { + t.Fatalf("seed maintenance: %v", err) + } + m, err := st.GetRepoMaintenance(context.Background(), hostID) + if err != nil { + t.Fatalf("get maintenance: %v", err) + } + m.CheckSubsetPct = pct + if err := st.UpdateRepoMaintenance(context.Background(), m); err != nil { + t.Fatalf("update maintenance: %v", err) + } +} + +// drainCommandRun reads envelopes until a command.run arrives, then +// unmarshals and returns the payload. +func drainCommandRun(t *testing.T, c *websocket.Conn) api.CommandRunPayload { + t.Helper() + env := drainUntil(t, c, api.MsgCommandRun) + var p api.CommandRunPayload + if err := env.UnmarshalPayload(&p); err != nil { + t.Fatalf("unmarshal command.run: %v", err) + } + return p +} + +// ----- prune tests --------------------------------------------------- + +// TestRunPruneRefusesWithoutAdminCreds: POST prune with no admin creds +// set → 400, code admin_creds_required, no job row created. +func TestRunPruneRefusesWithoutAdminCreds(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "prune-no-admin") + cookie := loginAsAdmin(t, st) + seedInitJob(t, st, hostID) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "prune-no-admin") + _ = drainUntil(t, c, api.MsgScheduleSet) + + status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/prune", nil, cookie) + if status != stdhttp.StatusBadRequest { + t.Fatalf("want 400, got %d body=%+v", status, body) + } + if code, _ := body["code"].(string); code != "admin_creds_required" { + t.Errorf("want code=admin_creds_required, got %+v", body) + } + + // No prune job row should have been persisted. + var n int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'prune'`, hostID).Scan(&n); err != nil { + t.Fatalf("count: %v", err) + } + if n != 0 { + t.Errorf("unexpected prune job rows: %d", n) + } +} + +// TestRunPruneShipsConfigUpdateThenCommandRun: set admin creds, connect +// host, POST prune. Assert envelope sequence: config.update(slot=admin) +// → command.run(prune, RequiresAdminCreds=true). Assert job row persisted. +func TestRunPruneShipsConfigUpdateThenCommandRun(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "prune-happy") + cookie := loginAsAdmin(t, st) + setAdminCreds(t, srv, st, hostID) + seedInitJob(t, st, hostID) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "prune-happy") + // Drain on-hello burst (repo config.update + schedule.set). + _ = drainUntil(t, c, api.MsgScheduleSet) + + status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/prune", nil, cookie) + if status != stdhttp.StatusAccepted { + t.Fatalf("want 202, got %d body=%+v", status, body) + } + jobID, _ := body["job_id"].(string) + if jobID == "" { + t.Fatalf("no job_id in response: %+v", body) + } + + // Read the next two envelopes — must be config.update(slot=admin) + // followed by command.run(prune). + deadline := time.Now().Add(3 * time.Second) + var sawAdminPush bool + var prunePayload *api.CommandRunPayload + for (prunePayload == nil) && time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 800*time.Millisecond) + mt, raw, err := c.Read(ctx) + cancel() + if err != nil { + break + } + if mt != websocket.MessageText { + continue + } + var env api.Envelope + if err := json.Unmarshal(raw, &env); err != nil { + continue + } + switch env.Type { + case api.MsgConfigUpdate: + var p api.ConfigUpdatePayload + if err := env.UnmarshalPayload(&p); err == nil && p.Slot == "admin" { + sawAdminPush = true + } + case api.MsgCommandRun: + var p api.CommandRunPayload + if err := env.UnmarshalPayload(&p); err == nil && p.Kind == api.JobPrune { + copy := p + prunePayload = © + } + } + } + + if !sawAdminPush { + t.Error("expected config.update(slot=admin) before prune dispatch") + } + if prunePayload == nil { + t.Fatal("timed out waiting for command.run(prune)") + } + if !prunePayload.RequiresAdminCreds { + t.Error("prune command.run must have RequiresAdminCreds=true") + } + if prunePayload.JobID != jobID { + t.Errorf("job_id mismatch: dispatch=%s run=%s", jobID, prunePayload.JobID) + } + + // Job row must be persisted. + var n int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM jobs WHERE id = ? AND host_id = ? AND kind = 'prune'`, + jobID, hostID).Scan(&n); err != nil { + t.Fatalf("count: %v", err) + } + if n != 1 { + t.Errorf("prune job row count: want 1, got %d", n) + } +} + +// ----- check tests --------------------------------------------------- + +// TestRunCheckUsesMaintenanceSubset: check_subset_pct=25 → Args==["25"]. +func TestRunCheckUsesMaintenanceSubset(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "check-subset") + cookie := loginAsAdmin(t, st) + setMaintenanceSubset(t, st, hostID, 25) + seedInitJob(t, st, hostID) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "check-subset") + _ = drainUntil(t, c, api.MsgScheduleSet) + + status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/check", nil, cookie) + if status != stdhttp.StatusAccepted { + t.Fatalf("want 202, got %d body=%+v", status, body) + } + + p := drainCommandRun(t, c) + if p.Kind != api.JobCheck { + t.Fatalf("kind: want check, got %s", p.Kind) + } + if len(p.Args) != 1 || p.Args[0] != "25" { + t.Errorf("args: want [25], got %v", p.Args) + } +} + +// TestRunCheckHonorsSubsetOverride: ?subset=10 overrides DB value of 25. +func TestRunCheckHonorsSubsetOverride(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "check-override") + cookie := loginAsAdmin(t, st) + setMaintenanceSubset(t, st, hostID, 25) + seedInitJob(t, st, hostID) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "check-override") + _ = drainUntil(t, c, api.MsgScheduleSet) + + status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/check?subset=10", nil, cookie) + if status != stdhttp.StatusAccepted { + t.Fatalf("want 202, got %d body=%+v", status, body) + } + + p := drainCommandRun(t, c) + if len(p.Args) != 1 || p.Args[0] != "10" { + t.Errorf("args: want [10], got %v", p.Args) + } +} + +// TestRunCheckRejectsBadSubsetGracefully: ?subset=abc falls back to DB +// value (not an error). strconv.Atoi failure silently ignored. +func TestRunCheckRejectsBadSubsetGracefully(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "check-badsubset") + cookie := loginAsAdmin(t, st) + setMaintenanceSubset(t, st, hostID, 30) + seedInitJob(t, st, hostID) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "check-badsubset") + _ = drainUntil(t, c, api.MsgScheduleSet) + + status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/check?subset=abc", nil, cookie) + if status != stdhttp.StatusAccepted { + t.Fatalf("want 202 (bad subset falls back), got %d body=%+v", status, body) + } + + p := drainCommandRun(t, c) + if len(p.Args) != 1 || p.Args[0] != strconv.Itoa(30) { + t.Errorf("args: want [30], got %v", p.Args) + } +} + +// ----- unlock tests -------------------------------------------------- + +// TestRunUnlockNeedsNoAdminCreds: no admin creds, POST unlock → 202. +func TestRunUnlockNeedsNoAdminCreds(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "unlock-no-admin") + cookie := loginAsAdmin(t, st) + seedInitJob(t, st, hostID) + + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "unlock-no-admin") + _ = drainUntil(t, c, api.MsgScheduleSet) + + status, body := doJSON(t, ts.URL, "POST", "/api/hosts/"+hostID+"/repo/unlock", nil, cookie) + if status != stdhttp.StatusAccepted { + t.Fatalf("want 202, got %d body=%+v", status, body) + } + + p := drainCommandRun(t, c) + if p.Kind != api.JobUnlock { + t.Fatalf("kind: want unlock, got %s", p.Kind) + } + // RequiresAdminCreds must be false for unlock. + if p.RequiresAdminCreds { + t.Error("unlock must not set RequiresAdminCreds") + } +} + +// ----- auth tests ---------------------------------------------------- + +// TestRunOpsRequireAuth: unauthenticated POST to each endpoint → 401. +func TestRunOpsRequireAuth(t *testing.T) { + t.Parallel() + _, url, st := newTestServerWithHub(t) + hostID := makeHost(t, st, "auth-host") + + for _, path := range []string{ + "/api/hosts/" + hostID + "/repo/prune", + "/api/hosts/" + hostID + "/repo/check", + "/api/hosts/" + hostID + "/repo/unlock", + } { + path := path + t.Run(path, func(t *testing.T) { + t.Parallel() + req, _ := stdhttp.NewRequest("POST", url+path, nil) + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusUnauthorized { + t.Errorf("want 401, got %d", res.StatusCode) + } + }) + } + + // HTMX path: unauthenticated POST with HX-Request: true → 303 to /login. + // Auth check fires before host lookup so the host ID doesn't need to exist. + for _, path := range []string{ + "/hosts/" + hostID + "/repo/prune", + "/hosts/" + hostID + "/repo/check", + "/hosts/" + hostID + "/repo/unlock", + } { + path := path + t.Run("htmx"+path, func(t *testing.T) { + t.Parallel() + client := &stdhttp.Client{ + CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error { + return stdhttp.ErrUseLastResponse + }, + } + req, _ := stdhttp.NewRequest("POST", url+path, nil) + req.Header.Set("HX-Request", "true") + res, err := client.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusSeeOther { + t.Errorf("want 303, got %d", res.StatusCode) + } + if loc := res.Header.Get("Location"); loc != "/login" { + t.Errorf("want Location=/login, got %q", loc) + } + }) + } +} diff --git a/internal/server/http/schedule_push.go b/internal/server/http/schedule_push.go index 02692b7..6bcffb6 100644 --- a/internal/server/http/schedule_push.go +++ b/internal/server/http/schedule_push.go @@ -164,15 +164,19 @@ func (s *Server) dispatchScheduledJob(ctx context.Context, hostID string, conn * } } -// dispatchBackupForGroup builds and sends a single backup command.run -// envelope on conn for the given group. Persists the job row first so -// the live log viewer can subscribe to it. -// dispatchBackupForGroup persists a backup job row, sends the -// command.run envelope to the agent, and audit-logs the dispatch. -// Returns the persisted job ID on success, or "" on any failure -// (failures are slog.Warn-ed). Callers may use the returned ID to, -// e.g., redirect the UI to the live job log. -func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, hostID, scheduleID string, g *store.SourceGroup, scheduledAt time.Time) string { +// dispatchBackupForGroupCore persists a backup job row, marshals and +// sends the command.run envelope, and audit-logs the dispatch. It does +// NOT enqueue a PendingRun on failure — that responsibility belongs to +// the caller when appropriate. +// +// Returns (jobID, nil) on success. Returns ("", err) on any failure; +// the error is also slog.Warn-ed inside this function so callers don't +// need to log it again. +// +// Used by both dispatchBackupForGroup (schedule.fire path, which adds +// enqueue-on-failure) and drainOne (which handles failure via +// BumpPendingRunAttempt on the existing row, avoiding double-enqueue). +func (s *Server) dispatchBackupForGroupCore(ctx context.Context, conn *ws.Conn, hostID, scheduleID string, g *store.SourceGroup, scheduledAt time.Time) (string, error) { jobID := ulid.Make().String() now := time.Now().UTC() scheduleRef := scheduleID @@ -186,7 +190,7 @@ func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, host }); err != nil { slog.Warn("schedule.fire: persist job", "host_id", hostID, "schedule_id", scheduleID, "group", g.Name, "err", err) - return "" + return "", err } // Backup ignores RetentionPolicy — the forget cadence lives on // host_repo_maintenance and is driven by the server-side ticker @@ -201,14 +205,17 @@ func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, host if err != nil { slog.Warn("schedule.fire: marshal command.run", "host_id", hostID, "schedule_id", scheduleID, "err", err) - return "" + return "", err } sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) defer cancel() if err := conn.Send(sendCtx, env); err != nil { - slog.Warn("schedule.fire: send command.run", - "host_id", hostID, "schedule_id", scheduleID, "err", err) - return "" + slog.Warn("schedule.fire: send command.run failed", + "host_id", hostID, "schedule_id", scheduleID, "group", g.Name, "err", err) + // The job row was already persisted — leave it in `queued` status. + // The drainer will re-dispatch (creating a new job row) and the + // orphaned queued row stays for forensic visibility. + return "", err } _ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{ ID: ulid.Make().String(), @@ -221,5 +228,37 @@ func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, host slog.Info("schedule.fire: dispatched backup", "host_id", hostID, "schedule_id", scheduleID, "group", g.Name, "job_id", jobID, "scheduled_at", scheduledAt) - return jobID + return jobID, nil +} + +// dispatchBackupForGroup is the schedule.fire entry point. Wraps +// dispatchBackupForGroupCore with enqueue-on-failure: a failed Send +// queues a fresh PendingRun for the drainer to retry later. +// +// Returns the persisted job ID on success, or "" on any failure. +func (s *Server) dispatchBackupForGroup(ctx context.Context, conn *ws.Conn, hostID, scheduleID string, g *store.SourceGroup, scheduledAt time.Time) string { + jobID, err := s.dispatchBackupForGroupCore(ctx, conn, hostID, scheduleID, g, scheduledAt) + if err == nil { + return jobID + } + // Send (or an earlier step) failed — err was already logged inside + // the core. Enqueue a fresh PendingRun for the drainer to retry. + backoff := time.Duration(g.RetryBackoffSeconds) * time.Second + if backoff <= 0 { + backoff = 60 * time.Second + } + if enqueueErr := s.deps.Store.EnqueuePendingRun(ctx, &store.PendingRun{ + ID: ulid.Make().String(), + ScheduleID: scheduleID, + SourceGroupID: g.ID, + HostID: hostID, + Attempt: 1, + NextAttemptAt: time.Now().UTC().Add(backoff), + ScheduledAt: scheduledAt, + LastError: err.Error(), + }); enqueueErr != nil { + slog.Warn("schedule.fire: enqueue pending run failed", + "host_id", hostID, "schedule_id", scheduleID, "group", g.Name, "err", enqueueErr) + } + return "" } diff --git a/internal/server/http/server.go b/internal/server/http/server.go index f286fdb..8ef3d83 100644 --- a/internal/server/http/server.go +++ b/internal/server/http/server.go @@ -7,6 +7,7 @@ import ( "context" "errors" stdhttp "net/http" + "sync" "time" "github.com/go-chi/chi/v5" @@ -41,6 +42,13 @@ type Deps struct { type Server struct { srv *stdhttp.Server deps Deps + + // drainLocks serializes DrainPending per host. The on-hello + // goroutine and the 30s ticker can otherwise race for the same + // host, double-dispatching every pending row. Map of hostID → + // sync.Mutex; checked-and-locked atomically via drainLocksMu. + drainLocksMu sync.Mutex + drainLocks map[string]*sync.Mutex } // New builds a configured but not-yet-started server. @@ -59,7 +67,7 @@ func New(deps Deps) *Server { w.WriteHeader(stdhttp.StatusNoContent) }) - s := &Server{deps: deps} + s := &Server{deps: deps, drainLocks: make(map[string]*sync.Mutex)} s.routes(r) s.srv = &stdhttp.Server{ @@ -105,6 +113,13 @@ func (s *Server) routes(r chi.Router) { r.Get("/hosts/{id}/repo-credentials", s.handleGetHostCredentials) r.Put("/hosts/{id}/repo-credentials", s.handleSetHostCredentials) + // Admin credentials — the prune-capable slot (separate from the + // everyday repo creds). Optional: hosts that don't prune against + // a rest-server repo with a separate admin user never need this. + r.Get("/hosts/{id}/admin-credentials", s.handleGetAdminCredentials) + r.Put("/hosts/{id}/admin-credentials", s.handleSetAdminCredentials) + r.Delete("/hosts/{id}/admin-credentials", s.handleDeleteAdminCredentials) + // Per-host schedule CRUD. Mutations bump host_schedule_version // and async-push to a connected agent (see schedule_push.go). r.Get("/hosts/{id}/schedules", s.handleListSchedules) @@ -134,12 +149,23 @@ func (s *Server) routes(r chi.Router) { // mounted at the equivalent path outside /api below — both // resolve to the same handler, which sniffs HX-Request. r.Post("/hosts/{id}/source-groups/{gid}/run", s.handleRunSourceGroup) + + // Repo-level run-now: prune (needs admin creds), check, unlock. + // HTMX forms are also mounted outside /api below. + r.Post("/hosts/{id}/repo/prune", s.handleRunRepoPrune) + r.Post("/hosts/{id}/repo/check", s.handleRunRepoCheck) + r.Post("/hosts/{id}/repo/unlock", s.handleRunRepoUnlock) }) // Per-source-group Run-now (HTMX form action). Available even // when the server is started without UI templates so REST callers // against the non-/api path also work. r.Post("/hosts/{id}/source-groups/{gid}/run", s.handleRunSourceGroup) + // Repo-level run-now (HTMX form actions). Same handlers as the /api + // variants — wantsHTML sniff distinguishes JSON vs HTMX response. + r.Post("/hosts/{id}/repo/prune", s.handleRunRepoPrune) + r.Post("/hosts/{id}/repo/check", s.handleRunRepoCheck) + r.Post("/hosts/{id}/repo/unlock", s.handleRunRepoUnlock) // Retired routes — see ui_handlers.go for the messages. Mounted // outside the UI gate so cached browser tabs get a clear 410 // even if the server runs without templates. @@ -202,6 +228,9 @@ func (s *Server) routes(r chi.Router) { r.Post("/hosts/{id}/repo/credentials", s.handleUIRepoCredentialsSave) r.Post("/hosts/{id}/repo/bandwidth", s.handleUIRepoBandwidthSave) r.Post("/hosts/{id}/repo/maintenance", s.handleUIRepoMaintenanceSave) + // Admin credentials form (separate slot for prune-capable user). + r.Post("/hosts/{id}/admin-credentials", s.handleUIAdminCredentialsSave) + r.Post("/hosts/{id}/admin-credentials/delete", s.handleUIAdminCredentialsDelete) // Schedules tab + create/edit/delete forms. r.Get("/hosts/{id}/schedules", s.handleUISchedulesList) r.Get("/hosts/{id}/schedules/new", s.handleUIScheduleNewGet) diff --git a/internal/server/http/ui_repo.go b/internal/server/http/ui_repo.go index 79ad2ae..3420cbd 100644 --- a/internal/server/http/ui_repo.go +++ b/internal/server/http/ui_repo.go @@ -7,6 +7,9 @@ import ( stdhttp "net/http" "strconv" "strings" + "time" + + "github.com/oklog/ulid/v2" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui" "gitea.dcglab.co.uk/steve/restic-manager/internal/store" @@ -17,10 +20,31 @@ import ( // the page into three independent forms so saving one section // doesn't disturb the others. // -// GET /hosts/{id}/repo — render -// POST /hosts/{id}/repo/credentials — connection -// POST /hosts/{id}/repo/bandwidth — host-wide bw caps -// POST /hosts/{id}/repo/maintenance — forget/prune/check cadences +// GET /hosts/{id}/repo — render +// POST /hosts/{id}/repo/credentials — connection +// POST /hosts/{id}/repo/bandwidth — host-wide bw caps +// POST /hosts/{id}/repo/maintenance — forget/prune/check cadences +// POST /hosts/{id}/admin-credentials — admin (prune) creds +// POST /hosts/{id}/admin-credentials/delete — clear admin creds + +// repoStatsView is a flat, pre-dereferenced projection of +// store.HostRepoStats for use in templates. Nil pointer fields are +// collapsed to zero/false and accompanied by a Has* sentinel so the +// template can distinguish "zero" from "not yet known." +type repoStatsView struct { + HasTotalSize bool + TotalSizeBytes int64 + HasRawSize bool + RawSizeBytes int64 + HasLastCheck bool + LastCheckAt time.Time + LastCheckAgo string + LastCheckStatus string + LockPresent bool + HasLastPrune bool + LastPruneAt time.Time + LastPruneAgo string +} type hostRepoPage struct { hostChromeData @@ -30,6 +54,11 @@ type hostRepoPage struct { RepoUsername string HasPassword bool + // Admin credentials (optional, prune-only — separate slot). + AdminURL string + AdminUsername string + HasAdminPassword bool + // Bandwidth (form values, blank means "no cap") BandwidthUp string BandwidthDown string @@ -37,6 +66,14 @@ type hostRepoPage struct { // Maintenance row Maintenance store.HostRepoMaintenance + // Online mirrors Hub.Connected so Run-now button disabled state is + // accurate at render time. + Online bool + + // StatsView is a pre-dereferenced projection of host_repo_stats. + // Nil when no row exists yet (fresh hosts). + StatsView *repoStatsView + // Snapshots-by-tag — map[group_name]count, plus an "untagged" row. SnapshotsByTag map[string]int UntaggedSnapshots int @@ -44,6 +81,7 @@ type hostRepoPage struct { // Inline form-error banners. Empty when no error for that section. CredentialsError string + AdminCredsError string BandwidthError string MaintenanceError string @@ -61,7 +99,7 @@ func (s *Server) loadHostRepoPage(r *stdhttp.Request, host store.Host) (*hostRep } // Credentials (redacted). - enc, err := s.deps.Store.GetHostCredentials(r.Context(), host.ID) + enc, err := s.deps.Store.GetHostCredentials(r.Context(), host.ID, store.CredKindRepo) switch { case err == nil: plain, derr := s.deps.AEAD.Decrypt(enc, []byte("host:"+host.ID)) @@ -79,6 +117,60 @@ func (s *Server) loadHostRepoPage(r *stdhttp.Request, host store.Host) (*hostRep return nil, err } + // Admin credentials (optional — prune-only slot). + adminEnc, aerr := s.deps.Store.GetHostCredentials(r.Context(), host.ID, store.CredKindAdmin) + switch { + case aerr == nil: + plain, derr := s.deps.AEAD.Decrypt(adminEnc, []byte("host:"+host.ID+":admin")) + if derr == nil { + var blob repoCredsBlob + if jerr := json.Unmarshal(plain, &blob); jerr == nil { + p.AdminURL = blob.RepoURL + p.AdminUsername = blob.RepoUsername + p.HasAdminPassword = blob.RepoPassword != "" + } + } + case errors.Is(aerr, store.ErrNotFound): + // admin slot not configured — fine + default: + return nil, aerr + } + + // Online status. + if s.deps.Hub != nil { + p.Online = s.deps.Hub.Connected(host.ID) + } + + // Repo stats (tolerate ErrNotFound — fresh hosts have no row yet). + if stats, serr := s.deps.Store.GetHostRepoStats(r.Context(), host.ID); serr == nil { + sv := &repoStatsView{} + if stats.TotalSizeBytes != nil { + sv.HasTotalSize = true + sv.TotalSizeBytes = *stats.TotalSizeBytes + } + if stats.RawSizeBytes != nil { + sv.HasRawSize = true + sv.RawSizeBytes = *stats.RawSizeBytes + } + if stats.LastCheckAt != nil { + sv.HasLastCheck = true + sv.LastCheckAt = *stats.LastCheckAt + sv.LastCheckAgo = relTimeAgo(*stats.LastCheckAt) + } + sv.LastCheckStatus = stats.LastCheckStatus + if stats.LockPresent != nil { + sv.LockPresent = *stats.LockPresent + } + if stats.LastPruneAt != nil { + sv.HasLastPrune = true + sv.LastPruneAt = *stats.LastPruneAt + sv.LastPruneAgo = relTimeAgo(*stats.LastPruneAt) + } + p.StatsView = sv + } else if !errors.Is(serr, store.ErrNotFound) { + return nil, serr + } + // Bandwidth. if host.BandwidthUpKBps != nil { p.BandwidthUp = strconv.Itoa(*host.BandwidthUpKBps) @@ -152,11 +244,11 @@ func (s *Server) handleUIHostRepo(w stdhttp.ResponseWriter, r *stdhttp.Request) } } -// renderRepoFormError loads the page state, overlays the section's -// error banner, and renders with a 422. Save-success goes through a -// 303 redirect with `?saved=
` instead, so this path is for -// validation failures only. -func (s *Server) renderRepoPage(w stdhttp.ResponseWriter, r *stdhttp.Request, u *ui.User, host *store.Host, credErr, bwErr, mntErr string) { +// renderRepoPage loads the page state, overlays section error banners, +// and renders with a 422. Save-success goes through a 303 redirect +// with `?saved=
` instead, so this path is for validation +// failures only. +func (s *Server) renderRepoPage(w stdhttp.ResponseWriter, r *stdhttp.Request, u *ui.User, host *store.Host, credErr, adminErr, bwErr, mntErr string) { page, err := s.loadHostRepoPage(r, *host) if err != nil { slog.Error("ui repo: reload after save", "host_id", host.ID, "err", err) @@ -164,6 +256,7 @@ func (s *Server) renderRepoPage(w stdhttp.ResponseWriter, r *stdhttp.Request, u return } page.CredentialsError = credErr + page.AdminCredsError = adminErr page.BandwidthError = bwErr page.MaintenanceError = mntErr view := s.baseView(u) @@ -198,13 +291,13 @@ func (s *Server) handleUIRepoCredentialsSave(w stdhttp.ResponseWriter, r *stdhtt repoPass := r.PostForm.Get("repo_password") // do NOT trim — operators may use trailing space deliberately if repoURL == "" { - s.renderRepoPage(w, r, u, host, "Repo URL is required.", "", "") + s.renderRepoPage(w, r, u, host, "Repo URL is required.", "", "", "") return } // Merge with existing blob — same semantics as the JSON PUT. existing := repoCredsBlob{} - if cur, err := s.deps.Store.GetHostCredentials(r.Context(), host.ID); err == nil { + if cur, err := s.deps.Store.GetHostCredentials(r.Context(), host.ID, store.CredKindRepo); err == nil { if plain, derr := s.deps.AEAD.Decrypt(cur, []byte("host:"+host.ID)); derr == nil { _ = json.Unmarshal(plain, &existing) } @@ -217,7 +310,7 @@ func (s *Server) handleUIRepoCredentialsSave(w stdhttp.ResponseWriter, r *stdhtt if existing.RepoPassword == "" { s.renderRepoPage(w, r, u, host, "No password on file yet — set one before saving the URL/username.", - "", "") + "", "", "") return } @@ -227,7 +320,7 @@ func (s *Server) handleUIRepoCredentialsSave(w stdhttp.ResponseWriter, r *stdhtt stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError) return } - if err := s.deps.Store.SetHostCredentials(r.Context(), host.ID, enc); err != nil { + if err := s.deps.Store.SetHostCredentials(r.Context(), host.ID, store.CredKindRepo, enc); err != nil { slog.Error("ui repo creds: persist", "err", err) stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError) return @@ -256,7 +349,7 @@ func (s *Server) handleUIRepoBandwidthSave(w stdhttp.ResponseWriter, r *stdhttp. up, upErr := parseOptionalNonNegInt(r.PostForm.Get("bandwidth_up")) down, downErr := parseOptionalNonNegInt(r.PostForm.Get("bandwidth_down")) if upErr != nil || downErr != nil { - s.renderRepoPage(w, r, u, host, "", + s.renderRepoPage(w, r, u, host, "", "", "Bandwidth caps must be non-negative whole numbers (or blank for no cap).", "") return @@ -294,19 +387,19 @@ func (s *Server) handleUIRepoMaintenanceSave(w stdhttp.ResponseWriter, r *stdhtt "forget": forgetCron, "prune": pruneCron, "check": checkCron, } { if expr == "" { - s.renderRepoPage(w, r, u, host, "", "", + s.renderRepoPage(w, r, u, host, "", "", "", label+" cadence is required.") return } if _, err := cronParser.Parse(expr); err != nil { - s.renderRepoPage(w, r, u, host, "", "", + s.renderRepoPage(w, r, u, host, "", "", "", label+" cadence didn't parse: "+err.Error()) return } } subset, err := strconv.Atoi(subsetStr) if err != nil || subset < 0 || subset > 100 { - s.renderRepoPage(w, r, u, host, "", "", + s.renderRepoPage(w, r, u, host, "", "", "", "check subset % must be between 0 and 100.") return } @@ -348,3 +441,143 @@ func parseOptionalNonNegInt(s string) (*int, error) { } return &n, nil } + +// relTimeAgo returns a short human-readable relative-time string like +// "5m ago", "3h ago", "2d ago" for use in stats panels. Does not use +// the template funcMap so it can be called from Go directly. +func relTimeAgo(t time.Time) string { + d := time.Since(t) + if d < 0 { + d = 0 + } + switch { + case d < time.Minute: + return "just now" + case d < time.Hour: + return strconv.Itoa(int(d.Minutes())) + "m ago" + case d < 24*time.Hour: + return strconv.Itoa(int(d.Hours())) + "h ago" + case d < 30*24*time.Hour: + return strconv.Itoa(int(d.Hours()/24)) + "d ago" + default: + return t.Format("2006-01-02") + } +} + +// handleUIAdminCredentialsSave handles the HTML form POST to +// /hosts/{id}/admin-credentials. Mirrors handleUIRepoCredentialsSave +// but operates on the admin slot (store.CredKindAdmin, AAD "host::admin"). +// Re-renders the page with an inline error on validation failure; +// redirects with ?saved=admin_credentials on success. +func (s *Server) handleUIAdminCredentialsSave(w stdhttp.ResponseWriter, r *stdhttp.Request) { + u := s.requireUIUser(w, r) + if u == nil { + return + } + host, ok := s.loadHostForUI(w, r) + if !ok { + return + } + if err := r.ParseForm(); err != nil { + stdhttp.Error(w, "bad request", stdhttp.StatusBadRequest) + return + } + repoURL := strings.TrimSpace(r.PostForm.Get("repo_url")) + repoUser := strings.TrimSpace(r.PostForm.Get("repo_username")) + repoPass := r.PostForm.Get("repo_password") + + // All blank → no-op save (operator hit Save without filling anything). + // We treat this as harmless — they may have wanted to clear via the + // Clear button instead. Only validate if they've started filling fields. + if repoURL == "" && repoUser == "" && repoPass == "" { + stdhttp.Redirect(w, r, "/hosts/"+host.ID+"/repo", stdhttp.StatusSeeOther) + return + } + + aad := []byte("host:" + host.ID + ":admin") + + // Merge with the existing admin row, if any. + existing := repoCredsBlob{} + if cur, err := s.deps.Store.GetHostCredentials(r.Context(), host.ID, store.CredKindAdmin); err == nil { + if plain, derr := s.deps.AEAD.Decrypt(cur, aad); derr == nil { + _ = json.Unmarshal(plain, &existing) + } + } + existing.RepoURL = repoURL + existing.RepoUsername = repoUser + if repoPass != "" { + existing.RepoPassword = repoPass + } + + if existing.RepoURL == "" { + s.renderRepoPage(w, r, u, host, "", "Repo URL is required.", "", "") + return + } + if existing.RepoPassword == "" { + s.renderRepoPage(w, r, u, host, "", + "No password on file yet — set one before saving the URL/username.", + "", "") + return + } + + enc, err := s.encryptRepoCreds(existing, aad) + if err != nil { + slog.Error("ui admin creds: encrypt", "err", err) + stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError) + return + } + if err := s.deps.Store.SetHostCredentials(r.Context(), host.ID, store.CredKindAdmin, enc); err != nil { + slog.Error("ui admin creds: persist", "err", err) + stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError) + return + } + _ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{ + ID: ulid.Make().String(), + UserID: &u.ID, + Actor: "user", + Action: "host.admin_credentials_set", + TargetKind: ptr("host"), + TargetID: &host.ID, + TS: nowUTC(), + }) + if s.deps.Hub != nil && s.deps.Hub.Connected(host.ID) { + if perr := s.pushAdminCredsToAgent(r.Context(), host.ID); perr != nil { + slog.Warn("ui admin creds: push to agent", "host_id", host.ID, "err", perr) + } + } + stdhttp.Redirect(w, r, "/hosts/"+host.ID+"/repo?saved=admin_credentials", stdhttp.StatusSeeOther) +} + +// handleUIAdminCredentialsDelete handles the HTML form POST to +// /hosts/{id}/admin-credentials/delete. Removes the admin slot and +// redirects back to the repo page. Treats "not found" as success +// (idempotent delete from the operator's point of view). +func (s *Server) handleUIAdminCredentialsDelete(w stdhttp.ResponseWriter, r *stdhttp.Request) { + u := s.requireUIUser(w, r) + if u == nil { + return + } + host, ok := s.loadHostForUI(w, r) + if !ok { + return + } + + err := s.deps.Store.DeleteHostCredentials(r.Context(), host.ID, store.CredKindAdmin) + if err != nil && !errors.Is(err, store.ErrNotFound) { + slog.Error("ui admin creds: delete", "host_id", host.ID, "err", err) + stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError) + return + } + if err == nil { + _ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{ + ID: ulid.Make().String(), + UserID: &u.ID, + Actor: "user", + Action: "host.admin_credentials_deleted", + TargetKind: ptr("host"), + TargetID: &host.ID, + TS: nowUTC(), + }) + } + stdhttp.Redirect(w, r, "/hosts/"+host.ID+"/repo?saved=admin_credentials", stdhttp.StatusSeeOther) +} diff --git a/internal/server/http/ui_repo_test.go b/internal/server/http/ui_repo_test.go new file mode 100644 index 0000000..38c857c --- /dev/null +++ b/internal/server/http/ui_repo_test.go @@ -0,0 +1,400 @@ +// ui_repo_test.go — integration tests for the Repo page HTML UI. +// Covers: admin-creds form rendering, stats panel, lock banner, +// run-now button disabled states, admin-creds form save/delete. +package http + +import ( + "context" + "io" + stdhttp "net/http" + "net/http/httptest" + "net/url" + "path/filepath" + "strings" + "testing" + "time" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/crypto" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/config" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// newTestServerWithUI creates a server that includes the UI renderer so +// HTML page tests can render and inspect the full template output. +func newTestServerWithUI(t *testing.T) (*Server, string, *store.Store) { + t.Helper() + dir := t.TempDir() + st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db")) + if err != nil { + t.Fatalf("store: %v", err) + } + t.Cleanup(func() { _ = st.Close() }) + + keyPath := filepath.Join(dir, "secret.key") + _ = crypto.GenerateKeyFile(keyPath) + key, _ := crypto.LoadKeyFromFile(keyPath) + aead, _ := crypto.NewAEAD(key) + + renderer, err := ui.New() + if err != nil { + t.Fatalf("ui.New: %v", err) + } + + deps := Deps{ + Cfg: config.Config{Listen: ":0", DataDir: dir, SecretKeyFile: keyPath}, + Store: st, + AEAD: aead, + Hub: ws.NewHub(), + UI: renderer, + } + s := New(deps) + ts := httptest.NewServer(s.srv.Handler) + t.Cleanup(ts.Close) + return s, ts.URL, st +} + +// getRepoPage fetches /hosts/{id}/repo and returns the body string. +func getRepoPage(t *testing.T, baseURL, hostID string, cookie *stdhttp.Cookie) string { + t.Helper() + client := &stdhttp.Client{ + CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error { + return stdhttp.ErrUseLastResponse + }, + } + req, err := stdhttp.NewRequest("GET", baseURL+"/hosts/"+hostID+"/repo", nil) + if err != nil { + t.Fatalf("new request: %v", err) + } + req.AddCookie(cookie) + res, err := client.Do(req) + if err != nil { + t.Fatalf("GET /hosts/%s/repo: %v", hostID, err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusOK { + t.Fatalf("GET /hosts/%s/repo: want 200, got %d", hostID, res.StatusCode) + } + raw, _ := io.ReadAll(res.Body) + return string(raw) +} + +// postForm posts URL-encoded form data to path, following no redirects, +// and returns the status code and Location header. +func postForm(t *testing.T, baseURL, path string, data url.Values, cookie *stdhttp.Cookie) (int, string) { + t.Helper() + client := &stdhttp.Client{ + CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error { + return stdhttp.ErrUseLastResponse + }, + } + req, err := stdhttp.NewRequest("POST", baseURL+path, strings.NewReader(data.Encode())) + if err != nil { + t.Fatalf("new request: %v", err) + } + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + if cookie != nil { + req.AddCookie(cookie) + } + res, err := client.Do(req) + if err != nil { + t.Fatalf("POST %s: %v", path, err) + } + defer res.Body.Close() + return res.StatusCode, res.Header.Get("Location") +} + +// ----- rendering tests ------------------------------------------------ + +// TestUIRepoPageRendersAdminCredsForm — visit /hosts/{id}/repo for a +// host with no admin creds. Assert the page contains the admin-creds +// section heading and the "not yet set" placeholder text. +func TestUIRepoPageRendersAdminCredsForm(t *testing.T) { + t.Parallel() + _, baseURL, st := newTestServerWithUI(t) + cookie := loginAsAdmin(t, st) + hostID := makeHost(t, st, "repo-page-admin-form") + + body := getRepoPage(t, baseURL, hostID, cookie) + + if !strings.Contains(body, "Admin credentials") { + t.Error("page missing 'Admin credentials' heading") + } + if !strings.Contains(body, "— not yet set —") { + t.Error("page missing '— not yet set —' placeholder for admin password") + } +} + +// TestUIRepoPageRendersStatsPanel — seed a host_repo_stats row, render +// the page, assert "Repo health" panel and the seeded values appear. +func TestUIRepoPageRendersStatsPanel(t *testing.T) { + t.Parallel() + _, baseURL, st := newTestServerWithUI(t) + cookie := loginAsAdmin(t, st) + hostID := makeHost(t, st, "repo-page-stats") + + totalSize := int64(5_000_000_000) // 5 GB + checkStatus := "ok" + checkAt := time.Now().Add(-2 * time.Hour).UTC() + if err := st.UpsertHostRepoStats(context.Background(), hostID, store.HostRepoStats{ + TotalSizeBytes: &totalSize, + LastCheckAt: &checkAt, + LastCheckStatus: checkStatus, + }); err != nil { + t.Fatalf("upsert stats: %v", err) + } + + body := getRepoPage(t, baseURL, hostID, cookie) + + if !strings.Contains(body, "Repo health") { + t.Error("page missing 'Repo health' heading") + } + // The bytes helper renders 5 GB as "5.0 GB" (with a unit suffix) + if !strings.Contains(body, "5.0") { + t.Error("page missing '5.0' (total size formatted bytes)") + } + if !strings.Contains(body, "ok") { + t.Error("page missing 'ok' check status") + } +} + +// TestUIRepoPageRendersLockBanner — seed stats with LockPresent=true, +// render, assert stale lock warning appears. +func TestUIRepoPageRendersLockBanner(t *testing.T) { + t.Parallel() + _, baseURL, st := newTestServerWithUI(t) + cookie := loginAsAdmin(t, st) + hostID := makeHost(t, st, "repo-page-lock") + + lockPresent := true + if err := st.UpsertHostRepoStats(context.Background(), hostID, store.HostRepoStats{ + LockPresent: &lockPresent, + }); err != nil { + t.Fatalf("upsert stats: %v", err) + } + + body := getRepoPage(t, baseURL, hostID, cookie) + + if !strings.Contains(body, "Stale lock detected") { + t.Error("page missing stale lock warning") + } +} + +// TestUIRepoRunNowButtonsDisabledWhenOffline — host not in the Hub +// (not connected), render, assert all three buttons carry disabled. +func TestUIRepoRunNowButtonsDisabledWhenOffline(t *testing.T) { + t.Parallel() + _, baseURL, st := newTestServerWithUI(t) + cookie := loginAsAdmin(t, st) + hostID := makeHost(t, st, "repo-page-offline") + + // No WS connection → Hub.Connected returns false. + body := getRepoPage(t, baseURL, hostID, cookie) + + // All three Run-now buttons should have disabled. + // Each button appears once in the template with class "btn btn-secondary" + // and hx-post attributes. The disabled attribute is added conditionally. + // Count occurrences of 'disabled' in the Run-now section. + runNowIdx := strings.Index(body, "Run now · one-time") + dangerIdx := strings.Index(body, "Danger zone") + if runNowIdx < 0 { + t.Fatal("page missing 'Run now · one-time' section") + } + if dangerIdx < 0 { + t.Fatal("page missing 'Danger zone' section") + } + runNowSection := body[runNowIdx:dangerIdx] + disabledCount := strings.Count(runNowSection, "disabled") + if disabledCount < 3 { + t.Errorf("expected at least 3 disabled attributes in Run-now section (one per button), got %d", disabledCount) + } +} + +// TestUIRepoPruneButtonDisabledWithoutAdminCreds — host is online but +// no admin creds set. Assert prune button has disabled and mentions +// "set admin credentials first". +func TestUIRepoPruneButtonDisabledWithoutAdminCreds(t *testing.T) { + t.Parallel() + srv, baseURL, st := newTestServerWithUI(t) + cookie := loginAsAdmin(t, st) + hostID := makeHost(t, st, "repo-page-prune-no-admin") + + // Register the host as "connected" in the Hub so the online check passes. + // We use a fake conn by injecting directly — for a simpler approach, + // rely on the fact that the Hub.Connected call just needs the ID registered. + // We can't easily fake a WS conn in a unit test, so instead we verify + // that even without the hub connected the prune button still has + // "set admin credentials first" text since that check runs first. + _ = srv // suppress unused warning + + body := getRepoPage(t, baseURL, hostID, cookie) + + if !strings.Contains(body, "set admin credentials first") { + t.Error("page missing 'set admin credentials first' on prune button") + } +} + +// ----- admin-creds form save/delete tests ---------------------------- + +// TestUIAdminCredentialsSaveRoundTrip — POST form-encoded body to +// /hosts/{id}/admin-credentials, follow redirect, assert page now shows +// "stored, leave blank to keep" placeholder. Audit row landed. +func TestUIAdminCredentialsSaveRoundTrip(t *testing.T) { + t.Parallel() + _, baseURL, st := newTestServerWithUI(t) + cookie, userID := loginAsAdminWithID(t, st) + hostID := makeHost(t, st, "admin-save-roundtrip") + + // POST admin credentials. + status, loc := postForm(t, baseURL, "/hosts/"+hostID+"/admin-credentials", url.Values{ + "repo_url": {"rest:http://admin.example/h"}, + "repo_username": {"admin-user"}, + "repo_password": {"s3cr3t-admin"}, + }, cookie) + if status != stdhttp.StatusSeeOther { + t.Fatalf("save: want 303, got %d", status) + } + if !strings.Contains(loc, "saved=admin_credentials") { + t.Errorf("redirect location should contain saved=admin_credentials, got %q", loc) + } + + // Follow redirect. + body := getRepoPage(t, baseURL, hostID, cookie) + if !strings.Contains(body, "stored, leave blank to keep") { + t.Error("after save: page missing 'stored, leave blank to keep' placeholder for admin password") + } + + // Audit row should exist. + ctx := context.Background() + rows, err := st.DB().QueryContext(ctx, + `SELECT action, user_id FROM audit_log WHERE target_id = ? AND action = 'host.admin_credentials_set'`, + hostID) + if err != nil { + t.Fatalf("query audit: %v", err) + } + defer rows.Close() + found := false + for rows.Next() { + var action string + var gotUID *string + if err := rows.Scan(&action, &gotUID); err != nil { + t.Fatalf("scan: %v", err) + } + found = true + if gotUID == nil || *gotUID != userID { + t.Errorf("audit row user_id: want %q, got %v", userID, gotUID) + } + } + if err := rows.Err(); err != nil { + t.Fatalf("rows.Err: %v", err) + } + if !found { + t.Error("audit row with action='host.admin_credentials_set' not found") + } +} + +// TestUIAdminCredentialsDelete — POST to the delete route, assert +// admin row gone and audit row landed. +func TestUIAdminCredentialsDelete(t *testing.T) { + t.Parallel() + srv, baseURL, st := newTestServerWithUI(t) + cookie, userID := loginAsAdminWithID(t, st) + hostID := makeHost(t, st, "admin-delete") + + ctx := context.Background() + + // Seed admin creds directly. + enc, err := srv.encryptRepoCreds(repoCredsBlob{ + RepoURL: "rest:http://admin.example/h", + RepoPassword: "pw", + }, []byte("host:"+hostID+":admin")) + if err != nil { + t.Fatalf("encrypt: %v", err) + } + if err := st.SetHostCredentials(ctx, hostID, store.CredKindAdmin, enc); err != nil { + t.Fatalf("set admin creds: %v", err) + } + + // POST to delete route. + status, loc := postForm(t, baseURL, "/hosts/"+hostID+"/admin-credentials/delete", url.Values{}, cookie) + if status != stdhttp.StatusSeeOther { + t.Fatalf("delete: want 303, got %d", status) + } + if !strings.Contains(loc, "saved=admin_credentials") { + t.Errorf("redirect location: want saved=admin_credentials, got %q", loc) + } + + // Admin row should be gone. + if _, err := st.GetHostCredentials(ctx, hostID, store.CredKindAdmin); err == nil { + t.Error("admin creds row still present after delete") + } + + // Audit row. + rows, err := st.DB().QueryContext(ctx, + `SELECT action, user_id FROM audit_log WHERE target_id = ? AND action = 'host.admin_credentials_deleted'`, + hostID) + if err != nil { + t.Fatalf("query audit: %v", err) + } + defer rows.Close() + found := false + for rows.Next() { + var action string + var gotUID *string + if err := rows.Scan(&action, &gotUID); err != nil { + t.Fatalf("scan: %v", err) + } + found = true + if gotUID == nil || *gotUID != userID { + t.Errorf("audit row user_id: want %q, got %v", userID, gotUID) + } + } + if err := rows.Err(); err != nil { + t.Fatalf("rows.Err: %v", err) + } + if !found { + t.Error("audit row with action='host.admin_credentials_deleted' not found") + } +} + +// TestUIAdminCredentialsDeleteIdempotent — POST to the delete route +// when no admin creds exist → 303 redirect (no 404 / 500). +func TestUIAdminCredentialsDeleteIdempotent(t *testing.T) { + t.Parallel() + _, baseURL, st := newTestServerWithUI(t) + cookie := loginAsAdmin(t, st) + hostID := makeHost(t, st, "admin-delete-noop") + + status, _ := postForm(t, baseURL, "/hosts/"+hostID+"/admin-credentials/delete", url.Values{}, cookie) + if status != stdhttp.StatusSeeOther { + t.Fatalf("delete (noop): want 303, got %d", status) + } +} + +// TestUIAdminCredentialsSaveAllBlankIsNoop — POST empty form → 303 +// redirect, no row created. +func TestUIAdminCredentialsSaveAllBlankIsNoop(t *testing.T) { + t.Parallel() + _, baseURL, st := newTestServerWithUI(t) + cookie := loginAsAdmin(t, st) + hostID := makeHost(t, st, "admin-save-blank") + + status, loc := postForm(t, baseURL, "/hosts/"+hostID+"/admin-credentials", url.Values{ + "repo_url": {""}, + "repo_username": {""}, + "repo_password": {""}, + }, cookie) + if status != stdhttp.StatusSeeOther { + t.Fatalf("blank save: want 303, got %d", status) + } + // All-blank is a no-op: redirect must not carry ?saved= banner. + if strings.Contains(loc, "?saved=") { + t.Errorf("blank save: redirect Location %q must not contain ?saved=", loc) + } + + // No admin row should have been created. + if _, err := st.GetHostCredentials(context.Background(), hostID, store.CredKindAdmin); err == nil { + t.Error("admin creds row created unexpectedly for blank save") + } +} diff --git a/internal/server/maintenance/ticker.go b/internal/server/maintenance/ticker.go new file mode 100644 index 0000000..a29d8a2 --- /dev/null +++ b/internal/server/maintenance/ticker.go @@ -0,0 +1,116 @@ +// Package maintenance owns the server-side scheduler that fires +// forget/prune/check on the cadences operators set on +// host_repo_maintenance rows. Independent of the agent's local cron +// (which now only handles backup schedules). +// +// The ticker is intentionally side-effect-free at the package +// boundary: it asks an injected Backend for current state and emits +// a list of Decisions for the caller to act on. Easy to unit-test +// without a running server. +package maintenance + +import ( + "context" + "errors" + "time" + + "github.com/robfig/cron/v3" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// Decision is one cadence-driven dispatch the ticker recommends. +// SubsetPct is populated only when Kind == "check"; ignored for +// "forget" and "prune". +type Decision struct { + HostID string + Kind string // "forget" | "prune" | "check" + SubsetPct int +} + +// Backend is the subset of *store.Store the ticker depends on. +// Constrained interface so tests can pass a fake. +type Backend interface { + ListAllMaintenance(ctx context.Context) ([]store.HostRepoMaintenance, error) + LatestJobByKind(ctx context.Context, hostID, kind string) (*store.Job, error) +} + +// Ticker decides which cadence-driven jobs are due to fire at a +// given instant. Stateless — the only state lives in the Backend. +type Ticker struct { + backend Backend + parser cron.Parser +} + +// New builds a Ticker bound to the given Backend. +func New(b Backend) *Ticker { + return &Ticker{ + backend: b, + parser: cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow), + } +} + +// Decide returns the set of jobs the ticker would dispatch at `now`. +// The caller is responsible for: checking host online state, +// persisting the job row, and shipping command.run. Returns nil +// (not an error) when the maintenance table is empty — a fresh +// install is the most common case. +func (t *Ticker) Decide(ctx context.Context, now time.Time) ([]Decision, error) { + rows, err := t.backend.ListAllMaintenance(ctx) + if err != nil { + return nil, err + } + var out []Decision + for _, m := range rows { + if d, ok := t.dueFor(ctx, now, m.HostID, "forget", m.ForgetCron, m.ForgetEnabled, 0); ok { + out = append(out, d) + } + if d, ok := t.dueFor(ctx, now, m.HostID, "prune", m.PruneCron, m.PruneEnabled, 0); ok { + out = append(out, d) + } + if d, ok := t.dueFor(ctx, now, m.HostID, "check", m.CheckCron, m.CheckEnabled, m.CheckSubsetPct); ok { + out = append(out, d) + } + } + return out, nil +} + +// dueFor returns true if the cron has a fire-instant strictly after +// the latest persisted job's created_at and at-or-before now. +// +// Anchor selection: +// - When LatestJobByKind returns a job: anchor = j.CreatedAt. +// - When LatestJobByKind returns ErrNotFound: anchor = now - 24h +// (first-run case — cap the lookback so a brand-new host doesn't +// fire 30 days of missed monthly-checks on first tick). +// - When LatestJobByKind returns a hard error: skip this kind for +// this host on this tick. +// +// Disabled (`enabled == false`) or empty cron skips silently. +// Cron parse failures skip silently — the schedule/maintenance +// routes already validate cron at write time, so this is defensive. +func (t *Ticker) dueFor(ctx context.Context, now time.Time, hostID, kind, expr string, enabled bool, subset int) (Decision, bool) { + if !enabled || expr == "" { + return Decision{}, false + } + sched, err := t.parser.Parse(expr) + if err != nil { + return Decision{}, false + } + j, err := t.backend.LatestJobByKind(ctx, hostID, kind) + var anchor time.Time + switch { + case err == nil && j != nil: + anchor = j.CreatedAt + case errors.Is(err, store.ErrNotFound): + anchor = now.Add(-24 * time.Hour) + default: + // Hard error — skip this kind on this tick. + return Decision{}, false + } + next := sched.Next(anchor) + if next.IsZero() || next.After(now) { + return Decision{}, false + } + return Decision{HostID: hostID, Kind: kind, SubsetPct: subset}, true +} diff --git a/internal/server/maintenance/ticker_test.go b/internal/server/maintenance/ticker_test.go new file mode 100644 index 0000000..c0631df --- /dev/null +++ b/internal/server/maintenance/ticker_test.go @@ -0,0 +1,315 @@ +package maintenance + +import ( + "context" + "errors" + "testing" + "time" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// fakeBackend implements Backend with table-driven canned responses. +type fakeBackend struct { + rows []store.HostRepoMaintenance + // jobs[hostID][kind] -> job (if present, returned). If absent, + // fakeBackend returns ErrNotFound by default. + jobs map[string]map[string]*store.Job + // hardErr forces a non-ErrNotFound failure for a given (host, kind). + hardErr map[string]map[string]error + // listErr forces ListAllMaintenance to fail. + listErr error +} + +func (f *fakeBackend) ListAllMaintenance(_ context.Context) ([]store.HostRepoMaintenance, error) { + if f.listErr != nil { + return nil, f.listErr + } + return f.rows, nil +} + +func (f *fakeBackend) LatestJobByKind(_ context.Context, hostID, kind string) (*store.Job, error) { + if hostErrs, ok := f.hardErr[hostID]; ok { + if err := hostErrs[kind]; err != nil { + return nil, err + } + } + if hostJobs, ok := f.jobs[hostID]; ok { + if j := hostJobs[kind]; j != nil { + return j, nil + } + } + return nil, store.ErrNotFound +} + +// mustTime parses an RFC3339 string, fatal on failure. +func mustTime(t *testing.T, s string) time.Time { + t.Helper() + out, err := time.Parse(time.RFC3339, s) + if err != nil { + t.Fatalf("parse %q: %v", s, err) + } + return out +} + +func TestTickerSkipsDisabled(t *testing.T) { + t.Parallel() + be := &fakeBackend{ + rows: []store.HostRepoMaintenance{{ + HostID: "h1", + ForgetCron: "0 3 * * *", + ForgetEnabled: false, + PruneCron: "0 4 * * *", + PruneEnabled: false, + CheckCron: "0 5 * * *", + CheckEnabled: false, + }}, + } + tk := New(be) + now := mustTime(t, "2026-05-04T04:00:00Z") + got, err := tk.Decide(context.Background(), now) + if err != nil { + t.Fatalf("Decide: %v", err) + } + if len(got) != 0 { + t.Errorf("expected no decisions, got %+v", got) + } +} + +func TestTickerSkipsEmptyCron(t *testing.T) { + t.Parallel() + be := &fakeBackend{ + rows: []store.HostRepoMaintenance{{ + HostID: "h1", + ForgetCron: "", + ForgetEnabled: true, + PruneCron: "", + PruneEnabled: true, + CheckCron: "", + CheckEnabled: true, + }}, + } + tk := New(be) + got, err := tk.Decide(context.Background(), mustTime(t, "2026-05-04T04:00:00Z")) + if err != nil { + t.Fatalf("Decide: %v", err) + } + if len(got) != 0 { + t.Errorf("expected no decisions, got %+v", got) + } +} + +func TestTickerFiresWhenOverdue(t *testing.T) { + t.Parallel() + now := mustTime(t, "2026-05-04T04:00:00Z") + // Latest forget job 25h ago. + last := now.Add(-25 * time.Hour) + be := &fakeBackend{ + rows: []store.HostRepoMaintenance{{ + HostID: "h1", + ForgetCron: "0 3 * * *", + ForgetEnabled: true, + }}, + jobs: map[string]map[string]*store.Job{ + "h1": {"forget": &store.Job{ID: "j1", HostID: "h1", Kind: "forget", CreatedAt: last}}, + }, + } + tk := New(be) + got, err := tk.Decide(context.Background(), now) + if err != nil { + t.Fatalf("Decide: %v", err) + } + if len(got) != 1 || got[0].Kind != "forget" || got[0].HostID != "h1" { + t.Errorf("expected one forget decision, got %+v", got) + } +} + +func TestTickerSuppressesWhenRecent(t *testing.T) { + t.Parallel() + now := mustTime(t, "2026-05-04T04:00:00Z") + last := mustTime(t, "2026-05-04T03:30:00Z") + be := &fakeBackend{ + rows: []store.HostRepoMaintenance{{ + HostID: "h1", + ForgetCron: "0 3 * * *", + ForgetEnabled: true, + }}, + jobs: map[string]map[string]*store.Job{ + "h1": {"forget": &store.Job{ID: "j1", HostID: "h1", Kind: "forget", CreatedAt: last}}, + }, + } + tk := New(be) + got, err := tk.Decide(context.Background(), now) + if err != nil { + t.Fatalf("Decide: %v", err) + } + if len(got) != 0 { + t.Errorf("expected no decisions, got %+v", got) + } +} + +func TestTickerFirstRunAnchorBoundedAt24h(t *testing.T) { + t.Parallel() + be := &fakeBackend{ + rows: []store.HostRepoMaintenance{{ + HostID: "h1", + ForgetCron: "0 3 * * *", + ForgetEnabled: true, + }}, + } + tk := New(be) + + // Case 1: now=04:00. Anchor=04:00 - 24h = previous-day 04:00. Next + // fire after that is today 03:00 — within window → fire. + now1 := mustTime(t, "2026-05-04T04:00:00Z") + got, err := tk.Decide(context.Background(), now1) + if err != nil { + t.Fatalf("Decide: %v", err) + } + if len(got) != 1 { + t.Errorf("case1: expected 1 decision, got %+v", got) + } + + // Case 2: a cron firing less often than once per 24h with a + // no-prior-job anchor must not fire when the most recent fire is + // outside the 24h lookback window. Use a weekly cron (Mondays at + // 03:00) and `now` on a Tuesday: anchor=now-24h lands on Monday, + // so cron.Next(Monday) = next-week Monday → after now → no fire. + // 2026-05-04 is a Monday, 2026-05-05 a Tuesday. + be2 := &fakeBackend{ + rows: []store.HostRepoMaintenance{{ + HostID: "h2", + ForgetCron: "0 3 * * 1", + ForgetEnabled: true, + }}, + } + tk2 := New(be2) + now2 := mustTime(t, "2026-05-05T03:00:00Z") + got2, err := tk2.Decide(context.Background(), now2) + if err != nil { + t.Fatalf("Decide case2: %v", err) + } + if len(got2) != 0 { + t.Errorf("case2: expected no decisions (cron fires < once/24h, prior fire was Monday 03:00 which is exactly 24h ago and anchor=now-24h means next-after is next Monday), got %+v", got2) + } +} + +func TestTickerCheckDecisionCarriesSubset(t *testing.T) { + t.Parallel() + now := mustTime(t, "2026-05-04T04:00:00Z") + last := now.Add(-30 * 24 * time.Hour) + be := &fakeBackend{ + rows: []store.HostRepoMaintenance{{ + HostID: "h1", + CheckCron: "0 3 * * *", + CheckEnabled: true, + CheckSubsetPct: 25, + }}, + jobs: map[string]map[string]*store.Job{ + "h1": {"check": &store.Job{ID: "j1", HostID: "h1", Kind: "check", CreatedAt: last}}, + }, + } + tk := New(be) + got, err := tk.Decide(context.Background(), now) + if err != nil { + t.Fatalf("Decide: %v", err) + } + if len(got) != 1 || got[0].Kind != "check" || got[0].SubsetPct != 25 { + t.Errorf("expected check decision with SubsetPct=25, got %+v", got) + } +} + +func TestTickerHardJobErrorSkipsKind(t *testing.T) { + t.Parallel() + now := mustTime(t, "2026-05-04T04:00:00Z") + last := now.Add(-25 * time.Hour) + hardErr := errors.New("synthetic db error") + be := &fakeBackend{ + rows: []store.HostRepoMaintenance{{ + HostID: "h1", + ForgetCron: "0 3 * * *", + ForgetEnabled: true, + CheckCron: "0 3 * * *", + CheckEnabled: true, + }}, + jobs: map[string]map[string]*store.Job{ + // check has a normal latest-job; should still fire. + "h1": {"check": &store.Job{ID: "jc", HostID: "h1", Kind: "check", CreatedAt: last}}, + }, + hardErr: map[string]map[string]error{ + "h1": {"forget": hardErr}, + }, + } + tk := New(be) + got, err := tk.Decide(context.Background(), now) + if err != nil { + t.Fatalf("Decide: %v", err) + } + // Only the check decision should land — forget is skipped. + if len(got) != 1 || got[0].Kind != "check" { + t.Errorf("expected only check decision, got %+v", got) + } +} + +func TestTickerHandlesMultipleHosts(t *testing.T) { + t.Parallel() + now := mustTime(t, "2026-05-04T04:00:00Z") + last := now.Add(-25 * time.Hour) + be := &fakeBackend{ + rows: []store.HostRepoMaintenance{ + { + HostID: "ha", + ForgetCron: "0 3 * * *", + ForgetEnabled: true, + }, + { + HostID: "hb", + CheckCron: "0 3 * * *", + CheckEnabled: true, + PruneCron: "0 4 * * *", + PruneEnabled: false, // disabled — should not fire + }, + }, + jobs: map[string]map[string]*store.Job{ + "ha": {"forget": &store.Job{ID: "j1", HostID: "ha", Kind: "forget", CreatedAt: last}}, + "hb": {"check": &store.Job{ID: "j2", HostID: "hb", Kind: "check", CreatedAt: last}}, + }, + } + tk := New(be) + got, err := tk.Decide(context.Background(), now) + if err != nil { + t.Fatalf("Decide: %v", err) + } + if len(got) != 2 { + t.Fatalf("expected 2 decisions, got %d: %+v", len(got), got) + } + kinds := map[string]string{} + for _, d := range got { + kinds[d.HostID] = d.Kind + } + if kinds["ha"] != "forget" { + t.Errorf("ha: expected forget, got %q", kinds["ha"]) + } + if kinds["hb"] != "check" { + t.Errorf("hb: expected check, got %q", kinds["hb"]) + } +} + +func TestTickerInvalidCronSkipsSilently(t *testing.T) { + t.Parallel() + be := &fakeBackend{ + rows: []store.HostRepoMaintenance{{ + HostID: "h1", + ForgetCron: "not a cron", + ForgetEnabled: true, + }}, + } + tk := New(be) + got, err := tk.Decide(context.Background(), mustTime(t, "2026-05-04T04:00:00Z")) + if err != nil { + t.Fatalf("Decide: %v", err) + } + if len(got) != 0 { + t.Errorf("expected no decisions for invalid cron, got %+v", got) + } +} diff --git a/internal/server/ws/handler.go b/internal/server/ws/handler.go index 48fb5fd..5706693 100644 --- a/internal/server/ws/handler.go +++ b/internal/server/ws/handler.go @@ -267,8 +267,34 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E deps.OnScheduleFire(ctx, hostID, c, p.ScheduleID, p.ScheduledAt) } - case api.MsgRepoStats, api.MsgCommandResult: - // TODO(P2): persist these projections. + case api.MsgRepoStats: + var p api.RepoStatsPayload + if err := env.UnmarshalPayload(&p); err != nil { + slog.Warn("ws: bad repo.stats payload", "host_id", hostID, "err", err) + break + } + patch := store.HostRepoStats{ + HostID: hostID, + TotalSizeBytes: p.TotalSizeBytes, + RawSizeBytes: p.RawSizeBytes, + UniqueFiles: p.UniqueFiles, + SnapshotCount: p.SnapshotCount, + LastCheckAt: p.LastCheckAt, + LastCheckStatus: p.LastCheckStatus, + LockPresent: p.LockPresent, + LastPruneAt: p.LastPruneAt, + LastPruneFreedBytes: p.LastPruneFreedBytes, + } + if err := deps.Store.UpsertHostRepoStats(ctx, hostID, patch); err != nil { + slog.Warn("ws: upsert host repo stats", "host_id", hostID, "err", err) + } else { + slog.Info("ws: repo stats refreshed", "host_id", hostID) + } + + case api.MsgCommandResult: + // TODO(P2): persist command.result acks for "did the agent + // accept the dispatch?" forensics. Currently the job lifecycle + // (job.started → job.finished) is sufficient signal. slog.Debug("ws msg not yet handled", "type", env.Type, "host_id", hostID) case api.MsgError: diff --git a/internal/server/ws/handler_test.go b/internal/server/ws/handler_test.go new file mode 100644 index 0000000..819a812 --- /dev/null +++ b/internal/server/ws/handler_test.go @@ -0,0 +1,135 @@ +package ws + +import ( + "context" + "path/filepath" + "testing" + "time" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// openWSTestStore opens an isolated file-backed db in t.TempDir. +func openWSTestStore(t *testing.T) *store.Store { + t.Helper() + dir := t.TempDir() + s, err := store.Open(context.Background(), filepath.Join(dir, "rm.db")) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + return s +} + +// seedHostWS inserts a minimal host row directly via the store's DB. +func seedHostWS(t *testing.T, s *store.Store, hostID string) { + t.Helper() + _, err := s.DB().Exec( + `INSERT INTO hosts (id, name, os, arch, enrolled_at) VALUES (?,?,?,?,?)`, + hostID, hostID, "linux", "amd64", "2026-01-01T00:00:00Z") + if err != nil { + t.Fatalf("seed host %q: %v", hostID, err) + } +} + +func int64ptrWS(v int64) *int64 { return &v } +func boolptrWS(v bool) *bool { return &v } + +func TestRepoStatsReportPersisted(t *testing.T) { + t.Parallel() + s := openWSTestStore(t) + ctx := context.Background() + + const hostID = "h-stats-ws" + seedHostWS(t, s, hostID) + + now := time.Now().UTC().Truncate(time.Second) + pruneAt := now.Add(-2 * time.Hour) + payload := api.RepoStatsPayload{ + TotalSizeBytes: int64ptrWS(1024), + RawSizeBytes: int64ptrWS(2048), + UniqueFiles: int64ptrWS(42), + SnapshotCount: int64ptrWS(7), + LastCheckAt: &now, + LastCheckStatus: "ok", + LockPresent: boolptrWS(false), + LastPruneAt: &pruneAt, + LastPruneFreedBytes: int64ptrWS(512), + } + env, err := api.Marshal(api.MsgRepoStats, "", payload) + if err != nil { + t.Fatalf("marshal: %v", err) + } + + deps := HandlerDeps{Store: s} + dispatchAgentMessage(ctx, nil, hostID, env, deps) + + got, err := s.GetHostRepoStats(ctx, hostID) + if err != nil { + t.Fatalf("get host repo stats: %v", err) + } + if got.TotalSizeBytes == nil || *got.TotalSizeBytes != 1024 { + t.Errorf("TotalSizeBytes: got %v want 1024", got.TotalSizeBytes) + } + if got.RawSizeBytes == nil || *got.RawSizeBytes != 2048 { + t.Errorf("RawSizeBytes: got %v want 2048", got.RawSizeBytes) + } + if got.UniqueFiles == nil || *got.UniqueFiles != 42 { + t.Errorf("UniqueFiles: got %v want 42", got.UniqueFiles) + } + if got.SnapshotCount == nil || *got.SnapshotCount != 7 { + t.Errorf("SnapshotCount: got %v want 7", got.SnapshotCount) + } + if got.LastCheckAt == nil || !got.LastCheckAt.Equal(now) { + t.Errorf("LastCheckAt: got %v want %v", got.LastCheckAt, now) + } + if got.LastCheckStatus != "ok" { + t.Errorf("LastCheckStatus: got %q want %q", got.LastCheckStatus, "ok") + } + if got.LockPresent == nil || *got.LockPresent != false { + t.Errorf("LockPresent: got %v want false", got.LockPresent) + } + if got.LastPruneAt == nil || !got.LastPruneAt.Equal(pruneAt) { + t.Errorf("LastPruneAt: got %v want %v", got.LastPruneAt, pruneAt) + } + if got.LastPruneFreedBytes == nil || *got.LastPruneFreedBytes != 512 { + t.Errorf("LastPruneFreedBytes: got %v want 512", got.LastPruneFreedBytes) + } +} + +func TestRepoStatsReportPartialUpdate(t *testing.T) { + t.Parallel() + s := openWSTestStore(t) + ctx := context.Background() + + const hostID = "h-stats-partial" + seedHostWS(t, s, hostID) + + // Pre-seed: TotalSizeBytes = 100. + if err := s.UpsertHostRepoStats(ctx, hostID, store.HostRepoStats{ + TotalSizeBytes: int64ptrWS(100), + }); err != nil { + t.Fatalf("pre-seed upsert: %v", err) + } + + // Send a repo.stats payload that only sets LastCheckStatus. + env, err := api.Marshal(api.MsgRepoStats, "", api.RepoStatsPayload{ + LastCheckStatus: "ok", + }) + if err != nil { + t.Fatalf("marshal: %v", err) + } + dispatchAgentMessage(ctx, nil, hostID, env, HandlerDeps{Store: s}) + + got, err := s.GetHostRepoStats(ctx, hostID) + if err != nil { + t.Fatalf("get: %v", err) + } + if got.TotalSizeBytes == nil || *got.TotalSizeBytes != 100 { + t.Errorf("TotalSizeBytes lost: got %v want 100", got.TotalSizeBytes) + } + if got.LastCheckStatus != "ok" { + t.Errorf("LastCheckStatus: got %q want ok", got.LastCheckStatus) + } +} diff --git a/internal/store/host_credentials.go b/internal/store/host_credentials.go index 22416c8..2dfbcbe 100644 --- a/internal/store/host_credentials.go +++ b/internal/store/host_credentials.go @@ -8,13 +8,23 @@ import ( "time" ) +// CredentialKind identifies the role of a host_credentials row. +type CredentialKind string + +const ( + // CredKindRepo is the append-only credential used for every backup. + CredKindRepo CredentialKind = "repo" + // CredKindAdmin is the delete-capable credential used for prune. + CredKindAdmin CredentialKind = "admin" +) + // GetHostCredentials returns the AEAD-encrypted repo creds blob for -// the host, or ("", ErrNotFound) if no credential has ever been set. +// the host + kind, or ("", ErrNotFound) if no matching row exists. // The caller decrypts using host_id as AEAD additional data. -func (s *Store) GetHostCredentials(ctx context.Context, hostID string) (string, error) { +func (s *Store) GetHostCredentials(ctx context.Context, hostID string, kind CredentialKind) (string, error) { row := s.db.QueryRowContext(ctx, - `SELECT enc_repo_creds FROM host_credentials WHERE host_id = ?`, - hostID) + `SELECT enc_repo_creds FROM host_credentials WHERE host_id = ? AND kind = ?`, + hostID, string(kind)) var enc string if err := row.Scan(&enc); err != nil { if errors.Is(err, sql.ErrNoRows) { @@ -25,22 +35,35 @@ func (s *Store) GetHostCredentials(ctx context.Context, hostID string) (string, return enc, nil } -// SetHostCredentials replaces the host's encrypted repo creds blob. -// The caller has already encrypted using host_id as additional data. -func (s *Store) SetHostCredentials(ctx context.Context, hostID, encRepoCreds string) error { +// SetHostCredentials replaces the host's encrypted repo creds blob for +// the given kind. The caller has already encrypted using host_id as +// additional data. +func (s *Store) SetHostCredentials(ctx context.Context, hostID string, kind CredentialKind, encRepoCreds string) error { if encRepoCreds == "" { return fmt.Errorf("store: empty enc_repo_creds") } now := time.Now().UTC().Format(time.RFC3339Nano) _, err := s.db.ExecContext(ctx, - `INSERT INTO host_credentials (host_id, enc_repo_creds, updated_at) - VALUES (?, ?, ?) - ON CONFLICT(host_id) DO UPDATE SET + `INSERT INTO host_credentials (host_id, kind, enc_repo_creds, updated_at) + VALUES (?, ?, ?, ?) + ON CONFLICT(host_id, kind) DO UPDATE SET enc_repo_creds = excluded.enc_repo_creds, updated_at = excluded.updated_at`, - hostID, encRepoCreds, now) + hostID, string(kind), encRepoCreds, now) if err != nil { return fmt.Errorf("store: set host credentials: %w", err) } return nil } + +// DeleteHostCredentials removes the credential row for the given host +// and kind. A no-op if the row does not exist. +func (s *Store) DeleteHostCredentials(ctx context.Context, hostID string, kind CredentialKind) error { + _, err := s.db.ExecContext(ctx, + `DELETE FROM host_credentials WHERE host_id = ? AND kind = ?`, + hostID, string(kind)) + if err != nil { + return fmt.Errorf("store: delete host credentials: %w", err) + } + return nil +} diff --git a/internal/store/host_credentials_test.go b/internal/store/host_credentials_test.go new file mode 100644 index 0000000..ddca751 --- /dev/null +++ b/internal/store/host_credentials_test.go @@ -0,0 +1,103 @@ +package store + +import ( + "context" + "errors" + "testing" +) + +// seedHost inserts a minimal host row for testing. +func seedHost(t *testing.T, s *Store, hostID string) { + t.Helper() + _, err := s.DB().Exec( + `INSERT INTO hosts (id, name, os, arch, enrolled_at) VALUES (?,?,?,?,?)`, + hostID, hostID, "linux", "amd64", "2026-01-01T00:00:00Z") + if err != nil { + t.Fatalf("seed host %q: %v", hostID, err) + } +} + +func TestHostCredentialsAdminRowSeparate(t *testing.T) { + t.Parallel() + s := openTestStore(t) + ctx := context.Background() + + const hostID = "h-creds-test" + seedHost(t, s, hostID) + + const repoBlob = "enc-repo-blob" + const adminBlob = "enc-admin-blob" + + // Set repo creds. + if err := s.SetHostCredentials(ctx, hostID, CredKindRepo, repoBlob); err != nil { + t.Fatalf("set repo creds: %v", err) + } + // Set admin creds. + if err := s.SetHostCredentials(ctx, hostID, CredKindAdmin, adminBlob); err != nil { + t.Fatalf("set admin creds: %v", err) + } + + // Fetch each by kind and assert they differ. + gotRepo, err := s.GetHostCredentials(ctx, hostID, CredKindRepo) + if err != nil { + t.Fatalf("get repo creds: %v", err) + } + gotAdmin, err := s.GetHostCredentials(ctx, hostID, CredKindAdmin) + if err != nil { + t.Fatalf("get admin creds: %v", err) + } + if gotRepo != repoBlob { + t.Errorf("repo creds: got %q, want %q", gotRepo, repoBlob) + } + if gotAdmin != adminBlob { + t.Errorf("admin creds: got %q, want %q", gotAdmin, adminBlob) + } + if gotRepo == gotAdmin { + t.Error("repo and admin blobs must differ") + } + + // Delete admin; repo must be unaffected. + if err := s.DeleteHostCredentials(ctx, hostID, CredKindAdmin); err != nil { + t.Fatalf("delete admin creds: %v", err) + } + if _, err := s.GetHostCredentials(ctx, hostID, CredKindAdmin); !errors.Is(err, ErrNotFound) { + t.Errorf("after delete, expected ErrNotFound for admin; got %v", err) + } + if got, err := s.GetHostCredentials(ctx, hostID, CredKindRepo); err != nil || got != repoBlob { + t.Errorf("repo creds should survive admin delete; got %q, err %v", got, err) + } +} + +func TestHostCredentialsNotFound(t *testing.T) { + t.Parallel() + s := openTestStore(t) + ctx := context.Background() + + _, err := s.GetHostCredentials(ctx, "no-such-host", CredKindRepo) + if !errors.Is(err, ErrNotFound) { + t.Errorf("expected ErrNotFound, got %v", err) + } +} + +func TestHostCredentialsUpsert(t *testing.T) { + t.Parallel() + s := openTestStore(t) + ctx := context.Background() + + const hostID = "h-upsert-test" + seedHost(t, s, hostID) + + if err := s.SetHostCredentials(ctx, hostID, CredKindRepo, "v1"); err != nil { + t.Fatalf("set v1: %v", err) + } + if err := s.SetHostCredentials(ctx, hostID, CredKindRepo, "v2"); err != nil { + t.Fatalf("set v2 (upsert): %v", err) + } + got, err := s.GetHostCredentials(ctx, hostID, CredKindRepo) + if err != nil { + t.Fatalf("get: %v", err) + } + if got != "v2" { + t.Errorf("expected v2, got %q", got) + } +} diff --git a/internal/store/host_repo_stats.go b/internal/store/host_repo_stats.go new file mode 100644 index 0000000..1952f68 --- /dev/null +++ b/internal/store/host_repo_stats.go @@ -0,0 +1,231 @@ +package store + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" +) + +// HostRepoStats is the per-host projection of repo-level metrics. +// All pointer fields are nullable; nil means "not yet known." The row +// is created (or replaced) by UpsertHostRepoStats which merges in only +// the non-nil fields from a patch. +type HostRepoStats struct { + HostID string + TotalSizeBytes *int64 + RawSizeBytes *int64 + UniqueFiles *int64 + SnapshotCount *int64 + LastCheckAt *time.Time + LastCheckStatus string // "" | "ok" | "errors_found" | "failed" + LockPresent *bool + LastPruneAt *time.Time + LastPruneFreedBytes *int64 + UpdatedAt time.Time +} + +// GetHostRepoStats returns the row, or (nil, ErrNotFound) if absent. +func (s *Store) GetHostRepoStats(ctx context.Context, hostID string) (*HostRepoStats, error) { + row := s.db.QueryRowContext(ctx, + `SELECT host_id, total_size_bytes, raw_size_bytes, unique_files, + snapshot_count, last_check_at, last_check_status, + lock_present, last_prune_at, last_prune_freed_bytes, updated_at + FROM host_repo_stats WHERE host_id = ?`, hostID) + return scanHostRepoStats(row) +} + +// getHostRepoStatsTx is identical to GetHostRepoStats but runs on an +// existing transaction so the fetch-merge-upsert in UpsertHostRepoStats +// is fully serialized. +func getHostRepoStatsTx(ctx context.Context, tx *sql.Tx, hostID string) (*HostRepoStats, error) { + row := tx.QueryRowContext(ctx, + `SELECT host_id, total_size_bytes, raw_size_bytes, unique_files, + snapshot_count, last_check_at, last_check_status, + lock_present, last_prune_at, last_prune_freed_bytes, updated_at + FROM host_repo_stats WHERE host_id = ?`, hostID) + return scanHostRepoStats(row) +} + +// scanHostRepoStats scans one row from host_repo_stats. +func scanHostRepoStats(row *sql.Row) (*HostRepoStats, error) { + var ( + st HostRepoStats + totalSize sql.NullInt64 + rawSize sql.NullInt64 + uniqueFiles sql.NullInt64 + snapshotCount sql.NullInt64 + lastCheckAt sql.NullString + lastCheckStatus sql.NullString + lockPresent int64 + lastPruneAt sql.NullString + lastPruneFreed sql.NullInt64 + updatedAt string + ) + if err := row.Scan( + &st.HostID, + &totalSize, &rawSize, &uniqueFiles, &snapshotCount, + &lastCheckAt, &lastCheckStatus, + &lockPresent, + &lastPruneAt, &lastPruneFreed, + &updatedAt, + ); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } + return nil, fmt.Errorf("store: scan host_repo_stats: %w", err) + } + if totalSize.Valid { + v := totalSize.Int64 + st.TotalSizeBytes = &v + } + if rawSize.Valid { + v := rawSize.Int64 + st.RawSizeBytes = &v + } + if uniqueFiles.Valid { + v := uniqueFiles.Int64 + st.UniqueFiles = &v + } + if snapshotCount.Valid { + v := snapshotCount.Int64 + st.SnapshotCount = &v + } + if lastCheckAt.Valid { + t, err := time.Parse(time.RFC3339Nano, lastCheckAt.String) + if err != nil { + return nil, fmt.Errorf("store: parse last_check_at: %w", err) + } + st.LastCheckAt = &t + } + if lastCheckStatus.Valid { + st.LastCheckStatus = lastCheckStatus.String + } + lp := lockPresent != 0 + st.LockPresent = &lp + if lastPruneAt.Valid { + t, err := time.Parse(time.RFC3339Nano, lastPruneAt.String) + if err != nil { + return nil, fmt.Errorf("store: parse last_prune_at: %w", err) + } + st.LastPruneAt = &t + } + if lastPruneFreed.Valid { + v := lastPruneFreed.Int64 + st.LastPruneFreedBytes = &v + } + t, err := time.Parse(time.RFC3339Nano, updatedAt) + if err != nil { + return nil, fmt.Errorf("store: parse host_repo_stats.updated_at: %w", err) + } + st.UpdatedAt = t + return &st, nil +} + +// UpsertHostRepoStats writes a partial update — only non-nil pointer +// fields (and LastCheckStatus when non-empty) overwrite existing +// columns. Wrapped in a transaction so concurrent upserts on the same +// host don't lose updates. +func (s *Store) UpsertHostRepoStats(ctx context.Context, hostID string, patch HostRepoStats) error { + tx, err := s.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("store: begin host_repo_stats tx: %w", err) + } + defer func() { _ = tx.Rollback() }() + + // Fetch existing row; start from zero if absent. + cur, err := getHostRepoStatsTx(ctx, tx, hostID) + if err != nil && !errors.Is(err, ErrNotFound) { + return err + } + if cur == nil { + cur = &HostRepoStats{HostID: hostID} + } + + // Merge: non-nil patch fields overwrite current. + if patch.TotalSizeBytes != nil { + cur.TotalSizeBytes = patch.TotalSizeBytes + } + if patch.RawSizeBytes != nil { + cur.RawSizeBytes = patch.RawSizeBytes + } + if patch.UniqueFiles != nil { + cur.UniqueFiles = patch.UniqueFiles + } + if patch.SnapshotCount != nil { + cur.SnapshotCount = patch.SnapshotCount + } + if patch.LastCheckAt != nil { + cur.LastCheckAt = patch.LastCheckAt + } + if patch.LastCheckStatus != "" { + cur.LastCheckStatus = patch.LastCheckStatus + } + if patch.LockPresent != nil { + cur.LockPresent = patch.LockPresent + } + if patch.LastPruneAt != nil { + cur.LastPruneAt = patch.LastPruneAt + } + if patch.LastPruneFreedBytes != nil { + cur.LastPruneFreedBytes = patch.LastPruneFreedBytes + } + + now := time.Now().UTC().Format(time.RFC3339Nano) + + // Convert *bool → int for lock_present. + var lockPresentInt int64 + if cur.LockPresent != nil && *cur.LockPresent { + lockPresentInt = 1 + } + + if _, err = tx.ExecContext(ctx, + `INSERT INTO host_repo_stats + (host_id, total_size_bytes, raw_size_bytes, unique_files, + snapshot_count, last_check_at, last_check_status, + lock_present, last_prune_at, last_prune_freed_bytes, updated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(host_id) DO UPDATE SET + total_size_bytes = excluded.total_size_bytes, + raw_size_bytes = excluded.raw_size_bytes, + unique_files = excluded.unique_files, + snapshot_count = excluded.snapshot_count, + last_check_at = excluded.last_check_at, + last_check_status = excluded.last_check_status, + lock_present = excluded.lock_present, + last_prune_at = excluded.last_prune_at, + last_prune_freed_bytes = excluded.last_prune_freed_bytes, + updated_at = excluded.updated_at`, + hostID, + nullableInt64(cur.TotalSizeBytes), + nullableInt64(cur.RawSizeBytes), + nullableInt64(cur.UniqueFiles), + nullableInt64(cur.SnapshotCount), + nullableTime(cur.LastCheckAt), + nullableStr(cur.LastCheckStatus), + lockPresentInt, + nullableTime(cur.LastPruneAt), + nullableInt64(cur.LastPruneFreedBytes), + now, + ); err != nil { + return fmt.Errorf("store: upsert host_repo_stats: %w", err) + } + return tx.Commit() +} + +// nullableInt64 converts *int64 to a database/sql-compatible nullable value. +func nullableInt64(p *int64) any { + if p == nil { + return nil + } + return *p +} + +// nullableTime converts *time.Time to an RFC3339Nano string or nil. +func nullableTime(p *time.Time) any { + if p == nil { + return nil + } + return p.UTC().Format(time.RFC3339Nano) +} diff --git a/internal/store/host_repo_stats_test.go b/internal/store/host_repo_stats_test.go new file mode 100644 index 0000000..9f43979 --- /dev/null +++ b/internal/store/host_repo_stats_test.go @@ -0,0 +1,131 @@ +package store + +import ( + "context" + "errors" + "testing" + "time" +) + +func int64ptr(v int64) *int64 { return &v } +func boolptr(v bool) *bool { return &v } + +func TestHostRepoStatsRoundTrip(t *testing.T) { + t.Parallel() + s := openTestStore(t) + ctx := context.Background() + + const hostID = "h-stats-test" + seedHost(t, s, hostID) + + // 1. Initial upsert: set TotalSizeBytes only. + if err := s.UpsertHostRepoStats(ctx, hostID, HostRepoStats{ + TotalSizeBytes: int64ptr(100), + }); err != nil { + t.Fatalf("upsert 1: %v", err) + } + got, err := s.GetHostRepoStats(ctx, hostID) + if err != nil { + t.Fatalf("get after upsert 1: %v", err) + } + if got.TotalSizeBytes == nil || *got.TotalSizeBytes != 100 { + t.Errorf("TotalSizeBytes: want 100, got %v", got.TotalSizeBytes) + } + if got.LastCheckStatus != "" { + t.Errorf("LastCheckStatus should be empty after first upsert, got %q", got.LastCheckStatus) + } + + // 2. Upsert with LastCheckStatus; TotalSizeBytes must be preserved. + if err := s.UpsertHostRepoStats(ctx, hostID, HostRepoStats{ + LastCheckStatus: "ok", + }); err != nil { + t.Fatalf("upsert 2: %v", err) + } + got, err = s.GetHostRepoStats(ctx, hostID) + if err != nil { + t.Fatalf("get after upsert 2: %v", err) + } + if got.TotalSizeBytes == nil || *got.TotalSizeBytes != 100 { + t.Errorf("TotalSizeBytes should still be 100 after second upsert, got %v", got.TotalSizeBytes) + } + if got.LastCheckStatus != "ok" { + t.Errorf("LastCheckStatus: want %q, got %q", "ok", got.LastCheckStatus) + } + + // 3. Upsert with LockPresent=true; all other fields preserved. + now := time.Now().UTC().Truncate(time.Second) + if err := s.UpsertHostRepoStats(ctx, hostID, HostRepoStats{ + LockPresent: boolptr(true), + LastCheckAt: &now, + }); err != nil { + t.Fatalf("upsert 3: %v", err) + } + got, err = s.GetHostRepoStats(ctx, hostID) + if err != nil { + t.Fatalf("get after upsert 3: %v", err) + } + if got.LockPresent == nil || !*got.LockPresent { + t.Error("LockPresent should be true after upsert 3") + } + if got.TotalSizeBytes == nil || *got.TotalSizeBytes != 100 { + t.Errorf("TotalSizeBytes still 100 expected, got %v", got.TotalSizeBytes) + } + if got.LastCheckStatus != "ok" { + t.Errorf("LastCheckStatus still 'ok' expected, got %q", got.LastCheckStatus) + } + if got.LastCheckAt == nil { + t.Error("LastCheckAt should be set") + } else if !got.LastCheckAt.UTC().Truncate(time.Second).Equal(now) { + t.Errorf("LastCheckAt: got %v, want %v", got.LastCheckAt.UTC().Truncate(time.Second), now) + } + + // 4. Clear lock (set to false). + if err := s.UpsertHostRepoStats(ctx, hostID, HostRepoStats{ + LockPresent: boolptr(false), + }); err != nil { + t.Fatalf("upsert 4: %v", err) + } + got, err = s.GetHostRepoStats(ctx, hostID) + if err != nil { + t.Fatalf("get after upsert 4: %v", err) + } + if got.LockPresent == nil || *got.LockPresent { + t.Error("LockPresent should be false after upsert 4") + } +} + +func TestHostRepoStatsNotFound(t *testing.T) { + t.Parallel() + s := openTestStore(t) + ctx := context.Background() + + _, err := s.GetHostRepoStats(ctx, "no-such-host") + if !errors.Is(err, ErrNotFound) { + t.Errorf("expected ErrNotFound, got %v", err) + } +} + +func TestHostRepoStatsCascadeDelete(t *testing.T) { + t.Parallel() + s := openTestStore(t) + ctx := context.Background() + + const hostID = "h-cascade-test" + seedHost(t, s, hostID) + + if err := s.UpsertHostRepoStats(ctx, hostID, HostRepoStats{ + TotalSizeBytes: int64ptr(999), + }); err != nil { + t.Fatalf("upsert: %v", err) + } + + // Delete the host; stats row should cascade-delete. + if _, err := s.DB().ExecContext(ctx, + `DELETE FROM hosts WHERE id = ?`, hostID); err != nil { + t.Fatalf("delete host: %v", err) + } + _, err := s.GetHostRepoStats(ctx, hostID) + if !errors.Is(err, ErrNotFound) { + t.Errorf("after host delete, expected ErrNotFound for stats; got %v", err) + } +} diff --git a/internal/store/jobs.go b/internal/store/jobs.go index 9438902..633fcc6 100644 --- a/internal/store/jobs.go +++ b/internal/store/jobs.go @@ -193,6 +193,71 @@ func (s *Store) GetJob(ctx context.Context, id string) (*Job, error) { return &j, nil } +// LatestJobByKind returns the most recent job (any status, including +// queued and running) of the given kind for the host, or +// (nil, ErrNotFound) if no such job exists. Used by the maintenance +// ticker to compute "last fire" anchors for the cron-due check; +// in-flight jobs MUST be considered or a long-running prune (>60s) +// would re-fire on the next tick while the first is still running. +func (s *Store) LatestJobByKind(ctx context.Context, hostID, kind string) (*Job, error) { + row := s.db.QueryRowContext(ctx, + `SELECT id, host_id, kind, status, scheduled_id, actor_kind, actor_id, + started_at, finished_at, exit_code, stats, error, created_at + FROM jobs + WHERE host_id = ? AND kind = ? + ORDER BY created_at DESC + LIMIT 1`, hostID, kind) + var ( + j Job + schedID sql.NullString + actorID sql.NullString + startedAt sql.NullString + finishedAt sql.NullString + exitCode sql.NullInt64 + stats sql.NullString + errMsg sql.NullString + createdAt string + ) + if err := row.Scan(&j.ID, &j.HostID, &j.Kind, &j.Status, &schedID, + &j.ActorKind, &actorID, &startedAt, &finishedAt, + &exitCode, &stats, &errMsg, &createdAt); err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrNotFound + } + return nil, fmt.Errorf("store: scan latest job by kind: %w", err) + } + if schedID.Valid { + s := schedID.String + j.ScheduledID = &s + } + if actorID.Valid { + s := actorID.String + j.ActorID = &s + } + if startedAt.Valid { + t, _ := time.Parse(time.RFC3339Nano, startedAt.String) + j.StartedAt = &t + } + if finishedAt.Valid { + t, _ := time.Parse(time.RFC3339Nano, finishedAt.String) + j.FinishedAt = &t + } + if exitCode.Valid { + i := int(exitCode.Int64) + j.ExitCode = &i + } + if stats.Valid && stats.String != "" { + j.Stats = json.RawMessage(stats.String) + } + if errMsg.Valid { + s := errMsg.String + j.Error = &s + } + t, _ := time.Parse(time.RFC3339Nano, createdAt) + j.CreatedAt = t + return &j, nil +} + // HasJobOfKind reports whether any job of the given kind exists for // this host, regardless of status. Used by the auto-init path on // agent hello to decide whether to dispatch a fresh `restic init` — diff --git a/internal/store/jobs_test.go b/internal/store/jobs_test.go new file mode 100644 index 0000000..9339a66 --- /dev/null +++ b/internal/store/jobs_test.go @@ -0,0 +1,136 @@ +package store + +import ( + "context" + "errors" + "testing" + "time" +) + +func TestLatestJobByKind(t *testing.T) { + t.Parallel() + s := openTestStore(t) + ctx := context.Background() + hostID := makeSchedHost(t, s) + + // No jobs yet → ErrNotFound. + if _, err := s.LatestJobByKind(ctx, hostID, "forget"); !errors.Is(err, ErrNotFound) { + t.Fatalf("expected ErrNotFound on empty, got %v", err) + } + + // Insert two finished jobs of kind=forget; the newer one should win. + older := time.Now().UTC().Add(-2 * time.Hour) + newer := time.Now().UTC().Add(-1 * time.Hour) + + if err := s.CreateJob(ctx, Job{ + ID: "j-old", HostID: hostID, Kind: "forget", + ActorKind: "system", CreatedAt: older, + }); err != nil { + t.Fatalf("create older: %v", err) + } + if err := s.MarkJobFinished(ctx, "j-old", "succeeded", 0, nil, "", older.Add(time.Minute)); err != nil { + t.Fatalf("finish older: %v", err) + } + if err := s.CreateJob(ctx, Job{ + ID: "j-new", HostID: hostID, Kind: "forget", + ActorKind: "system", CreatedAt: newer, + }); err != nil { + t.Fatalf("create newer: %v", err) + } + if err := s.MarkJobFinished(ctx, "j-new", "failed", 1, nil, "boom", newer.Add(time.Minute)); err != nil { + t.Fatalf("finish newer: %v", err) + } + + got, err := s.LatestJobByKind(ctx, hostID, "forget") + if err != nil { + t.Fatalf("LatestJobByKind: %v", err) + } + if got.ID != "j-new" { + t.Errorf("want j-new, got %q", got.ID) + } + + // An in-flight running job must be returned — long-prune-suppresses-tick + // scenario: if a prune runs >60s the next tick must not re-fire it. + runningAt := time.Now().UTC() + if err := s.CreateJob(ctx, Job{ + ID: "j-running", HostID: hostID, Kind: "forget", + ActorKind: "system", CreatedAt: runningAt, + }); err != nil { + t.Fatalf("create running: %v", err) + } + if err := s.MarkJobStarted(ctx, "j-running", runningAt); err != nil { + t.Fatalf("mark started: %v", err) + } + got2, err := s.LatestJobByKind(ctx, hostID, "forget") + if err != nil { + t.Fatalf("LatestJobByKind 2: %v", err) + } + if got2.ID != "j-running" { + t.Errorf("in-flight running job must be returned; want j-running, got %q", got2.ID) + } + + // A queued (not-yet-started) job is also returned (it is newer than + // j-running because CreatedAt is later). + queuedAt := runningAt.Add(time.Millisecond) + if err := s.CreateJob(ctx, Job{ + ID: "j-queued", HostID: hostID, Kind: "forget", + ActorKind: "system", CreatedAt: queuedAt, + }); err != nil { + t.Fatalf("create queued: %v", err) + } + got3, err := s.LatestJobByKind(ctx, hostID, "forget") + if err != nil { + t.Fatalf("LatestJobByKind 3: %v", err) + } + if got3.ID != "j-queued" { + t.Errorf("queued job must be returned as newest; want j-queued, got %q", got3.ID) + } + + // Different kind → ErrNotFound. + if _, err := s.LatestJobByKind(ctx, hostID, "prune"); !errors.Is(err, ErrNotFound) { + t.Fatalf("expected ErrNotFound for prune, got %v", err) + } +} + +func TestListAllMaintenance(t *testing.T) { + t.Parallel() + s := openTestStore(t) + ctx := context.Background() + + // Empty case. + rows, err := s.ListAllMaintenance(ctx) + if err != nil { + t.Fatalf("empty list: %v", err) + } + if len(rows) != 0 { + t.Errorf("want empty, got %+v", rows) + } + + // Seed two hosts with maintenance rows. + h1 := "01HMAINTHOST00000000000A1" + h2 := "01HMAINTHOST00000000000A2" + for i, id := range []string{h1, h2} { + if err := s.CreateHost(ctx, Host{ + ID: id, Name: "maint-host-" + string(rune('a'+i)), + OS: "linux", Arch: "amd64", + AgentVersion: "dev", ResticVersion: "0.16.0", ProtocolVersion: 1, + EnrolledAt: time.Now().UTC(), + }, "th-"+id, ""); err != nil { + t.Fatalf("create host %s: %v", id, err) + } + } + if err := s.CreateDefaultRepoMaintenance(ctx, h1); err != nil { + t.Fatalf("seed h1: %v", err) + } + if err := s.CreateDefaultRepoMaintenance(ctx, h2); err != nil { + t.Fatalf("seed h2: %v", err) + } + + rows, err = s.ListAllMaintenance(ctx) + if err != nil { + t.Fatalf("list: %v", err) + } + if len(rows) != 2 { + t.Errorf("want 2 rows, got %d", len(rows)) + } +} diff --git a/internal/store/maintenance.go b/internal/store/maintenance.go index 795a6b0..ddeb5e0 100644 --- a/internal/store/maintenance.go +++ b/internal/store/maintenance.go @@ -50,6 +50,40 @@ func (st *Store) GetRepoMaintenance(ctx context.Context, hostID string) (*HostRe return &m, nil } +// ListAllMaintenance returns every host_repo_maintenance row. +// Used by the server-side maintenance ticker to iterate every +// host on each tick. Order is unspecified (the ticker doesn't +// care). +func (st *Store) ListAllMaintenance(ctx context.Context) ([]HostRepoMaintenance, error) { + rows, err := st.db.QueryContext(ctx, + `SELECT host_id, forget_cron, forget_enabled, + prune_cron, prune_enabled, + check_cron, check_enabled, check_subset_pct + FROM host_repo_maintenance`) + if err != nil { + return nil, fmt.Errorf("store: list all maintenance: %w", err) + } + defer func() { _ = rows.Close() }() + var out []HostRepoMaintenance + for rows.Next() { + var ( + m HostRepoMaintenance + forgetEnabled, pruneEnabled, checkEnabled int + ) + if err := rows.Scan(&m.HostID, + &m.ForgetCron, &forgetEnabled, + &m.PruneCron, &pruneEnabled, + &m.CheckCron, &checkEnabled, &m.CheckSubsetPct); err != nil { + return nil, fmt.Errorf("store: scan maintenance: %w", err) + } + m.ForgetEnabled = forgetEnabled != 0 + m.PruneEnabled = pruneEnabled != 0 + m.CheckEnabled = checkEnabled != 0 + out = append(out, m) + } + return out, rows.Err() +} + // UpdateRepoMaintenance replaces every editable field. Doesn't bump // the schedule version — these run on the server's own ticker, not // the agent's local cron, so the agent doesn't need to know. diff --git a/internal/store/migrations/0009_admin_creds_and_repo_stats.sql b/internal/store/migrations/0009_admin_creds_and_repo_stats.sql new file mode 100644 index 0000000..59c8e91 --- /dev/null +++ b/internal/store/migrations/0009_admin_creds_and_repo_stats.sql @@ -0,0 +1,58 @@ +-- 0009_admin_creds_and_repo_stats.sql +-- +-- Phase 5 of the P2 redesign needs two things in the schema: +-- +-- 1. A second credential row per host. Today host_credentials is +-- 1:1 with hosts. For prune (and any future destructive op) we +-- want a rest-server admin user whose password gives delete +-- access — separate from the append-only user used on every +-- backup. Add a `kind` column with default 'repo'; existing rows +-- become kind='repo'. Future admin rows live alongside. +-- +-- 2. A small singleton-per-host projection for repo size, snapshot +-- count, last-prune freed bytes, lock state, and last-check +-- result. Backed by `restic stats --json` + sniffed `restic +-- check` stderr. +-- +-- Use column-level ALTERs only; host_credentials has no inbound +-- FKs but the rule from CLAUDE.md still applies. + +ALTER TABLE host_credentials ADD COLUMN kind TEXT NOT NULL DEFAULT 'repo'; + +-- The PK on host_credentials is currently (host_id) — we need a +-- composite (host_id, kind). SQLite has no ALTER TABLE … +-- ADD/CHANGE PRIMARY KEY, so this is the one place a rebuild is +-- justified. host_credentials has no inbound FKs, so the cascade +-- trap doesn't apply here. Verified against schema/0002. + +CREATE TABLE host_credentials_new ( + host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE, + kind TEXT NOT NULL DEFAULT 'repo' + CHECK (kind IN ('repo', 'admin')), + enc_repo_creds TEXT NOT NULL, + updated_at TEXT NOT NULL, + PRIMARY KEY (host_id, kind) +); +INSERT INTO host_credentials_new (host_id, kind, enc_repo_creds, updated_at) + SELECT host_id, kind, enc_repo_creds, updated_at FROM host_credentials; +DROP TABLE host_credentials; +ALTER TABLE host_credentials_new RENAME TO host_credentials; + +-- Repo stats projection. One row per host, upserted by the agent's +-- stats.report envelope (which fires after every successful backup +-- and after every check / prune). All fields nullable so a freshly +-- enrolled host with no jobs yet is representable. + +CREATE TABLE host_repo_stats ( + host_id TEXT PRIMARY KEY REFERENCES hosts(id) ON DELETE CASCADE, + total_size_bytes INTEGER, + raw_size_bytes INTEGER, + unique_files INTEGER, + snapshot_count INTEGER, + last_check_at TEXT, + last_check_status TEXT CHECK (last_check_status IS NULL OR last_check_status IN ('ok', 'errors_found', 'failed')), + lock_present INTEGER NOT NULL DEFAULT 0, + last_prune_at TEXT, + last_prune_freed_bytes INTEGER, + updated_at TEXT NOT NULL +); diff --git a/internal/store/pending.go b/internal/store/pending.go index 42f2fdd..097d336 100644 --- a/internal/store/pending.go +++ b/internal/store/pending.go @@ -72,6 +72,43 @@ func (st *Store) DuePendingRuns(ctx context.Context, now time.Time, limit int) ( return out, rows.Err() } +// ListPendingRunsForHost returns every pending row for the host +// (regardless of next_attempt_at), ordered by next_attempt_at +// ascending. Used by the on-reconnect drain — when a host comes +// back, we walk every pending row for it, not just the due ones, +// because the host being back makes "due" unimportant: every row +// is dispatchable now. +func (st *Store) ListPendingRunsForHost(ctx context.Context, hostID string) ([]PendingRun, error) { + rows, err := st.db.QueryContext(ctx, + `SELECT id, schedule_id, source_group_id, host_id, attempt, + next_attempt_at, scheduled_at, COALESCE(last_error, '') + FROM pending_runs + WHERE host_id = ? + ORDER BY next_attempt_at`, + hostID) + if err != nil { + return nil, fmt.Errorf("store: list pending runs for host: %w", err) + } + defer func() { _ = rows.Close() }() + out := []PendingRun{} + for rows.Next() { + var p PendingRun + var nextAt, scheduledAt string + if err := rows.Scan(&p.ID, &p.ScheduleID, &p.SourceGroupID, &p.HostID, + &p.Attempt, &nextAt, &scheduledAt, &p.LastError); err != nil { + return nil, err + } + if t, err := time.Parse(time.RFC3339Nano, nextAt); err == nil { + p.NextAttemptAt = t + } + if t, err := time.Parse(time.RFC3339Nano, scheduledAt); err == nil { + p.ScheduledAt = t + } + out = append(out, p) + } + return out, rows.Err() +} + // DeletePendingRun removes a row by id. Called after successful // dispatch or after exceeding retry_max. func (st *Store) DeletePendingRun(ctx context.Context, id string) error { diff --git a/internal/store/sources_test.go b/internal/store/sources_test.go index 2f6d7bb..28cdf9b 100644 --- a/internal/store/sources_test.go +++ b/internal/store/sources_test.go @@ -219,3 +219,78 @@ func TestPendingRunQueue(t *testing.T) { t.Fatalf("after delete: %v", due) } } + +func TestListPendingRunsForHost(t *testing.T) { + t.Parallel() + s := openTestStore(t) + ctx := context.Background() + hostA := makeSchedHost(t, s) + hostB := "01HPENDLISTHOSTB00000001" + if err := s.CreateHost(ctx, Host{ + ID: hostB, Name: "pending-list-host-b", OS: "linux", Arch: "amd64", + AgentVersion: "dev", ResticVersion: "0.16.0", ProtocolVersion: 1, + EnrolledAt: time.Now().UTC(), + }, "tokenhashB", ""); err != nil { + t.Fatal(err) + } + gA := makeGroup(t, s, hostA, "default", "01HPENDLISTGRPA000000001") + gB := makeGroup(t, s, hostB, "default", "01HPENDLISTGRPB000000001") + schedA := "01HPENDLISTSCHEDA0000001" + schedB := "01HPENDLISTSCHEDB0000001" + if err := s.CreateSchedule(ctx, &Schedule{ + ID: schedA, HostID: hostA, CronExpr: "@hourly", Enabled: true, + SourceGroupIDs: []string{gA}, + }); err != nil { + t.Fatal(err) + } + if err := s.CreateSchedule(ctx, &Schedule{ + ID: schedB, HostID: hostB, CronExpr: "@hourly", Enabled: true, + SourceGroupIDs: []string{gB}, + }); err != nil { + t.Fatal(err) + } + + now := time.Now().UTC() + // Two rows for hostA — one not-yet-due, one already-due — and one + // for hostB. ListPendingRunsForHost(A) must return both A rows + // (regardless of due-ness) ordered by next_attempt_at ascending. + rows := []*PendingRun{ + { + ID: "01HPENDLISTROW0000000A02", ScheduleID: schedA, SourceGroupID: gA, HostID: hostA, + NextAttemptAt: now.Add(time.Hour), ScheduledAt: now, + }, + { + ID: "01HPENDLISTROW0000000A01", ScheduleID: schedA, SourceGroupID: gA, HostID: hostA, + NextAttemptAt: now.Add(-time.Minute), ScheduledAt: now.Add(-time.Hour), + }, + { + ID: "01HPENDLISTROW0000000B01", ScheduleID: schedB, SourceGroupID: gB, HostID: hostB, + NextAttemptAt: now, ScheduledAt: now, + }, + } + for _, r := range rows { + if err := s.EnqueuePendingRun(ctx, r); err != nil { + t.Fatal(err) + } + } + + out, err := s.ListPendingRunsForHost(ctx, hostA) + if err != nil { + t.Fatal(err) + } + if len(out) != 2 { + t.Fatalf("len=%d, want 2: %+v", len(out), out) + } + // Ordered ascending by next_attempt_at: the -1m row first, then +1h. + if out[0].ID != "01HPENDLISTROW0000000A01" || out[1].ID != "01HPENDLISTROW0000000A02" { + t.Fatalf("order: got %s,%s", out[0].ID, out[1].ID) + } + + out, err = s.ListPendingRunsForHost(ctx, "non-existent-host") + if err != nil { + t.Fatal(err) + } + if len(out) != 0 { + t.Fatalf("non-existent host: got %d rows", len(out)) + } +} diff --git a/internal/store/store_test.go b/internal/store/store_test.go index 87aff9b..35eddb5 100644 --- a/internal/store/store_test.go +++ b/internal/store/store_test.go @@ -84,6 +84,70 @@ func TestMigrateIsIdempotent(t *testing.T) { } } +func TestMigration0009Schema(t *testing.T) { + t.Parallel() + s := openTestStore(t) + ctx := context.Background() + + // host_credentials must have a composite PK (host_id, kind). + // We verify this by inserting two rows for the same host_id (different kinds) + // and confirming a duplicate (host_id, kind) fails. + _, err := s.DB().ExecContext(ctx, + `INSERT INTO hosts (id, name, os, arch, enrolled_at) VALUES (?,?,?,?,?)`, + "h-0009", "test-host", "linux", "amd64", "2026-01-01T00:00:00Z") + if err != nil { + t.Fatalf("insert host: %v", err) + } + now := "2026-01-01T00:00:00Z" + if _, err := s.DB().ExecContext(ctx, + `INSERT INTO host_credentials (host_id, kind, enc_repo_creds, updated_at) VALUES (?,?,?,?)`, + "h-0009", "repo", "enc-repo", now); err != nil { + t.Fatalf("insert repo creds: %v", err) + } + if _, err := s.DB().ExecContext(ctx, + `INSERT INTO host_credentials (host_id, kind, enc_repo_creds, updated_at) VALUES (?,?,?,?)`, + "h-0009", "admin", "enc-admin", now); err != nil { + t.Fatalf("insert admin creds: %v", err) + } + // Duplicate (host_id, kind) must fail. + if _, err := s.DB().ExecContext(ctx, + `INSERT INTO host_credentials (host_id, kind, enc_repo_creds, updated_at) VALUES (?,?,?,?)`, + "h-0009", "repo", "enc-repo-2", now); err == nil { + t.Fatal("expected unique constraint violation on (host_id, kind), got nil") + } + + // CHECK (kind IN ('repo','admin')) must reject an invalid kind. + if _, err := s.DB().ExecContext(ctx, + `INSERT INTO host_credentials (host_id, kind, enc_repo_creds, updated_at) VALUES (?,?,?,?)`, + "h-0009", "other", "enc-other", now); err == nil { + t.Fatal("expected CHECK constraint violation on kind='other', got nil") + } + + // host_repo_stats table must exist with expected columns. + if _, err := s.DB().ExecContext(ctx, + `INSERT INTO host_repo_stats (host_id, lock_present, updated_at) VALUES (?,?,?)`, + "h-0009", 0, now); err != nil { + t.Fatalf("insert host_repo_stats: %v", err) + } + var lockPresent int + if err := s.DB().QueryRowContext(ctx, + `SELECT lock_present FROM host_repo_stats WHERE host_id = ?`, "h-0009", + ).Scan(&lockPresent); err != nil { + t.Fatalf("select host_repo_stats: %v", err) + } + if lockPresent != 0 { + t.Errorf("expected lock_present=0, got %d", lockPresent) + } + + // CHECK (last_check_status IN ('ok','errors_found','failed')) must reject + // an invalid value. + if _, err := s.DB().ExecContext(ctx, + `UPDATE host_repo_stats SET last_check_status = ? WHERE host_id = ?`, + "wat", "h-0009"); err == nil { + t.Fatal("expected CHECK constraint violation on last_check_status='wat', got nil") + } +} + func TestForeignKeysEnforced(t *testing.T) { t.Parallel() s := openTestStore(t) diff --git a/tasks.md b/tasks.md index df03958..b0fa7f7 100644 --- a/tasks.md +++ b/tasks.md @@ -166,14 +166,24 @@ Sizes: **S** = under a day, **M** = 1–3 days, **L** = 3–7 days. - Header "version N · agent in sync / agent at vM" indicator preserved across all tabs (backed by `host_schedule_version` + `applied_schedule_version`). - Form validation re-renders with the operator's typed input intact (mirror P2-04's behaviour). Each save fires `pushScheduleSetAsync` so an online agent re-arms within seconds. -### P2 redesign — Phase 5 (server-side maintenance ticker) — TODO +### P2 redesign — Phase 5 (server-side maintenance ticker) -- [ ] **P2R-03** (M) `prune` command end-to-end. Restic wrapper (`restic.RunPrune`), agent dispatcher (`case api.JobPrune:`), wire envelope. **Admin-only credential**: a second `host_credentials` row keyed by `host_id` + `kind=admin` carries the non-append-only username/password; server pushes it via `config.update` only when dispatching a prune job, and the agent's secrets store keeps it in a separate slot from the everyday append-only creds. UI: prune row on the Repo page. Operator-triggered Run-now via `POST /hosts/{id}/repo/prune`. Cadence-driven dispatch lands in P2R-04. -- [ ] **P2R-04** (M) `check` command end-to-end (`restic check --read-data-subset N%`). Wrapper + dispatcher + wire. UI: check row on the Repo page (with the subset % slider). Operator Run-now via `POST /hosts/{id}/repo/check`. Cadence-driven dispatch lands in P2R-05. -- [ ] **P2R-05** (S) `unlock` command end-to-end (`restic unlock`). Operator-only — no cadence. `POST /hosts/{id}/repo/unlock`. Repo page surfaces lock state from the most recent `check` (which warns about stale locks). -- [ ] **P2R-06** (M) Server-side maintenance ticker. Cron-style loop on the server reads `host_repo_maintenance` rows, dispatches `forget` / `prune` / `check` jobs against the right host on the configured cadence (last-run timestamps tracked per kind on the maintenance row). Independent of the agent's local cron — the agent's cron only handles backup schedules now. Skips offline hosts (queues to `pending_runs` instead — see P2R-08). Handles ticker restarts cleanly (no-op if a job of the same kind ran inside the cadence window). -- [ ] **P2R-07** (S) Repo stats panel on the Repo page: size, dedup ratio, snapshot count, last-check timestamp + result, lock state, last-prune timestamp + bytes-freed. Backed by parsing `restic stats --json` output that the agent ships periodically (piggyback on the existing snapshots-report path). -- [ ] **P2R-08** (M) Pending-runs queue worker. On agent reconnect, server drains `pending_runs` rows for that host and re-dispatches them in order. Bump backoff per `pending_run.attempt_count`; drop rows that have exceeded the source-group's `retry_max`. Audit-logged. Smoke-tested by stopping the agent, running maintenance ticker so cadence misses, restarting agent, watching the queue drain. +- [x] **P2R-03** (M) `prune` command end-to-end. Restic wrapper (`restic.RunPrune`), agent dispatcher (`case api.JobPrune:`), wire envelope. **Admin-only credential**: a second `host_credentials` row keyed by `host_id` + `kind=admin` carries the non-append-only username/password; server pushes it via `config.update` only when dispatching a prune job, and the agent's secrets store keeps it in a separate slot from the everyday append-only creds. UI: prune row on the Repo page. Operator-triggered Run-now via `POST /hosts/{id}/repo/prune`. Cadence-driven dispatch lands in P2R-04. +- [x] **P2R-04** (M) `check` command end-to-end (`restic check --read-data-subset N%`). Wrapper + dispatcher + wire. UI: check row on the Repo page (with the subset % slider). Operator Run-now via `POST /hosts/{id}/repo/check`. Cadence-driven dispatch lands in P2R-05. +- [x] **P2R-05** (S) `unlock` command end-to-end (`restic unlock`). Operator-only — no cadence. `POST /hosts/{id}/repo/unlock`. Repo page surfaces lock state from the most recent `check` (which warns about stale locks). +- [x] **P2R-06** (M) Server-side maintenance ticker. Cron-style loop on the server reads `host_repo_maintenance` rows, dispatches `forget` / `prune` / `check` jobs against the right host on the configured cadence (last-run timestamps tracked per kind on the maintenance row). Independent of the agent's local cron — the agent's cron only handles backup schedules now. Skips offline hosts (queues to `pending_runs` instead — see P2R-08). Handles ticker restarts cleanly (no-op if a job of the same kind ran inside the cadence window). +- [x] **P2R-07** (S) Repo stats panel on the Repo page: size, dedup ratio, snapshot count, last-check timestamp + result, lock state, last-prune timestamp + bytes-freed. Backed by parsing `restic stats --json` output that the agent ships periodically (piggyback on the existing snapshots-report path). +- [x] **P2R-08** (M) Pending-runs queue worker. On agent reconnect, server drains `pending_runs` rows for that host and re-dispatches them in order. Bump backoff per `pending_run.attempt_count`; drop rows that have exceeded the source-group's `retry_max`. Audit-logged. Smoke-tested by stopping the agent, running maintenance ticker so cadence misses, restarting agent, watching the queue drain. + +### P2 redesign — Phase 5 ✅ + +- Restic-manager Phase 5 lands on branch `p2r-phase5-maintenance`: + prune/check/unlock end-to-end (P2R-03/04/05); server-side + maintenance ticker drives forget/prune/check on cadence (P2R-06); + repo-stats panel surfaces size, lock state, last-check / last-prune + (P2R-07); pending-runs queue worker drains scheduled-backup + fires that raced an agent disconnect (P2R-08). See + `docs/superpowers/plans/2026-05-03-p2-redesign-phase-5.md`. ### P2 redesign — Phase 6 (auto-init follow-up) — TODO diff --git a/web/static/css/styles.css b/web/static/css/styles.css index d8105fa..ac06171 100644 --- a/web/static/css/styles.css +++ b/web/static/css/styles.css @@ -1,3 +1,3 @@ *,:after,:before{--tw-border-spacing-x:0;--tw-border-spacing-y:0;--tw-translate-x:0;--tw-translate-y:0;--tw-rotate:0;--tw-skew-x:0;--tw-skew-y:0;--tw-scale-x:1;--tw-scale-y:1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness:proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:rgba(59,130,246,.5);--tw-ring-offset-shadow:0 0 #0000;--tw-ring-shadow:0 0 #0000;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x:0;--tw-border-spacing-y:0;--tw-translate-x:0;--tw-translate-y:0;--tw-rotate:0;--tw-skew-x:0;--tw-skew-y:0;--tw-scale-x:1;--tw-scale-y:1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness:proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:rgba(59,130,246,.5);--tw-ring-offset-shadow:0 0 #0000;--tw-ring-shadow:0 0 #0000;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: } -/*! tailwindcss v3.4.17 | MIT License | https://tailwindcss.com*/*,:after,:before{border:0 solid #e5e7eb;box-sizing:border-box}:after,:before{--tw-content:""}:host,html{line-height:1.5;-webkit-text-size-adjust:100%;font-family:Inter,system-ui,-apple-system,sans-serif;font-feature-settings:normal;font-variation-settings:normal;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-tap-highlight-color:transparent}body{line-height:inherit;margin:0}hr{border-top-width:1px;color:inherit;height:0}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:JetBrains Mono,ui-monospace,monospace;font-feature-settings:normal;font-size:1em;font-variation-settings:normal}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{border-collapse:collapse;border-color:inherit;text-indent:0}button,input,optgroup,select,textarea{color:inherit;font-family:inherit;font-feature-settings:inherit;font-size:100%;font-variation-settings:inherit;font-weight:inherit;letter-spacing:inherit;line-height:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0}fieldset,legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{color:#9ca3af;opacity:1}input::placeholder,textarea::placeholder{color:#9ca3af;opacity:1}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{height:auto;max-width:100%}[hidden]:where(:not([hidden=until-found])){display:none}:root{--bg:oklch(0.17 0.006 250);--panel:oklch(0.20 0.007 250);--panel-hi:oklch(0.23 0.008 250);--line:oklch(0.27 0.010 250);--line-soft:oklch(0.23 0.008 250);--ink:oklch(0.96 0.005 250);--ink-mid:oklch(0.78 0.005 250);--ink-mute:oklch(0.58 0.006 250);--ink-fade:oklch(0.42 0.006 250);--ok:oklch(0.78 0.14 155);--warn:oklch(0.82 0.13 80);--bad:oklch(0.70 0.20 25);--off:oklch(0.50 0.005 250);--accent:oklch(0.82 0.12 195)}body,html{background:var(--bg);color:var(--ink);font-family:Inter,system-ui,-apple-system,sans-serif;-webkit-font-smoothing:antialiased}body{font-feature-settings:"cv11","ss01","ss03"}::-moz-selection{background:color-mix(in oklch,var(--accent),transparent 70%)}::selection{background:color-mix(in oklch,var(--accent),transparent 70%)}.\!container{width:100%!important}.container{width:100%}@media (min-width:640px){.\!container{max-width:640px!important}.container{max-width:640px}}@media (min-width:768px){.\!container{max-width:768px!important}.container{max-width:768px}}@media (min-width:1024px){.\!container{max-width:1024px!important}.container{max-width:1024px}}@media (min-width:1280px){.\!container{max-width:1280px!important}.container{max-width:1280px}}@media (min-width:1536px){.\!container{max-width:1536px!important}.container{max-width:1536px}}.mono{font-family:JetBrains Mono,ui-monospace,monospace;font-variant-numeric:tabular-nums}.panel{background:var(--panel);border:1px solid var(--line-soft)}.hairline{box-shadow:inset 0 -1px 0 var(--line-soft)}.dot{border-radius:9999px;display:inline-block;height:7px;width:7px}.dot-online{background:var(--ok);box-shadow:0 0 0 3px color-mix(in oklch,var(--ok),transparent 80%)}.dot-degraded{background:var(--warn);box-shadow:0 0 0 3px color-mix(in oklch,var(--warn),transparent 80%)}.dot-offline{background:var(--off)}.dot-failed{background:var(--bad);box-shadow:0 0 0 3px color-mix(in oklch,var(--bad),transparent 80%)}.pulse{animation:rm-pulse 2.4s ease-in-out infinite}@keyframes rm-pulse{0%,to{box-shadow:0 0 0 3px color-mix(in oklch,var(--accent),transparent 80%)}50%{box-shadow:0 0 0 6px color-mix(in oklch,var(--accent),transparent 92%)}}.btn{align-items:center;background:transparent;border:1px solid var(--line);border-radius:5px;color:var(--ink-mid);cursor:pointer;display:inline-flex;font-size:12px;font-weight:500;gap:6px;padding:6px 11px;text-decoration:none;transition:all .12s ease}.btn:hover{background:var(--panel-hi);color:var(--ink)}.btn:disabled,.btn[disabled]{cursor:not-allowed;opacity:.4;pointer-events:none}.btn-primary{background:var(--accent);border-color:var(--accent);color:oklch(.18 .01 195)}.btn-primary:hover{filter:brightness(1.08)}.btn-ghost,.btn-ghost:hover{border-color:transparent}.btn-ghost:hover{background:var(--panel-hi)}.btn-danger{border-color:color-mix(in oklch,var(--bad),transparent 70%);color:var(--bad)}.btn-danger:hover{background:color-mix(in oklch,var(--bad),transparent 88%);border-color:color-mix(in oklch,var(--bad),transparent 50%);color:oklch(.85 .1 25)}.btn-lg{font-size:13px;padding:9px 14px}.btn-block{justify-content:center;width:100%}.nav-tab{border-bottom:2px solid transparent;color:var(--ink-mute);cursor:pointer;font-size:13px;margin-right:28px;padding:18px 0;text-decoration:none}.nav-tab.active{border-color:var(--accent)}.nav-tab.active,.nav-tab:hover{color:var(--ink)}.sub-tab{border-bottom:1.5px solid transparent;color:var(--ink-mute);cursor:pointer;font-size:13px;margin-right:24px;padding:12px 0;text-decoration:none}.sub-tab.active{border-color:var(--ink);color:var(--ink)}.tag{align-items:center;border:1px solid var(--line);border-radius:3px;display:inline-flex;font-size:11px;gap:5px;letter-spacing:.01em;line-height:1;padding:4px 7px}.field-label,.tag{color:var(--ink-mid)}.field-label{display:block;font-size:12px;margin-bottom:6px}.field-help{color:var(--ink-mute);font-size:12px;line-height:1.55;margin-top:6px}.field{background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;color:var(--ink);font-family:inherit;font-size:13px;outline:none;padding:9px 12px;transition:border-color .12s ease;width:100%}.field:focus{border-color:var(--accent)}.field.invalid{border-color:color-mix(in oklch,var(--bad),transparent 50%)}.field.mono{font-family:JetBrains Mono,monospace;font-size:12px}.field.with-prefix{padding-left:64px}.host-row{align-items:center;border-left:3px solid transparent;-moz-column-gap:18px;column-gap:18px;display:grid;font-size:13px;grid-template-columns:24px 1.4fr .95fr 1.5fr .75fr .7fr .7fr 1.1fr 92px;padding:11px 16px}.host-row.head{color:var(--ink-fade);font-size:11px;letter-spacing:.08em;padding-bottom:10px;padding-top:10px;text-transform:uppercase}.host-row.degraded{border-left-color:color-mix(in oklch,var(--warn),transparent 50%)}.host-row.failed{border-left-color:color-mix(in oklch,var(--bad),transparent 50%)}.host-row.offline{border-left-color:color-mix(in oklch,var(--off),transparent 70%)}.host-row:hover{background:var(--panel-hi)}.host-row.clickable{position:relative}.host-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.host-row.clickable:hover{cursor:pointer}.host-row.clickable>*{pointer-events:none;position:relative;z-index:1}.host-row.clickable>.row-action,.host-row.clickable>.row-link{pointer-events:auto}.src-row{align-items:center;-moz-column-gap:18px;column-gap:18px;display:grid;grid-template-columns:1fr auto;padding:14px 18px}.src-row.clickable{position:relative}.src-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.src-row.clickable:hover{background:var(--panel-hi);cursor:pointer}.src-row.clickable>*{pointer-events:none;position:relative;z-index:1}.src-row.clickable>.row-action,.src-row.clickable>.row-link{pointer-events:auto}.schd-row{align-items:center;-moz-column-gap:18px;column-gap:18px;display:grid;font-size:13px;grid-template-columns:90px 1fr 2fr auto;padding:12px 18px}.schd-row.head{color:var(--ink-fade);font-size:11px;letter-spacing:.08em;padding-bottom:10px;padding-top:10px;text-transform:uppercase}.schd-row.clickable{position:relative}.schd-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.schd-row.clickable:hover{background:var(--panel-hi);cursor:pointer}.schd-row.clickable>*{pointer-events:none;position:relative;z-index:1}.schd-row.clickable>.row-action,.schd-row.clickable>.row-link{pointer-events:auto}.preset-chip{background:var(--bg);border:1px solid var(--line-soft);border-radius:4px;color:var(--ink-mid);cursor:pointer;font-family:JetBrains Mono,monospace;font-size:11.5px;padding:4px 9px;transition:border-color .1s ease,color .1s ease;-webkit-user-select:none;-moz-user-select:none;user-select:none}.preset-chip:hover{border-color:var(--accent);color:var(--ink)}.picker{align-items:center;background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;cursor:pointer;display:flex;font-size:13px;gap:12px;padding:10px 12px;transition:border-color .1s ease,background .1s ease}.picker:hover{border-color:var(--ink-mute)}.picker .check{border:1px solid var(--line);border-radius:3px;display:inline-block;flex-shrink:0;height:14px;position:relative;width:14px}.picker.checked{background:color-mix(in oklch,var(--accent),transparent 92%);border-color:color-mix(in oklch,var(--accent),transparent 50%)}.picker.checked .check{background:var(--accent);border-color:var(--accent)}.picker.checked .check:after{border:solid oklch(.18 .01 195);border-width:0 1.5px 1.5px 0;content:"";height:8px;left:4px;position:absolute;top:1px;transform:rotate(45deg);width:4px}.picker input[type=checkbox]{opacity:0;pointer-events:none;position:absolute}.keep-cell{background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;display:flex;flex-direction:column;gap:4px;padding:9px 11px}.keep-cell label{color:var(--ink-fade);font-size:10.5px;letter-spacing:.08em;text-transform:uppercase}.keep-cell input{background:transparent;border:none;color:var(--ink);font-size:14px;outline:none;padding:0;width:100%}.keep-cell input,.log{font-family:JetBrains Mono,monospace}.log{background:var(--bg);border:1px solid var(--line-soft);border-radius:7px;font-size:12px;line-height:1.7;overflow:hidden}.log-line{align-items:baseline;-moz-column-gap:14px;column-gap:14px;display:grid;grid-template-columns:14ch 8ch 1fr;padding:1px 16px}.log-line:first-child{padding-top:12px}.log-line:last-child{padding-bottom:12px}.log-tag,.log-ts{color:var(--ink-fade)}.log-tag{font-size:10px;letter-spacing:.08em;text-transform:uppercase}.progress-track{background:var(--bg);border:1px solid var(--line-soft);border-radius:9999px;height:6px;overflow:hidden}.progress-fill{background:var(--accent);border-radius:9999px;height:100%;transition:width .25s ease}.progress-fill.ok{background:var(--ok)}.progress-fill.bad{background:var(--bad)}.crumbs{font-size:12px}.crumbs,.crumbs a{color:var(--ink-mute)}.crumbs a{text-decoration:underline;text-decoration-color:var(--line);text-underline-offset:3px}.crumbs .sep{color:var(--ink-fade);margin:0 8px}.snippet{border:1px solid var(--line-soft);border-radius:6px;overflow:hidden}.snippet-head{align-items:center;border-bottom:1px solid var(--line-soft);color:var(--ink-fade);display:flex;font-size:11px;justify-content:space-between;letter-spacing:.1em;padding:10px 14px;text-transform:uppercase}.snippet pre{color:var(--ink-mid);font-family:JetBrains Mono,monospace;font-size:12px;line-height:1.7;margin:0;padding:14px;white-space:pre-wrap;word-break:break-all}.snippet pre .var{color:var(--accent)}.empty-state{background:radial-gradient(ellipse at top,color-mix(in oklch,var(--accent),transparent 95%),transparent 60%),var(--panel);border:1px dashed var(--line);border-radius:8px;padding:60px 40px;text-align:center}.pointer-events-none{pointer-events:none}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.bottom-5{bottom:1.25rem}.left-0{left:0}.right-5{right:1.25rem}.top-0{top:0}.z-50{z-index:50}.col-span-2{grid-column:span 2/span 2}.col-span-3{grid-column:span 3/span 3}.col-span-4{grid-column:span 4/span 4}.col-span-5{grid-column:span 5/span 5}.col-span-7{grid-column:span 7/span 7}.col-span-8{grid-column:span 8/span 8}.col-span-9{grid-column:span 9/span 9}.m-0{margin:0}.mx-auto{margin-left:auto;margin-right:auto}.mb-1\.5{margin-bottom:.375rem}.mb-10{margin-bottom:2.5rem}.mb-2{margin-bottom:.5rem}.mb-2\.5{margin-bottom:.625rem}.mb-3{margin-bottom:.75rem}.mb-3\.5{margin-bottom:.875rem}.mb-4{margin-bottom:1rem}.mb-5{margin-bottom:1.25rem}.mb-7{margin-bottom:1.75rem}.ml-1{margin-left:.25rem}.ml-1\.5{margin-left:.375rem}.ml-2{margin-left:.5rem}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-1\.5{margin-top:.375rem}.mt-2{margin-top:.5rem}.mt-2\.5{margin-top:.625rem}.mt-20{margin-top:5rem}.mt-3{margin-top:.75rem}.mt-3\.5{margin-top:.875rem}.mt-4{margin-top:1rem}.mt-5{margin-top:1.25rem}.mt-6{margin-top:1.5rem}.mt-7{margin-top:1.75rem}.mt-8{margin-top:2rem}.mt-9{margin-top:2.25rem}.block{display:block}.inline-block{display:inline-block}.inline{display:inline}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.h-3\.5{height:.875rem}.h-\[22px\]{height:22px}.min-h-screen{min-height:100vh}.w-16{width:4rem}.w-3\.5{width:.875rem}.w-\[22px\]{width:22px}.w-\[360px\]{width:360px}.w-full{width:100%}.max-w-\[1280px\]{max-width:1280px}.max-w-\[440px\]{max-width:440px}.max-w-\[480px\]{max-width:480px}.max-w-\[520px\]{max-width:520px}.max-w-\[580px\]{max-width:580px}.max-w-\[680px\]{max-width:680px}.max-w-\[720px\]{max-width:720px}.max-w-\[760px\]{max-width:760px}.flex-1{flex:1 1 0%}.flex-none{flex:none}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.cursor-pointer{cursor:pointer}.resize{resize:both}.list-none{list-style-type:none}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-start{align-items:flex-start}.items-end{align-items:flex-end}.items-center{align-items:center}.items-baseline{align-items:baseline}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-1\.5{gap:.375rem}.gap-2{gap:.5rem}.gap-2\.5{gap:.625rem}.gap-3{gap:.75rem}.gap-3\.5{gap:.875rem}.gap-4{gap:1rem}.gap-5{gap:1.25rem}.gap-6{gap:1.5rem}.gap-8{gap:2rem}.space-y-4>:not([hidden])~:not([hidden]){--tw-space-y-reverse:0;margin-bottom:calc(1rem*var(--tw-space-y-reverse));margin-top:calc(1rem*(1 - var(--tw-space-y-reverse)))}.overflow-hidden,.truncate{overflow:hidden}.truncate{text-overflow:ellipsis}.truncate,.whitespace-nowrap{white-space:nowrap}.text-pretty{text-wrap:pretty}.rounded-\[3px\]{border-radius:3px}.rounded-\[5px\]{border-radius:5px}.rounded-\[6px\]{border-radius:6px}.rounded-\[7px\]{border-radius:7px}.rounded-full{border-radius:9999px}.border{border-width:1px}.border-y{border-top-width:1px}.border-b,.border-y{border-bottom-width:1px}.border-l{border-left-width:1px}.border-t{border-top-width:1px}.border-line{border-color:oklch(.27 .01 250)}.border-line-soft{border-color:oklch(.23 .008 250)}.p-0{padding:0}.p-4{padding:1rem}.p-5{padding:1.25rem}.p-7{padding:1.75rem}.px-1{padding-left:.25rem;padding-right:.25rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-2\.5{padding-left:.625rem;padding-right:.625rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-3\.5{padding-left:.875rem;padding-right:.875rem}.px-4{padding-left:1rem;padding-right:1rem}.px-7{padding-left:1.75rem;padding-right:1.75rem}.px-8{padding-left:2rem;padding-right:2rem}.py-0\.5{padding-bottom:.125rem;padding-top:.125rem}.py-1{padding-bottom:.25rem;padding-top:.25rem}.py-12{padding-bottom:3rem;padding-top:3rem}.py-2{padding-bottom:.5rem;padding-top:.5rem}.py-2\.5{padding-bottom:.625rem;padding-top:.625rem}.py-3{padding-bottom:.75rem;padding-top:.75rem}.py-3\.5{padding-bottom:.875rem;padding-top:.875rem}.py-4{padding-bottom:1rem;padding-top:1rem}.py-5{padding-bottom:1.25rem;padding-top:1.25rem}.py-6{padding-bottom:1.5rem;padding-top:1.5rem}.py-7{padding-bottom:1.75rem;padding-top:1.75rem}.pb-14{padding-bottom:3.5rem}.pb-2{padding-bottom:.5rem}.pb-24{padding-bottom:6rem}.pb-3{padding-bottom:.75rem}.pb-4{padding-bottom:1rem}.pl-6{padding-left:1.5rem}.pl-9{padding-left:2.25rem}.pt-1{padding-top:.25rem}.pt-14{padding-top:3.5rem}.pt-4{padding-top:1rem}.pt-5{padding-top:1.25rem}.pt-6{padding-top:1.5rem}.pt-7{padding-top:1.75rem}.pt-9{padding-top:2.25rem}.pt-\[1px\]{padding-top:1px}.text-center{text-align:center}.text-right{text-align:right}.text-2xl{font-size:1.5rem;line-height:2rem}.text-\[11\.5px\]{font-size:11.5px}.text-\[11px\]{font-size:11px}.text-\[12\.5px\]{font-size:12.5px}.text-\[12px\]{font-size:12px}.text-\[13px\]{font-size:13px}.text-\[14px\]{font-size:14px}.text-\[16px\]{font-size:16px}.text-\[18px\]{font-size:18px}.text-\[20px\]{font-size:20px}.text-\[22px\]{font-size:22px}.text-\[26px\]{font-size:26px}.text-\[28px\]{font-size:28px}.text-base{font-size:1rem;line-height:1.5rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xs{font-size:.75rem;line-height:1rem}.font-medium{font-weight:500}.font-normal{font-weight:400}.font-semibold{font-weight:600}.uppercase{text-transform:uppercase}.normal-case{text-transform:none}.italic{font-style:italic}.leading-\[1\.55\]{line-height:1.55}.leading-\[1\.5\]{line-height:1.5}.leading-\[1\.65\]{line-height:1.65}.leading-\[1\.6\]{line-height:1.6}.leading-\[1\.7\]{line-height:1.7}.leading-\[20px\]{line-height:20px}.leading-none{line-height:1}.tracking-\[-0\.005em\]{letter-spacing:-.005em}.tracking-\[-0\.012em\]{letter-spacing:-.012em}.tracking-\[-0\.01em\]{letter-spacing:-.01em}.tracking-\[-0\.02em\]{letter-spacing:-.02em}.tracking-\[0\.005em\]{letter-spacing:.005em}.tracking-\[0\.01em\]{letter-spacing:.01em}.tracking-\[0\.02em\]{letter-spacing:.02em}.tracking-\[0\.08em\]{letter-spacing:.08em}.tracking-\[0\.1em\]{letter-spacing:.1em}.text-accent{color:oklch(.82 .12 195)}.text-bad{color:oklch(.7 .2 25)}.text-ink{color:oklch(.96 .005 250)}.text-ink-fade{color:oklch(.42 .006 250)}.text-ink-mid{color:oklch(.78 .005 250)}.text-ink-mute{color:oklch(.58 .006 250)}.text-ok{color:oklch(.78 .14 155)}.text-warn{color:oklch(.82 .13 80)}.underline{text-decoration-line:underline}.no-underline{text-decoration-line:none}.decoration-line{text-decoration-color:oklch(.27 .01 250)}.underline-offset-4{text-underline-offset:4px}.transition{transition-duration:.15s;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1)} +/*! tailwindcss v3.4.17 | MIT License | https://tailwindcss.com*/*,:after,:before{border:0 solid #e5e7eb;box-sizing:border-box}:after,:before{--tw-content:""}:host,html{line-height:1.5;-webkit-text-size-adjust:100%;font-family:Inter,system-ui,-apple-system,sans-serif;font-feature-settings:normal;font-variation-settings:normal;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-tap-highlight-color:transparent}body{line-height:inherit;margin:0}hr{border-top-width:1px;color:inherit;height:0}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:JetBrains Mono,ui-monospace,monospace;font-feature-settings:normal;font-size:1em;font-variation-settings:normal}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{border-collapse:collapse;border-color:inherit;text-indent:0}button,input,optgroup,select,textarea{color:inherit;font-family:inherit;font-feature-settings:inherit;font-size:100%;font-variation-settings:inherit;font-weight:inherit;letter-spacing:inherit;line-height:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0}fieldset,legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{color:#9ca3af;opacity:1}input::placeholder,textarea::placeholder{color:#9ca3af;opacity:1}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{height:auto;max-width:100%}[hidden]:where(:not([hidden=until-found])){display:none}:root{--bg:oklch(0.17 0.006 250);--panel:oklch(0.20 0.007 250);--panel-hi:oklch(0.23 0.008 250);--line:oklch(0.27 0.010 250);--line-soft:oklch(0.23 0.008 250);--ink:oklch(0.96 0.005 250);--ink-mid:oklch(0.78 0.005 250);--ink-mute:oklch(0.58 0.006 250);--ink-fade:oklch(0.42 0.006 250);--ok:oklch(0.78 0.14 155);--warn:oklch(0.82 0.13 80);--bad:oklch(0.70 0.20 25);--off:oklch(0.50 0.005 250);--accent:oklch(0.82 0.12 195)}body,html{background:var(--bg);color:var(--ink);font-family:Inter,system-ui,-apple-system,sans-serif;-webkit-font-smoothing:antialiased}body{font-feature-settings:"cv11","ss01","ss03"}::-moz-selection{background:color-mix(in oklch,var(--accent),transparent 70%)}::selection{background:color-mix(in oklch,var(--accent),transparent 70%)}.\!container{width:100%!important}.container{width:100%}@media (min-width:640px){.\!container{max-width:640px!important}.container{max-width:640px}}@media (min-width:768px){.\!container{max-width:768px!important}.container{max-width:768px}}@media (min-width:1024px){.\!container{max-width:1024px!important}.container{max-width:1024px}}@media (min-width:1280px){.\!container{max-width:1280px!important}.container{max-width:1280px}}@media (min-width:1536px){.\!container{max-width:1536px!important}.container{max-width:1536px}}.mono{font-family:JetBrains Mono,ui-monospace,monospace;font-variant-numeric:tabular-nums}.panel{background:var(--panel);border:1px solid var(--line-soft)}.hairline{box-shadow:inset 0 -1px 0 var(--line-soft)}.dot{border-radius:9999px;display:inline-block;height:7px;width:7px}.dot-online{background:var(--ok);box-shadow:0 0 0 3px color-mix(in oklch,var(--ok),transparent 80%)}.dot-degraded{background:var(--warn);box-shadow:0 0 0 3px color-mix(in oklch,var(--warn),transparent 80%)}.dot-offline{background:var(--off)}.dot-failed{background:var(--bad);box-shadow:0 0 0 3px color-mix(in oklch,var(--bad),transparent 80%)}.pulse{animation:rm-pulse 2.4s ease-in-out infinite}@keyframes rm-pulse{0%,to{box-shadow:0 0 0 3px color-mix(in oklch,var(--accent),transparent 80%)}50%{box-shadow:0 0 0 6px color-mix(in oklch,var(--accent),transparent 92%)}}.btn{align-items:center;background:transparent;border:1px solid var(--line);border-radius:5px;color:var(--ink-mid);cursor:pointer;display:inline-flex;font-size:12px;font-weight:500;gap:6px;padding:6px 11px;text-decoration:none;transition:all .12s ease}.btn:hover{background:var(--panel-hi);color:var(--ink)}.btn:disabled,.btn[disabled]{cursor:not-allowed;opacity:.4;pointer-events:none}.btn-primary{background:var(--accent);border-color:var(--accent);color:oklch(.18 .01 195)}.btn-primary:hover{filter:brightness(1.08)}.btn-ghost,.btn-ghost:hover{border-color:transparent}.btn-ghost:hover{background:var(--panel-hi)}.btn-danger{border-color:color-mix(in oklch,var(--bad),transparent 70%);color:var(--bad)}.btn-danger:hover{background:color-mix(in oklch,var(--bad),transparent 88%);border-color:color-mix(in oklch,var(--bad),transparent 50%);color:oklch(.85 .1 25)}.btn-lg{font-size:13px;padding:9px 14px}.btn-block{justify-content:center;width:100%}.nav-tab{border-bottom:2px solid transparent;color:var(--ink-mute);cursor:pointer;font-size:13px;margin-right:28px;padding:18px 0;text-decoration:none}.nav-tab.active{border-color:var(--accent)}.nav-tab.active,.nav-tab:hover{color:var(--ink)}.sub-tab{border-bottom:1.5px solid transparent;color:var(--ink-mute);cursor:pointer;font-size:13px;margin-right:24px;padding:12px 0;text-decoration:none}.sub-tab.active{border-color:var(--ink);color:var(--ink)}.tag{align-items:center;border:1px solid var(--line);border-radius:3px;display:inline-flex;font-size:11px;gap:5px;letter-spacing:.01em;line-height:1;padding:4px 7px}.field-label,.tag{color:var(--ink-mid)}.field-label{display:block;font-size:12px;margin-bottom:6px}.field-help{color:var(--ink-mute);font-size:12px;line-height:1.55;margin-top:6px}.field{background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;color:var(--ink);font-family:inherit;font-size:13px;outline:none;padding:9px 12px;transition:border-color .12s ease;width:100%}.field:focus{border-color:var(--accent)}.field.invalid{border-color:color-mix(in oklch,var(--bad),transparent 50%)}.field.mono{font-family:JetBrains Mono,monospace;font-size:12px}.field.with-prefix{padding-left:64px}.host-row{align-items:center;border-left:3px solid transparent;-moz-column-gap:18px;column-gap:18px;display:grid;font-size:13px;grid-template-columns:24px 1.4fr .95fr 1.5fr .75fr .7fr .7fr 1.1fr 92px;padding:11px 16px}.host-row.head{color:var(--ink-fade);font-size:11px;letter-spacing:.08em;padding-bottom:10px;padding-top:10px;text-transform:uppercase}.host-row.degraded{border-left-color:color-mix(in oklch,var(--warn),transparent 50%)}.host-row.failed{border-left-color:color-mix(in oklch,var(--bad),transparent 50%)}.host-row.offline{border-left-color:color-mix(in oklch,var(--off),transparent 70%)}.host-row:hover{background:var(--panel-hi)}.host-row.clickable{position:relative}.host-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.host-row.clickable:hover{cursor:pointer}.host-row.clickable>*{pointer-events:none;position:relative;z-index:1}.host-row.clickable>.row-action,.host-row.clickable>.row-link{pointer-events:auto}.src-row{align-items:center;-moz-column-gap:18px;column-gap:18px;display:grid;grid-template-columns:1fr auto;padding:14px 18px}.src-row.clickable{position:relative}.src-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.src-row.clickable:hover{background:var(--panel-hi);cursor:pointer}.src-row.clickable>*{pointer-events:none;position:relative;z-index:1}.src-row.clickable>.row-action,.src-row.clickable>.row-link{pointer-events:auto}.schd-row{align-items:center;-moz-column-gap:18px;column-gap:18px;display:grid;font-size:13px;grid-template-columns:90px 1fr 2fr auto;padding:12px 18px}.schd-row.head{color:var(--ink-fade);font-size:11px;letter-spacing:.08em;padding-bottom:10px;padding-top:10px;text-transform:uppercase}.schd-row.clickable{position:relative}.schd-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.schd-row.clickable:hover{background:var(--panel-hi);cursor:pointer}.schd-row.clickable>*{pointer-events:none;position:relative;z-index:1}.schd-row.clickable>.row-action,.schd-row.clickable>.row-link{pointer-events:auto}.preset-chip{background:var(--bg);border:1px solid var(--line-soft);border-radius:4px;color:var(--ink-mid);cursor:pointer;font-family:JetBrains Mono,monospace;font-size:11.5px;padding:4px 9px;transition:border-color .1s ease,color .1s ease;-webkit-user-select:none;-moz-user-select:none;user-select:none}.preset-chip:hover{border-color:var(--accent);color:var(--ink)}.picker{align-items:center;background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;cursor:pointer;display:flex;font-size:13px;gap:12px;padding:10px 12px;transition:border-color .1s ease,background .1s ease}.picker:hover{border-color:var(--ink-mute)}.picker .check{border:1px solid var(--line);border-radius:3px;display:inline-block;flex-shrink:0;height:14px;position:relative;width:14px}.picker.checked{background:color-mix(in oklch,var(--accent),transparent 92%);border-color:color-mix(in oklch,var(--accent),transparent 50%)}.picker.checked .check{background:var(--accent);border-color:var(--accent)}.picker.checked .check:after{border:solid oklch(.18 .01 195);border-width:0 1.5px 1.5px 0;content:"";height:8px;left:4px;position:absolute;top:1px;transform:rotate(45deg);width:4px}.picker input[type=checkbox]{opacity:0;pointer-events:none;position:absolute}.keep-cell{background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;display:flex;flex-direction:column;gap:4px;padding:9px 11px}.keep-cell label{color:var(--ink-fade);font-size:10.5px;letter-spacing:.08em;text-transform:uppercase}.keep-cell input{background:transparent;border:none;color:var(--ink);font-size:14px;outline:none;padding:0;width:100%}.keep-cell input,.log{font-family:JetBrains Mono,monospace}.log{background:var(--bg);border:1px solid var(--line-soft);border-radius:7px;font-size:12px;line-height:1.7;overflow:hidden}.log-line{align-items:baseline;-moz-column-gap:14px;column-gap:14px;display:grid;grid-template-columns:14ch 8ch 1fr;padding:1px 16px}.log-line:first-child{padding-top:12px}.log-line:last-child{padding-bottom:12px}.log-tag,.log-ts{color:var(--ink-fade)}.log-tag{font-size:10px;letter-spacing:.08em;text-transform:uppercase}.progress-track{background:var(--bg);border:1px solid var(--line-soft);border-radius:9999px;height:6px;overflow:hidden}.progress-fill{background:var(--accent);border-radius:9999px;height:100%;transition:width .25s ease}.progress-fill.ok{background:var(--ok)}.progress-fill.bad{background:var(--bad)}.crumbs{font-size:12px}.crumbs,.crumbs a{color:var(--ink-mute)}.crumbs a{text-decoration:underline;text-decoration-color:var(--line);text-underline-offset:3px}.crumbs .sep{color:var(--ink-fade);margin:0 8px}.snippet{border:1px solid var(--line-soft);border-radius:6px;overflow:hidden}.snippet-head{align-items:center;border-bottom:1px solid var(--line-soft);color:var(--ink-fade);display:flex;font-size:11px;justify-content:space-between;letter-spacing:.1em;padding:10px 14px;text-transform:uppercase}.snippet pre{color:var(--ink-mid);font-family:JetBrains Mono,monospace;font-size:12px;line-height:1.7;margin:0;padding:14px;white-space:pre-wrap;word-break:break-all}.snippet pre .var{color:var(--accent)}.empty-state{background:radial-gradient(ellipse at top,color-mix(in oklch,var(--accent),transparent 95%),transparent 60%),var(--panel);border:1px dashed var(--line);border-radius:8px;padding:60px 40px;text-align:center}.pointer-events-none{pointer-events:none}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.bottom-5{bottom:1.25rem}.left-0{left:0}.right-5{right:1.25rem}.top-0{top:0}.z-50{z-index:50}.col-span-2{grid-column:span 2/span 2}.col-span-3{grid-column:span 3/span 3}.col-span-4{grid-column:span 4/span 4}.col-span-5{grid-column:span 5/span 5}.col-span-7{grid-column:span 7/span 7}.col-span-8{grid-column:span 8/span 8}.col-span-9{grid-column:span 9/span 9}.m-0{margin:0}.mx-auto{margin-left:auto;margin-right:auto}.mb-1\.5{margin-bottom:.375rem}.mb-10{margin-bottom:2.5rem}.mb-2{margin-bottom:.5rem}.mb-2\.5{margin-bottom:.625rem}.mb-3{margin-bottom:.75rem}.mb-3\.5{margin-bottom:.875rem}.mb-4{margin-bottom:1rem}.mb-5{margin-bottom:1.25rem}.mb-7{margin-bottom:1.75rem}.ml-1{margin-left:.25rem}.ml-1\.5{margin-left:.375rem}.ml-2{margin-left:.5rem}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-1\.5{margin-top:.375rem}.mt-2{margin-top:.5rem}.mt-2\.5{margin-top:.625rem}.mt-20{margin-top:5rem}.mt-3{margin-top:.75rem}.mt-3\.5{margin-top:.875rem}.mt-4{margin-top:1rem}.mt-5{margin-top:1.25rem}.mt-6{margin-top:1.5rem}.mt-7{margin-top:1.75rem}.mt-8{margin-top:2rem}.mt-9{margin-top:2.25rem}.block{display:block}.inline-block{display:inline-block}.inline{display:inline}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.h-3\.5{height:.875rem}.h-\[22px\]{height:22px}.min-h-screen{min-height:100vh}.w-16{width:4rem}.w-3\.5{width:.875rem}.w-\[22px\]{width:22px}.w-\[360px\]{width:360px}.w-full{width:100%}.max-w-\[1280px\]{max-width:1280px}.max-w-\[440px\]{max-width:440px}.max-w-\[480px\]{max-width:480px}.max-w-\[520px\]{max-width:520px}.max-w-\[580px\]{max-width:580px}.max-w-\[640px\]{max-width:640px}.max-w-\[680px\]{max-width:680px}.max-w-\[720px\]{max-width:720px}.max-w-\[760px\]{max-width:760px}.flex-1{flex:1 1 0%}.flex-none{flex:none}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.cursor-pointer{cursor:pointer}.resize{resize:both}.list-none{list-style-type:none}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-start{align-items:flex-start}.items-end{align-items:flex-end}.items-center{align-items:center}.items-baseline{align-items:baseline}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-1\.5{gap:.375rem}.gap-2{gap:.5rem}.gap-2\.5{gap:.625rem}.gap-3{gap:.75rem}.gap-3\.5{gap:.875rem}.gap-4{gap:1rem}.gap-5{gap:1.25rem}.gap-6{gap:1.5rem}.gap-8{gap:2rem}.gap-x-4{-moz-column-gap:1rem;column-gap:1rem}.gap-y-2{row-gap:.5rem}.space-y-4>:not([hidden])~:not([hidden]){--tw-space-y-reverse:0;margin-bottom:calc(1rem*var(--tw-space-y-reverse));margin-top:calc(1rem*(1 - var(--tw-space-y-reverse)))}.overflow-hidden,.truncate{overflow:hidden}.truncate{text-overflow:ellipsis}.truncate,.whitespace-nowrap{white-space:nowrap}.text-pretty{text-wrap:pretty}.rounded-\[3px\]{border-radius:3px}.rounded-\[5px\]{border-radius:5px}.rounded-\[6px\]{border-radius:6px}.rounded-\[7px\]{border-radius:7px}.rounded-full{border-radius:9999px}.border{border-width:1px}.border-y{border-top-width:1px}.border-b,.border-y{border-bottom-width:1px}.border-l{border-left-width:1px}.border-t{border-top-width:1px}.border-line{border-color:oklch(.27 .01 250)}.border-line-soft{border-color:oklch(.23 .008 250)}.p-0{padding:0}.p-4{padding:1rem}.p-5{padding:1.25rem}.p-7{padding:1.75rem}.px-1{padding-left:.25rem;padding-right:.25rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-2\.5{padding-left:.625rem;padding-right:.625rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-3\.5{padding-left:.875rem;padding-right:.875rem}.px-4{padding-left:1rem;padding-right:1rem}.px-7{padding-left:1.75rem;padding-right:1.75rem}.px-8{padding-left:2rem;padding-right:2rem}.py-0\.5{padding-bottom:.125rem;padding-top:.125rem}.py-1{padding-bottom:.25rem;padding-top:.25rem}.py-12{padding-bottom:3rem;padding-top:3rem}.py-2{padding-bottom:.5rem;padding-top:.5rem}.py-2\.5{padding-bottom:.625rem;padding-top:.625rem}.py-3{padding-bottom:.75rem;padding-top:.75rem}.py-3\.5{padding-bottom:.875rem;padding-top:.875rem}.py-4{padding-bottom:1rem;padding-top:1rem}.py-5{padding-bottom:1.25rem;padding-top:1.25rem}.py-6{padding-bottom:1.5rem;padding-top:1.5rem}.py-7{padding-bottom:1.75rem;padding-top:1.75rem}.pb-14{padding-bottom:3.5rem}.pb-2{padding-bottom:.5rem}.pb-24{padding-bottom:6rem}.pb-3{padding-bottom:.75rem}.pb-4{padding-bottom:1rem}.pl-6{padding-left:1.5rem}.pl-9{padding-left:2.25rem}.pt-1{padding-top:.25rem}.pt-14{padding-top:3.5rem}.pt-4{padding-top:1rem}.pt-5{padding-top:1.25rem}.pt-6{padding-top:1.5rem}.pt-7{padding-top:1.75rem}.pt-9{padding-top:2.25rem}.pt-\[1px\]{padding-top:1px}.text-center{text-align:center}.text-right{text-align:right}.text-2xl{font-size:1.5rem;line-height:2rem}.text-\[11\.5px\]{font-size:11.5px}.text-\[11px\]{font-size:11px}.text-\[12\.5px\]{font-size:12.5px}.text-\[12px\]{font-size:12px}.text-\[13px\]{font-size:13px}.text-\[14px\]{font-size:14px}.text-\[16px\]{font-size:16px}.text-\[18px\]{font-size:18px}.text-\[20px\]{font-size:20px}.text-\[22px\]{font-size:22px}.text-\[26px\]{font-size:26px}.text-\[28px\]{font-size:28px}.text-base{font-size:1rem;line-height:1.5rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xs{font-size:.75rem;line-height:1rem}.font-medium{font-weight:500}.font-normal{font-weight:400}.font-semibold{font-weight:600}.uppercase{text-transform:uppercase}.normal-case{text-transform:none}.italic{font-style:italic}.leading-\[1\.55\]{line-height:1.55}.leading-\[1\.5\]{line-height:1.5}.leading-\[1\.65\]{line-height:1.65}.leading-\[1\.6\]{line-height:1.6}.leading-\[1\.7\]{line-height:1.7}.leading-\[20px\]{line-height:20px}.leading-none{line-height:1}.tracking-\[-0\.005em\]{letter-spacing:-.005em}.tracking-\[-0\.012em\]{letter-spacing:-.012em}.tracking-\[-0\.01em\]{letter-spacing:-.01em}.tracking-\[-0\.02em\]{letter-spacing:-.02em}.tracking-\[0\.005em\]{letter-spacing:.005em}.tracking-\[0\.01em\]{letter-spacing:.01em}.tracking-\[0\.02em\]{letter-spacing:.02em}.tracking-\[0\.08em\]{letter-spacing:.08em}.tracking-\[0\.1em\]{letter-spacing:.1em}.text-accent{color:oklch(.82 .12 195)}.text-bad{color:oklch(.7 .2 25)}.text-ink{color:oklch(.96 .005 250)}.text-ink-fade{color:oklch(.42 .006 250)}.text-ink-mid{color:oklch(.78 .005 250)}.text-ink-mute{color:oklch(.58 .006 250)}.text-ok{color:oklch(.78 .14 155)}.text-warn{color:oklch(.82 .13 80)}.underline{text-decoration-line:underline}.no-underline{text-decoration-line:none}.decoration-line{text-decoration-color:oklch(.27 .01 250)}.underline-offset-4{text-underline-offset:4px}.transition{transition-duration:.15s;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1)} diff --git a/web/templates/pages/host_repo.html b/web/templates/pages/host_repo.html index 5580e7f..a0caf6e 100644 --- a/web/templates/pages/host_repo.html +++ b/web/templates/pages/host_repo.html @@ -42,6 +42,54 @@ + {{/* ---------- Admin credentials (optional) ---------- */}} +

+ Admin credentials · prune-only · optional +

+
+ {{if $page.AdminCredsError}} +
+ {{$page.AdminCredsError}} +
+ {{end}} + {{if eq $page.SavedSection "admin_credentials"}} +
✓ saved
+ {{end}} +

+ Only needed for rest-server repos that distinguish an append-only + user (everyday backups) from a delete-capable user (prune / + forget). For S3 / B2 / SFTP / local, leave this blank — the + everyday repo credentials handle prune too. +

+
+
+ + +
+
+ + +
+
+ + +
+
+
+ + {{if $page.HasAdminPassword}} + + {{end}} +
+
+ {{if $page.HasAdminPassword}} +
+ {{end}} + {{/* ---------- Bandwidth ---------- */}}

Bandwidth · host-wide

@@ -138,6 +186,40 @@
+ {{/* ---------- Run now · one-time ---------- */}} +

Run now · one-time

+
+

+ Operator-triggered. Output streams live to the job log. Cadence-driven runs land independently from the server-side ticker. +

+
+ + + +
+
+ {{/* ---------- Danger zone ---------- */}}

Danger zone

+ {{/* ---------- Repo health ---------- */}} + {{if $page.StatsView}} + {{$s := $page.StatsView}} +

Repo health

+
+ {{if $s.LockPresent}} +
+ Stale lock detected on the most recent check. Run unlock above to clear it before the next backup. +
+ {{end}} +
+ {{if $s.HasTotalSize}} +
Total size
+
{{bytes $s.TotalSizeBytes}}
+ {{end}} + {{if $s.HasRawSize}} +
Raw size · pre-dedup
+
{{bytes $s.RawSizeBytes}}
+ {{end}} + {{if $s.HasLastCheck}} +
Last check
+
+ {{$s.LastCheckAgo}} + {{if $s.LastCheckStatus}} · {{$s.LastCheckStatus}}{{end}} +
+ {{end}} + {{if $s.HasLastPrune}} +
Last prune
+
{{$s.LastPruneAgo}}
+ {{end}} +
+
+ {{end}} + {{if gt (len $page.GroupNames) 0}}

Snapshots by source