server: maintenance ticker drives forget/prune/check on cadence

Wires a 60s server-side ticker to the pure-logic maintenance.Decide
introduced in the previous commit. Decisions flow through a new
DispatchMaintenance method on *Server, which:

  - skips offline hosts (no pending_runs queueing — maintenance is
    not a backup, missed fires shouldn't pile up)
  - silently skips prune when admin creds aren't bound
  - pushes admin creds before prune, then dispatches with
    RequiresAdminCreds=true (same as operator-driven prune)
  - persists job rows with actor_kind="system"

Reshapes the forget wire payload from a single RetentionPolicy to a
ForgetGroups list (one tag + per-group keep-* per source group). The
agent walks the groups and runs `restic forget --tag <name> --keep-*`
once per group. Dead-code removed: CommandRunPayload.RetentionPolicy,
the old forget JSON-decode in cmd/agent, and the single-policy form of
restic.RunForget.
This commit is contained in:
2026-05-03 23:40:35 +01:00
parent a131419b1a
commit 7d2c2ae1c2
8 changed files with 559 additions and 62 deletions
+19
View File
@@ -16,6 +16,7 @@ import (
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
@@ -139,6 +140,14 @@ func run() error {
defer purgeTick.Stop()
offlineTick := time.NewTicker(30 * time.Second)
defer offlineTick.Stop()
// Maintenance ticker: drives forget/prune/check on the cadences
// operators set per-host. Independent of the agent's local cron
// (which only handles backup schedules). 60s cadence — the cron
// expressions are minute-grained, so anything finer is wasted
// work.
maintenanceTick := time.NewTicker(60 * time.Second)
defer maintenanceTick.Stop()
mt := maintenance.New(st)
go func() {
for {
select {
@@ -156,6 +165,16 @@ func run() error {
if n, err := st.MarkHostsOfflineStale(ctx, cutoff); err == nil && n > 0 {
slog.Info("marked hosts offline (stale heartbeat)", "n", n)
}
case <-maintenanceTick.C:
decisions, err := mt.Decide(ctx, time.Now().UTC())
if err != nil {
slog.Warn("maintenance ticker: decide", "err", err)
continue
}
if len(decisions) > 0 {
slog.Info("maintenance ticker: dispatching", "n", len(decisions))
srv.DispatchMaintenance(ctx, decisions)
}
}
}
}()