server: maintenance ticker drives forget/prune/check on cadence

Wires a 60s server-side ticker to the pure-logic maintenance.Decide
introduced in the previous commit. Decisions flow through a new
DispatchMaintenance method on *Server, which:

  - skips offline hosts (no pending_runs queueing — maintenance is
    not a backup, missed fires shouldn't pile up)
  - silently skips prune when admin creds aren't bound
  - pushes admin creds before prune, then dispatches with
    RequiresAdminCreds=true (same as operator-driven prune)
  - persists job rows with actor_kind="system"

Reshapes the forget wire payload from a single RetentionPolicy to a
ForgetGroups list (one tag + per-group keep-* per source group). The
agent walks the groups and runs `restic forget --tag <name> --keep-*`
once per group. Dead-code removed: CommandRunPayload.RetentionPolicy,
the old forget JSON-decode in cmd/agent, and the single-policy form of
restic.RunForget.
This commit is contained in:
2026-05-03 23:40:35 +01:00
parent ae96983877
commit 14b703be58
8 changed files with 559 additions and 62 deletions
+18 -21
View File
@@ -2,7 +2,6 @@ package main
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
@@ -336,28 +335,26 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
slog.Info("agent: init job complete", "job_id", p.JobID)
}()
case api.JobForget:
var policy restic.ForgetPolicy
if len(p.RetentionPolicy) > 0 {
var raw struct {
KeepLast *int `json:"keep_last,omitempty"`
KeepHourly *int `json:"keep_hourly,omitempty"`
KeepDaily *int `json:"keep_daily,omitempty"`
KeepWeekly *int `json:"keep_weekly,omitempty"`
KeepMonthly *int `json:"keep_monthly,omitempty"`
KeepYearly *int `json:"keep_yearly,omitempty"`
}
if err := json.Unmarshal(p.RetentionPolicy, &raw); err != nil {
return fmt.Errorf("forget: decode retention_policy: %w", err)
}
policy = restic.ForgetPolicy{
KeepLast: raw.KeepLast, KeepHourly: raw.KeepHourly,
KeepDaily: raw.KeepDaily, KeepWeekly: raw.KeepWeekly,
KeepMonthly: raw.KeepMonthly, KeepYearly: raw.KeepYearly,
}
if len(p.ForgetGroups) == 0 {
return fmt.Errorf("forget: command.run carried no forget_groups (server didn't populate them)")
}
slog.Info("agent: accepting forget job", "job_id", p.JobID, "policy", p.RetentionPolicy)
groups := make([]restic.ForgetGroup, 0, len(p.ForgetGroups))
for _, g := range p.ForgetGroups {
groups = append(groups, restic.ForgetGroup{
Tag: g.Tag,
Policy: restic.ForgetPolicy{
KeepLast: g.Policy.KeepLast,
KeepHourly: g.Policy.KeepHourly,
KeepDaily: g.Policy.KeepDaily,
KeepWeekly: g.Policy.KeepWeekly,
KeepMonthly: g.Policy.KeepMonthly,
KeepYearly: g.Policy.KeepYearly,
},
})
}
slog.Info("agent: accepting forget job", "job_id", p.JobID, "groups", len(groups))
go func() {
if err := r.RunForget(ctx, p.JobID, policy); err != nil {
if err := r.RunForget(ctx, p.JobID, groups); err != nil {
slog.Warn("agent: forget job failed", "job_id", p.JobID, "err", err)
return
}
+19
View File
@@ -16,6 +16,7 @@ import (
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
@@ -139,6 +140,14 @@ func run() error {
defer purgeTick.Stop()
offlineTick := time.NewTicker(30 * time.Second)
defer offlineTick.Stop()
// Maintenance ticker: drives forget/prune/check on the cadences
// operators set per-host. Independent of the agent's local cron
// (which only handles backup schedules). 60s cadence — the cron
// expressions are minute-grained, so anything finer is wasted
// work.
maintenanceTick := time.NewTicker(60 * time.Second)
defer maintenanceTick.Stop()
mt := maintenance.New(st)
go func() {
for {
select {
@@ -156,6 +165,16 @@ func run() error {
if n, err := st.MarkHostsOfflineStale(ctx, cutoff); err == nil && n > 0 {
slog.Info("marked hosts offline (stale heartbeat)", "n", n)
}
case <-maintenanceTick.C:
decisions, err := mt.Decide(ctx, time.Now().UTC())
if err != nil {
slog.Warn("maintenance ticker: decide", "err", err)
continue
}
if len(decisions) > 0 {
slog.Info("maintenance ticker: dispatching", "n", len(decisions))
srv.DispatchMaintenance(ctx, decisions)
}
}
}
}()