server: maintenance ticker drives forget/prune/check on cadence
Wires a 60s server-side ticker to the pure-logic maintenance.Decide
introduced in the previous commit. Decisions flow through a new
DispatchMaintenance method on *Server, which:
- skips offline hosts (no pending_runs queueing — maintenance is
not a backup, missed fires shouldn't pile up)
- silently skips prune when admin creds aren't bound
- pushes admin creds before prune, then dispatches with
RequiresAdminCreds=true (same as operator-driven prune)
- persists job rows with actor_kind="system"
Reshapes the forget wire payload from a single RetentionPolicy to a
ForgetGroups list (one tag + per-group keep-* per source group). The
agent walks the groups and runs `restic forget --tag <name> --keep-*`
once per group. Dead-code removed: CommandRunPayload.RetentionPolicy,
the old forget JSON-decode in cmd/agent, and the single-policy form of
restic.RunForget.
This commit is contained in:
+18
-21
@@ -2,7 +2,6 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
@@ -336,28 +335,26 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
||||
slog.Info("agent: init job complete", "job_id", p.JobID)
|
||||
}()
|
||||
case api.JobForget:
|
||||
var policy restic.ForgetPolicy
|
||||
if len(p.RetentionPolicy) > 0 {
|
||||
var raw struct {
|
||||
KeepLast *int `json:"keep_last,omitempty"`
|
||||
KeepHourly *int `json:"keep_hourly,omitempty"`
|
||||
KeepDaily *int `json:"keep_daily,omitempty"`
|
||||
KeepWeekly *int `json:"keep_weekly,omitempty"`
|
||||
KeepMonthly *int `json:"keep_monthly,omitempty"`
|
||||
KeepYearly *int `json:"keep_yearly,omitempty"`
|
||||
}
|
||||
if err := json.Unmarshal(p.RetentionPolicy, &raw); err != nil {
|
||||
return fmt.Errorf("forget: decode retention_policy: %w", err)
|
||||
}
|
||||
policy = restic.ForgetPolicy{
|
||||
KeepLast: raw.KeepLast, KeepHourly: raw.KeepHourly,
|
||||
KeepDaily: raw.KeepDaily, KeepWeekly: raw.KeepWeekly,
|
||||
KeepMonthly: raw.KeepMonthly, KeepYearly: raw.KeepYearly,
|
||||
}
|
||||
if len(p.ForgetGroups) == 0 {
|
||||
return fmt.Errorf("forget: command.run carried no forget_groups (server didn't populate them)")
|
||||
}
|
||||
slog.Info("agent: accepting forget job", "job_id", p.JobID, "policy", p.RetentionPolicy)
|
||||
groups := make([]restic.ForgetGroup, 0, len(p.ForgetGroups))
|
||||
for _, g := range p.ForgetGroups {
|
||||
groups = append(groups, restic.ForgetGroup{
|
||||
Tag: g.Tag,
|
||||
Policy: restic.ForgetPolicy{
|
||||
KeepLast: g.Policy.KeepLast,
|
||||
KeepHourly: g.Policy.KeepHourly,
|
||||
KeepDaily: g.Policy.KeepDaily,
|
||||
KeepWeekly: g.Policy.KeepWeekly,
|
||||
KeepMonthly: g.Policy.KeepMonthly,
|
||||
KeepYearly: g.Policy.KeepYearly,
|
||||
},
|
||||
})
|
||||
}
|
||||
slog.Info("agent: accepting forget job", "job_id", p.JobID, "groups", len(groups))
|
||||
go func() {
|
||||
if err := r.RunForget(ctx, p.JobID, policy); err != nil {
|
||||
if err := r.RunForget(ctx, p.JobID, groups); err != nil {
|
||||
slog.Warn("agent: forget job failed", "job_id", p.JobID, "err", err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
||||
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
@@ -139,6 +140,14 @@ func run() error {
|
||||
defer purgeTick.Stop()
|
||||
offlineTick := time.NewTicker(30 * time.Second)
|
||||
defer offlineTick.Stop()
|
||||
// Maintenance ticker: drives forget/prune/check on the cadences
|
||||
// operators set per-host. Independent of the agent's local cron
|
||||
// (which only handles backup schedules). 60s cadence — the cron
|
||||
// expressions are minute-grained, so anything finer is wasted
|
||||
// work.
|
||||
maintenanceTick := time.NewTicker(60 * time.Second)
|
||||
defer maintenanceTick.Stop()
|
||||
mt := maintenance.New(st)
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
@@ -156,6 +165,16 @@ func run() error {
|
||||
if n, err := st.MarkHostsOfflineStale(ctx, cutoff); err == nil && n > 0 {
|
||||
slog.Info("marked hosts offline (stale heartbeat)", "n", n)
|
||||
}
|
||||
case <-maintenanceTick.C:
|
||||
decisions, err := mt.Decide(ctx, time.Now().UTC())
|
||||
if err != nil {
|
||||
slog.Warn("maintenance ticker: decide", "err", err)
|
||||
continue
|
||||
}
|
||||
if len(decisions) > 0 {
|
||||
slog.Info("maintenance ticker: dispatching", "n", len(decisions))
|
||||
srv.DispatchMaintenance(ctx, decisions)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
Reference in New Issue
Block a user