Files
restic-manager/internal/agent/scheduler/scheduler.go
T
steve 8fb1c100fd P2-04.5: kill host.default_paths in favour of manual schedules
Two independent path lists for "what does this host back up?" was
a real divergence footgun — operator types one set at Add-host time
and a different set into a schedule, both end up in the same repo,
the snapshot history looks fine until restore. Resolution: drop
host.default_paths entirely; add a `manual` flag on schedules.
A manual schedule has paths/excludes/tags/retention like any other
but no cron — it fires only via per-schedule Run-now. Single source
of truth for what gets backed up.

Schema (migration 0007):
* schedules.manual INTEGER NOT NULL DEFAULT 0.
* For every host with non-empty default_paths, seed a manual
  schedule with those paths and bump host_schedule_version.
* ALTER TABLE hosts DROP COLUMN default_paths.
* ALTER TABLE enrollment_tokens RENAME COLUMN default_paths
  TO initial_paths.

Original draft of this migration rebuilt hosts via the
create-new + drop-old + rename-new pattern. With foreign_keys=ON
(set in the connection DSN), DROP TABLE on the parent fired
ON DELETE CASCADE on every child of hosts(id) — schedules /
jobs / snapshots / host_credentials all wiped on the smoke env
when I tried it. SQLite 3.35+ supports column-level ALTERs
directly, so we skip the rebuild dance and avoid the cascade
trap. Six lines of SQL instead of sixty, no FK risk.

Run-now rewiring:
* New `dispatchScheduleNow(hostID, scheduleID, conn?)` helper
  unifies the agent-driven path (cron fire → schedule.fire →
  OnScheduleFire callback) and the UI-driven path (operator
  clicks Run-now on a schedule row). Conn arg is optional; nil
  falls back to Hub.Send.
* New POST /hosts/{id}/schedules/{sid}/run endpoint — per-row
  Run-now button on the schedules list.
* Dashboard's per-host Run-now (handleUIRunBackup) now picks the
  host's only enabled manual schedule, falls back to the only
  enabled schedule, else returns "pick one in Schedules tab".
  Keeps one-click for the common case.

Agent:
* Scheduler skips manual schedules in cron build (silent — they're
  a normal data shape, not an error).
* Wire Schedule struct gains Manual flag.
* Schedule.fire flow unchanged — the agent only ever fires
  non-manual schedules anyway.

UI:
* Add-host form retitled "Initial schedule · manual" so the
  operator knows the paths become an editable schedule under
  the Schedules tab. Result page calls out the manual schedule
  + points at Host > Schedules.
* Schedule edit form: "Manual schedule" checkbox at the top of
  the When section; toggling it hides/shows the cron field via
  inline JS. Server-side validator skips the cron requirement
  when manual=true.
* Schedule list shows a "manual" tag under the status pill and
  renders the When column as "— run-now only —" for manual rows.
  Each row gets a Run-now button when the schedule is enabled
  and the host is online.

Tests + go test ./... green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 12:26:06 +01:00

178 lines
5.4 KiB
Go

package scheduler
import (
"log/slog"
"sync"
"time"
"github.com/robfig/cron/v3"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
)
// Sender abstracts away the agent's outbound WS channel — we use it
// to fire schedule.fire and schedule.ack envelopes back at the
// server. Same shape as runner.Sender; deliberately not shared so
// the scheduler can be tested without dragging in the runner.
type Sender interface {
Send(env api.Envelope) error
}
// Scheduler maintains the agent's local cron entries. Schedules
// arrive from the server via Apply (driven by MsgScheduleSet); on
// each fire, the entry sends a schedule.fire to the server and
// lets the server's existing dispatch path turn that into a
// command.run. The scheduler itself never builds CommandRunPayloads.
//
// Lifecycle:
// - Start once at agent boot.
// - Apply on every MsgScheduleSet — replaces the active cron with
// a fresh one, then emits schedule.ack with the version we just
// applied.
// - Stop on agent shutdown.
//
// The active Sender is updated on every Apply call. This handles
// reconnects naturally: a new connection's first MsgScheduleSet
// re-arms the scheduler with a working tx; cron entries that fire
// against a dropped connection just log and skip the tick.
type Scheduler struct {
mu sync.Mutex
current *cron.Cron
version int64
tx Sender
}
// New builds a Scheduler. Doesn't start any cron yet — Apply is
// what brings the loop alive.
func New() *Scheduler {
return &Scheduler{}
}
// Stop halts whatever cron is currently running. Safe to call
// multiple times.
func (s *Scheduler) Stop() {
s.mu.Lock()
defer s.mu.Unlock()
if s.current != nil {
<-s.current.Stop().Done()
s.current = nil
}
}
// Apply reconciles the active cron with payload. Stops the old cron
// (waiting for in-flight entries to return), builds a new one from
// every enabled entry, starts it, and emits schedule.ack with
// payload.Version. Schedule entries with malformed cron exprs are
// logged and skipped — the server's validator should have caught
// these, but better skip-and-warn than crash the loop.
//
// Payload's order doesn't matter; we always rebuild from scratch.
// Empty Schedules is a valid input that effectively disables every
// timed job for this host.
func (s *Scheduler) Apply(payload api.ScheduleSetPayload, tx Sender) {
s.mu.Lock()
s.tx = tx
// Stop the previous cron, if any. cron.Stop returns once the
// scheduler has stopped firing new entries; in-flight ones
// continue in their own goroutines, which is what we want
// (otherwise a long-running backup would block reconciliation).
if s.current != nil {
<-s.current.Stop().Done()
s.current = nil
}
c := cron.New()
added := 0
for _, sch := range payload.Schedules {
if !sch.Enabled {
continue
}
// Manual schedules carry paths/retention/etc. but have no
// cron — they only fire via operator-driven run-now (which
// the server resolves directly via dispatchScheduledJob).
// Skip without warning: they're a normal data shape.
if sch.Manual {
continue
}
// Capture by value so the closure doesn't share id across iters.
entry := sch
_, err := c.AddFunc(entry.CronExpr, func() {
s.fire(entry)
})
if err != nil {
slog.Warn("scheduler: skipping entry with bad cron expr",
"schedule_id", entry.ID, "expr", entry.CronExpr, "err", err)
continue
}
added++
}
c.Start()
s.current = c
s.version = payload.Version
ackTx := s.tx
s.mu.Unlock()
slog.Info("scheduler: applied", "version", payload.Version,
"received", len(payload.Schedules), "active", added)
// Ack outside the lock — Send() shouldn't take long, but holding
// s.mu across an external call would needlessly serialise other
// callers (e.g. a future Status() inspection from the UI).
ackEnv, err := api.Marshal(api.MsgScheduleAck, "", api.ScheduleAckPayload{
Version: payload.Version,
AppliedAt: time.Now().UTC(),
})
if err != nil {
slog.Error("scheduler: marshal schedule.ack", "err", err)
return
}
if ackTx == nil {
return
}
if err := ackTx.Send(ackEnv); err != nil {
slog.Warn("scheduler: send schedule.ack — server will retry on reconnect",
"version", payload.Version, "err", err)
}
}
// Version returns the schedule version currently applied. Useful for
// tests + diagnostics.
func (s *Scheduler) Version() int64 {
s.mu.Lock()
defer s.mu.Unlock()
return s.version
}
// fire runs when one of the cron entries' time arrives. Sends a
// schedule.fire envelope to the server, which is responsible for
// minting the job_id, persisting the row, and shipping back a
// command.run envelope that the agent's existing dispatcher will
// then execute. Fire-and-log: if the WS write fails we skip this
// tick — the next one will fire normally, and a flapping link is
// already noisy elsewhere.
func (s *Scheduler) fire(entry api.Schedule) {
s.mu.Lock()
tx := s.tx
s.mu.Unlock()
if tx == nil {
slog.Info("scheduler: tick fired with no active connection — skipping",
"schedule_id", entry.ID)
return
}
env, err := api.Marshal(api.MsgScheduleFire, "", api.ScheduleFirePayload{
ScheduleID: entry.ID,
ScheduledAt: time.Now().UTC(),
})
if err != nil {
slog.Error("scheduler: marshal schedule.fire",
"schedule_id", entry.ID, "err", err)
return
}
if err := tx.Send(env); err != nil {
slog.Warn("scheduler: send schedule.fire — skipping this tick",
"schedule_id", entry.ID, "err", err)
}
}