Files
restic-manager/internal/agent/scheduler/scheduler_test.go
T
steve 608962441b P2-02 (agent side) + P2-03: agent scheduler + schedule.fire dispatch
Closes the schedule reconciliation loop end-to-end.

* New `internal/agent/scheduler` package wraps robfig/cron/v3 with
  the lifecycle the agent needs:
  - Apply(ScheduleSetPayload, Sender) stops the prior cron (waiting
    for in-flight entries to return), rebuilds from scratch, starts,
    and emits schedule.ack with the version we just applied.
  - Disabled entries skipped silently; bad cron exprs (which
    shouldn't reach us — the server validates — but defensive)
    log a warn and skip.
  - On each cron tick the entry sends a new schedule.fire envelope
    to the server with {schedule_id, scheduled_at}. The scheduler
    itself never builds CommandRunPayloads — server is the source
    of truth for jobs.
  - tx is swapped on every Apply, so reconnect is handled
    naturally: cron entries that fire against a dropped tx log
    "no active connection" and skip the tick.
  - Stop() is idempotent and waits for the cron's in-flight
    workers via cron.Stop().Done().

* New wire message api.MsgScheduleFire + api.ScheduleFirePayload
  for the agent → server "I just fired locally" RPC.

* Server-side dispatch (schedule_push.go: dispatchScheduledJob):
  looks up the schedule by id, validates ownership + that it's
  enabled, builds args from kind (paths for backup; other kinds
  are still arg-less in Phase 2 and grow as those job kinds land
  in P2-05..08), persists a jobs row with actor_kind=schedule +
  scheduled_id, and writes command.run back on the same conn so
  the agent runs through its existing dispatch path.

* store.CreateJob now writes scheduled_id. This column was in the
  schema since 0001 but never populated — the original P1 path
  only had operator-driven jobs, so actor_kind was always 'user'
  and scheduled_id was always nil.

* cmd/agent/main.go integration: dispatcher gains a
  *scheduler.Scheduler; the MsgScheduleSet case now hands the
  payload to scheduler.Apply (in a goroutine so the WS read loop
  keeps draining other messages).

* WS dispatcher gains OnScheduleFire alongside OnScheduleAck.

* Tests:
  - scheduler unit tests (4): ack-on-apply, cron tick fires
    schedule.fire envelope, disabled entries don't fire, replace-
    prior-state stops the old cron.
  - Server-side end-to-end: schedule.fire → command.run with the
    right job_id / kind / args, plus jobs row with actor_kind=
    "schedule" and scheduled_id linking back to the schedule.

Persistence of next-fire times across agent restarts is
deliberately deferred. A missed fire window during downtime
simply fires once on reconnect — that's the desirable behaviour
(the operator wants the missed backup to run, not be silently
skipped because we lost track of when it was due).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 11:29:12 +01:00

160 lines
3.6 KiB
Go

package scheduler
import (
"sync"
"testing"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
)
// recSender is a Sender that records every envelope it gets. Tests
// inspect it after a tick to assert the right messages were emitted.
type recSender struct {
mu sync.Mutex
envs []api.Envelope
}
func (r *recSender) Send(env api.Envelope) error {
r.mu.Lock()
defer r.mu.Unlock()
r.envs = append(r.envs, env)
return nil
}
func (r *recSender) snapshot() []api.Envelope {
r.mu.Lock()
defer r.mu.Unlock()
out := make([]api.Envelope, len(r.envs))
copy(out, r.envs)
return out
}
func TestApplyEmitsAck(t *testing.T) {
t.Parallel()
tx := &recSender{}
s := New()
defer s.Stop()
s.Apply(api.ScheduleSetPayload{
Version: 7,
Schedules: []api.Schedule{
{ID: "s1", Kind: api.JobBackup, CronExpr: "@hourly", Enabled: true},
},
}, tx)
if got := s.Version(); got != 7 {
t.Fatalf("Version: got %d, want 7", got)
}
envs := tx.snapshot()
if len(envs) != 1 {
t.Fatalf("expected 1 envelope (ack), got %d", len(envs))
}
if envs[0].Type != api.MsgScheduleAck {
t.Fatalf("envelope type: got %s, want %s", envs[0].Type, api.MsgScheduleAck)
}
var ack api.ScheduleAckPayload
_ = envs[0].UnmarshalPayload(&ack)
if ack.Version != 7 {
t.Fatalf("ack version: got %d", ack.Version)
}
}
func TestApplyTickFiresScheduleFire(t *testing.T) {
t.Parallel()
tx := &recSender{}
s := New()
defer s.Stop()
// Cron expression that fires roughly every second; close enough
// to be reliable in CI without making the test slow.
s.Apply(api.ScheduleSetPayload{
Version: 1,
Schedules: []api.Schedule{
{ID: "every-second", Kind: api.JobBackup, CronExpr: "@every 1s", Enabled: true},
},
}, tx)
deadline := time.Now().Add(3 * time.Second)
for time.Now().Before(deadline) {
envs := tx.snapshot()
for _, e := range envs {
if e.Type == api.MsgScheduleFire {
var p api.ScheduleFirePayload
_ = e.UnmarshalPayload(&p)
if p.ScheduleID == "every-second" {
return
}
}
}
time.Sleep(50 * time.Millisecond)
}
t.Fatal("schedule.fire did not arrive within 3s")
}
func TestApplyDisabledEntriesSkipped(t *testing.T) {
t.Parallel()
tx := &recSender{}
s := New()
defer s.Stop()
s.Apply(api.ScheduleSetPayload{
Version: 1,
Schedules: []api.Schedule{
{ID: "off", Kind: api.JobBackup, CronExpr: "@every 1s", Enabled: false},
},
}, tx)
// A disabled schedule must never fire — give the cron a couple
// of ticks to confirm it's silent.
time.Sleep(2200 * time.Millisecond)
for _, e := range tx.snapshot() {
if e.Type == api.MsgScheduleFire {
t.Fatalf("disabled schedule fired: %+v", e)
}
}
}
func TestApplyReplacesPriorState(t *testing.T) {
t.Parallel()
tx := &recSender{}
s := New()
defer s.Stop()
s.Apply(api.ScheduleSetPayload{
Version: 1,
Schedules: []api.Schedule{
{ID: "old", Kind: api.JobBackup, CronExpr: "@every 1s", Enabled: true},
},
}, tx)
// Wait long enough for the first version to fire at least once.
time.Sleep(1500 * time.Millisecond)
// Now replace with version 2 that doesn't include "old".
s.Apply(api.ScheduleSetPayload{
Version: 2,
Schedules: []api.Schedule{},
}, tx)
// Snapshot count *after* the replacement.
before := 0
for _, e := range tx.snapshot() {
if e.Type == api.MsgScheduleFire {
before++
}
}
time.Sleep(2 * time.Second)
after := 0
for _, e := range tx.snapshot() {
if e.Type == api.MsgScheduleFire {
after++
}
}
if after != before {
t.Fatalf("schedule.fire count grew after replacement (before=%d after=%d) — old cron still firing",
before, after)
}
}