P2-02 (agent side) + P2-03: agent scheduler + schedule.fire dispatch
Closes the schedule reconciliation loop end-to-end.
* New `internal/agent/scheduler` package wraps robfig/cron/v3 with
the lifecycle the agent needs:
- Apply(ScheduleSetPayload, Sender) stops the prior cron (waiting
for in-flight entries to return), rebuilds from scratch, starts,
and emits schedule.ack with the version we just applied.
- Disabled entries skipped silently; bad cron exprs (which
shouldn't reach us — the server validates — but defensive)
log a warn and skip.
- On each cron tick the entry sends a new schedule.fire envelope
to the server with {schedule_id, scheduled_at}. The scheduler
itself never builds CommandRunPayloads — server is the source
of truth for jobs.
- tx is swapped on every Apply, so reconnect is handled
naturally: cron entries that fire against a dropped tx log
"no active connection" and skip the tick.
- Stop() is idempotent and waits for the cron's in-flight
workers via cron.Stop().Done().
* New wire message api.MsgScheduleFire + api.ScheduleFirePayload
for the agent → server "I just fired locally" RPC.
* Server-side dispatch (schedule_push.go: dispatchScheduledJob):
looks up the schedule by id, validates ownership + that it's
enabled, builds args from kind (paths for backup; other kinds
are still arg-less in Phase 2 and grow as those job kinds land
in P2-05..08), persists a jobs row with actor_kind=schedule +
scheduled_id, and writes command.run back on the same conn so
the agent runs through its existing dispatch path.
* store.CreateJob now writes scheduled_id. This column was in the
schema since 0001 but never populated — the original P1 path
only had operator-driven jobs, so actor_kind was always 'user'
and scheduled_id was always nil.
* cmd/agent/main.go integration: dispatcher gains a
*scheduler.Scheduler; the MsgScheduleSet case now hands the
payload to scheduler.Apply (in a goroutine so the WS read loop
keeps draining other messages).
* WS dispatcher gains OnScheduleFire alongside OnScheduleAck.
* Tests:
- scheduler unit tests (4): ack-on-apply, cron tick fires
schedule.fire envelope, disabled entries don't fire, replace-
prior-state stops the old cron.
- Server-side end-to-end: schedule.fire → command.run with the
right job_id / kind / args, plus jobs row with actor_kind=
"schedule" and scheduled_id linking back to the schedule.
Persistence of next-fire times across agent restarts is
deliberately deferred. A missed fire window during downtime
simply fires once on reconnect — that's the desirable behaviour
(the operator wants the missed backup to run, not be silently
skipped because we lost track of when it was due).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -6,6 +6,8 @@ import (
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"github.com/oklog/ulid/v2"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
@@ -141,6 +143,90 @@ func (s *Server) applyScheduleAck(ctx context.Context, hostID string, version in
|
||||
"host_id", hostID, "version", version, "applied_at", appliedAt)
|
||||
}
|
||||
|
||||
// dispatchScheduledJob is invoked when the agent reports a local
|
||||
// cron fire via `schedule.fire`. We look up the schedule, build the
|
||||
// CommandRunPayload from it, persist a job row (actor=schedule,
|
||||
// linked back to scheduled_id), and write MsgCommandRun straight
|
||||
// back on the same conn so the agent runs the job through its
|
||||
// normal command dispatch path.
|
||||
//
|
||||
// On any error we log and bail — the agent's cron will fire again
|
||||
// at the next tick. We deliberately don't try to retry: schedules
|
||||
// are by definition repeating, and a missed tick is less bad than
|
||||
// a confused operator-visible "phantom job" that never actually
|
||||
// ran restic.
|
||||
func (s *Server) dispatchScheduledJob(ctx context.Context, hostID string, conn *ws.Conn, scheduleID string, scheduledAt time.Time) {
|
||||
sched, err := s.deps.Store.GetSchedule(ctx, hostID, scheduleID)
|
||||
if err != nil {
|
||||
slog.Warn("schedule.fire: schedule not found",
|
||||
"host_id", hostID, "schedule_id", scheduleID, "err", err)
|
||||
return
|
||||
}
|
||||
if !sched.Enabled {
|
||||
// The agent shouldn't be firing disabled schedules — its
|
||||
// local cron is rebuilt from the canonical version after
|
||||
// every push — but treat as belt-and-braces.
|
||||
slog.Info("schedule.fire: ignoring disabled schedule",
|
||||
"host_id", hostID, "schedule_id", scheduleID)
|
||||
return
|
||||
}
|
||||
|
||||
// Args differ by kind. For backup we ship the schedule's paths;
|
||||
// other kinds are still arg-less in Phase 2 (forget/prune/check
|
||||
// take their parameters from RetentionPolicy / Options at exec
|
||||
// time on the agent — handled when those job kinds land).
|
||||
var args []string
|
||||
if sched.Kind == string(api.JobBackup) {
|
||||
args = append(args, sched.Paths...)
|
||||
}
|
||||
|
||||
jobID := ulid.Make().String()
|
||||
now := time.Now().UTC()
|
||||
if err := s.deps.Store.CreateJob(ctx, store.Job{
|
||||
ID: jobID,
|
||||
HostID: hostID,
|
||||
Kind: sched.Kind,
|
||||
ScheduledID: &sched.ID,
|
||||
ActorKind: "schedule",
|
||||
ActorID: &sched.ID,
|
||||
CreatedAt: now,
|
||||
}); err != nil {
|
||||
slog.Warn("schedule.fire: create job",
|
||||
"host_id", hostID, "schedule_id", scheduleID, "err", err)
|
||||
return
|
||||
}
|
||||
|
||||
env, err := api.Marshal(api.MsgCommandRun, jobID, api.CommandRunPayload{
|
||||
JobID: jobID,
|
||||
Kind: api.JobKind(sched.Kind),
|
||||
Args: args,
|
||||
})
|
||||
if err != nil {
|
||||
slog.Error("schedule.fire: marshal command.run",
|
||||
"host_id", hostID, "schedule_id", scheduleID, "err", err)
|
||||
return
|
||||
}
|
||||
sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
defer cancel()
|
||||
if err := conn.Send(sendCtx, env); err != nil {
|
||||
slog.Warn("schedule.fire: send command.run",
|
||||
"host_id", hostID, "job_id", jobID, "err", err)
|
||||
return
|
||||
}
|
||||
|
||||
_ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{
|
||||
ID: ulid.Make().String(),
|
||||
Actor: "schedule",
|
||||
Action: "job.run_now",
|
||||
TargetKind: ptr("job"),
|
||||
TargetID: &jobID,
|
||||
TS: now,
|
||||
})
|
||||
slog.Info("schedule.fire: dispatched",
|
||||
"host_id", hostID, "schedule_id", scheduleID,
|
||||
"job_id", jobID, "kind", sched.Kind, "scheduled_at", scheduledAt)
|
||||
}
|
||||
|
||||
// Compile-time guard that the store actually implements the methods
|
||||
// schedule_push.go calls. Useful when mocking the store in tests.
|
||||
var _ scheduleStore = (*store.Store)(nil)
|
||||
|
||||
@@ -148,6 +148,98 @@ func TestSchedulePushOnHelloAndAckRoundtrip(t *testing.T) {
|
||||
h.AppliedScheduleVersion, pushed.Version)
|
||||
}
|
||||
|
||||
func TestScheduleFireDispatchesCommandRun(t *testing.T) {
|
||||
t.Parallel()
|
||||
srv, url, st := newTestServerWithHub(t)
|
||||
_ = srv
|
||||
cookie := loginAndCookie(t, url)
|
||||
hostID, agentToken := makePushHost(t, st)
|
||||
|
||||
// Pre-create one backup schedule.
|
||||
body, _ := json.Marshal(scheduleAPI{
|
||||
Kind: "backup", CronExpr: "@hourly",
|
||||
Paths: []string{"/etc/hostname"}, Enabled: true,
|
||||
})
|
||||
req, _ := stdhttp.NewRequest("POST",
|
||||
url+"/api/hosts/"+hostID+"/schedules", bytes.NewReader(body))
|
||||
req.AddCookie(cookie)
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
res, err := stdhttp.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("create: %v", err)
|
||||
}
|
||||
got, _ := io.ReadAll(res.Body)
|
||||
res.Body.Close()
|
||||
var created scheduleAPI
|
||||
_ = json.Unmarshal(got, &created)
|
||||
|
||||
// Connect as the agent.
|
||||
wsURL := "ws" + strings.TrimPrefix(url, "http") + "/ws/agent"
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
c, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{
|
||||
HTTPHeader: stdhttp.Header{"Authorization": []string{"Bearer " + agentToken}},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("dial: %v", err)
|
||||
}
|
||||
defer c.CloseNow()
|
||||
|
||||
helloEnv, _ := api.Marshal(api.MsgHello, "", api.HelloPayload{
|
||||
ProtocolVersion: api.CurrentProtocolVersion,
|
||||
AgentVersion: "test", ResticVersion: "test",
|
||||
Hostname: "ph", OS: api.OSLinux, Arch: api.ArchAmd64,
|
||||
})
|
||||
raw, _ := json.Marshal(helloEnv)
|
||||
_ = c.Write(ctx, websocket.MessageText, raw)
|
||||
|
||||
// Drain the on-hello schedule.set.
|
||||
_ = readUntilType(ctx, t, c, api.MsgScheduleSet)
|
||||
|
||||
// Pretend our local cron just fired this schedule.
|
||||
fireEnv, _ := api.Marshal(api.MsgScheduleFire, "", api.ScheduleFirePayload{
|
||||
ScheduleID: created.ID,
|
||||
ScheduledAt: time.Now().UTC(),
|
||||
})
|
||||
raw, _ = json.Marshal(fireEnv)
|
||||
if err := c.Write(ctx, websocket.MessageText, raw); err != nil {
|
||||
t.Fatalf("write fire: %v", err)
|
||||
}
|
||||
|
||||
// Server should respond with command.run.
|
||||
cmdEnv := readUntilType(ctx, t, c, api.MsgCommandRun)
|
||||
var cmd api.CommandRunPayload
|
||||
if err := cmdEnv.UnmarshalPayload(&cmd); err != nil {
|
||||
t.Fatalf("decode command.run: %v", err)
|
||||
}
|
||||
if cmd.JobID == "" || cmd.Kind != api.JobBackup {
|
||||
t.Fatalf("command.run: %+v", cmd)
|
||||
}
|
||||
if len(cmd.Args) != 1 || cmd.Args[0] != "/etc/hostname" {
|
||||
t.Fatalf("command.run args: %+v", cmd.Args)
|
||||
}
|
||||
|
||||
// Verify the job row landed with actor_kind=schedule.
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
var actorKind, scheduledID string
|
||||
row := st.DB().QueryRowContext(context.Background(),
|
||||
`SELECT actor_kind, COALESCE(scheduled_id,'') FROM jobs WHERE id = ?`,
|
||||
cmd.JobID)
|
||||
if err := row.Scan(&actorKind, &scheduledID); err == nil {
|
||||
if actorKind != "schedule" {
|
||||
t.Fatalf("job actor_kind: %q", actorKind)
|
||||
}
|
||||
if scheduledID != created.ID {
|
||||
t.Fatalf("job scheduled_id: %q want %q", scheduledID, created.ID)
|
||||
}
|
||||
return
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
t.Fatalf("job row %s never landed", cmd.JobID)
|
||||
}
|
||||
|
||||
func TestSchedulePushOnCRUD(t *testing.T) {
|
||||
t.Parallel()
|
||||
srv, url, st := newTestServerWithHub(t)
|
||||
|
||||
@@ -117,11 +117,12 @@ func (s *Server) routes(r chi.Router) {
|
||||
// Agent ↔ server WebSocket. Bearer-authenticated inside the handler.
|
||||
if s.deps.Hub != nil {
|
||||
r.Mount("/ws/agent", ws.AgentHandler(ws.HandlerDeps{
|
||||
Hub: s.deps.Hub,
|
||||
Store: s.deps.Store,
|
||||
JobHub: s.deps.JobHub,
|
||||
OnHello: s.onAgentHello,
|
||||
OnScheduleAck: s.applyScheduleAck,
|
||||
Hub: s.deps.Hub,
|
||||
Store: s.deps.Store,
|
||||
JobHub: s.deps.JobHub,
|
||||
OnHello: s.onAgentHello,
|
||||
OnScheduleAck: s.applyScheduleAck,
|
||||
OnScheduleFire: s.dispatchScheduledJob,
|
||||
}))
|
||||
}
|
||||
|
||||
|
||||
@@ -30,6 +30,11 @@ type HandlerDeps struct {
|
||||
// OnScheduleAck is called when an agent confirms it has applied
|
||||
// a particular schedule version (P2-02 reconciliation). Optional.
|
||||
OnScheduleAck func(ctx context.Context, hostID string, version int64, appliedAt time.Time)
|
||||
// OnScheduleFire is called when an agent's local cron fires. The
|
||||
// callback is expected to look up the schedule, persist a job
|
||||
// row, and emit MsgCommandRun back on conn so the agent can run
|
||||
// the job using its normal job dispatch path. Optional.
|
||||
OnScheduleFire func(ctx context.Context, hostID string, conn *Conn, scheduleID string, scheduledAt time.Time)
|
||||
}
|
||||
|
||||
// AgentHandler is the http.Handler that owns /ws/agent. Agents
|
||||
@@ -268,6 +273,16 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
|
||||
deps.OnScheduleAck(ctx, hostID, p.Version, p.AppliedAt)
|
||||
}
|
||||
|
||||
case api.MsgScheduleFire:
|
||||
var p api.ScheduleFirePayload
|
||||
if err := env.UnmarshalPayload(&p); err != nil {
|
||||
slog.Warn("ws: bad schedule.fire payload", "host_id", hostID, "err", err)
|
||||
break
|
||||
}
|
||||
if deps.OnScheduleFire != nil {
|
||||
deps.OnScheduleFire(ctx, hostID, c, p.ScheduleID, p.ScheduledAt)
|
||||
}
|
||||
|
||||
case api.MsgRepoStats, api.MsgCommandResult:
|
||||
// TODO(P2): persist these projections.
|
||||
slog.Debug("ws msg not yet handled", "type", env.Type, "host_id", hostID)
|
||||
|
||||
Reference in New Issue
Block a user