P3-X1: cancel-job feature

Wires the existing job_detail Cancel button (which was a UI stub) into
real backend behaviour:

- internal/api already declared MsgCommandCancel + CommandCancelPayload;
  promote those from forward-declarations to a working envelope. Agent
  side: cmd/agent/main.go drops the TODO-stub and gains a per-job
  ctx.CancelFunc map. runJob's switch is refactored around a small
  spawn() helper so each kind's goroutine derives a per-job context,
  registers the cancel, and removes itself on completion regardless of
  outcome. command.cancel looks up the func and fires it.
- internal/agent/runner.sendFinished now takes ctx and rebadges
  ctx.Canceled errors as JobCancelled (exit 130) rather than
  JobFailed. All Run* call sites updated.
- internal/restic.resticCmd sets cmd.Cancel to send SIGTERM (via
  build-tagged sigterm constant; os.Kill on Windows since SIGTERM
  isn't deliverable there) and cmd.WaitDelay=5s for the SIGKILL
  fallback. SIGTERM lets restic remove its lock file before exiting.
- New POST /api/jobs/{id}/cancel server endpoint validates the job
  is non-terminal and the host is online, sends command.cancel via
  the hub, writes a job.cancel audit row, returns 202. The agent's
  resulting job.finished (status=cancelled) is what actually
  transitions the row.

Tests:
- internal/server/http/cancel_test.go covers happy path (envelope
  shape + audit row), 409 for terminal jobs, 404 for missing jobs,
  503 for offline hosts.
- internal/agent/runner/cancel_test.go covers cancel mid-run: a fake
  restic that exec'd into 'sleep 30' is canceled 150ms after start
  and the resulting job.finished reports JobCancelled with exit 130
  in well under the WaitDelay.

Foundational for P3 restore (operator needs to be able to cancel a
running backup if they need to restore urgently). Independently useful
for prune/check/backup that are stuck.
This commit is contained in:
2026-05-04 15:11:49 +01:00
parent d325a27439
commit 94149a7324
9 changed files with 543 additions and 47 deletions
+88 -38
View File
@@ -210,6 +210,45 @@ type dispatcher struct {
bwMu sync.Mutex
bwUpKBps int
bwDownKBps int
// Per-running-job cancellation handles. Populated when runJob
// spawns the goroutine, removed when it returns. Looked up by
// the command.cancel handler (server → agent) to abort an
// in-flight restic invocation.
cancelMu sync.Mutex
cancels map[string]context.CancelFunc
}
// trackJob registers a cancel func for an in-flight job and returns a
// cleanup that removes it. Call cleanup when the job goroutine exits
// regardless of outcome — runs even on panic.
func (d *dispatcher) trackJob(jobID string, cancel context.CancelFunc) func() {
d.cancelMu.Lock()
if d.cancels == nil {
d.cancels = make(map[string]context.CancelFunc)
}
d.cancels[jobID] = cancel
d.cancelMu.Unlock()
return func() {
d.cancelMu.Lock()
delete(d.cancels, jobID)
d.cancelMu.Unlock()
}
}
// cancelJob fires the cancel func for jobID if there is one and
// returns whether the job was actually known. The runner is expected
// to surface the resulting context.Canceled as a JobCancelled status
// in its job.finished envelope (see runner.sendFinished).
func (d *dispatcher) cancelJob(jobID string) bool {
d.cancelMu.Lock()
cancel, ok := d.cancels[jobID]
d.cancelMu.Unlock()
if !ok {
return false
}
cancel()
return true
}
func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.Sender) error {
@@ -222,8 +261,18 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S
return d.runJob(ctx, p, tx)
case api.MsgCommandCancel:
// TODO(P2): cancellation requires keeping a job→cancelFunc map.
slog.Info("ws agent: command.cancel received (cancellation lands in P2)", "id", env.ID)
var p api.CommandCancelPayload
if err := env.UnmarshalPayload(&p); err != nil {
return fmt.Errorf("command.cancel: %w", err)
}
if d.cancelJob(p.JobID) {
slog.Info("ws agent: command.cancel applied", "job_id", p.JobID)
} else {
// Job already finished or was never seen on this agent.
// Not an error — operator may have raced cancel against
// natural completion. Server-side state is authoritative.
slog.Info("ws agent: command.cancel for unknown job (already finished?)", "job_id", p.JobID)
}
case api.MsgScheduleSet:
var p api.ScheduleSetPayload
@@ -374,6 +423,25 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
LimitDownloadKBps: downKBps,
}, tx, time.Second)
// spawn wraps the kind-specific goroutine: derives a per-job
// cancellable context from the connection-scoped ctx, registers
// the cancel func so command.cancel can fire it, deregisters on
// completion. Per-job ctx means canceling one job doesn't kill
// any other in-flight invocations.
spawn := func(name string, fn func(ctx context.Context) error) {
jobCtx, cancel := context.WithCancel(ctx)
cleanup := d.trackJob(p.JobID, cancel)
go func() {
defer cleanup()
defer cancel() // release ctx resources on goroutine exit
if err := fn(jobCtx); err != nil {
slog.Warn("agent: "+name+" job failed", "job_id", p.JobID, "err", err)
return
}
slog.Info("agent: "+name+" job complete", "job_id", p.JobID)
}()
}
switch p.Kind {
case api.JobBackup:
// Includes/Excludes/Tag come from the source group resolved
@@ -391,22 +459,14 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
slog.Info("agent: accepting backup job",
"job_id", p.JobID, "paths", paths, "excludes", p.Excludes, "tag", p.Tag)
hooks := runner.BackupHooks{Pre: p.PreHook, Post: p.PostHook}
go func() {
if err := r.RunBackup(ctx, p.JobID, paths, p.Excludes, tags, hooks); err != nil {
slog.Warn("agent: backup job failed", "job_id", p.JobID, "err", err)
return
}
slog.Info("agent: backup job complete", "job_id", p.JobID)
}()
spawn("backup", func(jobCtx context.Context) error {
return r.RunBackup(jobCtx, p.JobID, paths, p.Excludes, tags, hooks)
})
case api.JobInit:
slog.Info("agent: accepting init job", "job_id", p.JobID)
go func() {
if err := r.RunInit(ctx, p.JobID); err != nil {
slog.Warn("agent: init job failed", "job_id", p.JobID, "err", err)
return
}
slog.Info("agent: init job complete", "job_id", p.JobID)
}()
spawn("init", func(jobCtx context.Context) error {
return r.RunInit(jobCtx, p.JobID)
})
case api.JobForget:
if len(p.ForgetGroups) == 0 {
// Hard-error rather than fall back to a single-policy form:
@@ -433,13 +493,9 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
})
}
slog.Info("agent: accepting forget job", "job_id", p.JobID, "groups", len(groups))
go func() {
if err := r.RunForget(ctx, p.JobID, groups); err != nil {
slog.Warn("agent: forget job failed", "job_id", p.JobID, "err", err)
return
}
slog.Info("agent: forget job complete", "job_id", p.JobID)
}()
spawn("forget", func(jobCtx context.Context) error {
return r.RunForget(jobCtx, p.JobID, groups)
})
case api.JobPrune:
// Prune may require admin creds (delete authority on rest-server).
runCreds := creds
@@ -462,29 +518,23 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
LimitDownloadKBps: downKBps,
}, tx, time.Second)
slog.Info("agent: accepting prune job", "job_id", p.JobID, "admin_creds", p.RequiresAdminCreds)
go func() {
if err := prr.RunPrune(ctx, p.JobID); err != nil {
slog.Warn("agent: prune job failed", "job_id", p.JobID, "err", err)
}
}()
spawn("prune", func(jobCtx context.Context) error {
return prr.RunPrune(jobCtx, p.JobID)
})
case api.JobCheck:
subset := 0
if len(p.Args) > 0 {
subset, _ = strconv.Atoi(p.Args[0])
}
slog.Info("agent: accepting check job", "job_id", p.JobID, "subset_pct", subset)
go func() {
if err := r.RunCheck(ctx, p.JobID, subset); err != nil {
slog.Warn("agent: check job failed", "job_id", p.JobID, "err", err)
}
}()
spawn("check", func(jobCtx context.Context) error {
return r.RunCheck(jobCtx, p.JobID, subset)
})
case api.JobUnlock:
slog.Info("agent: accepting unlock job", "job_id", p.JobID)
go func() {
if err := r.RunUnlock(ctx, p.JobID); err != nil {
slog.Warn("agent: unlock job failed", "job_id", p.JobID, "err", err)
}
}()
spawn("unlock", func(jobCtx context.Context) error {
return r.RunUnlock(jobCtx, p.JobID)
})
default:
return fmt.Errorf("kind %q not implemented yet (Phase 2 lands the rest)", p.Kind)
}