P3-X1: cancel-job feature

Wires the existing job_detail Cancel button (which was a UI stub) into real backend behaviour: - internal/api already declared MsgCommandCancel + CommandCancelPayload; promote those from forward-declarations to a working envelope. Agent side: cmd/agent/main.go drops the TODO-stub and gains a per-job ctx.CancelFunc map. runJob's switch is refactored around a small spawn() helper so each kind's goroutine derives a per-job context, registers the cancel, and removes itself on completion regardless of outcome. command.cancel looks up the func and fires it. - internal/agent/runner.sendFinished now takes ctx and rebadges ctx.Canceled errors as JobCancelled (exit 130) rather than JobFailed. All Run* call sites updated. - internal/restic.resticCmd sets cmd.Cancel to send SIGTERM (via build-tagged sigterm constant; os.Kill on Windows since SIGTERM isn't deliverable there) and cmd.WaitDelay=5s for the SIGKILL fallback. SIGTERM lets restic remove its lock file before exiting. - New POST /api/jobs/{id}/cancel server endpoint validates the job is non-terminal and the host is online, sends command.cancel via the hub, writes a job.cancel audit row, returns 202. The agent's resulting job.finished (status=cancelled) is what actually transitions the row. Tests: - internal/server/http/cancel_test.go covers happy path (envelope shape + audit row), 409 for terminal jobs, 404 for missing jobs, 503 for offline hosts. - internal/agent/runner/cancel_test.go covers cancel mid-run: a fake restic that exec'd into 'sleep 30' is canceled 150ms after start and the resulting job.finished reports JobCancelled with exit 130 in well under the WaitDelay. Foundational for P3 restore (operator needs to be able to cancel a running backup if they need to restore urgently). Independently useful for prune/check/backup that are stuck.
2026-05-04 15:11:49 +01:00
parent d325a27439
commit 94149a7324
9 changed files with 543 additions and 47 deletions
@@ -95,8 +95,10 @@ func (r *Runner) streamHandler(jobID string, seq *atomic.Int64) restic.LineHandl
 }

 // sendFinished ships a job.finished envelope. err==nil → succeeded;
-// otherwise failed. statsBlob is forwarded as JobFinishedPayload.Stats.
-func (r *Runner) sendFinished(jobID string, finishedAt time.Time, err error, statsBlob json.RawMessage) {
+// otherwise failed (or canceled if ctx was canceled — operator
+// hit the Cancel button or the agent is shutting down).
+// statsBlob is forwarded as JobFinishedPayload.Stats.
+func (r *Runner) sendFinished(ctx context.Context, jobID string, finishedAt time.Time, err error, statsBlob json.RawMessage) {
 	status := api.JobSucceeded
 	exit := 0
 	errMsg := ""
@@ -104,6 +106,16 @@ func (r *Runner) sendFinished(jobID string, finishedAt time.Time, err error, sta
 		status = api.JobFailed
 		exit = -1
 		errMsg = err.Error()
+		// If the context was canceled, the failure is operator-driven
+		// (or shutdown). Surface as JobCancelled so the UI shows a
+		// neutral "canceled" state rather than a red "failed" one.
+		// exec.CommandContext returns the process's exit error on
+		// ctx-cancel, which we'd otherwise rebadge as failed.
+		if ctxErr := ctx.Err(); ctxErr != nil {
+			status = api.JobCancelled
+			exit = 130  // POSIX convention for SIGINT/SIGTERM-killed
+			errMsg = "" // no need to surface the underlying restic error
+		}
 	}
 	finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{
 		JobID:      jobID,
@@ -138,7 +150,7 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t
 	if hooks.Pre != "" {
 		if err := r.runHook(ctx, jobID, "pre", hooks.Pre, "", &seq); err != nil {
 			finishedAt := time.Now().UTC()
-			r.sendFinished(jobID, finishedAt, err, nil)
+			r.sendFinished(ctx, jobID, finishedAt, err, nil)
 			return fmt.Errorf("pre_hook failed: %w", err)
 		}
 	}
@@ -206,7 +218,7 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t
 		}
 	}

-	r.sendFinished(jobID, finishedAt, err, statsBlob)
+	r.sendFinished(ctx, jobID, finishedAt, err, statsBlob)

 	// On a successful backup, refresh the server's snapshot projection.
 	// We do this *after* job.finished so the UI sees the job land first;
@@ -240,7 +252,7 @@ func (r *Runner) RunInit(ctx context.Context, jobID string) error {
 	var seq atomic.Int64
 	err := env.RunInit(ctx, r.streamHandler(jobID, &seq))
 	finishedAt := time.Now().UTC()
-	r.sendFinished(jobID, finishedAt, err, nil)
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)
 	if err != nil {
 		return fmt.Errorf("runner init: %w", err)
 	}
@@ -262,7 +274,7 @@ func (r *Runner) RunForget(ctx context.Context, jobID string, groups []restic.Fo
 	var seq atomic.Int64
 	err := env.RunForget(ctx, groups, r.streamHandler(jobID, &seq))
 	finishedAt := time.Now().UTC()
-	r.sendFinished(jobID, finishedAt, err, nil)
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)

 	// Refresh the server's snapshot projection — forget rewrites the
 	// index so the host's snapshot list almost certainly shrunk.
@@ -300,7 +312,7 @@ func (r *Runner) RunPrune(ctx context.Context, jobID string) error {
 		}
 	}

-	r.sendFinished(jobID, finishedAt, err, nil)
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)

 	if err != nil {
 		return fmt.Errorf("runner prune: %w", err)
@@ -339,7 +351,7 @@ func (r *Runner) RunCheck(ctx context.Context, jobID string, subsetPct int) erro
 		slog.Warn("runner: stats.report after check failed", "job_id", jobID, "err", rerr)
 	}

-	r.sendFinished(jobID, finishedAt, err, nil)
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)

 	if err != nil {
 		return fmt.Errorf("runner check: %w", err)
@@ -366,7 +378,7 @@ func (r *Runner) RunUnlock(ctx context.Context, jobID string) error {
 		}
 	}

-	r.sendFinished(jobID, finishedAt, err, nil)
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)

 	if err != nil {
 		return fmt.Errorf("runner unlock: %w", err)