Files
restic-manager/internal/agent/runner/cancel_test.go
T
steve 9fa2ef48f0 P3-X1: cancel-job feature
Wires the existing job_detail Cancel button (which was a UI stub) into
real backend behaviour:

- internal/api already declared MsgCommandCancel + CommandCancelPayload;
  promote those from forward-declarations to a working envelope. Agent
  side: cmd/agent/main.go drops the TODO-stub and gains a per-job
  ctx.CancelFunc map. runJob's switch is refactored around a small
  spawn() helper so each kind's goroutine derives a per-job context,
  registers the cancel, and removes itself on completion regardless of
  outcome. command.cancel looks up the func and fires it.
- internal/agent/runner.sendFinished now takes ctx and rebadges
  ctx.Canceled errors as JobCancelled (exit 130) rather than
  JobFailed. All Run* call sites updated.
- internal/restic.resticCmd sets cmd.Cancel to send SIGTERM (via
  build-tagged sigterm constant; os.Kill on Windows since SIGTERM
  isn't deliverable there) and cmd.WaitDelay=5s for the SIGKILL
  fallback. SIGTERM lets restic remove its lock file before exiting.
- New POST /api/jobs/{id}/cancel server endpoint validates the job
  is non-terminal and the host is online, sends command.cancel via
  the hub, writes a job.cancel audit row, returns 202. The agent's
  resulting job.finished (status=cancelled) is what actually
  transitions the row.

Tests:
- internal/server/http/cancel_test.go covers happy path (envelope
  shape + audit row), 409 for terminal jobs, 404 for missing jobs,
  503 for offline hosts.
- internal/agent/runner/cancel_test.go covers cancel mid-run: a fake
  restic that exec'd into 'sleep 30' is canceled 150ms after start
  and the resulting job.finished reports JobCancelled with exit 130
  in well under the WaitDelay.

Foundational for P3 restore (operator needs to be able to cancel a
running backup if they need to restore urgently). Independently useful
for prune/check/backup that are stuck.
2026-05-04 15:11:49 +01:00

101 lines
2.8 KiB
Go

package runner
import (
"context"
"strings"
"sync"
"testing"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
)
// safeSender is a thread-safe variant of fakeSender. The cancel test
// has the runner goroutine sending envelopes while the test goroutine
// is reading the slice, so we need a mutex.
type safeSender struct {
mu sync.Mutex
envs []api.Envelope
}
func (s *safeSender) Send(e api.Envelope) error {
s.mu.Lock()
s.envs = append(s.envs, e)
s.mu.Unlock()
return nil
}
func (s *safeSender) snapshot() []api.Envelope {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]api.Envelope, len(s.envs))
copy(out, s.envs)
return out
}
// TestRunBackupCanceledMidRunReportsCanceled spawns a backup against
// a fake restic that sleeps for 30 seconds, cancels the context after
// a short delay, and confirms the resulting job.finished envelope
// reports status=canceled (not failed).
func TestRunBackupCanceledMidRunReportsCanceled(t *testing.T) {
t.Parallel()
// Fake restic: replace the shell with a long sleep via `exec` so the
// process tree is one process — SIGTERM goes directly to sleep and
// it exits. Without `exec`, the shell stays in the foreground while
// sleep is its child; SIGTERM-to-shell may or may not propagate to
// sleep depending on the shell, leading to the WaitDelay-then-
// SIGKILL fallback path firing — slower and noisier.
bin := setupScript(t, `exec sleep 30`)
tx := &safeSender{}
r := New(Config{ResticBin: bin}, tx, 0)
ctx, cancel := context.WithCancel(context.Background())
done := make(chan error, 1)
go func() {
done <- r.RunBackup(ctx, "job-cancel", []string{"/tmp/x"}, nil, nil, BackupHooks{})
}()
// Wait long enough for the subprocess to actually start before
// canceling. Without this, exec.CommandContext can race the
// kill against Start and produce a different error path.
time.Sleep(150 * time.Millisecond)
cancel()
select {
case <-done:
case <-time.After(15 * time.Second):
t.Fatal("RunBackup did not return within 15s of cancel")
}
// Locate the job.finished envelope and check its status.
envs := tx.snapshot()
var finEnv api.Envelope
var found bool
for _, e := range envs {
if e.Type == api.MsgJobFinished {
finEnv = e
found = true
break
}
}
if !found {
t.Fatal("no job.finished envelope was sent")
}
var fin api.JobFinishedPayload
if err := finEnv.UnmarshalPayload(&fin); err != nil {
t.Fatalf("unmarshal: %v", err)
}
if fin.Status != api.JobCancelled {
t.Fatalf("status: got %q, want %q", fin.Status, api.JobCancelled)
}
if fin.ExitCode != 130 {
t.Errorf("exit_code: got %d, want 130 (POSIX cancel convention)", fin.ExitCode)
}
// The error message should be empty for canceled jobs (see runner.sendFinished).
if !strings.HasPrefix(fin.Error, "") || fin.Error != "" {
t.Errorf("error: got %q, want empty for canceled jobs", fin.Error)
}
}