P3-X1: cancel-job feature

Wires the existing job_detail Cancel button (which was a UI stub) into
real backend behaviour:

- internal/api already declared MsgCommandCancel + CommandCancelPayload;
  promote those from forward-declarations to a working envelope. Agent
  side: cmd/agent/main.go drops the TODO-stub and gains a per-job
  ctx.CancelFunc map. runJob's switch is refactored around a small
  spawn() helper so each kind's goroutine derives a per-job context,
  registers the cancel, and removes itself on completion regardless of
  outcome. command.cancel looks up the func and fires it.
- internal/agent/runner.sendFinished now takes ctx and rebadges
  ctx.Canceled errors as JobCancelled (exit 130) rather than
  JobFailed. All Run* call sites updated.
- internal/restic.resticCmd sets cmd.Cancel to send SIGTERM (via
  build-tagged sigterm constant; os.Kill on Windows since SIGTERM
  isn't deliverable there) and cmd.WaitDelay=5s for the SIGKILL
  fallback. SIGTERM lets restic remove its lock file before exiting.
- New POST /api/jobs/{id}/cancel server endpoint validates the job
  is non-terminal and the host is online, sends command.cancel via
  the hub, writes a job.cancel audit row, returns 202. The agent's
  resulting job.finished (status=cancelled) is what actually
  transitions the row.

Tests:
- internal/server/http/cancel_test.go covers happy path (envelope
  shape + audit row), 409 for terminal jobs, 404 for missing jobs,
  503 for offline hosts.
- internal/agent/runner/cancel_test.go covers cancel mid-run: a fake
  restic that exec'd into 'sleep 30' is canceled 150ms after start
  and the resulting job.finished reports JobCancelled with exit 130
  in well under the WaitDelay.

Foundational for P3 restore (operator needs to be able to cancel a
running backup if they need to restore urgently). Independently useful
for prune/check/backup that are stuck.
This commit is contained in:
2026-05-04 15:11:49 +01:00
parent 454a2415dc
commit 9fa2ef48f0
9 changed files with 543 additions and 47 deletions
+86
View File
@@ -0,0 +1,86 @@
package http
import (
stdhttp "net/http"
"time"
"github.com/go-chi/chi/v5"
"github.com/oklog/ulid/v2"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// handleCancelJob is POST /api/jobs/{id}/cancel. Sends a command.cancel
// envelope to the host that owns the job; the agent kills the running
// restic subprocess, and the resulting job.finished envelope (status =
// canceled) is what actually transitions the job row — this handler
// does not touch the jobs table directly. Returning 202 makes that
// asynchronicity explicit.
//
// 4xx cases:
// - job not found (404)
// - job already in a terminal state (409 — nothing to cancel)
// - host offline (503 — same code path the run-now endpoint uses)
//
// Audit-logged as job.cancel with the job ID as target.
func (s *Server) handleCancelJob(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "")
return
}
jobID := chi.URLParam(r, "id")
if jobID == "" {
writeJSONError(w, stdhttp.StatusBadRequest, "missing_job_id", "")
return
}
job, err := s.deps.Store.GetJob(r.Context(), jobID)
if err != nil {
writeJSONError(w, stdhttp.StatusNotFound, "job_not_found", "")
return
}
switch api.JobStatus(job.Status) {
case api.JobSucceeded, api.JobFailed, api.JobCancelled:
writeJSONError(w, stdhttp.StatusConflict, "job_terminal",
"job is already in a terminal state ("+job.Status+")")
return
}
if !s.deps.Hub.Connected(job.HostID) {
writeJSONError(w, stdhttp.StatusServiceUnavailable, "host_offline",
"agent is not connected; can't deliver cancel signal")
return
}
env, err := api.Marshal(api.MsgCommandCancel, jobID, api.CommandCancelPayload{
JobID: jobID,
})
if err != nil {
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
return
}
if err := s.deps.Hub.Send(r.Context(), job.HostID, env); err != nil {
writeJSONError(w, stdhttp.StatusServiceUnavailable, "host_offline", err.Error())
return
}
var actorID *string
actor := "system"
if user != nil {
actor = "user"
actorID = &user.ID
}
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
ID: ulid.Make().String(),
UserID: actorID,
Actor: actor,
Action: "job.cancel",
TargetKind: ptr("job"),
TargetID: &jobID,
TS: time.Now().UTC(),
})
w.WriteHeader(stdhttp.StatusAccepted)
}
+204
View File
@@ -0,0 +1,204 @@
// cancel_test.go — covers POST /api/jobs/{id}/cancel.
package http
import (
"context"
"encoding/json"
stdhttp "net/http"
"strings"
"testing"
"time"
"github.com/coder/websocket"
"github.com/oklog/ulid/v2"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// TestCancelJobRunningHappyPath: a running job's cancel endpoint sends
// a command.cancel envelope with the right job id, returns 202, and
// writes a job.cancel audit row.
func TestCancelJobRunningHappyPath(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "cancel-host")
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "cancel-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
// Seed a running job we can target.
jobID := ulid.Make().String()
now := time.Now().UTC()
if err := st.CreateJob(context.Background(), store.Job{
ID: jobID, HostID: hostID, Kind: "backup",
ActorKind: "user", CreatedAt: now,
}); err != nil {
t.Fatalf("create job: %v", err)
}
if err := st.MarkJobStarted(context.Background(), jobID, now); err != nil {
t.Fatalf("mark started: %v", err)
}
cookie := loginAsAdmin(t, st)
req, _ := stdhttp.NewRequest("POST",
ts.URL+"/api/jobs/"+jobID+"/cancel", nil)
req.AddCookie(cookie)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusAccepted {
t.Fatalf("status: got %d, want 202", res.StatusCode)
}
// Read the dispatched command.cancel envelope.
deadline := time.Now().Add(2 * time.Second)
var got api.Envelope
for time.Now().Before(deadline) {
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
mt, raw, rerr := c.Read(ctx)
cancel()
if rerr != nil {
break
}
if mt != websocket.MessageText {
continue
}
if !strings.Contains(string(raw), `"command.cancel"`) {
continue
}
if err := json.Unmarshal(raw, &got); err != nil {
t.Fatalf("unmarshal: %v", err)
}
break
}
if got.Type != api.MsgCommandCancel {
t.Fatalf("never received command.cancel envelope")
}
var cp api.CommandCancelPayload
if err := got.UnmarshalPayload(&cp); err != nil {
t.Fatalf("unmarshal payload: %v", err)
}
if cp.JobID != jobID {
t.Fatalf("payload job_id: got %q want %q", cp.JobID, jobID)
}
// Audit row exists.
var n int
if err := st.DB().QueryRow(
`SELECT COUNT(*) FROM audit_log WHERE action = 'job.cancel' AND target_id = ?`,
jobID).Scan(&n); err != nil {
t.Fatalf("audit count: %v", err)
}
if n != 1 {
t.Fatalf("audit rows: got %d, want 1", n)
}
}
// TestCancelJobAlreadyTerminal: a job in succeeded/failed/canceled
// state returns 409 and does NOT send a WS envelope.
func TestCancelJobAlreadyTerminal(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "term-host")
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "term-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
jobID := ulid.Make().String()
now := time.Now().UTC()
if err := st.CreateJob(context.Background(), store.Job{
ID: jobID, HostID: hostID, Kind: "backup",
ActorKind: "user", CreatedAt: now,
}); err != nil {
t.Fatalf("create job: %v", err)
}
if err := st.MarkJobFinished(context.Background(), jobID, "succeeded", 0, nil, "", now); err != nil {
t.Fatalf("mark finished: %v", err)
}
cookie := loginAsAdmin(t, st)
req, _ := stdhttp.NewRequest("POST",
ts.URL+"/api/jobs/"+jobID+"/cancel", nil)
req.AddCookie(cookie)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusConflict {
t.Fatalf("status: got %d, want 409", res.StatusCode)
}
// Drain — no command.cancel should arrive.
ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
defer cancel()
for {
mt, raw, rerr := c.Read(ctx)
if rerr != nil {
break
}
if mt == websocket.MessageText && strings.Contains(string(raw), `"command.cancel"`) {
t.Fatalf("unexpected command.cancel envelope for terminal job")
}
}
}
// TestCancelJobNotFound: 404 for a job id that doesn't exist.
func TestCancelJobNotFound(t *testing.T) {
t.Parallel()
_, ts, st := rawTestServer(t)
cookie := loginAsAdmin(t, st)
req, _ := stdhttp.NewRequest("POST",
ts.URL+"/api/jobs/"+ulid.Make().String()+"/cancel", nil)
req.AddCookie(cookie)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusNotFound {
t.Fatalf("status: got %d, want 404", res.StatusCode)
}
}
// TestCancelJobHostOffline: a queued/running job whose host has no
// active WS connection returns 503.
func TestCancelJobHostOffline(t *testing.T) {
t.Parallel()
_, ts, st := rawTestServer(t)
// Create a host but don't connect a WS for it.
hostID := ulid.Make().String()
if err := st.CreateHost(context.Background(), store.Host{
ID: hostID, Name: "offline-host", OS: "linux", Arch: "amd64",
EnrolledAt: time.Now().UTC(),
}, "deadbeef", ""); err != nil {
t.Fatalf("create host: %v", err)
}
jobID := ulid.Make().String()
now := time.Now().UTC()
if err := st.CreateJob(context.Background(), store.Job{
ID: jobID, HostID: hostID, Kind: "backup",
ActorKind: "user", CreatedAt: now,
}); err != nil {
t.Fatalf("create job: %v", err)
}
if err := st.MarkJobStarted(context.Background(), jobID, now); err != nil {
t.Fatalf("mark started: %v", err)
}
cookie := loginAsAdmin(t, st)
req, _ := stdhttp.NewRequest("POST",
ts.URL+"/api/jobs/"+jobID+"/cancel", nil)
req.AddCookie(cookie)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusServiceUnavailable {
t.Fatalf("status: got %d, want 503", res.StatusCode)
}
}
+6
View File
@@ -178,6 +178,12 @@ func (s *Server) routes(r chi.Router) {
r.Post("/hosts/{id}/repo/prune", s.handleRunRepoPrune)
r.Post("/hosts/{id}/repo/check", s.handleRunRepoCheck)
r.Post("/hosts/{id}/repo/unlock", s.handleRunRepoUnlock)
// Cancel a running job. Operator-driven, sends command.cancel
// to the agent which kills the restic subprocess; the agent's
// resulting job.finished (status=canceled) is what flips the
// job row.
r.Post("/jobs/{id}/cancel", s.handleCancelJob)
})
// Per-source-group Run-now (HTMX form action). Available even