Phase 3 — Restore (P3-X1, X2, 01, 02, 03, 09, X3-X6) #6
+88
-38
@@ -210,6 +210,45 @@ type dispatcher struct {
|
|||||||
bwMu sync.Mutex
|
bwMu sync.Mutex
|
||||||
bwUpKBps int
|
bwUpKBps int
|
||||||
bwDownKBps int
|
bwDownKBps int
|
||||||
|
|
||||||
|
// Per-running-job cancellation handles. Populated when runJob
|
||||||
|
// spawns the goroutine, removed when it returns. Looked up by
|
||||||
|
// the command.cancel handler (server → agent) to abort an
|
||||||
|
// in-flight restic invocation.
|
||||||
|
cancelMu sync.Mutex
|
||||||
|
cancels map[string]context.CancelFunc
|
||||||
|
}
|
||||||
|
|
||||||
|
// trackJob registers a cancel func for an in-flight job and returns a
|
||||||
|
// cleanup that removes it. Call cleanup when the job goroutine exits
|
||||||
|
// regardless of outcome — runs even on panic.
|
||||||
|
func (d *dispatcher) trackJob(jobID string, cancel context.CancelFunc) func() {
|
||||||
|
d.cancelMu.Lock()
|
||||||
|
if d.cancels == nil {
|
||||||
|
d.cancels = make(map[string]context.CancelFunc)
|
||||||
|
}
|
||||||
|
d.cancels[jobID] = cancel
|
||||||
|
d.cancelMu.Unlock()
|
||||||
|
return func() {
|
||||||
|
d.cancelMu.Lock()
|
||||||
|
delete(d.cancels, jobID)
|
||||||
|
d.cancelMu.Unlock()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cancelJob fires the cancel func for jobID if there is one and
|
||||||
|
// returns whether the job was actually known. The runner is expected
|
||||||
|
// to surface the resulting context.Canceled as a JobCancelled status
|
||||||
|
// in its job.finished envelope (see runner.sendFinished).
|
||||||
|
func (d *dispatcher) cancelJob(jobID string) bool {
|
||||||
|
d.cancelMu.Lock()
|
||||||
|
cancel, ok := d.cancels[jobID]
|
||||||
|
d.cancelMu.Unlock()
|
||||||
|
if !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
cancel()
|
||||||
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.Sender) error {
|
func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.Sender) error {
|
||||||
@@ -222,8 +261,18 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S
|
|||||||
return d.runJob(ctx, p, tx)
|
return d.runJob(ctx, p, tx)
|
||||||
|
|
||||||
case api.MsgCommandCancel:
|
case api.MsgCommandCancel:
|
||||||
// TODO(P2): cancellation requires keeping a job→cancelFunc map.
|
var p api.CommandCancelPayload
|
||||||
slog.Info("ws agent: command.cancel received (cancellation lands in P2)", "id", env.ID)
|
if err := env.UnmarshalPayload(&p); err != nil {
|
||||||
|
return fmt.Errorf("command.cancel: %w", err)
|
||||||
|
}
|
||||||
|
if d.cancelJob(p.JobID) {
|
||||||
|
slog.Info("ws agent: command.cancel applied", "job_id", p.JobID)
|
||||||
|
} else {
|
||||||
|
// Job already finished or was never seen on this agent.
|
||||||
|
// Not an error — operator may have raced cancel against
|
||||||
|
// natural completion. Server-side state is authoritative.
|
||||||
|
slog.Info("ws agent: command.cancel for unknown job (already finished?)", "job_id", p.JobID)
|
||||||
|
}
|
||||||
|
|
||||||
case api.MsgScheduleSet:
|
case api.MsgScheduleSet:
|
||||||
var p api.ScheduleSetPayload
|
var p api.ScheduleSetPayload
|
||||||
@@ -374,6 +423,25 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
|||||||
LimitDownloadKBps: downKBps,
|
LimitDownloadKBps: downKBps,
|
||||||
}, tx, time.Second)
|
}, tx, time.Second)
|
||||||
|
|
||||||
|
// spawn wraps the kind-specific goroutine: derives a per-job
|
||||||
|
// cancellable context from the connection-scoped ctx, registers
|
||||||
|
// the cancel func so command.cancel can fire it, deregisters on
|
||||||
|
// completion. Per-job ctx means canceling one job doesn't kill
|
||||||
|
// any other in-flight invocations.
|
||||||
|
spawn := func(name string, fn func(ctx context.Context) error) {
|
||||||
|
jobCtx, cancel := context.WithCancel(ctx)
|
||||||
|
cleanup := d.trackJob(p.JobID, cancel)
|
||||||
|
go func() {
|
||||||
|
defer cleanup()
|
||||||
|
defer cancel() // release ctx resources on goroutine exit
|
||||||
|
if err := fn(jobCtx); err != nil {
|
||||||
|
slog.Warn("agent: "+name+" job failed", "job_id", p.JobID, "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
slog.Info("agent: "+name+" job complete", "job_id", p.JobID)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
switch p.Kind {
|
switch p.Kind {
|
||||||
case api.JobBackup:
|
case api.JobBackup:
|
||||||
// Includes/Excludes/Tag come from the source group resolved
|
// Includes/Excludes/Tag come from the source group resolved
|
||||||
@@ -391,22 +459,14 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
|||||||
slog.Info("agent: accepting backup job",
|
slog.Info("agent: accepting backup job",
|
||||||
"job_id", p.JobID, "paths", paths, "excludes", p.Excludes, "tag", p.Tag)
|
"job_id", p.JobID, "paths", paths, "excludes", p.Excludes, "tag", p.Tag)
|
||||||
hooks := runner.BackupHooks{Pre: p.PreHook, Post: p.PostHook}
|
hooks := runner.BackupHooks{Pre: p.PreHook, Post: p.PostHook}
|
||||||
go func() {
|
spawn("backup", func(jobCtx context.Context) error {
|
||||||
if err := r.RunBackup(ctx, p.JobID, paths, p.Excludes, tags, hooks); err != nil {
|
return r.RunBackup(jobCtx, p.JobID, paths, p.Excludes, tags, hooks)
|
||||||
slog.Warn("agent: backup job failed", "job_id", p.JobID, "err", err)
|
})
|
||||||
return
|
|
||||||
}
|
|
||||||
slog.Info("agent: backup job complete", "job_id", p.JobID)
|
|
||||||
}()
|
|
||||||
case api.JobInit:
|
case api.JobInit:
|
||||||
slog.Info("agent: accepting init job", "job_id", p.JobID)
|
slog.Info("agent: accepting init job", "job_id", p.JobID)
|
||||||
go func() {
|
spawn("init", func(jobCtx context.Context) error {
|
||||||
if err := r.RunInit(ctx, p.JobID); err != nil {
|
return r.RunInit(jobCtx, p.JobID)
|
||||||
slog.Warn("agent: init job failed", "job_id", p.JobID, "err", err)
|
})
|
||||||
return
|
|
||||||
}
|
|
||||||
slog.Info("agent: init job complete", "job_id", p.JobID)
|
|
||||||
}()
|
|
||||||
case api.JobForget:
|
case api.JobForget:
|
||||||
if len(p.ForgetGroups) == 0 {
|
if len(p.ForgetGroups) == 0 {
|
||||||
// Hard-error rather than fall back to a single-policy form:
|
// Hard-error rather than fall back to a single-policy form:
|
||||||
@@ -433,13 +493,9 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
slog.Info("agent: accepting forget job", "job_id", p.JobID, "groups", len(groups))
|
slog.Info("agent: accepting forget job", "job_id", p.JobID, "groups", len(groups))
|
||||||
go func() {
|
spawn("forget", func(jobCtx context.Context) error {
|
||||||
if err := r.RunForget(ctx, p.JobID, groups); err != nil {
|
return r.RunForget(jobCtx, p.JobID, groups)
|
||||||
slog.Warn("agent: forget job failed", "job_id", p.JobID, "err", err)
|
})
|
||||||
return
|
|
||||||
}
|
|
||||||
slog.Info("agent: forget job complete", "job_id", p.JobID)
|
|
||||||
}()
|
|
||||||
case api.JobPrune:
|
case api.JobPrune:
|
||||||
// Prune may require admin creds (delete authority on rest-server).
|
// Prune may require admin creds (delete authority on rest-server).
|
||||||
runCreds := creds
|
runCreds := creds
|
||||||
@@ -462,29 +518,23 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
|
|||||||
LimitDownloadKBps: downKBps,
|
LimitDownloadKBps: downKBps,
|
||||||
}, tx, time.Second)
|
}, tx, time.Second)
|
||||||
slog.Info("agent: accepting prune job", "job_id", p.JobID, "admin_creds", p.RequiresAdminCreds)
|
slog.Info("agent: accepting prune job", "job_id", p.JobID, "admin_creds", p.RequiresAdminCreds)
|
||||||
go func() {
|
spawn("prune", func(jobCtx context.Context) error {
|
||||||
if err := prr.RunPrune(ctx, p.JobID); err != nil {
|
return prr.RunPrune(jobCtx, p.JobID)
|
||||||
slog.Warn("agent: prune job failed", "job_id", p.JobID, "err", err)
|
})
|
||||||
}
|
|
||||||
}()
|
|
||||||
case api.JobCheck:
|
case api.JobCheck:
|
||||||
subset := 0
|
subset := 0
|
||||||
if len(p.Args) > 0 {
|
if len(p.Args) > 0 {
|
||||||
subset, _ = strconv.Atoi(p.Args[0])
|
subset, _ = strconv.Atoi(p.Args[0])
|
||||||
}
|
}
|
||||||
slog.Info("agent: accepting check job", "job_id", p.JobID, "subset_pct", subset)
|
slog.Info("agent: accepting check job", "job_id", p.JobID, "subset_pct", subset)
|
||||||
go func() {
|
spawn("check", func(jobCtx context.Context) error {
|
||||||
if err := r.RunCheck(ctx, p.JobID, subset); err != nil {
|
return r.RunCheck(jobCtx, p.JobID, subset)
|
||||||
slog.Warn("agent: check job failed", "job_id", p.JobID, "err", err)
|
})
|
||||||
}
|
|
||||||
}()
|
|
||||||
case api.JobUnlock:
|
case api.JobUnlock:
|
||||||
slog.Info("agent: accepting unlock job", "job_id", p.JobID)
|
slog.Info("agent: accepting unlock job", "job_id", p.JobID)
|
||||||
go func() {
|
spawn("unlock", func(jobCtx context.Context) error {
|
||||||
if err := r.RunUnlock(ctx, p.JobID); err != nil {
|
return r.RunUnlock(jobCtx, p.JobID)
|
||||||
slog.Warn("agent: unlock job failed", "job_id", p.JobID, "err", err)
|
})
|
||||||
}
|
|
||||||
}()
|
|
||||||
default:
|
default:
|
||||||
return fmt.Errorf("kind %q not implemented yet (Phase 2 lands the rest)", p.Kind)
|
return fmt.Errorf("kind %q not implemented yet (Phase 2 lands the rest)", p.Kind)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,100 @@
|
|||||||
|
package runner
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
// safeSender is a thread-safe variant of fakeSender. The cancel test
|
||||||
|
// has the runner goroutine sending envelopes while the test goroutine
|
||||||
|
// is reading the slice, so we need a mutex.
|
||||||
|
type safeSender struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
envs []api.Envelope
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *safeSender) Send(e api.Envelope) error {
|
||||||
|
s.mu.Lock()
|
||||||
|
s.envs = append(s.envs, e)
|
||||||
|
s.mu.Unlock()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *safeSender) snapshot() []api.Envelope {
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
out := make([]api.Envelope, len(s.envs))
|
||||||
|
copy(out, s.envs)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestRunBackupCanceledMidRunReportsCanceled spawns a backup against
|
||||||
|
// a fake restic that sleeps for 30 seconds, cancels the context after
|
||||||
|
// a short delay, and confirms the resulting job.finished envelope
|
||||||
|
// reports status=canceled (not failed).
|
||||||
|
func TestRunBackupCanceledMidRunReportsCanceled(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// Fake restic: replace the shell with a long sleep via `exec` so the
|
||||||
|
// process tree is one process — SIGTERM goes directly to sleep and
|
||||||
|
// it exits. Without `exec`, the shell stays in the foreground while
|
||||||
|
// sleep is its child; SIGTERM-to-shell may or may not propagate to
|
||||||
|
// sleep depending on the shell, leading to the WaitDelay-then-
|
||||||
|
// SIGKILL fallback path firing — slower and noisier.
|
||||||
|
bin := setupScript(t, `exec sleep 30`)
|
||||||
|
|
||||||
|
tx := &safeSender{}
|
||||||
|
r := New(Config{ResticBin: bin}, tx, 0)
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
done := make(chan error, 1)
|
||||||
|
go func() {
|
||||||
|
done <- r.RunBackup(ctx, "job-cancel", []string{"/tmp/x"}, nil, nil, BackupHooks{})
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Wait long enough for the subprocess to actually start before
|
||||||
|
// canceling. Without this, exec.CommandContext can race the
|
||||||
|
// kill against Start and produce a different error path.
|
||||||
|
time.Sleep(150 * time.Millisecond)
|
||||||
|
cancel()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-time.After(15 * time.Second):
|
||||||
|
t.Fatal("RunBackup did not return within 15s of cancel")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Locate the job.finished envelope and check its status.
|
||||||
|
envs := tx.snapshot()
|
||||||
|
var finEnv api.Envelope
|
||||||
|
var found bool
|
||||||
|
for _, e := range envs {
|
||||||
|
if e.Type == api.MsgJobFinished {
|
||||||
|
finEnv = e
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Fatal("no job.finished envelope was sent")
|
||||||
|
}
|
||||||
|
var fin api.JobFinishedPayload
|
||||||
|
if err := finEnv.UnmarshalPayload(&fin); err != nil {
|
||||||
|
t.Fatalf("unmarshal: %v", err)
|
||||||
|
}
|
||||||
|
if fin.Status != api.JobCancelled {
|
||||||
|
t.Fatalf("status: got %q, want %q", fin.Status, api.JobCancelled)
|
||||||
|
}
|
||||||
|
if fin.ExitCode != 130 {
|
||||||
|
t.Errorf("exit_code: got %d, want 130 (POSIX cancel convention)", fin.ExitCode)
|
||||||
|
}
|
||||||
|
// The error message should be empty for canceled jobs (see runner.sendFinished).
|
||||||
|
if !strings.HasPrefix(fin.Error, "") || fin.Error != "" {
|
||||||
|
t.Errorf("error: got %q, want empty for canceled jobs", fin.Error)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -95,8 +95,10 @@ func (r *Runner) streamHandler(jobID string, seq *atomic.Int64) restic.LineHandl
|
|||||||
}
|
}
|
||||||
|
|
||||||
// sendFinished ships a job.finished envelope. err==nil → succeeded;
|
// sendFinished ships a job.finished envelope. err==nil → succeeded;
|
||||||
// otherwise failed. statsBlob is forwarded as JobFinishedPayload.Stats.
|
// otherwise failed (or canceled if ctx was canceled — operator
|
||||||
func (r *Runner) sendFinished(jobID string, finishedAt time.Time, err error, statsBlob json.RawMessage) {
|
// hit the Cancel button or the agent is shutting down).
|
||||||
|
// statsBlob is forwarded as JobFinishedPayload.Stats.
|
||||||
|
func (r *Runner) sendFinished(ctx context.Context, jobID string, finishedAt time.Time, err error, statsBlob json.RawMessage) {
|
||||||
status := api.JobSucceeded
|
status := api.JobSucceeded
|
||||||
exit := 0
|
exit := 0
|
||||||
errMsg := ""
|
errMsg := ""
|
||||||
@@ -104,6 +106,16 @@ func (r *Runner) sendFinished(jobID string, finishedAt time.Time, err error, sta
|
|||||||
status = api.JobFailed
|
status = api.JobFailed
|
||||||
exit = -1
|
exit = -1
|
||||||
errMsg = err.Error()
|
errMsg = err.Error()
|
||||||
|
// If the context was canceled, the failure is operator-driven
|
||||||
|
// (or shutdown). Surface as JobCancelled so the UI shows a
|
||||||
|
// neutral "canceled" state rather than a red "failed" one.
|
||||||
|
// exec.CommandContext returns the process's exit error on
|
||||||
|
// ctx-cancel, which we'd otherwise rebadge as failed.
|
||||||
|
if ctxErr := ctx.Err(); ctxErr != nil {
|
||||||
|
status = api.JobCancelled
|
||||||
|
exit = 130 // POSIX convention for SIGINT/SIGTERM-killed
|
||||||
|
errMsg = "" // no need to surface the underlying restic error
|
||||||
|
}
|
||||||
}
|
}
|
||||||
finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{
|
finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{
|
||||||
JobID: jobID,
|
JobID: jobID,
|
||||||
@@ -138,7 +150,7 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t
|
|||||||
if hooks.Pre != "" {
|
if hooks.Pre != "" {
|
||||||
if err := r.runHook(ctx, jobID, "pre", hooks.Pre, "", &seq); err != nil {
|
if err := r.runHook(ctx, jobID, "pre", hooks.Pre, "", &seq); err != nil {
|
||||||
finishedAt := time.Now().UTC()
|
finishedAt := time.Now().UTC()
|
||||||
r.sendFinished(jobID, finishedAt, err, nil)
|
r.sendFinished(ctx, jobID, finishedAt, err, nil)
|
||||||
return fmt.Errorf("pre_hook failed: %w", err)
|
return fmt.Errorf("pre_hook failed: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -206,7 +218,7 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
r.sendFinished(jobID, finishedAt, err, statsBlob)
|
r.sendFinished(ctx, jobID, finishedAt, err, statsBlob)
|
||||||
|
|
||||||
// On a successful backup, refresh the server's snapshot projection.
|
// On a successful backup, refresh the server's snapshot projection.
|
||||||
// We do this *after* job.finished so the UI sees the job land first;
|
// We do this *after* job.finished so the UI sees the job land first;
|
||||||
@@ -240,7 +252,7 @@ func (r *Runner) RunInit(ctx context.Context, jobID string) error {
|
|||||||
var seq atomic.Int64
|
var seq atomic.Int64
|
||||||
err := env.RunInit(ctx, r.streamHandler(jobID, &seq))
|
err := env.RunInit(ctx, r.streamHandler(jobID, &seq))
|
||||||
finishedAt := time.Now().UTC()
|
finishedAt := time.Now().UTC()
|
||||||
r.sendFinished(jobID, finishedAt, err, nil)
|
r.sendFinished(ctx, jobID, finishedAt, err, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("runner init: %w", err)
|
return fmt.Errorf("runner init: %w", err)
|
||||||
}
|
}
|
||||||
@@ -262,7 +274,7 @@ func (r *Runner) RunForget(ctx context.Context, jobID string, groups []restic.Fo
|
|||||||
var seq atomic.Int64
|
var seq atomic.Int64
|
||||||
err := env.RunForget(ctx, groups, r.streamHandler(jobID, &seq))
|
err := env.RunForget(ctx, groups, r.streamHandler(jobID, &seq))
|
||||||
finishedAt := time.Now().UTC()
|
finishedAt := time.Now().UTC()
|
||||||
r.sendFinished(jobID, finishedAt, err, nil)
|
r.sendFinished(ctx, jobID, finishedAt, err, nil)
|
||||||
|
|
||||||
// Refresh the server's snapshot projection — forget rewrites the
|
// Refresh the server's snapshot projection — forget rewrites the
|
||||||
// index so the host's snapshot list almost certainly shrunk.
|
// index so the host's snapshot list almost certainly shrunk.
|
||||||
@@ -300,7 +312,7 @@ func (r *Runner) RunPrune(ctx context.Context, jobID string) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
r.sendFinished(jobID, finishedAt, err, nil)
|
r.sendFinished(ctx, jobID, finishedAt, err, nil)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("runner prune: %w", err)
|
return fmt.Errorf("runner prune: %w", err)
|
||||||
@@ -339,7 +351,7 @@ func (r *Runner) RunCheck(ctx context.Context, jobID string, subsetPct int) erro
|
|||||||
slog.Warn("runner: stats.report after check failed", "job_id", jobID, "err", rerr)
|
slog.Warn("runner: stats.report after check failed", "job_id", jobID, "err", rerr)
|
||||||
}
|
}
|
||||||
|
|
||||||
r.sendFinished(jobID, finishedAt, err, nil)
|
r.sendFinished(ctx, jobID, finishedAt, err, nil)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("runner check: %w", err)
|
return fmt.Errorf("runner check: %w", err)
|
||||||
@@ -366,7 +378,7 @@ func (r *Runner) RunUnlock(ctx context.Context, jobID string) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
r.sendFinished(jobID, finishedAt, err, nil)
|
r.sendFinished(ctx, jobID, finishedAt, err, nil)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("runner unlock: %w", err)
|
return fmt.Errorf("runner unlock: %w", err)
|
||||||
|
|||||||
@@ -0,0 +1,7 @@
|
|||||||
|
//go:build !windows
|
||||||
|
|
||||||
|
package restic
|
||||||
|
|
||||||
|
import "syscall"
|
||||||
|
|
||||||
|
var sigterm = syscall.SIGTERM
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
//go:build windows
|
||||||
|
|
||||||
|
package restic
|
||||||
|
|
||||||
|
import "os"
|
||||||
|
|
||||||
|
// Windows has no SIGTERM. The closest equivalent is os.Interrupt
|
||||||
|
// (CTRL_BREAK_EVENT), but Go's exec.Cmd.Process.Signal() on Windows
|
||||||
|
// only supports os.Kill — sending anything else returns an error and
|
||||||
|
// no signal is delivered. Fall back to os.Kill so Cancel still works
|
||||||
|
// (immediate force-kill); WaitDelay is unused but harmless.
|
||||||
|
var sigterm = os.Kill
|
||||||
@@ -72,11 +72,30 @@ func (e Env) globalArgs() []string {
|
|||||||
// before the supplied subcommand args. Centralizing this so every
|
// before the supplied subcommand args. Centralizing this so every
|
||||||
// command (backup/forget/prune/check/unlock/init/stats) honors
|
// command (backup/forget/prune/check/unlock/init/stats) honors
|
||||||
// the caps without each call site having to remember.
|
// the caps without each call site having to remember.
|
||||||
|
//
|
||||||
|
// Cancellation: by default exec.CommandContext sends SIGKILL when
|
||||||
|
// ctx is canceled, which leaves restic no chance to clean up its
|
||||||
|
// repository lock. Override Cmd.Cancel to send SIGTERM first, and
|
||||||
|
// set Cmd.WaitDelay so the process is force-killed if it doesn't
|
||||||
|
// exit within five seconds. Restic responds to SIGTERM by removing
|
||||||
|
// its lock file before exiting, which is what we want when an
|
||||||
|
// operator cancels a long-running backup/restore from the UI.
|
||||||
func (e Env) resticCmd(ctx context.Context, sub ...string) *exec.Cmd {
|
func (e Env) resticCmd(ctx context.Context, sub ...string) *exec.Cmd {
|
||||||
args := append(e.globalArgs(), sub...)
|
args := append(e.globalArgs(), sub...)
|
||||||
cmd := exec.CommandContext(ctx, e.Bin, args...)
|
cmd := exec.CommandContext(ctx, e.Bin, args...)
|
||||||
cmd.Env = e.envSlice()
|
cmd.Env = e.envSlice()
|
||||||
cmd.Dir = e.WorkDir
|
cmd.Dir = e.WorkDir
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
// Cmd.Process is set after Start; Cancel only fires post-Start
|
||||||
|
// so the nil check is defensive against the documented but
|
||||||
|
// unlikely race. Signal returns ErrProcessDone if the process
|
||||||
|
// already exited; that's not a problem here either.
|
||||||
|
if cmd.Process == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return cmd.Process.Signal(sigterm)
|
||||||
|
}
|
||||||
|
cmd.WaitDelay = 5 * time.Second
|
||||||
return cmd
|
return cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,86 @@
|
|||||||
|
package http
|
||||||
|
|
||||||
|
import (
|
||||||
|
stdhttp "net/http"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/go-chi/chi/v5"
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
// handleCancelJob is POST /api/jobs/{id}/cancel. Sends a command.cancel
|
||||||
|
// envelope to the host that owns the job; the agent kills the running
|
||||||
|
// restic subprocess, and the resulting job.finished envelope (status =
|
||||||
|
// canceled) is what actually transitions the job row — this handler
|
||||||
|
// does not touch the jobs table directly. Returning 202 makes that
|
||||||
|
// asynchronicity explicit.
|
||||||
|
//
|
||||||
|
// 4xx cases:
|
||||||
|
// - job not found (404)
|
||||||
|
// - job already in a terminal state (409 — nothing to cancel)
|
||||||
|
// - host offline (503 — same code path the run-now endpoint uses)
|
||||||
|
//
|
||||||
|
// Audit-logged as job.cancel with the job ID as target.
|
||||||
|
func (s *Server) handleCancelJob(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||||
|
user, ok := s.requireUser(r)
|
||||||
|
if !ok {
|
||||||
|
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorized", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
jobID := chi.URLParam(r, "id")
|
||||||
|
if jobID == "" {
|
||||||
|
writeJSONError(w, stdhttp.StatusBadRequest, "missing_job_id", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
job, err := s.deps.Store.GetJob(r.Context(), jobID)
|
||||||
|
if err != nil {
|
||||||
|
writeJSONError(w, stdhttp.StatusNotFound, "job_not_found", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
switch api.JobStatus(job.Status) {
|
||||||
|
case api.JobSucceeded, api.JobFailed, api.JobCancelled:
|
||||||
|
writeJSONError(w, stdhttp.StatusConflict, "job_terminal",
|
||||||
|
"job is already in a terminal state ("+job.Status+")")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if !s.deps.Hub.Connected(job.HostID) {
|
||||||
|
writeJSONError(w, stdhttp.StatusServiceUnavailable, "host_offline",
|
||||||
|
"agent is not connected; can't deliver cancel signal")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
env, err := api.Marshal(api.MsgCommandCancel, jobID, api.CommandCancelPayload{
|
||||||
|
JobID: jobID,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := s.deps.Hub.Send(r.Context(), job.HostID, env); err != nil {
|
||||||
|
writeJSONError(w, stdhttp.StatusServiceUnavailable, "host_offline", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var actorID *string
|
||||||
|
actor := "system"
|
||||||
|
if user != nil {
|
||||||
|
actor = "user"
|
||||||
|
actorID = &user.ID
|
||||||
|
}
|
||||||
|
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
|
||||||
|
ID: ulid.Make().String(),
|
||||||
|
UserID: actorID,
|
||||||
|
Actor: actor,
|
||||||
|
Action: "job.cancel",
|
||||||
|
TargetKind: ptr("job"),
|
||||||
|
TargetID: &jobID,
|
||||||
|
TS: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
|
||||||
|
w.WriteHeader(stdhttp.StatusAccepted)
|
||||||
|
}
|
||||||
@@ -0,0 +1,204 @@
|
|||||||
|
// cancel_test.go — covers POST /api/jobs/{id}/cancel.
|
||||||
|
package http
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
stdhttp "net/http"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/coder/websocket"
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestCancelJobRunningHappyPath: a running job's cancel endpoint sends
|
||||||
|
// a command.cancel envelope with the right job id, returns 202, and
|
||||||
|
// writes a job.cancel audit row.
|
||||||
|
func TestCancelJobRunningHappyPath(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
srv, ts, st := rawTestServer(t)
|
||||||
|
hostID, token := enrolHostForWS(t, srv, st, "cancel-host")
|
||||||
|
c := agentDial(t, srv, ts, hostID, token)
|
||||||
|
sendHello(t, c, "cancel-host")
|
||||||
|
_ = drainUntil(t, c, api.MsgScheduleSet)
|
||||||
|
|
||||||
|
// Seed a running job we can target.
|
||||||
|
jobID := ulid.Make().String()
|
||||||
|
now := time.Now().UTC()
|
||||||
|
if err := st.CreateJob(context.Background(), store.Job{
|
||||||
|
ID: jobID, HostID: hostID, Kind: "backup",
|
||||||
|
ActorKind: "user", CreatedAt: now,
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("create job: %v", err)
|
||||||
|
}
|
||||||
|
if err := st.MarkJobStarted(context.Background(), jobID, now); err != nil {
|
||||||
|
t.Fatalf("mark started: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
req, _ := stdhttp.NewRequest("POST",
|
||||||
|
ts.URL+"/api/jobs/"+jobID+"/cancel", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusAccepted {
|
||||||
|
t.Fatalf("status: got %d, want 202", res.StatusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the dispatched command.cancel envelope.
|
||||||
|
deadline := time.Now().Add(2 * time.Second)
|
||||||
|
var got api.Envelope
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||||
|
mt, raw, rerr := c.Read(ctx)
|
||||||
|
cancel()
|
||||||
|
if rerr != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if mt != websocket.MessageText {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !strings.Contains(string(raw), `"command.cancel"`) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(raw, &got); err != nil {
|
||||||
|
t.Fatalf("unmarshal: %v", err)
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if got.Type != api.MsgCommandCancel {
|
||||||
|
t.Fatalf("never received command.cancel envelope")
|
||||||
|
}
|
||||||
|
var cp api.CommandCancelPayload
|
||||||
|
if err := got.UnmarshalPayload(&cp); err != nil {
|
||||||
|
t.Fatalf("unmarshal payload: %v", err)
|
||||||
|
}
|
||||||
|
if cp.JobID != jobID {
|
||||||
|
t.Fatalf("payload job_id: got %q want %q", cp.JobID, jobID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Audit row exists.
|
||||||
|
var n int
|
||||||
|
if err := st.DB().QueryRow(
|
||||||
|
`SELECT COUNT(*) FROM audit_log WHERE action = 'job.cancel' AND target_id = ?`,
|
||||||
|
jobID).Scan(&n); err != nil {
|
||||||
|
t.Fatalf("audit count: %v", err)
|
||||||
|
}
|
||||||
|
if n != 1 {
|
||||||
|
t.Fatalf("audit rows: got %d, want 1", n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestCancelJobAlreadyTerminal: a job in succeeded/failed/canceled
|
||||||
|
// state returns 409 and does NOT send a WS envelope.
|
||||||
|
func TestCancelJobAlreadyTerminal(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
srv, ts, st := rawTestServer(t)
|
||||||
|
hostID, token := enrolHostForWS(t, srv, st, "term-host")
|
||||||
|
c := agentDial(t, srv, ts, hostID, token)
|
||||||
|
sendHello(t, c, "term-host")
|
||||||
|
_ = drainUntil(t, c, api.MsgScheduleSet)
|
||||||
|
|
||||||
|
jobID := ulid.Make().String()
|
||||||
|
now := time.Now().UTC()
|
||||||
|
if err := st.CreateJob(context.Background(), store.Job{
|
||||||
|
ID: jobID, HostID: hostID, Kind: "backup",
|
||||||
|
ActorKind: "user", CreatedAt: now,
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("create job: %v", err)
|
||||||
|
}
|
||||||
|
if err := st.MarkJobFinished(context.Background(), jobID, "succeeded", 0, nil, "", now); err != nil {
|
||||||
|
t.Fatalf("mark finished: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
req, _ := stdhttp.NewRequest("POST",
|
||||||
|
ts.URL+"/api/jobs/"+jobID+"/cancel", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusConflict {
|
||||||
|
t.Fatalf("status: got %d, want 409", res.StatusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Drain — no command.cancel should arrive.
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond)
|
||||||
|
defer cancel()
|
||||||
|
for {
|
||||||
|
mt, raw, rerr := c.Read(ctx)
|
||||||
|
if rerr != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if mt == websocket.MessageText && strings.Contains(string(raw), `"command.cancel"`) {
|
||||||
|
t.Fatalf("unexpected command.cancel envelope for terminal job")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestCancelJobNotFound: 404 for a job id that doesn't exist.
|
||||||
|
func TestCancelJobNotFound(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
_, ts, st := rawTestServer(t)
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
req, _ := stdhttp.NewRequest("POST",
|
||||||
|
ts.URL+"/api/jobs/"+ulid.Make().String()+"/cancel", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusNotFound {
|
||||||
|
t.Fatalf("status: got %d, want 404", res.StatusCode)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestCancelJobHostOffline: a queued/running job whose host has no
|
||||||
|
// active WS connection returns 503.
|
||||||
|
func TestCancelJobHostOffline(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
_, ts, st := rawTestServer(t)
|
||||||
|
// Create a host but don't connect a WS for it.
|
||||||
|
hostID := ulid.Make().String()
|
||||||
|
if err := st.CreateHost(context.Background(), store.Host{
|
||||||
|
ID: hostID, Name: "offline-host", OS: "linux", Arch: "amd64",
|
||||||
|
EnrolledAt: time.Now().UTC(),
|
||||||
|
}, "deadbeef", ""); err != nil {
|
||||||
|
t.Fatalf("create host: %v", err)
|
||||||
|
}
|
||||||
|
jobID := ulid.Make().String()
|
||||||
|
now := time.Now().UTC()
|
||||||
|
if err := st.CreateJob(context.Background(), store.Job{
|
||||||
|
ID: jobID, HostID: hostID, Kind: "backup",
|
||||||
|
ActorKind: "user", CreatedAt: now,
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("create job: %v", err)
|
||||||
|
}
|
||||||
|
if err := st.MarkJobStarted(context.Background(), jobID, now); err != nil {
|
||||||
|
t.Fatalf("mark started: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
req, _ := stdhttp.NewRequest("POST",
|
||||||
|
ts.URL+"/api/jobs/"+jobID+"/cancel", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusServiceUnavailable {
|
||||||
|
t.Fatalf("status: got %d, want 503", res.StatusCode)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -178,6 +178,12 @@ func (s *Server) routes(r chi.Router) {
|
|||||||
r.Post("/hosts/{id}/repo/prune", s.handleRunRepoPrune)
|
r.Post("/hosts/{id}/repo/prune", s.handleRunRepoPrune)
|
||||||
r.Post("/hosts/{id}/repo/check", s.handleRunRepoCheck)
|
r.Post("/hosts/{id}/repo/check", s.handleRunRepoCheck)
|
||||||
r.Post("/hosts/{id}/repo/unlock", s.handleRunRepoUnlock)
|
r.Post("/hosts/{id}/repo/unlock", s.handleRunRepoUnlock)
|
||||||
|
|
||||||
|
// Cancel a running job. Operator-driven, sends command.cancel
|
||||||
|
// to the agent which kills the restic subprocess; the agent's
|
||||||
|
// resulting job.finished (status=canceled) is what flips the
|
||||||
|
// job row.
|
||||||
|
r.Post("/jobs/{id}/cancel", s.handleCancelJob)
|
||||||
})
|
})
|
||||||
|
|
||||||
// Per-source-group Run-now (HTMX form action). Available even
|
// Per-source-group Run-now (HTMX form action). Available even
|
||||||
|
|||||||
Reference in New Issue
Block a user