agent+server: P2R-11 pre/post hook execution for backup jobs

Agent: new runner.BackupHooks struct + runHook helper invoked via
/bin/sh -c (cmd.exe /C on Windows). pre_hook non-zero exit aborts
the backup; post_hook always runs with RM_JOB_STATUS=succeeded|failed
in env. Output streamed as 'hook(<phase>): …' log.stream lines.
Hooks only run for kind=backup (other kinds skip both phases).

Server: resolveBackupHooks resolves group → host default → empty,
decrypts via crypto.AEAD with per-slot ad bytes, plumbs plaintext
into CommandRunPayload for both schedule.fire and per-group
Run-now dispatch sites. Decrypt failures degrade silently to no
hook so a malformed blob can't poison every backup.
This commit is contained in:
2026-05-04 10:57:28 +01:00
parent 18b0bf976d
commit 7b1990cf11
11 changed files with 379 additions and 52 deletions
+2 -1
View File
@@ -359,8 +359,9 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
}
slog.Info("agent: accepting backup job",
"job_id", p.JobID, "paths", paths, "excludes", p.Excludes, "tag", p.Tag)
hooks := runner.BackupHooks{Pre: p.PreHook, Post: p.PostHook}
go func() {
if err := r.RunBackup(ctx, p.JobID, paths, p.Excludes, tags); err != nil {
if err := r.RunBackup(ctx, p.JobID, paths, p.Excludes, tags, hooks); err != nil {
slog.Warn("agent: backup job failed", "job_id", p.JobID, "err", err)
return
}
+106
View File
@@ -0,0 +1,106 @@
// hooks.go — pre/post backup hooks for the agent runner (P2R-11).
//
// Hooks fire only for backup jobs (the runner's other kinds —
// init/forget/prune/check/unlock — call shell scripts that touch
// repo internals; running operator hooks for those would be
// surprising). Hook bodies arrive plaintext on the wire (server
// decrypted before the WS push). The agent never persists them
// to disk; they live in memory for the lifetime of one job.
//
// Failure semantics:
// - pre_hook non-zero exit aborts the backup: the runner returns
// the error, the job is recorded as failed, and the actual
// restic invocation never runs.
// - post_hook non-zero exit is logged with a warning prefix in
// the job log but does NOT change the job status — the operator
// wants the backup result preserved even if the cleanup step
// misbehaved.
//
// Streaming: each line of the hook's stdout/stderr is shipped as a
// log.stream envelope with payload prefixed `hook: ` so the live
// log viewer can visually separate it from restic's own output.
package runner
import (
"bufio"
"context"
"fmt"
"io"
"os/exec"
"runtime"
"sync/atomic"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
)
// runHook executes script via the host shell. status is the value
// passed as RM_JOB_STATUS in the env (empty for pre-hooks; the
// final job status — "succeeded" or "failed" — for post-hooks).
// Returns an error iff the hook exited non-zero. ctx cancellation
// kills the subprocess.
func (r *Runner) runHook(ctx context.Context, jobID, phase, script, status string, seq *atomic.Int64) error {
if script == "" {
return nil
}
shell, flag := defaultShell()
cmd := exec.CommandContext(ctx, shell, flag, script)
cmd.Env = []string{
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
}
if status != "" {
cmd.Env = append(cmd.Env, "RM_JOB_STATUS="+status)
}
cmd.Env = append(cmd.Env, "RM_JOB_ID="+jobID, "RM_HOOK_PHASE="+phase)
stdout, err := cmd.StdoutPipe()
if err != nil {
return fmt.Errorf("hook %s: stdout pipe: %w", phase, err)
}
stderr, err := cmd.StderrPipe()
if err != nil {
return fmt.Errorf("hook %s: stderr pipe: %w", phase, err)
}
if err := cmd.Start(); err != nil {
return fmt.Errorf("hook %s: start: %w", phase, err)
}
done := make(chan struct{}, 2)
go func() { r.pumpHookLines(stdout, "stdout", phase, jobID, seq); done <- struct{}{} }()
go func() { r.pumpHookLines(stderr, "stderr", phase, jobID, seq); done <- struct{}{} }()
<-done
<-done
if werr := cmd.Wait(); werr != nil {
return fmt.Errorf("hook %s exited non-zero: %w", phase, werr)
}
return nil
}
// pumpHookLines streams lines as log.stream envelopes prefixed with
// "hook(<phase>): " so the live log can visually separate them.
func (r *Runner) pumpHookLines(rd io.Reader, stream, phase, jobID string, seq *atomic.Int64) {
scanner := bufio.NewScanner(rd)
scanner.Buffer(make([]byte, 0, 64*1024), 256*1024)
for scanner.Scan() {
line := "hook(" + phase + "): " + scanner.Text()
env, _ := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
JobID: jobID,
Seq: seq.Add(1),
TS: time.Now().UTC(),
Stream: api.LogStream(stream),
Payload: line,
})
_ = r.tx.Send(env)
}
}
// defaultShell returns the (binary, single-arg-flag) pair to use for
// `<shell> <flag> "<script>"`. /bin/sh -c on Unix; cmd.exe /C on
// Windows. The hook author writes whichever shell they prefer
// inside the script body itself (PowerShell, bash, etc) — this is
// just the bootstrap interpreter.
func defaultShell() (string, string) {
if runtime.GOOS == "windows" {
return "cmd.exe", "/C"
}
return "/bin/sh", "-c"
}
+90
View File
@@ -0,0 +1,90 @@
// hooks_test.go — pre/post backup hook semantics (P2R-11).
package runner
import (
"context"
"strings"
"testing"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
)
// TestPreHookFailureAbortsBackup: pre_hook exits 1 → restic never
// runs, job is recorded failed with the hook's error.
func TestPreHookFailureAbortsBackup(t *testing.T) {
t.Parallel()
// Restic script that records every invocation. If restic was
// called we'll see "restic-was-here" in the captured log.
bin := setupScript(t, `echo "restic-was-here"`)
tx := &fakeSender{}
r := New(Config{ResticBin: bin}, tx, 0)
err := r.RunBackup(context.Background(), "job-pre",
[]string{"/etc"}, nil, []string{"tag"},
BackupHooks{Pre: "exit 1"})
if err == nil {
t.Fatal("expected RunBackup to return an error from failed pre_hook")
}
if !strings.Contains(err.Error(), "pre_hook failed") {
t.Fatalf("error message: %q (want 'pre_hook failed')", err)
}
// job.finished arrived with status=failed.
finEnv := firstEnvOfType(t, tx.envs, api.MsgJobFinished)
var fin api.JobFinishedPayload
_ = finEnv.UnmarshalPayload(&fin)
if fin.Status != api.JobFailed {
t.Fatalf("status: %q, want failed", fin.Status)
}
// restic must NOT have run.
for _, env := range tx.envs {
if env.Type != api.MsgLogStream {
continue
}
var l api.LogStreamLine
_ = env.UnmarshalPayload(&l)
if strings.Contains(l.Payload, "restic-was-here") {
t.Fatal("restic was invoked despite pre_hook failure")
}
}
}
// TestPostHookRunsAfterBackup: post_hook fires after a successful
// backup and receives RM_JOB_STATUS=succeeded in the env.
func TestPostHookRunsAfterBackup(t *testing.T) {
t.Parallel()
bin := setupScript(t, `
case "$1" in
backup) echo '{"message_type":"summary","snapshot_id":"abc"}' ;;
snapshots) echo '[]' ;;
stats) echo '{"total_size":0,"total_uncompressed_size":0,"snapshots_count":0,"total_file_count":0,"total_blob_count":0}' ;;
*) exit 0 ;;
esac
`)
tx := &fakeSender{}
r := New(Config{ResticBin: bin}, tx, 0)
post := `echo "post-status=$RM_JOB_STATUS phase=$RM_HOOK_PHASE"`
if err := r.RunBackup(context.Background(), "job-post",
[]string{"/etc"}, nil, nil, BackupHooks{Post: post}); err != nil {
t.Fatalf("RunBackup: %v", err)
}
// Walk log.stream envelopes; one of them should be the post-hook
// line with the expected status.
var found bool
for _, env := range tx.envs {
if env.Type != api.MsgLogStream {
continue
}
var l api.LogStreamLine
_ = env.UnmarshalPayload(&l)
if strings.Contains(l.Payload, "post-status=succeeded") &&
strings.Contains(l.Payload, "phase=post") {
found = true
break
}
}
if !found {
t.Fatal("post_hook output not found in log.stream envelopes")
}
}
+36 -3
View File
@@ -116,15 +116,34 @@ func (r *Runner) sendFinished(jobID string, finishedAt time.Time, err error, sta
_ = r.tx.Send(finEnv)
}
// BackupHooks bundles the optional pre/post shell snippets that fire
// around a backup. Empty fields skip that phase. Resolved server-side
// (group → host default) before dispatch; the agent just executes
// whatever arrives in the payload.
type BackupHooks struct {
Pre string
Post string
}
// RunBackup executes a backup job and reports back via the sender.
// Returns nil on a clean (or "incomplete-but-snapshot-created") finish.
func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, tags []string) error {
func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, tags []string, hooks BackupHooks) error {
startedAt := time.Now().UTC()
r.sendStarted(jobID, api.JobBackup, startedAt)
env := r.resticEnv()
var seq atomic.Int64
// pre_hook: non-zero exit aborts the backup. The job is recorded
// as failed with the hook's error and restic never runs.
if hooks.Pre != "" {
if err := r.runHook(ctx, jobID, "pre", hooks.Pre, "", &seq); err != nil {
finishedAt := time.Now().UTC()
r.sendFinished(jobID, finishedAt, err, nil)
return fmt.Errorf("pre_hook failed: %w", err)
}
}
env := r.resticEnv()
lastProgress := time.Now()
handle := func(stream string, line string, ev any) {
@@ -173,6 +192,20 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t
if summary != nil {
statsBlob, _ = json.Marshal(summary)
}
// post_hook: always runs regardless of backup outcome. Receives
// RM_JOB_STATUS=succeeded|failed in env. Non-zero exit is logged
// but does not change the recorded job status.
if hooks.Post != "" {
status := "succeeded"
if err != nil {
status = "failed"
}
if perr := r.runHook(ctx, jobID, "post", hooks.Post, status, &seq); perr != nil {
slog.Warn("runner: post_hook exited non-zero", "job_id", jobID, "err", perr)
}
}
r.sendFinished(jobID, finishedAt, err, statsBlob)
// On a successful backup, refresh the server's snapshot projection.
+75
View File
@@ -0,0 +1,75 @@
// hooks_resolve.go — server-side resolution of pre/post hooks for a
// backup dispatch (P2R-11). The agent receives plaintext hook bodies
// in CommandRunPayload; this file is where the AEAD blob on the
// source group (or the host's default) gets decrypted into the
// strings the wire payload carries.
//
// Resolution order:
// 1. source_group.<phase>_hook (per-group override)
// 2. host.<phase>_hook_default (host-wide default)
// 3. "" (no hook → agent skips that phase)
//
// Decrypt errors are logged and treated as "no hook configured" so
// a malformed blob can't poison every backup. The audit trail
// captures the underlying state regardless.
package http
import (
"log/slog"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// resolveBackupHooks returns the (pre, post) plaintext hook strings
// the agent should run around the backup. Both are empty when no
// hook is configured at either level.
func (s *Server) resolveBackupHooks(host *store.Host, g *store.SourceGroup) (pre, post string) {
if s.deps.AEAD == nil {
return "", ""
}
pre = s.decryptHookOrFallback(g.PreHook, host.PreHookDefault, host.ID, "pre")
post = s.decryptHookOrFallback(g.PostHook, host.PostHookDefault, host.ID, "post")
return pre, post
}
// decryptHookOrFallback returns the per-group hook decrypted, or
// (when that's empty) the host default decrypted, or "" if neither
// is configured. Decrypt failures log and degrade to empty.
func (s *Server) decryptHookOrFallback(group, hostDefault, hostID, phase string) string {
tryDecrypt := func(blob, slot string) (string, bool) {
if blob == "" {
return "", false
}
plain, err := s.deps.AEAD.Decrypt(blob, []byte("hook:"+hostID+":"+slot+":"+phase))
if err != nil {
slog.Error("decrypt hook", "host_id", hostID, "phase", phase, "slot", slot, "err", err)
return "", false
}
return string(plain), true
}
if v, ok := tryDecrypt(group, "group"); ok {
return v
}
if v, ok := tryDecrypt(hostDefault, "host"); ok {
return v
}
return ""
}
// EncryptHookForGroup encrypts a hook body for storage on a source
// group. Caller passes the plaintext from a UI form; an empty body
// returns "" so the store persists NULL (cleared).
func (s *Server) EncryptHookForGroup(hostID, phase, body string) (string, error) {
if body == "" {
return "", nil
}
return s.deps.AEAD.Encrypt([]byte(body), []byte("hook:"+hostID+":group:"+phase))
}
// EncryptHookForHost is the host-default twin of EncryptHookForGroup.
func (s *Server) EncryptHookForHost(hostID, phase, body string) (string, error) {
if body == "" {
return "", nil
}
return s.deps.AEAD.Encrypt([]byte(body), []byte("hook:"+hostID+":host:"+phase))
}
+9
View File
@@ -79,6 +79,13 @@ func (s *Server) handleRunSourceGroup(w stdhttp.ResponseWriter, r *stdhttp.Reque
return
}
// Resolve hooks (group → host default → empty). Best-effort host
// lookup; failure proceeds with no hook rather than block the run.
var preHook, postHook string
if host, herr := s.deps.Store.GetHost(r.Context(), hostID); herr == nil {
preHook, postHook = s.resolveBackupHooks(host, g)
}
// Backup invocations don't consume RetentionPolicy — that lives on
// forget. Sending the resolved set here would just be dead weight.
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobBackup,
@@ -88,6 +95,8 @@ func (s *Server) handleRunSourceGroup(w stdhttp.ResponseWriter, r *stdhttp.Reque
Tag: g.Name,
BandwidthUpKBps: upOverride,
BandwidthDownKBps: downOverride,
PreHook: preHook,
PostHook: postHook,
})
if code != "" {
s.runGroupError(w, r, status, code, msg)
+14
View File
@@ -192,6 +192,18 @@ func (s *Server) dispatchBackupForGroupCore(ctx context.Context, conn *ws.Conn,
"schedule_id", scheduleID, "group", g.Name, "err", err)
return "", err
}
// Resolve pre/post hooks (group → host default → empty) so they
// ride on the backup payload as plaintext. The host lookup is
// cheap; failure here is non-fatal (we proceed without hooks
// rather than block the backup).
var preHook, postHook string
if host, herr := s.deps.Store.GetHost(ctx, hostID); herr == nil {
preHook, postHook = s.resolveBackupHooks(host, g)
} else {
slog.Warn("schedule.fire: load host for hook resolve",
"host_id", hostID, "err", herr)
}
// Backup ignores RetentionPolicy — the forget cadence lives on
// host_repo_maintenance and is driven by the server-side ticker
// (P2R-06). Don't ship the field on backup dispatches.
@@ -201,6 +213,8 @@ func (s *Server) dispatchBackupForGroupCore(ctx context.Context, conn *ws.Conn,
Includes: g.Includes,
Excludes: g.Excludes,
Tag: g.Name,
PreHook: preHook,
PostHook: postHook,
})
if err != nil {
slog.Warn("schedule.fire: marshal command.run",
+15 -15
View File
@@ -42,8 +42,8 @@ func TestSourceGroupHooksRoundTrip(t *testing.T) {
g := &SourceGroup{
ID: ulid.Make().String(), HostID: hostID, Name: "etc",
PreHook: []byte("ENC-PRE"),
PostHook: []byte("ENC-POST"),
PreHook: "ENC-PRE",
PostHook: "ENC-POST",
}
if err := st.CreateSourceGroup(context.Background(), g); err != nil {
t.Fatalf("create: %v", err)
@@ -52,16 +52,16 @@ func TestSourceGroupHooksRoundTrip(t *testing.T) {
if err != nil {
t.Fatalf("get: %v", err)
}
if string(got.PreHook) != "ENC-PRE" {
if got.PreHook != "ENC-PRE" {
t.Fatalf("PreHook: got %q, want ENC-PRE", got.PreHook)
}
if string(got.PostHook) != "ENC-POST" {
if got.PostHook != "ENC-POST" {
t.Fatalf("PostHook: got %q, want ENC-POST", got.PostHook)
}
// Update: clear PreHook, change PostHook.
got.PreHook = nil
got.PostHook = []byte("ENC-POST-2")
got.PreHook = ""
got.PostHook = "ENC-POST-2"
if err := st.UpdateSourceGroup(context.Background(), got); err != nil {
t.Fatalf("update: %v", err)
}
@@ -69,10 +69,10 @@ func TestSourceGroupHooksRoundTrip(t *testing.T) {
if err != nil {
t.Fatalf("get: %v", err)
}
if got.PreHook != nil {
t.Fatalf("PreHook: want nil after clear, got %q", got.PreHook)
if got.PreHook != "" {
t.Fatalf("PreHook: want empty after clear, got %q", got.PreHook)
}
if string(got.PostHook) != "ENC-POST-2" {
if got.PostHook != "ENC-POST-2" {
t.Fatalf("PostHook: got %q, want ENC-POST-2", got.PostHook)
}
}
@@ -82,25 +82,25 @@ func TestHostHookDefaultsRoundTrip(t *testing.T) {
st := newTestStore(t)
hostID := makeHostInStore(t, st, "host-hooks-host")
if err := st.SetHostHooks(context.Background(), hostID, []byte("PRE"), []byte("POST")); err != nil {
if err := st.SetHostHooks(context.Background(), hostID, "PRE", "POST"); err != nil {
t.Fatalf("set: %v", err)
}
h, err := st.GetHost(context.Background(), hostID)
if err != nil {
t.Fatalf("get: %v", err)
}
if string(h.PreHookDefault) != "PRE" || string(h.PostHookDefault) != "POST" {
if h.PreHookDefault != "PRE" || h.PostHookDefault != "POST" {
t.Fatalf("after set: pre=%q post=%q", h.PreHookDefault, h.PostHookDefault)
}
// Clear by passing nil.
if err := st.SetHostHooks(context.Background(), hostID, nil, nil); err != nil {
// Clear by passing empty strings.
if err := st.SetHostHooks(context.Background(), hostID, "", ""); err != nil {
t.Fatalf("clear: %v", err)
}
h, err = st.GetHost(context.Background(), hostID)
if err != nil {
t.Fatalf("get: %v", err)
}
if h.PreHookDefault != nil || h.PostHookDefault != nil {
t.Fatalf("after clear: pre=%v post=%v (want nil)", h.PreHookDefault, h.PostHookDefault)
if h.PreHookDefault != "" || h.PostHookDefault != "" {
t.Fatalf("after clear: pre=%q post=%q (want empty)", h.PreHookDefault, h.PostHookDefault)
}
}
+11 -7
View File
@@ -158,7 +158,7 @@ func scanHostRow(s hostScanner) (*Host, error) {
enrolled string
tags string
bwUp, bwDown sql.NullInt64
preHook, postHook []byte
preHook, postHook sql.NullString
)
err := s.Scan(&h.ID, &h.Name, &h.OS, &h.Arch,
&h.AgentVersion, &h.ResticVersion, &h.ProtocolVersion,
@@ -215,18 +215,22 @@ func scanHostRow(s hostScanner) (*Host, error) {
v := int(bwDown.Int64)
h.BandwidthDownKBps = &v
}
h.PreHookDefault = preHook
h.PostHookDefault = postHook
if preHook.Valid {
h.PreHookDefault = preHook.String
}
if postHook.Valid {
h.PostHookDefault = postHook.String
}
return &h, nil
}
// SetHostHooks replaces the host-wide pre/post hook defaults. Pass
// nil/empty to clear that hook. Stored verbatim — caller is expected
// to encrypt the bytes before they reach this layer.
func (s *Store) SetHostHooks(ctx context.Context, hostID string, pre, post []byte) error {
// the empty string to clear that hook. Stored verbatim — caller is
// expected to encrypt before they reach this layer.
func (s *Store) SetHostHooks(ctx context.Context, hostID string, pre, post string) error {
_, err := s.db.ExecContext(ctx,
`UPDATE hosts SET pre_hook_default = ?, post_hook_default = ? WHERE id = ?`,
nullableBytes(pre), nullableBytes(post), hostID)
nullableString(pre), nullableString(post), hostID)
if err != nil {
return fmt.Errorf("store: set host hooks: %w", err)
}
+9 -15
View File
@@ -52,7 +52,7 @@ func (st *Store) CreateSourceGroup(ctx context.Context, g *SourceGroup) error {
g.RetryMax, g.RetryBackoffSeconds,
nullableString(g.ConflictDimension),
now.Format(time.RFC3339Nano), now.Format(time.RFC3339Nano),
nullableBytes(g.PreHook), nullableBytes(g.PostHook),
nullableString(g.PreHook), nullableString(g.PostHook),
); err != nil {
return fmt.Errorf("store: create source group: %w", err)
}
@@ -96,7 +96,7 @@ func (st *Store) UpdateSourceGroup(ctx context.Context, g *SourceGroup) error {
g.RetryMax, g.RetryBackoffSeconds,
nullableString(g.ConflictDimension),
now.Format(time.RFC3339Nano),
nullableBytes(g.PreHook), nullableBytes(g.PostHook),
nullableString(g.PreHook), nullableString(g.PostHook),
g.ID, g.HostID,
)
if err != nil {
@@ -226,7 +226,7 @@ func scanSourceGroupRow(s sourceGroupScanner) (*SourceGroup, error) {
includes, excludes, retention string
conflict sql.NullString
createdAt, updatedAt string
preHook, postHook []byte
preHook, postHook sql.NullString
)
err := s.Scan(&out.ID, &out.HostID, &out.Name,
&includes, &excludes, &retention,
@@ -235,8 +235,12 @@ func scanSourceGroupRow(s sourceGroupScanner) (*SourceGroup, error) {
if err != nil {
return nil, err
}
out.PreHook = preHook
out.PostHook = postHook
if preHook.Valid {
out.PreHook = preHook.String
}
if postHook.Valid {
out.PostHook = postHook.String
}
if includes != "" {
_ = json.Unmarshal([]byte(includes), &out.Includes)
}
@@ -264,13 +268,3 @@ func nullableString(s string) any {
}
return s
}
// nullableBytes returns nil for an empty/nil slice so SQL stores it
// as NULL rather than an empty BLOB. The agent treats both the same
// (no hook), but NULL is the canonical "absent" form on disk.
func nullableBytes(b []byte) any {
if len(b) == 0 {
return nil
}
return b
}
+12 -11
View File
@@ -67,11 +67,12 @@ type Host struct {
BandwidthUpKBps *int
BandwidthDownKBps *int
// PreHookDefault / PostHookDefault are AEAD-encrypted host-wide
// hook bodies. Per source group hooks (SourceGroup.PreHook /
// PostHook) override these when set. nil = no default configured.
PreHookDefault []byte
PostHookDefault []byte
// PreHookDefault / PostHookDefault are AEAD ciphertext (string
// blob produced by crypto.AEAD.Encrypt). Per source group hooks
// (SourceGroup.PreHook / PostHook) override these when set.
// Empty = no default configured.
PreHookDefault string
PostHookDefault string
}
// Schedule is now intentionally slim: cron + which groups + enabled.
@@ -113,12 +114,12 @@ type SourceGroup struct {
CreatedAt time.Time
UpdatedAt time.Time
// PreHook / PostHook are AEAD-encrypted shell snippets (raw blob).
// nil means "no hook configured." Encryption/decryption happens at
// the HTTP layer (where AEAD lives); the store layer just persists
// the bytes verbatim.
PreHook []byte
PostHook []byte
// PreHook / PostHook are AEAD ciphertext (string blob produced by
// crypto.AEAD.Encrypt). Empty means "no hook configured."
// Encryption/decryption happens at the HTTP layer (where AEAD
// lives); the store layer just persists the bytes verbatim.
PreHook string
PostHook string
}
// RetentionPolicy is the typed view of `restic forget --keep-*`.