p6-01/02: agent self-update + fleet update server cluster

- alert: update_failed (per-host, dedup=hostID) + fleet_update_halted (system-scoped, host_id NULL via new RaiseOrTouchSystem helper). - ws: UpdateWatcher tracks in-flight command.update dispatches and reconciles them against incoming hello envelopes — success path marks the job succeeded and auto-resolves the alert; 90s timeout marks the job failed and raises update_failed. - http: POST /api/hosts/{id}/update (admin-only JSON) + the HTMX /hosts/{id}/update form variant. Pre-checks: host exists, online, agent_version != current, no running update job. Refactored core into Server.dispatchHostUpdate so the fleet worker can share it without going through HTTP. - fleetupdate: rolling worker iterating through host slots, halting on first failure and raising fleet_update_halted. Polling-based version-match (re-read hosts.agent_version every 1s up to 95s) — no extra plumbing into the WS hello path. At-most-one-running is enforced at the store layer (ErrFleetUpdateRunning). - cmd/server: wire UpdateWatcher and FleetWorker into the main goroutine; the worker uses a small serverDispatcher adapter that delegates back into Server.DispatchHostUpdate. Tests: watcher (success/timeout/mismatch/late-hello), HTTP endpoint (happy + four pre-check branches + RBAC), worker (two-host happy, timeout-halt, host-offline-halt, already-at-target skip, cancel mid-run, double-Start guard).
2026-05-06 22:03:50 +01:00
parent d413896302
commit 6fd2a2ff77
11 changed files with 1540 additions and 2 deletions
@@ -16,6 +16,7 @@ import (
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )

 // HandlerDeps is the set of collaborators the agent WS handler needs.
@@ -26,6 +27,9 @@ type HandlerDeps struct {
 	// AlertEngine receives job-finished and host-online events so the
 	// alert engine can evaluate its rules. Optional; nil = no-op.
 	AlertEngine *alert.Engine
+	// UpdateWatcher reconciles in-flight agent-update dispatches against
+	// hello envelopes. Optional; nil = no-op.
+	UpdateWatcher *UpdateWatcher
 	// OnHello is called once per successful hello, after the host row
 	// has been touched and the conn registered. Used by the HTTP
 	// layer to push host_credentials down as a config.update before
@@ -147,6 +151,9 @@ func runAgentLoop(ctx context.Context, c *Conn, hostID string, deps HandlerDeps)
 	if deps.AlertEngine != nil {
 		deps.AlertEngine.NotifyHostOnline(hostID)
 	}
+	if deps.UpdateWatcher != nil {
+		deps.UpdateWatcher.OnHello(ctx, hostID, helloPayload.AgentVersion, version.Version)
+	}

 	deps.Hub.Register(hostID, c)
 	defer deps.Hub.Unregister(hostID, c)
@@ -0,0 +1,151 @@
+package ws
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"sync"
+	"time"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
+)
+
+// updateTimeout bounds how long the watcher waits for an agent to come
+// back with its new version after a command.update dispatch. var (not
+// const) so tests can shrink it.
+var updateTimeout = 90 * time.Second
+
+// AlertRaiser is the slim subset of *alert.Engine the update watcher
+// touches. Defined here (not in the alert package) so the dependency
+// arrow points the right way.
+type AlertRaiser interface {
+	RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time)
+	ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time)
+}
+
+// UpdateWatcher tracks in-flight agent-update dispatches and reconciles
+// them against incoming hello envelopes. Entries land on Track and
+// resolve via OnHello (success path) or the periodic sweep (timeout).
+type UpdateWatcher struct {
+	store  *store.Store
+	alerts AlertRaiser
+
+	mu      sync.Mutex
+	entries map[string]*updateEntry // hostID → entry
+
+	tickPeriod time.Duration
+}
+
+type updateEntry struct {
+	jobID     string
+	startedAt time.Time
+	// terminated is set once the entry has reached a terminal state so
+	// late OnHellos don't resurrect it.
+	terminated bool
+}
+
+// NewUpdateWatcher builds an unstarted watcher. Call Run in a goroutine
+// to start the periodic sweep.
+func NewUpdateWatcher(st *store.Store, alerts AlertRaiser) *UpdateWatcher {
+	return &UpdateWatcher{
+		store:      st,
+		alerts:     alerts,
+		entries:    make(map[string]*updateEntry),
+		tickPeriod: 5 * time.Second,
+	}
+}
+
+// Track registers a freshly-dispatched update job. A subsequent Track
+// for the same host replaces the prior entry (last-write-wins).
+func (w *UpdateWatcher) Track(jobID, hostID string) {
+	if w == nil {
+		return
+	}
+	w.mu.Lock()
+	w.entries[hostID] = &updateEntry{jobID: jobID, startedAt: time.Now()}
+	w.mu.Unlock()
+}
+
+// OnHello is called by the WS handler after a successful hello has been
+// persisted. If a tracked update for the host matches the targetVersion,
+// the job is marked succeeded and any open update_failed alert is
+// auto-resolved. A non-matching version is a no-op (the watcher keeps
+// waiting until the timeout).
+func (w *UpdateWatcher) OnHello(ctx context.Context, hostID, agentVersion, targetVersion string) {
+	if w == nil {
+		return
+	}
+	w.mu.Lock()
+	e, ok := w.entries[hostID]
+	if !ok || e.terminated {
+		w.mu.Unlock()
+		return
+	}
+	if agentVersion != targetVersion {
+		// Not the version we asked for — keep waiting.
+		w.mu.Unlock()
+		return
+	}
+	e.terminated = true
+	jobID := e.jobID
+	delete(w.entries, hostID)
+	w.mu.Unlock()
+
+	now := time.Now().UTC()
+	if err := w.store.MarkJobFinished(ctx, jobID, "succeeded", 0, nil, "", now); err != nil {
+		slog.Warn("ws update watcher: mark succeeded", "job_id", jobID, "host_id", hostID, "err", err)
+	}
+	if w.alerts != nil {
+		w.alerts.ResolveUpdateFailed(ctx, hostID, now)
+	}
+}
+
+// Run drives the periodic sweep. Returns when ctx is done.
+func (w *UpdateWatcher) Run(ctx context.Context) {
+	if w == nil {
+		return
+	}
+	t := time.NewTicker(w.tickPeriod)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case now := <-t.C:
+			w.sweep(ctx, now)
+		}
+	}
+}
+
+func (w *UpdateWatcher) sweep(ctx context.Context, now time.Time) {
+	type expired struct {
+		hostID string
+		jobID  string
+		age    time.Duration
+	}
+	var toFail []expired
+	w.mu.Lock()
+	for hostID, e := range w.entries {
+		if e.terminated {
+			continue
+		}
+		if now.Sub(e.startedAt) >= updateTimeout {
+			toFail = append(toFail, expired{hostID: hostID, jobID: e.jobID, age: now.Sub(e.startedAt)})
+			e.terminated = true
+			delete(w.entries, hostID)
+		}
+	}
+	w.mu.Unlock()
+
+	for _, x := range toFail {
+		reason := fmt.Sprintf("timeout: agent did not reconnect within %s", updateTimeout)
+		stamp := now.UTC()
+		errMsg := reason
+		if err := w.store.MarkJobFinished(ctx, x.jobID, "failed", -1, nil, errMsg, stamp); err != nil {
+			slog.Warn("ws update watcher: mark failed", "job_id", x.jobID, "host_id", x.hostID, "err", err)
+		}
+		if w.alerts != nil {
+			w.alerts.RaiseUpdateFailed(ctx, x.hostID, x.jobID, reason, stamp)
+		}
+	}
+}
@@ -0,0 +1,161 @@
+package ws
+
+import (
+	"context"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/oklog/ulid/v2"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
+)
+
+type fakeAlerts struct {
+	mu       sync.Mutex
+	raised   []string // hostIDs
+	resolved []string
+	reasons  []string
+}
+
+func (f *fakeAlerts) RaiseUpdateFailed(_ context.Context, hostID, _ /*jobID*/, reason string, _ time.Time) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.raised = append(f.raised, hostID)
+	f.reasons = append(f.reasons, reason)
+}
+
+func (f *fakeAlerts) ResolveUpdateFailed(_ context.Context, hostID string, _ time.Time) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.resolved = append(f.resolved, hostID)
+}
+
+func seedJob(t *testing.T, st *store.Store, hostID string) string {
+	t.Helper()
+	jobID := ulid.Make().String()
+	if err := st.CreateJob(context.Background(), store.Job{
+		ID: jobID, HostID: hostID, Kind: "update",
+		ActorKind: "user", CreatedAt: time.Now().UTC(),
+	}); err != nil {
+		t.Fatalf("create job: %v", err)
+	}
+	return jobID
+}
+
+func TestUpdateWatcherOnHelloSuccess(t *testing.T) {
+	st := openWSTestStore(t)
+	hostID := ulid.Make().String()
+	seedHostWS(t, st, hostID)
+	jobID := seedJob(t, st, hostID)
+
+	a := &fakeAlerts{}
+	w := NewUpdateWatcher(st, a)
+	w.Track(jobID, hostID)
+
+	w.OnHello(context.Background(), hostID, "v2", "v2")
+
+	job, err := st.GetJob(context.Background(), jobID)
+	if err != nil {
+		t.Fatalf("get job: %v", err)
+	}
+	if job.Status != "succeeded" {
+		t.Fatalf("status: got %q want succeeded", job.Status)
+	}
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if len(a.resolved) != 1 || a.resolved[0] != hostID {
+		t.Fatalf("resolve calls: %v", a.resolved)
+	}
+	if len(a.raised) != 0 {
+		t.Fatalf("unexpected raises: %v", a.raised)
+	}
+}
+
+func TestUpdateWatcherTimeout(t *testing.T) {
+	prev := updateTimeout
+	updateTimeout = 50 * time.Millisecond
+	t.Cleanup(func() { updateTimeout = prev })
+
+	st := openWSTestStore(t)
+	hostID := ulid.Make().String()
+	seedHostWS(t, st, hostID)
+	jobID := seedJob(t, st, hostID)
+
+	a := &fakeAlerts{}
+	w := NewUpdateWatcher(st, a)
+	w.Track(jobID, hostID)
+
+	time.Sleep(80 * time.Millisecond)
+	w.sweep(context.Background(), time.Now())
+
+	job, err := st.GetJob(context.Background(), jobID)
+	if err != nil {
+		t.Fatalf("get job: %v", err)
+	}
+	if job.Status != "failed" {
+		t.Fatalf("status: got %q want failed", job.Status)
+	}
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if len(a.raised) != 1 || a.raised[0] != hostID {
+		t.Fatalf("raise calls: %v", a.raised)
+	}
+	if len(a.reasons) == 0 || a.reasons[0] == "" {
+		t.Fatalf("missing reason")
+	}
+}
+
+func TestUpdateWatcherMismatchedVersionNoOp(t *testing.T) {
+	st := openWSTestStore(t)
+	hostID := ulid.Make().String()
+	seedHostWS(t, st, hostID)
+	jobID := seedJob(t, st, hostID)
+
+	a := &fakeAlerts{}
+	w := NewUpdateWatcher(st, a)
+	w.Track(jobID, hostID)
+
+	w.OnHello(context.Background(), hostID, "v1", "v2")
+
+	job, _ := st.GetJob(context.Background(), jobID)
+	if job.Status == "succeeded" || job.Status == "failed" {
+		t.Fatalf("status flipped on mismatched hello: %q", job.Status)
+	}
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if len(a.raised) != 0 || len(a.resolved) != 0 {
+		t.Fatalf("unexpected alert calls raised=%v resolved=%v", a.raised, a.resolved)
+	}
+}
+
+func TestUpdateWatcherHelloAfterTimeoutIsNoOp(t *testing.T) {
+	prev := updateTimeout
+	updateTimeout = 50 * time.Millisecond
+	t.Cleanup(func() { updateTimeout = prev })
+
+	st := openWSTestStore(t)
+	hostID := ulid.Make().String()
+	seedHostWS(t, st, hostID)
+	jobID := seedJob(t, st, hostID)
+
+	a := &fakeAlerts{}
+	w := NewUpdateWatcher(st, a)
+	w.Track(jobID, hostID)
+
+	time.Sleep(80 * time.Millisecond)
+	w.sweep(context.Background(), time.Now())
+
+	// Hello arrives after sweep — entry already gone, must be no-op.
+	w.OnHello(context.Background(), hostID, "v2", "v2")
+
+	job, _ := st.GetJob(context.Background(), jobID)
+	if job.Status != "failed" {
+		t.Fatalf("status flipped from failed → %q", job.Status)
+	}
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	if len(a.resolved) != 0 {
+		t.Fatalf("late hello triggered ResolveUpdateFailed: %v", a.resolved)
+	}
+}