p6-01/02: agent self-update + fleet update server cluster
- alert: update_failed (per-host, dedup=hostID) + fleet_update_halted
(system-scoped, host_id NULL via new RaiseOrTouchSystem helper).
- ws: UpdateWatcher tracks in-flight command.update dispatches and
reconciles them against incoming hello envelopes — success path
marks the job succeeded and auto-resolves the alert; 90s timeout
marks the job failed and raises update_failed.
- http: POST /api/hosts/{id}/update (admin-only JSON) + the HTMX
/hosts/{id}/update form variant. Pre-checks: host exists, online,
agent_version != current, no running update job. Refactored core
into Server.dispatchHostUpdate so the fleet worker can share it
without going through HTTP.
- fleetupdate: rolling worker iterating through host slots, halting
on first failure and raising fleet_update_halted. Polling-based
version-match (re-read hosts.agent_version every 1s up to 95s) —
no extra plumbing into the WS hello path. At-most-one-running is
enforced at the store layer (ErrFleetUpdateRunning).
- cmd/server: wire UpdateWatcher and FleetWorker into the main
goroutine; the worker uses a small serverDispatcher adapter that
delegates back into Server.DispatchHostUpdate.
Tests: watcher (success/timeout/mismatch/late-hello), HTTP endpoint
(happy + four pre-check branches + RBAC), worker (two-host happy,
timeout-halt, host-offline-halt, already-at-target skip, cancel
mid-run, double-Start guard).
This commit is contained in:
@@ -16,6 +16,7 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||
)
|
||||
|
||||
// HandlerDeps is the set of collaborators the agent WS handler needs.
|
||||
@@ -26,6 +27,9 @@ type HandlerDeps struct {
|
||||
// AlertEngine receives job-finished and host-online events so the
|
||||
// alert engine can evaluate its rules. Optional; nil = no-op.
|
||||
AlertEngine *alert.Engine
|
||||
// UpdateWatcher reconciles in-flight agent-update dispatches against
|
||||
// hello envelopes. Optional; nil = no-op.
|
||||
UpdateWatcher *UpdateWatcher
|
||||
// OnHello is called once per successful hello, after the host row
|
||||
// has been touched and the conn registered. Used by the HTTP
|
||||
// layer to push host_credentials down as a config.update before
|
||||
@@ -147,6 +151,9 @@ func runAgentLoop(ctx context.Context, c *Conn, hostID string, deps HandlerDeps)
|
||||
if deps.AlertEngine != nil {
|
||||
deps.AlertEngine.NotifyHostOnline(hostID)
|
||||
}
|
||||
if deps.UpdateWatcher != nil {
|
||||
deps.UpdateWatcher.OnHello(ctx, hostID, helloPayload.AgentVersion, version.Version)
|
||||
}
|
||||
|
||||
deps.Hub.Register(hostID, c)
|
||||
defer deps.Hub.Unregister(hostID, c)
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
package ws
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
)
|
||||
|
||||
// updateTimeout bounds how long the watcher waits for an agent to come
|
||||
// back with its new version after a command.update dispatch. var (not
|
||||
// const) so tests can shrink it.
|
||||
var updateTimeout = 90 * time.Second
|
||||
|
||||
// AlertRaiser is the slim subset of *alert.Engine the update watcher
|
||||
// touches. Defined here (not in the alert package) so the dependency
|
||||
// arrow points the right way.
|
||||
type AlertRaiser interface {
|
||||
RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time)
|
||||
ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time)
|
||||
}
|
||||
|
||||
// UpdateWatcher tracks in-flight agent-update dispatches and reconciles
|
||||
// them against incoming hello envelopes. Entries land on Track and
|
||||
// resolve via OnHello (success path) or the periodic sweep (timeout).
|
||||
type UpdateWatcher struct {
|
||||
store *store.Store
|
||||
alerts AlertRaiser
|
||||
|
||||
mu sync.Mutex
|
||||
entries map[string]*updateEntry // hostID → entry
|
||||
|
||||
tickPeriod time.Duration
|
||||
}
|
||||
|
||||
type updateEntry struct {
|
||||
jobID string
|
||||
startedAt time.Time
|
||||
// terminated is set once the entry has reached a terminal state so
|
||||
// late OnHellos don't resurrect it.
|
||||
terminated bool
|
||||
}
|
||||
|
||||
// NewUpdateWatcher builds an unstarted watcher. Call Run in a goroutine
|
||||
// to start the periodic sweep.
|
||||
func NewUpdateWatcher(st *store.Store, alerts AlertRaiser) *UpdateWatcher {
|
||||
return &UpdateWatcher{
|
||||
store: st,
|
||||
alerts: alerts,
|
||||
entries: make(map[string]*updateEntry),
|
||||
tickPeriod: 5 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// Track registers a freshly-dispatched update job. A subsequent Track
|
||||
// for the same host replaces the prior entry (last-write-wins).
|
||||
func (w *UpdateWatcher) Track(jobID, hostID string) {
|
||||
if w == nil {
|
||||
return
|
||||
}
|
||||
w.mu.Lock()
|
||||
w.entries[hostID] = &updateEntry{jobID: jobID, startedAt: time.Now()}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
|
||||
// OnHello is called by the WS handler after a successful hello has been
|
||||
// persisted. If a tracked update for the host matches the targetVersion,
|
||||
// the job is marked succeeded and any open update_failed alert is
|
||||
// auto-resolved. A non-matching version is a no-op (the watcher keeps
|
||||
// waiting until the timeout).
|
||||
func (w *UpdateWatcher) OnHello(ctx context.Context, hostID, agentVersion, targetVersion string) {
|
||||
if w == nil {
|
||||
return
|
||||
}
|
||||
w.mu.Lock()
|
||||
e, ok := w.entries[hostID]
|
||||
if !ok || e.terminated {
|
||||
w.mu.Unlock()
|
||||
return
|
||||
}
|
||||
if agentVersion != targetVersion {
|
||||
// Not the version we asked for — keep waiting.
|
||||
w.mu.Unlock()
|
||||
return
|
||||
}
|
||||
e.terminated = true
|
||||
jobID := e.jobID
|
||||
delete(w.entries, hostID)
|
||||
w.mu.Unlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
if err := w.store.MarkJobFinished(ctx, jobID, "succeeded", 0, nil, "", now); err != nil {
|
||||
slog.Warn("ws update watcher: mark succeeded", "job_id", jobID, "host_id", hostID, "err", err)
|
||||
}
|
||||
if w.alerts != nil {
|
||||
w.alerts.ResolveUpdateFailed(ctx, hostID, now)
|
||||
}
|
||||
}
|
||||
|
||||
// Run drives the periodic sweep. Returns when ctx is done.
|
||||
func (w *UpdateWatcher) Run(ctx context.Context) {
|
||||
if w == nil {
|
||||
return
|
||||
}
|
||||
t := time.NewTicker(w.tickPeriod)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case now := <-t.C:
|
||||
w.sweep(ctx, now)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (w *UpdateWatcher) sweep(ctx context.Context, now time.Time) {
|
||||
type expired struct {
|
||||
hostID string
|
||||
jobID string
|
||||
age time.Duration
|
||||
}
|
||||
var toFail []expired
|
||||
w.mu.Lock()
|
||||
for hostID, e := range w.entries {
|
||||
if e.terminated {
|
||||
continue
|
||||
}
|
||||
if now.Sub(e.startedAt) >= updateTimeout {
|
||||
toFail = append(toFail, expired{hostID: hostID, jobID: e.jobID, age: now.Sub(e.startedAt)})
|
||||
e.terminated = true
|
||||
delete(w.entries, hostID)
|
||||
}
|
||||
}
|
||||
w.mu.Unlock()
|
||||
|
||||
for _, x := range toFail {
|
||||
reason := fmt.Sprintf("timeout: agent did not reconnect within %s", updateTimeout)
|
||||
stamp := now.UTC()
|
||||
errMsg := reason
|
||||
if err := w.store.MarkJobFinished(ctx, x.jobID, "failed", -1, nil, errMsg, stamp); err != nil {
|
||||
slog.Warn("ws update watcher: mark failed", "job_id", x.jobID, "host_id", x.hostID, "err", err)
|
||||
}
|
||||
if w.alerts != nil {
|
||||
w.alerts.RaiseUpdateFailed(ctx, x.hostID, x.jobID, reason, stamp)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,161 @@
|
||||
package ws
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/oklog/ulid/v2"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
)
|
||||
|
||||
type fakeAlerts struct {
|
||||
mu sync.Mutex
|
||||
raised []string // hostIDs
|
||||
resolved []string
|
||||
reasons []string
|
||||
}
|
||||
|
||||
func (f *fakeAlerts) RaiseUpdateFailed(_ context.Context, hostID, _ /*jobID*/, reason string, _ time.Time) {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.raised = append(f.raised, hostID)
|
||||
f.reasons = append(f.reasons, reason)
|
||||
}
|
||||
|
||||
func (f *fakeAlerts) ResolveUpdateFailed(_ context.Context, hostID string, _ time.Time) {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.resolved = append(f.resolved, hostID)
|
||||
}
|
||||
|
||||
func seedJob(t *testing.T, st *store.Store, hostID string) string {
|
||||
t.Helper()
|
||||
jobID := ulid.Make().String()
|
||||
if err := st.CreateJob(context.Background(), store.Job{
|
||||
ID: jobID, HostID: hostID, Kind: "update",
|
||||
ActorKind: "user", CreatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
t.Fatalf("create job: %v", err)
|
||||
}
|
||||
return jobID
|
||||
}
|
||||
|
||||
func TestUpdateWatcherOnHelloSuccess(t *testing.T) {
|
||||
st := openWSTestStore(t)
|
||||
hostID := ulid.Make().String()
|
||||
seedHostWS(t, st, hostID)
|
||||
jobID := seedJob(t, st, hostID)
|
||||
|
||||
a := &fakeAlerts{}
|
||||
w := NewUpdateWatcher(st, a)
|
||||
w.Track(jobID, hostID)
|
||||
|
||||
w.OnHello(context.Background(), hostID, "v2", "v2")
|
||||
|
||||
job, err := st.GetJob(context.Background(), jobID)
|
||||
if err != nil {
|
||||
t.Fatalf("get job: %v", err)
|
||||
}
|
||||
if job.Status != "succeeded" {
|
||||
t.Fatalf("status: got %q want succeeded", job.Status)
|
||||
}
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
if len(a.resolved) != 1 || a.resolved[0] != hostID {
|
||||
t.Fatalf("resolve calls: %v", a.resolved)
|
||||
}
|
||||
if len(a.raised) != 0 {
|
||||
t.Fatalf("unexpected raises: %v", a.raised)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateWatcherTimeout(t *testing.T) {
|
||||
prev := updateTimeout
|
||||
updateTimeout = 50 * time.Millisecond
|
||||
t.Cleanup(func() { updateTimeout = prev })
|
||||
|
||||
st := openWSTestStore(t)
|
||||
hostID := ulid.Make().String()
|
||||
seedHostWS(t, st, hostID)
|
||||
jobID := seedJob(t, st, hostID)
|
||||
|
||||
a := &fakeAlerts{}
|
||||
w := NewUpdateWatcher(st, a)
|
||||
w.Track(jobID, hostID)
|
||||
|
||||
time.Sleep(80 * time.Millisecond)
|
||||
w.sweep(context.Background(), time.Now())
|
||||
|
||||
job, err := st.GetJob(context.Background(), jobID)
|
||||
if err != nil {
|
||||
t.Fatalf("get job: %v", err)
|
||||
}
|
||||
if job.Status != "failed" {
|
||||
t.Fatalf("status: got %q want failed", job.Status)
|
||||
}
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
if len(a.raised) != 1 || a.raised[0] != hostID {
|
||||
t.Fatalf("raise calls: %v", a.raised)
|
||||
}
|
||||
if len(a.reasons) == 0 || a.reasons[0] == "" {
|
||||
t.Fatalf("missing reason")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateWatcherMismatchedVersionNoOp(t *testing.T) {
|
||||
st := openWSTestStore(t)
|
||||
hostID := ulid.Make().String()
|
||||
seedHostWS(t, st, hostID)
|
||||
jobID := seedJob(t, st, hostID)
|
||||
|
||||
a := &fakeAlerts{}
|
||||
w := NewUpdateWatcher(st, a)
|
||||
w.Track(jobID, hostID)
|
||||
|
||||
w.OnHello(context.Background(), hostID, "v1", "v2")
|
||||
|
||||
job, _ := st.GetJob(context.Background(), jobID)
|
||||
if job.Status == "succeeded" || job.Status == "failed" {
|
||||
t.Fatalf("status flipped on mismatched hello: %q", job.Status)
|
||||
}
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
if len(a.raised) != 0 || len(a.resolved) != 0 {
|
||||
t.Fatalf("unexpected alert calls raised=%v resolved=%v", a.raised, a.resolved)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateWatcherHelloAfterTimeoutIsNoOp(t *testing.T) {
|
||||
prev := updateTimeout
|
||||
updateTimeout = 50 * time.Millisecond
|
||||
t.Cleanup(func() { updateTimeout = prev })
|
||||
|
||||
st := openWSTestStore(t)
|
||||
hostID := ulid.Make().String()
|
||||
seedHostWS(t, st, hostID)
|
||||
jobID := seedJob(t, st, hostID)
|
||||
|
||||
a := &fakeAlerts{}
|
||||
w := NewUpdateWatcher(st, a)
|
||||
w.Track(jobID, hostID)
|
||||
|
||||
time.Sleep(80 * time.Millisecond)
|
||||
w.sweep(context.Background(), time.Now())
|
||||
|
||||
// Hello arrives after sweep — entry already gone, must be no-op.
|
||||
w.OnHello(context.Background(), hostID, "v2", "v2")
|
||||
|
||||
job, _ := st.GetJob(context.Background(), jobID)
|
||||
if job.Status != "failed" {
|
||||
t.Fatalf("status flipped from failed → %q", job.Status)
|
||||
}
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
if len(a.resolved) != 0 {
|
||||
t.Fatalf("late hello triggered ResolveUpdateFailed: %v", a.resolved)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user