p6-01/02: agent self-update + fleet update server cluster

- alert: update_failed (per-host, dedup=hostID) + fleet_update_halted
  (system-scoped, host_id NULL via new RaiseOrTouchSystem helper).
- ws: UpdateWatcher tracks in-flight command.update dispatches and
  reconciles them against incoming hello envelopes — success path
  marks the job succeeded and auto-resolves the alert; 90s timeout
  marks the job failed and raises update_failed.
- http: POST /api/hosts/{id}/update (admin-only JSON) + the HTMX
  /hosts/{id}/update form variant. Pre-checks: host exists, online,
  agent_version != current, no running update job. Refactored core
  into Server.dispatchHostUpdate so the fleet worker can share it
  without going through HTTP.
- fleetupdate: rolling worker iterating through host slots, halting
  on first failure and raising fleet_update_halted. Polling-based
  version-match (re-read hosts.agent_version every 1s up to 95s) —
  no extra plumbing into the WS hello path. At-most-one-running is
  enforced at the store layer (ErrFleetUpdateRunning).
- cmd/server: wire UpdateWatcher and FleetWorker into the main
  goroutine; the worker uses a small serverDispatcher adapter that
  delegates back into Server.DispatchHostUpdate.

Tests: watcher (success/timeout/mismatch/late-hello), HTTP endpoint
(happy + four pre-check branches + RBAC), worker (two-host happy,
timeout-halt, host-offline-halt, already-at-target skip, cancel
mid-run, double-Start guard).
This commit is contained in:
2026-05-06 22:03:50 +01:00
parent c37954aa3f
commit 9d5775fb47
11 changed files with 1540 additions and 2 deletions
+217
View File
@@ -0,0 +1,217 @@
package http
import (
"context"
"encoding/json"
stdhttp "net/http"
"time"
"github.com/go-chi/chi/v5"
"github.com/oklog/ulid/v2"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
)
// UpdateWatcher is the slim view of the ws.updateWatcher this package
// uses for tracking in-flight update dispatches. Defined as an
// interface so a test can inject a stub.
type UpdateWatcher interface {
Track(jobID, hostID string)
}
// FleetWorker is the slim view of the fleetupdate.Worker this package
// uses. Kept here for forward compatibility with P6-15 — the host
// update endpoint itself does not use it.
type FleetWorker interface {
Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error)
Cancel(ctx context.Context, fleetUpdateID string) error
}
// dispatchHostUpdateResult communicates structured outcomes from the
// shared dispatch path so both the HTTP handler and the fleet worker
// can format errors in their own idiom.
type dispatchHostUpdateResult struct {
JobID string
Code string // "" on success
Status int // HTTP status the JSON handler should use on error
Msg string // human-readable detail (optional)
}
// dispatchHostUpdate is the shared "send command.update to one host"
// path. It performs every pre-check (host exists, online, version
// mismatch, no in-flight update) and on success creates the jobs row,
// audits, dispatches the WS envelope, and tracks the watcher entry.
//
// Pre-checks are returned as structured codes rather than HTTP errors
// so the fleet worker can map them onto its own per-host status enum
// without parsing strings.
func (s *Server) dispatchHostUpdate(ctx context.Context, hostID string, actorKind string, actorID *string) dispatchHostUpdateResult {
host, err := s.deps.Store.GetHost(ctx, hostID)
if err != nil || host == nil {
return dispatchHostUpdateResult{Code: "host_not_found", Status: stdhttp.StatusNotFound}
}
if !s.deps.Hub.Connected(host.ID) {
return dispatchHostUpdateResult{
Code: "host_offline", Status: stdhttp.StatusConflict,
Msg: "agent is not currently connected",
}
}
if host.AgentVersion != "" && host.AgentVersion == version.Version {
return dispatchHostUpdateResult{
Code: "already_up_to_date", Status: stdhttp.StatusConflict,
Msg: "agent already running version " + version.Version,
}
}
existing, err := s.deps.Store.RunningUpdateJobForHost(ctx, hostID)
if err != nil {
return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
}
if existing != "" {
return dispatchHostUpdateResult{
Code: "update_in_progress", Status: stdhttp.StatusConflict,
Msg: "an update job is already in flight for this host",
JobID: existing,
}
}
jobID := ulid.Make().String()
now := time.Now().UTC()
if err := s.deps.Store.CreateJob(ctx, store.Job{
ID: jobID, HostID: hostID, Kind: "update",
ActorKind: actorKind, ActorID: actorID,
CreatedAt: now,
}); err != nil {
return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
}
env, err := api.Marshal(api.MsgCommandUpdate, ulid.Make().String(), api.CommandUpdatePayload{
JobID: jobID,
})
if err != nil {
return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
}
if err := s.deps.Hub.Send(ctx, hostID, env); err != nil {
// Roll the job to failed so we don't leak a queued row.
_ = s.deps.Store.MarkJobFinished(ctx, jobID, "failed", -1, nil, err.Error(), time.Now().UTC())
return dispatchHostUpdateResult{
Code: "host_offline", Status: stdhttp.StatusConflict, Msg: err.Error(),
}
}
if s.deps.UpdateWatcher != nil {
s.deps.UpdateWatcher.Track(jobID, hostID)
}
auditPayload, _ := json.Marshal(map[string]string{
"job_id": jobID,
"target_version": version.Version,
})
_ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{
ID: ulid.Make().String(),
UserID: actorID,
Actor: actorKind,
Action: "host.update_dispatched",
TargetKind: ptr("host"),
TargetID: &hostID,
TS: now,
Payload: auditPayload,
})
return dispatchHostUpdateResult{JobID: jobID}
}
// handleHostUpdate is POST /api/hosts/{id}/update — JSON, admin-only.
func (s *Server) handleHostUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
writeJSONError(w, stdhttp.StatusBadRequest, "missing_host_id", "")
return
}
actor := "user"
var actorID *string
if user != nil {
actorID = &user.ID
}
res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID)
if res.Code != "" {
writeJSONError(w, res.Status, res.Code, res.Msg)
return
}
writeJSON(w, stdhttp.StatusAccepted, map[string]string{"job_id": res.JobID})
}
// handleHostUpdateForm is the HTMX-friendly POST /hosts/{id}/update
// variant. On success it sets HX-Redirect to the job detail page; on
// pre-check failures it renders an inline error banner.
func (s *Server) handleHostUpdateForm(w stdhttp.ResponseWriter, r *stdhttp.Request) {
user, ok := s.requireUser(r)
if !ok {
stdhttp.Error(w, "unauthorised", stdhttp.StatusUnauthorized)
return
}
hostID := chi.URLParam(r, "id")
if hostID == "" {
stdhttp.Error(w, "missing host_id", stdhttp.StatusBadRequest)
return
}
actor := "user"
var actorID *string
if user != nil {
actorID = &user.ID
}
res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID)
if res.Code != "" {
// Inline banner for HTMX swaps. Mirrors what host_credentials
// returns on validation errors — small text/html fragment.
w.Header().Set("Content-Type", "text/html; charset=utf-8")
w.WriteHeader(res.Status)
msg := hostUpdateErrorMessage(res.Code, res.Msg)
_, _ = w.Write([]byte(`<div class="banner banner-error" role="alert">` + htmlEscape(msg) + `</div>`))
return
}
w.Header().Set("HX-Redirect", "/jobs/"+res.JobID)
w.WriteHeader(stdhttp.StatusOK)
}
func hostUpdateErrorMessage(code, msg string) string {
switch code {
case "host_not_found":
return "Host not found."
case "host_offline":
return "Agent is offline; can't deliver the update command."
case "already_up_to_date":
return "Agent is already running the current version."
case "update_in_progress":
return "An update is already in progress for this host."
}
if msg != "" {
return msg
}
return "Update dispatch failed."
}
// htmlEscape is a minimal HTML-attr-safe escaper. Avoids pulling html/template
// for a one-shot inline banner.
func htmlEscape(s string) string {
out := make([]byte, 0, len(s))
for i := 0; i < len(s); i++ {
switch s[i] {
case '&':
out = append(out, []byte("&amp;")...)
case '<':
out = append(out, []byte("&lt;")...)
case '>':
out = append(out, []byte("&gt;")...)
case '"':
out = append(out, []byte("&quot;")...)
default:
out = append(out, s[i])
}
}
return string(out)
}
+270
View File
@@ -0,0 +1,270 @@
// host_update_test.go — covers POST /api/hosts/{id}/update.
package http
import (
"context"
"encoding/json"
"io"
stdhttp "net/http"
"strings"
"sync"
"testing"
"time"
"github.com/coder/websocket"
"github.com/oklog/ulid/v2"
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
)
// stubWatcher records Track calls so tests can assert the watcher was
// notified.
type stubWatcher struct {
mu sync.Mutex
tracked []string // hostIDs
}
func (s *stubWatcher) Track(_, hostID string) {
s.mu.Lock()
defer s.mu.Unlock()
s.tracked = append(s.tracked, hostID)
}
func TestHostUpdateHappyPath(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
watcher := &stubWatcher{}
srv.deps.UpdateWatcher = watcher
hostID, token := enrolHostForWS(t, srv, st, "upd-host")
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "upd-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
// Force a version mismatch so the dispatch isn't short-circuited.
if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
t.Fatalf("mark hello: %v", err)
}
cookie := loginAsAdmin(t, st)
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
req.AddCookie(cookie)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusAccepted {
t.Fatalf("status: got %d, want 202", res.StatusCode)
}
var out struct {
JobID string `json:"job_id"`
}
if err := json.NewDecoder(res.Body).Decode(&out); err != nil {
t.Fatalf("decode: %v", err)
}
if out.JobID == "" {
t.Fatal("missing job_id in response")
}
// command.update envelope arrives.
deadline := time.Now().Add(2 * time.Second)
var got api.Envelope
for time.Now().Before(deadline) {
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
mt, raw, rerr := c.Read(ctx)
cancel()
if rerr != nil {
break
}
if mt != websocket.MessageText {
continue
}
if !strings.Contains(string(raw), `"command.update"`) {
continue
}
_ = json.Unmarshal(raw, &got)
break
}
if got.Type != api.MsgCommandUpdate {
t.Fatal("never received command.update envelope")
}
var cp api.CommandUpdatePayload
if err := got.UnmarshalPayload(&cp); err != nil {
t.Fatalf("payload: %v", err)
}
if cp.JobID != out.JobID {
t.Fatalf("payload job_id: got %q want %q", cp.JobID, out.JobID)
}
// Watcher tracked.
watcher.mu.Lock()
defer watcher.mu.Unlock()
if len(watcher.tracked) != 1 || watcher.tracked[0] != hostID {
t.Fatalf("watcher tracked: %v", watcher.tracked)
}
// Audit row exists.
var n int
if err := st.DB().QueryRow(
`SELECT COUNT(*) FROM audit_log WHERE action = 'host.update_dispatched' AND target_id = ?`,
hostID).Scan(&n); err != nil {
t.Fatalf("audit count: %v", err)
}
if n != 1 {
t.Fatalf("audit rows: got %d, want 1", n)
}
}
func TestHostUpdateNotFound(t *testing.T) {
t.Parallel()
_, ts, st := rawTestServer(t)
cookie := loginAsAdmin(t, st)
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/no-such/update", nil)
req.AddCookie(cookie)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusNotFound {
t.Fatalf("status: got %d want 404", res.StatusCode)
}
}
func TestHostUpdateOffline(t *testing.T) {
t.Parallel()
_, ts, st := rawTestServer(t)
hostID := ulid.Make().String()
if err := st.CreateHost(context.Background(), store.Host{
ID: hostID, Name: "off", OS: "linux", Arch: "amd64",
EnrolledAt: time.Now().UTC(),
}, "deadbeef", ""); err != nil {
t.Fatalf("create: %v", err)
}
cookie := loginAsAdmin(t, st)
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
req.AddCookie(cookie)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusConflict {
t.Fatalf("status: got %d want 409", res.StatusCode)
}
body := readJSONError(t, res.Body)
if body.Code != "host_offline" {
t.Fatalf("code: %q", body.Code)
}
}
func TestHostUpdateAlreadyUpToDate(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "uptodate-host")
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "uptodate-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
// Force agent_version == version.Version.
if err := st.MarkHostHello(context.Background(), hostID, version.Version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
t.Fatalf("mark hello: %v", err)
}
cookie := loginAsAdmin(t, st)
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
req.AddCookie(cookie)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusConflict {
t.Fatalf("status: got %d want 409", res.StatusCode)
}
body := readJSONError(t, res.Body)
if body.Code != "already_up_to_date" {
t.Fatalf("code: %q", body.Code)
}
}
func TestHostUpdateInProgress(t *testing.T) {
t.Parallel()
srv, ts, st := rawTestServer(t)
hostID, token := enrolHostForWS(t, srv, st, "inprog-host")
c := agentDial(t, srv, ts, hostID, token)
sendHello(t, c, "inprog-host")
_ = drainUntil(t, c, api.MsgScheduleSet)
if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
t.Fatalf("mark hello: %v", err)
}
// Pre-seed an in-flight update job.
jobID := ulid.Make().String()
if err := st.CreateJob(context.Background(), store.Job{
ID: jobID, HostID: hostID, Kind: "update",
ActorKind: "user", CreatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("seed job: %v", err)
}
cookie := loginAsAdmin(t, st)
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
req.AddCookie(cookie)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusConflict {
t.Fatalf("status: got %d want 409", res.StatusCode)
}
body := readJSONError(t, res.Body)
if body.Code != "update_in_progress" {
t.Fatalf("code: %q", body.Code)
}
}
func TestHostUpdateRBAC(t *testing.T) {
t.Parallel()
_, ts, st := rawTestServer(t)
hostID := ulid.Make().String()
if err := st.CreateHost(context.Background(), store.Host{
ID: hostID, Name: "rbac-host", OS: "linux", Arch: "amd64",
EnrolledAt: time.Now().UTC(),
}, "deadbeef", ""); err != nil {
t.Fatalf("create: %v", err)
}
for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} {
role := role
t.Run(string(role), func(t *testing.T) {
cookie := loginAsRole(t, st, role)
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
req.AddCookie(cookie)
res, err := stdhttp.DefaultClient.Do(req)
if err != nil {
t.Fatalf("do: %v", err)
}
defer res.Body.Close()
if res.StatusCode != stdhttp.StatusForbidden {
t.Fatalf("status for %s: got %d want 403", role, res.StatusCode)
}
})
}
}
type jsonErrBody struct {
Code string `json:"code"`
Message string `json:"message,omitempty"`
}
func readJSONError(t *testing.T, body io.Reader) jsonErrBody {
t.Helper()
var out jsonErrBody
if err := json.NewDecoder(body).Decode(&out); err != nil {
t.Fatalf("decode error body: %v", err)
}
return out
}
+37 -2
View File
@@ -39,6 +39,13 @@ type Deps struct {
// NotificationHub (optional, wired in G1) is used by the test-fire
// endpoint to dispatch a single synthetic payload through a channel.
NotificationHub *notification.Hub
// UpdateWatcher tracks in-flight agent self-update dispatches and
// reconciles them against incoming hello envelopes. Optional;
// nil = no-op (handlers degrade by skipping the Track call).
UpdateWatcher UpdateWatcher
// FleetWorker drives the rolling fleet-update worker. Optional;
// nil = fleet update endpoints (P6-15) report unavailable.
FleetWorker FleetWorker
// Version is the binary's build version, surfaced in the chrome.
// Empty falls back to "dev".
Version string
@@ -125,7 +132,7 @@ func (s *Server) routes(r chi.Router) {
r.Get("/install/*", s.handleInstallAsset)
r.Get("/api/version", s.handleVersion)
if s.deps.Hub != nil {
r.Mount("/ws/agent", ws.AgentHandler(ws.HandlerDeps{
hd := ws.HandlerDeps{
Hub: s.deps.Hub,
Store: s.deps.Store,
JobHub: s.deps.JobHub,
@@ -133,7 +140,11 @@ func (s *Server) routes(r chi.Router) {
OnHello: s.onAgentHello,
OnScheduleAck: s.applyScheduleAck,
OnScheduleFire: s.dispatchScheduledJob,
}))
}
if w, ok := s.deps.UpdateWatcher.(*ws.UpdateWatcher); ok && w != nil {
hd.UpdateWatcher = w
}
r.Mount("/ws/agent", ws.AgentHandler(hd))
}
r.Get("/ws/agent/pending", s.handlePendingWS)
r.Mount("/static/", staticHandler())
@@ -271,6 +282,9 @@ func (s *Server) routes(r chi.Router) {
r.Group(func(r chi.Router) {
r.Use(s.requireRole(store.RoleAdmin))
r.Post("/api/hosts/{id}/update", s.handleHostUpdate)
r.Post("/hosts/{id}/update", s.handleHostUpdateForm)
r.Get("/api/users", s.handleAPIUsersList)
r.Post("/api/users", s.handleAPIUserCreate)
r.Get("/api/users/{id}", s.handleAPIUserGet)
@@ -322,6 +336,27 @@ func (s *Server) Shutdown(ctx context.Context) error {
return s.srv.Shutdown(ctx)
}
// SetFleetWorker installs the fleet-update worker post-construction.
// Used to break the wiring loop in cmd/server (the worker depends on a
// dispatcher that delegates back into the server's host-update path).
func (s *Server) SetFleetWorker(fw FleetWorker) { s.deps.FleetWorker = fw }
// DispatchHostUpdate is the public entry point for callers (the fleet
// worker) that need to drive the same dispatch path the HTTP handler
// uses, without going through HTTP. Returns the structured result so
// the caller can map error codes to its own status enum.
func (s *Server) DispatchHostUpdate(ctx context.Context, hostID, actorUserID string) (jobID string, code string, err error) {
var actorID *string
if actorUserID != "" {
actorID = &actorUserID
}
res := s.dispatchHostUpdate(ctx, hostID, "user", actorID)
if res.Code != "" {
return res.JobID, res.Code, nil
}
return res.JobID, "", nil
}
// Addr returns the configured listen address. Useful in tests when
// the caller passes :0 to get a random port.
func (s *Server) Addr() string { return s.srv.Addr }