p6-01/02: agent self-update + fleet update server cluster

- alert: update_failed (per-host, dedup=hostID) + fleet_update_halted
  (system-scoped, host_id NULL via new RaiseOrTouchSystem helper).
- ws: UpdateWatcher tracks in-flight command.update dispatches and
  reconciles them against incoming hello envelopes — success path
  marks the job succeeded and auto-resolves the alert; 90s timeout
  marks the job failed and raises update_failed.
- http: POST /api/hosts/{id}/update (admin-only JSON) + the HTMX
  /hosts/{id}/update form variant. Pre-checks: host exists, online,
  agent_version != current, no running update job. Refactored core
  into Server.dispatchHostUpdate so the fleet worker can share it
  without going through HTTP.
- fleetupdate: rolling worker iterating through host slots, halting
  on first failure and raising fleet_update_halted. Polling-based
  version-match (re-read hosts.agent_version every 1s up to 95s) —
  no extra plumbing into the WS hello path. At-most-one-running is
  enforced at the store layer (ErrFleetUpdateRunning).
- cmd/server: wire UpdateWatcher and FleetWorker into the main
  goroutine; the worker uses a small serverDispatcher adapter that
  delegates back into Server.DispatchHostUpdate.

Tests: watcher (success/timeout/mismatch/late-hello), HTTP endpoint
(happy + four pre-check branches + RBAC), worker (two-host happy,
timeout-halt, host-offline-halt, already-at-target skip, cancel
mid-run, double-Start guard).
This commit is contained in:
2026-05-06 22:03:50 +01:00
parent c37954aa3f
commit 9d5775fb47
11 changed files with 1540 additions and 2 deletions
+37 -2
View File
@@ -39,6 +39,13 @@ type Deps struct {
// NotificationHub (optional, wired in G1) is used by the test-fire
// endpoint to dispatch a single synthetic payload through a channel.
NotificationHub *notification.Hub
// UpdateWatcher tracks in-flight agent self-update dispatches and
// reconciles them against incoming hello envelopes. Optional;
// nil = no-op (handlers degrade by skipping the Track call).
UpdateWatcher UpdateWatcher
// FleetWorker drives the rolling fleet-update worker. Optional;
// nil = fleet update endpoints (P6-15) report unavailable.
FleetWorker FleetWorker
// Version is the binary's build version, surfaced in the chrome.
// Empty falls back to "dev".
Version string
@@ -125,7 +132,7 @@ func (s *Server) routes(r chi.Router) {
r.Get("/install/*", s.handleInstallAsset)
r.Get("/api/version", s.handleVersion)
if s.deps.Hub != nil {
r.Mount("/ws/agent", ws.AgentHandler(ws.HandlerDeps{
hd := ws.HandlerDeps{
Hub: s.deps.Hub,
Store: s.deps.Store,
JobHub: s.deps.JobHub,
@@ -133,7 +140,11 @@ func (s *Server) routes(r chi.Router) {
OnHello: s.onAgentHello,
OnScheduleAck: s.applyScheduleAck,
OnScheduleFire: s.dispatchScheduledJob,
}))
}
if w, ok := s.deps.UpdateWatcher.(*ws.UpdateWatcher); ok && w != nil {
hd.UpdateWatcher = w
}
r.Mount("/ws/agent", ws.AgentHandler(hd))
}
r.Get("/ws/agent/pending", s.handlePendingWS)
r.Mount("/static/", staticHandler())
@@ -271,6 +282,9 @@ func (s *Server) routes(r chi.Router) {
r.Group(func(r chi.Router) {
r.Use(s.requireRole(store.RoleAdmin))
r.Post("/api/hosts/{id}/update", s.handleHostUpdate)
r.Post("/hosts/{id}/update", s.handleHostUpdateForm)
r.Get("/api/users", s.handleAPIUsersList)
r.Post("/api/users", s.handleAPIUserCreate)
r.Get("/api/users/{id}", s.handleAPIUserGet)
@@ -322,6 +336,27 @@ func (s *Server) Shutdown(ctx context.Context) error {
return s.srv.Shutdown(ctx)
}
// SetFleetWorker installs the fleet-update worker post-construction.
// Used to break the wiring loop in cmd/server (the worker depends on a
// dispatcher that delegates back into the server's host-update path).
func (s *Server) SetFleetWorker(fw FleetWorker) { s.deps.FleetWorker = fw }
// DispatchHostUpdate is the public entry point for callers (the fleet
// worker) that need to drive the same dispatch path the HTTP handler
// uses, without going through HTTP. Returns the structured result so
// the caller can map error codes to its own status enum.
func (s *Server) DispatchHostUpdate(ctx context.Context, hostID, actorUserID string) (jobID string, code string, err error) {
var actorID *string
if actorUserID != "" {
actorID = &actorUserID
}
res := s.dispatchHostUpdate(ctx, hostID, "user", actorID)
if res.Code != "" {
return res.JobID, res.Code, nil
}
return res.JobID, "", nil
}
// Addr returns the configured listen address. Useful in tests when
// the caller passes :0 to get a random port.
func (s *Server) Addr() string { return s.srv.Addr }