// fleet_update.go — admin-only fleet rolling-update endpoints + page. // // Surface: // - POST /api/fleet/update → starts a fleet update (JSON) // - POST /api/fleet-updates/{id}/cancel // - GET /api/fleet-updates/{id} → JSON parent + per-host array // - GET /settings/fleet-update → admin UI page // - GET /settings/fleet-update/partial → htmx polling fragment // // All routes are mounted in the admin band (see routes()). package http import ( "context" "encoding/json" "errors" "log/slog" stdhttp "net/http" "time" "github.com/go-chi/chi/v5" "github.com/oklog/ulid/v2" "gitea.dcglab.co.uk/steve/restic-manager/internal/store" "gitea.dcglab.co.uk/steve/restic-manager/internal/version" ) // fleetUpdateStartReq is the JSON body for POST /api/fleet/update. // Both fields are optional: empty target_version defaults to the // server's current version, empty host_ids derives the out-of-date // online subset. type fleetUpdateStartReq struct { TargetVersion string `json:"target_version,omitempty"` HostIDs []string `json:"host_ids,omitempty"` } // fleetUpdateHostView is one row in the JSON response for GET // /api/fleet-updates/{id}. Hostname is hydrated from the store so // callers don't need a second round-trip per host. type fleetUpdateHostView struct { HostID string `json:"host_id"` HostName string `json:"host_name,omitempty"` Position int `json:"position"` Status string `json:"status"` JobID string `json:"job_id,omitempty"` FailedReason string `json:"failed_reason,omitempty"` } // fleetUpdateView is the JSON projection of the parent + children. type fleetUpdateView struct { ID string `json:"id"` StartedAt string `json:"started_at"` StartedByUserID string `json:"started_by_user_id"` TargetVersion string `json:"target_version"` Status string `json:"status"` CurrentHostID string `json:"current_host_id,omitempty"` HaltedReason string `json:"halted_reason,omitempty"` CompletedAt *string `json:"completed_at,omitempty"` Hosts []fleetUpdateHostView `json:"hosts"` } // fleetUpdatePage backs both the full /settings/fleet-update page // and the partial polled fragment. Idle / Active are mutually // exclusive: if Active is non-nil, render the progress view. type fleetUpdatePage struct { // Idle-state fields. OutOfDateHosts []store.Host // online hosts whose version != target TargetVersion string // Active-state fields. Nil when no fleet update has ever run. Active *store.FleetUpdate ActiveRows []fleetUpdateHostView // Common. HostNames map[string]string // PollURL is the partial endpoint htmx polls every few seconds. PollURL string } // handleAPIFleetUpdateStart is POST /api/fleet/update. func (s *Server) handleAPIFleetUpdateStart(w stdhttp.ResponseWriter, r *stdhttp.Request) { user, ok := s.requireUser(r) if !ok { writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "") return } if s.deps.FleetWorker == nil { writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "") return } var body fleetUpdateStartReq // Empty body is fine — both fields are optional. if r.ContentLength != 0 { if err := json.NewDecoder(r.Body).Decode(&body); err != nil { writeJSONError(w, stdhttp.StatusBadRequest, "invalid_json", err.Error()) return } } target := body.TargetVersion if target == "" { target = version.Version } hostIDs := body.HostIDs if len(hostIDs) == 0 { derived, err := s.deriveOutOfDateOnlineHostIDs(r.Context(), target) if err != nil { writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error()) return } hostIDs = derived } if len(hostIDs) == 0 { writeJSONError(w, stdhttp.StatusConflict, "no_hosts_eligible", "no online hosts are out of date") return } fuID, err := s.deps.FleetWorker.Start(r.Context(), user.ID, target, hostIDs) if err != nil { if errors.Is(err, store.ErrFleetUpdateRunning) { writeJSONError(w, stdhttp.StatusConflict, "fleet_update_in_progress", err.Error()) return } writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error()) return } auditPayload, _ := json.Marshal(map[string]any{ "fleet_update_id": fuID, "target_version": target, "host_count": len(hostIDs), }) _ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{ ID: ulid.Make().String(), UserID: &user.ID, Actor: "user", Action: "fleet.update_started", TargetKind: ptr("fleet_update"), TargetID: &fuID, TS: time.Now().UTC(), Payload: auditPayload, }) writeJSON(w, stdhttp.StatusAccepted, map[string]string{"fleet_update_id": fuID}) } // handleAPIFleetUpdateCancel is POST /api/fleet-updates/{id}/cancel. func (s *Server) handleAPIFleetUpdateCancel(w stdhttp.ResponseWriter, r *stdhttp.Request) { user, ok := s.requireUser(r) if !ok { writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "") return } if s.deps.FleetWorker == nil { writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "") return } fuID := chi.URLParam(r, "id") if fuID == "" { writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "") return } fu, _, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID) if err != nil { if errors.Is(err, store.ErrNotFound) { writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "") return } writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error()) return } if fu.Status != "running" { writeJSONError(w, stdhttp.StatusConflict, "fleet_update_not_running", "fleet update is not in the running state") return } if err := s.deps.FleetWorker.Cancel(r.Context(), fuID); err != nil { writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error()) return } _ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{ ID: ulid.Make().String(), UserID: &user.ID, Actor: "user", Action: "fleet.update_cancelled", TargetKind: ptr("fleet_update"), TargetID: &fuID, TS: time.Now().UTC(), }) w.WriteHeader(stdhttp.StatusNoContent) } // handleAPIFleetUpdateGet is GET /api/fleet-updates/{id}. func (s *Server) handleAPIFleetUpdateGet(w stdhttp.ResponseWriter, r *stdhttp.Request) { if _, ok := s.requireUser(r); !ok { writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "") return } fuID := chi.URLParam(r, "id") fu, hosts, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID) if err != nil { if errors.Is(err, store.ErrNotFound) { writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "") return } writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error()) return } names := s.hostNameMap(r) view := fleetUpdateView{ ID: fu.ID, StartedAt: fu.StartedAt.UTC().Format(time.RFC3339Nano), StartedByUserID: fu.StartedByUserID, TargetVersion: fu.TargetVersion, Status: fu.Status, CurrentHostID: fu.CurrentHostID, HaltedReason: fu.HaltedReason, Hosts: make([]fleetUpdateHostView, 0, len(hosts)), } if fu.CompletedAt != nil { s := fu.CompletedAt.UTC().Format(time.RFC3339Nano) view.CompletedAt = &s } for _, h := range hosts { view.Hosts = append(view.Hosts, fleetUpdateHostView{ HostID: h.HostID, HostName: names[h.HostID], Position: h.Position, Status: h.Status, JobID: h.JobID, FailedReason: h.FailedReason, }) } writeJSON(w, stdhttp.StatusOK, view) } // handleUIFleetUpdate renders /settings/fleet-update. func (s *Server) handleUIFleetUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) { u := s.requireUIUser(w, r) if u == nil { return } page, err := s.buildFleetUpdatePage(r) if err != nil { slog.Error("ui fleet update: build page", "err", err) stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError) return } view := s.baseView(r, u) view.Title = "Fleet update · restic-manager" view.Active = "settings" view.Page = page if err := s.deps.UI.Render(w, "fleet_update", view); err != nil { slog.Error("ui fleet update: render", "err", err) } } // handleUIFleetUpdatePartial renders just the inner panel for htmx // auto-refresh polling — same data, no chrome. func (s *Server) handleUIFleetUpdatePartial(w stdhttp.ResponseWriter, r *stdhttp.Request) { u := s.requireUIUser(w, r) if u == nil { return } page, err := s.buildFleetUpdatePage(r) if err != nil { slog.Error("ui fleet update partial: build page", "err", err) stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError) return } view := s.baseView(r, u) view.Page = page if err := s.deps.UI.RenderPartial(w, "fleet_update_inner", view); err != nil { slog.Error("ui fleet update partial: render", "err", err) } } // buildFleetUpdatePage assembles the data both /settings/fleet-update // and its partial render against. Resolves the most-recent fleet // update (active OR completed/cancelled/halted) so the page can show // the last roll's result instead of disappearing into "idle" the // instant a roll finishes. func (s *Server) buildFleetUpdatePage(r *stdhttp.Request) (fleetUpdatePage, error) { page := fleetUpdatePage{ TargetVersion: version.Version, HostNames: map[string]string{}, PollURL: "/settings/fleet-update/partial", } hosts, err := s.deps.Store.ListHosts(r.Context()) if err != nil { return page, err } for _, h := range hosts { page.HostNames[h.ID] = h.Name } active, err := s.deps.Store.ActiveFleetUpdate(r.Context()) if err != nil { return page, err } mostRecent := active if mostRecent == nil { // Fall back to the most recent terminal row so the page can // show "completed" / "halted" / "cancelled" once the worker // finishes. One small bespoke query — keeps the page from // flashing back to "idle" the instant a roll wraps up. var id string err := s.deps.Store.DB().QueryRowContext(r.Context(), `SELECT id FROM fleet_updates ORDER BY started_at DESC LIMIT 1`). Scan(&id) if err == nil { fu, _, gerr := s.deps.Store.GetFleetUpdate(r.Context(), id) if gerr == nil { mostRecent = fu } } } if mostRecent != nil { _, rows, gerr := s.deps.Store.GetFleetUpdate(r.Context(), mostRecent.ID) if gerr == nil { page.Active = mostRecent page.ActiveRows = make([]fleetUpdateHostView, 0, len(rows)) for _, hr := range rows { page.ActiveRows = append(page.ActiveRows, fleetUpdateHostView{ HostID: hr.HostID, HostName: page.HostNames[hr.HostID], Position: hr.Position, Status: hr.Status, JobID: hr.JobID, FailedReason: hr.FailedReason, }) } } } // Idle list (or "still out of date" reference even when an active // roll is running — cheap to compute, harmless to attach). for _, h := range hosts { if h.Status != "online" { continue } if h.AgentVersion == "" || h.AgentVersion == page.TargetVersion { continue } page.OutOfDateHosts = append(page.OutOfDateHosts, h) } return page, nil } // deriveOutOfDateOnlineHostIDs returns the list of host IDs that // (a) are online (Hub.Connected) and (b) have an agent_version that's // non-empty AND != target. Used by the start endpoint when the caller // omits host_ids. func (s *Server) deriveOutOfDateOnlineHostIDs(ctx context.Context, target string) ([]string, error) { hosts, err := s.deps.Store.ListHosts(ctx) if err != nil { return nil, err } out := []string{} for _, h := range hosts { if h.AgentVersion == "" || h.AgentVersion == target { continue } if !s.deps.Hub.Connected(h.ID) { continue } out = append(out, h.ID) } return out, nil } // hostNameMap returns hostID → name; used to hydrate fleet-update // JSON responses. func (s *Server) hostNameMap(r *stdhttp.Request) map[string]string { out := map[string]string{} hosts, err := s.deps.Store.ListHosts(r.Context()) if err != nil { return out } for _, h := range hosts { out[h.ID] = h.Name } return out }