p6-01/02: agent self-update + fleet update server cluster
- alert: update_failed (per-host, dedup=hostID) + fleet_update_halted
(system-scoped, host_id NULL via new RaiseOrTouchSystem helper).
- ws: UpdateWatcher tracks in-flight command.update dispatches and
reconciles them against incoming hello envelopes — success path
marks the job succeeded and auto-resolves the alert; 90s timeout
marks the job failed and raises update_failed.
- http: POST /api/hosts/{id}/update (admin-only JSON) + the HTMX
/hosts/{id}/update form variant. Pre-checks: host exists, online,
agent_version != current, no running update job. Refactored core
into Server.dispatchHostUpdate so the fleet worker can share it
without going through HTTP.
- fleetupdate: rolling worker iterating through host slots, halting
on first failure and raising fleet_update_halted. Polling-based
version-match (re-read hosts.agent_version every 1s up to 95s) —
no extra plumbing into the WS hello path. At-most-one-running is
enforced at the store layer (ErrFleetUpdateRunning).
- cmd/server: wire UpdateWatcher and FleetWorker into the main
goroutine; the worker uses a small serverDispatcher adapter that
delegates back into Server.DispatchHostUpdate.
Tests: watcher (success/timeout/mismatch/late-hello), HTTP endpoint
(happy + four pre-check branches + RBAC), worker (two-host happy,
timeout-halt, host-offline-halt, already-at-target skip, cancel
mid-run, double-Start guard).
This commit is contained in:
@@ -17,6 +17,7 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/fleetupdate"
|
||||
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
|
||||
@@ -91,6 +92,7 @@ func run() error {
|
||||
|
||||
notifHub := notification.NewHub(st, aead, cfg.BaseURL)
|
||||
alertEngine := alert.NewEngine(st, notifHub)
|
||||
updateWatcher := ws.NewUpdateWatcher(st, alertEngine)
|
||||
|
||||
renderer, err := ui.New()
|
||||
if err != nil {
|
||||
@@ -116,6 +118,7 @@ func run() error {
|
||||
JobHub: jobHub,
|
||||
AlertEngine: alertEngine,
|
||||
NotificationHub: notifHub,
|
||||
UpdateWatcher: updateWatcher,
|
||||
UI: renderer,
|
||||
Version: version,
|
||||
OIDC: oidcClient,
|
||||
@@ -147,10 +150,17 @@ func run() error {
|
||||
|
||||
srv := rmhttp.New(deps)
|
||||
|
||||
// Fleet-update worker — built after the HTTP server because the
|
||||
// dispatcher delegates back into srv.DispatchHostUpdate.
|
||||
fleetWorker := fleetupdate.NewWorker(st, hub,
|
||||
&serverDispatcher{srv: srv}, alertEngine)
|
||||
srv.SetFleetWorker(fleetWorker)
|
||||
|
||||
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer stop()
|
||||
|
||||
go alertEngine.Run(ctx)
|
||||
go updateWatcher.Run(ctx)
|
||||
|
||||
errCh := make(chan error, 1)
|
||||
go func() {
|
||||
@@ -243,3 +253,12 @@ func run() error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// serverDispatcher adapts the http.Server's DispatchHostUpdate method
|
||||
// to the fleetupdate.Dispatcher interface. Lives in main so the
|
||||
// http and fleetupdate packages don't need to know about each other.
|
||||
type serverDispatcher struct{ srv *rmhttp.Server }
|
||||
|
||||
func (d *serverDispatcher) DispatchUpdate(ctx context.Context, hostID, actorUserID string) (string, string, error) {
|
||||
return d.srv.DispatchHostUpdate(ctx, hostID, actorUserID)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user