p6-01/02: agent self-update + fleet update server cluster
- alert: update_failed (per-host, dedup=hostID) + fleet_update_halted
(system-scoped, host_id NULL via new RaiseOrTouchSystem helper).
- ws: UpdateWatcher tracks in-flight command.update dispatches and
reconciles them against incoming hello envelopes — success path
marks the job succeeded and auto-resolves the alert; 90s timeout
marks the job failed and raises update_failed.
- http: POST /api/hosts/{id}/update (admin-only JSON) + the HTMX
/hosts/{id}/update form variant. Pre-checks: host exists, online,
agent_version != current, no running update job. Refactored core
into Server.dispatchHostUpdate so the fleet worker can share it
without going through HTTP.
- fleetupdate: rolling worker iterating through host slots, halting
on first failure and raising fleet_update_halted. Polling-based
version-match (re-read hosts.agent_version every 1s up to 95s) —
no extra plumbing into the WS hello path. At-most-one-running is
enforced at the store layer (ErrFleetUpdateRunning).
- cmd/server: wire UpdateWatcher and FleetWorker into the main
goroutine; the worker uses a small serverDispatcher adapter that
delegates back into Server.DispatchHostUpdate.
Tests: watcher (success/timeout/mismatch/late-hello), HTTP endpoint
(happy + four pre-check branches + RBAC), worker (two-host happy,
timeout-halt, host-offline-halt, already-at-target skip, cancel
mid-run, double-Start guard).
This commit is contained in:
@@ -0,0 +1,63 @@
|
||||
package alert
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||
)
|
||||
|
||||
// Alert-kind constants for P6 self-update flows.
|
||||
const (
|
||||
// KindUpdateFailed is raised when an agent fails to come back with
|
||||
// the expected version after a command.update dispatch (timeout or
|
||||
// version-mismatch). Resolved by a subsequent matching hello.
|
||||
KindUpdateFailed = "update_failed"
|
||||
|
||||
// KindFleetUpdateHalted is raised when the fleet-update worker
|
||||
// stops mid-run because a host failed to update or went offline.
|
||||
// Host-less alert (system-scoped). Manually resolved by an admin.
|
||||
KindFleetUpdateHalted = "fleet_update_halted"
|
||||
)
|
||||
|
||||
// RaiseUpdateFailed records a per-host update failure. dedupKey is the
|
||||
// hostID so a re-dispatch on the same host touches the existing alert
|
||||
// rather than spawning a duplicate.
|
||||
func (e *Engine) RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time) {
|
||||
msg := fmt.Sprintf("Agent update failed (job %s): %s", jobID, reason)
|
||||
e.raiseAndNotify(ctx, hostID, KindUpdateFailed, hostID, "warning", msg, when)
|
||||
}
|
||||
|
||||
// ResolveUpdateFailed clears any open update_failed alert for hostID.
|
||||
// Called from the WS hello path when the agent reconnects with the
|
||||
// target version.
|
||||
func (e *Engine) ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time) {
|
||||
e.resolveAndNotify(ctx, hostID, KindUpdateFailed, hostID, when)
|
||||
}
|
||||
|
||||
// RaiseFleetUpdateHalted is host-less — the fleet update is a
|
||||
// system-level concept. We persist it via the dedicated host-less
|
||||
// alert path so the alerts table's host_id column carries NULL.
|
||||
func (e *Engine) RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time) {
|
||||
msg := fmt.Sprintf("Fleet update %s halted: %s", fleetUpdateID, reason)
|
||||
id, didRaise, err := e.store.RaiseOrTouchSystem(ctx, KindFleetUpdateHalted, fleetUpdateID, "warning", msg, when)
|
||||
if err != nil {
|
||||
slog.Warn("alert: raise fleet_update_halted", "fu_id", fleetUpdateID, "err", err)
|
||||
return
|
||||
}
|
||||
if !didRaise {
|
||||
return
|
||||
}
|
||||
go e.hub.Dispatch(ctx, notification.Payload{
|
||||
Event: notification.EventRaised,
|
||||
AlertID: id,
|
||||
Severity: "warning",
|
||||
Kind: KindFleetUpdateHalted,
|
||||
HostID: "",
|
||||
HostName: "",
|
||||
Message: msg,
|
||||
RaisedAt: when,
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user