9d5775fb47
- alert: update_failed (per-host, dedup=hostID) + fleet_update_halted
(system-scoped, host_id NULL via new RaiseOrTouchSystem helper).
- ws: UpdateWatcher tracks in-flight command.update dispatches and
reconciles them against incoming hello envelopes — success path
marks the job succeeded and auto-resolves the alert; 90s timeout
marks the job failed and raises update_failed.
- http: POST /api/hosts/{id}/update (admin-only JSON) + the HTMX
/hosts/{id}/update form variant. Pre-checks: host exists, online,
agent_version != current, no running update job. Refactored core
into Server.dispatchHostUpdate so the fleet worker can share it
without going through HTTP.
- fleetupdate: rolling worker iterating through host slots, halting
on first failure and raising fleet_update_halted. Polling-based
version-match (re-read hosts.agent_version every 1s up to 95s) —
no extra plumbing into the WS hello path. At-most-one-running is
enforced at the store layer (ErrFleetUpdateRunning).
- cmd/server: wire UpdateWatcher and FleetWorker into the main
goroutine; the worker uses a small serverDispatcher adapter that
delegates back into Server.DispatchHostUpdate.
Tests: watcher (success/timeout/mismatch/late-hello), HTTP endpoint
(happy + four pre-check branches + RBAC), worker (two-host happy,
timeout-halt, host-offline-halt, already-at-target skip, cancel
mid-run, double-Start guard).
64 lines
2.2 KiB
Go
64 lines
2.2 KiB
Go
package alert
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
|
)
|
|
|
|
// Alert-kind constants for P6 self-update flows.
|
|
const (
|
|
// KindUpdateFailed is raised when an agent fails to come back with
|
|
// the expected version after a command.update dispatch (timeout or
|
|
// version-mismatch). Resolved by a subsequent matching hello.
|
|
KindUpdateFailed = "update_failed"
|
|
|
|
// KindFleetUpdateHalted is raised when the fleet-update worker
|
|
// stops mid-run because a host failed to update or went offline.
|
|
// Host-less alert (system-scoped). Manually resolved by an admin.
|
|
KindFleetUpdateHalted = "fleet_update_halted"
|
|
)
|
|
|
|
// RaiseUpdateFailed records a per-host update failure. dedupKey is the
|
|
// hostID so a re-dispatch on the same host touches the existing alert
|
|
// rather than spawning a duplicate.
|
|
func (e *Engine) RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time) {
|
|
msg := fmt.Sprintf("Agent update failed (job %s): %s", jobID, reason)
|
|
e.raiseAndNotify(ctx, hostID, KindUpdateFailed, hostID, "warning", msg, when)
|
|
}
|
|
|
|
// ResolveUpdateFailed clears any open update_failed alert for hostID.
|
|
// Called from the WS hello path when the agent reconnects with the
|
|
// target version.
|
|
func (e *Engine) ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time) {
|
|
e.resolveAndNotify(ctx, hostID, KindUpdateFailed, hostID, when)
|
|
}
|
|
|
|
// RaiseFleetUpdateHalted is host-less — the fleet update is a
|
|
// system-level concept. We persist it via the dedicated host-less
|
|
// alert path so the alerts table's host_id column carries NULL.
|
|
func (e *Engine) RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time) {
|
|
msg := fmt.Sprintf("Fleet update %s halted: %s", fleetUpdateID, reason)
|
|
id, didRaise, err := e.store.RaiseOrTouchSystem(ctx, KindFleetUpdateHalted, fleetUpdateID, "warning", msg, when)
|
|
if err != nil {
|
|
slog.Warn("alert: raise fleet_update_halted", "fu_id", fleetUpdateID, "err", err)
|
|
return
|
|
}
|
|
if !didRaise {
|
|
return
|
|
}
|
|
go e.hub.Dispatch(ctx, notification.Payload{
|
|
Event: notification.EventRaised,
|
|
AlertID: id,
|
|
Severity: "warning",
|
|
Kind: KindFleetUpdateHalted,
|
|
HostID: "",
|
|
HostName: "",
|
|
Message: msg,
|
|
RaisedAt: when,
|
|
})
|
|
}
|