Files
restic-manager/internal/alert/update_alerts.go
T
steve 6fd2a2ff77 p6-01/02: agent self-update + fleet update server cluster
- alert: update_failed (per-host, dedup=hostID) + fleet_update_halted
  (system-scoped, host_id NULL via new RaiseOrTouchSystem helper).
- ws: UpdateWatcher tracks in-flight command.update dispatches and
  reconciles them against incoming hello envelopes — success path
  marks the job succeeded and auto-resolves the alert; 90s timeout
  marks the job failed and raises update_failed.
- http: POST /api/hosts/{id}/update (admin-only JSON) + the HTMX
  /hosts/{id}/update form variant. Pre-checks: host exists, online,
  agent_version != current, no running update job. Refactored core
  into Server.dispatchHostUpdate so the fleet worker can share it
  without going through HTTP.
- fleetupdate: rolling worker iterating through host slots, halting
  on first failure and raising fleet_update_halted. Polling-based
  version-match (re-read hosts.agent_version every 1s up to 95s) —
  no extra plumbing into the WS hello path. At-most-one-running is
  enforced at the store layer (ErrFleetUpdateRunning).
- cmd/server: wire UpdateWatcher and FleetWorker into the main
  goroutine; the worker uses a small serverDispatcher adapter that
  delegates back into Server.DispatchHostUpdate.

Tests: watcher (success/timeout/mismatch/late-hello), HTTP endpoint
(happy + four pre-check branches + RBAC), worker (two-host happy,
timeout-halt, host-offline-halt, already-at-target skip, cancel
mid-run, double-Start guard).
2026-05-06 22:03:50 +01:00

64 lines
2.2 KiB
Go

package alert
import (
"context"
"fmt"
"log/slog"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
)
// Alert-kind constants for P6 self-update flows.
const (
// KindUpdateFailed is raised when an agent fails to come back with
// the expected version after a command.update dispatch (timeout or
// version-mismatch). Resolved by a subsequent matching hello.
KindUpdateFailed = "update_failed"
// KindFleetUpdateHalted is raised when the fleet-update worker
// stops mid-run because a host failed to update or went offline.
// Host-less alert (system-scoped). Manually resolved by an admin.
KindFleetUpdateHalted = "fleet_update_halted"
)
// RaiseUpdateFailed records a per-host update failure. dedupKey is the
// hostID so a re-dispatch on the same host touches the existing alert
// rather than spawning a duplicate.
func (e *Engine) RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time) {
msg := fmt.Sprintf("Agent update failed (job %s): %s", jobID, reason)
e.raiseAndNotify(ctx, hostID, KindUpdateFailed, hostID, "warning", msg, when)
}
// ResolveUpdateFailed clears any open update_failed alert for hostID.
// Called from the WS hello path when the agent reconnects with the
// target version.
func (e *Engine) ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time) {
e.resolveAndNotify(ctx, hostID, KindUpdateFailed, hostID, when)
}
// RaiseFleetUpdateHalted is host-less — the fleet update is a
// system-level concept. We persist it via the dedicated host-less
// alert path so the alerts table's host_id column carries NULL.
func (e *Engine) RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time) {
msg := fmt.Sprintf("Fleet update %s halted: %s", fleetUpdateID, reason)
id, didRaise, err := e.store.RaiseOrTouchSystem(ctx, KindFleetUpdateHalted, fleetUpdateID, "warning", msg, when)
if err != nil {
slog.Warn("alert: raise fleet_update_halted", "fu_id", fleetUpdateID, "err", err)
return
}
if !didRaise {
return
}
go e.hub.Dispatch(ctx, notification.Payload{
Event: notification.EventRaised,
AlertID: id,
Severity: "warning",
Kind: KindFleetUpdateHalted,
HostID: "",
HostName: "",
Message: msg,
RaisedAt: when,
})
}