p6-01/02: agent self-update + fleet update server cluster
- alert: update_failed (per-host, dedup=hostID) + fleet_update_halted
(system-scoped, host_id NULL via new RaiseOrTouchSystem helper).
- ws: UpdateWatcher tracks in-flight command.update dispatches and
reconciles them against incoming hello envelopes — success path
marks the job succeeded and auto-resolves the alert; 90s timeout
marks the job failed and raises update_failed.
- http: POST /api/hosts/{id}/update (admin-only JSON) + the HTMX
/hosts/{id}/update form variant. Pre-checks: host exists, online,
agent_version != current, no running update job. Refactored core
into Server.dispatchHostUpdate so the fleet worker can share it
without going through HTTP.
- fleetupdate: rolling worker iterating through host slots, halting
on first failure and raising fleet_update_halted. Polling-based
version-match (re-read hosts.agent_version every 1s up to 95s) —
no extra plumbing into the WS hello path. At-most-one-running is
enforced at the store layer (ErrFleetUpdateRunning).
- cmd/server: wire UpdateWatcher and FleetWorker into the main
goroutine; the worker uses a small serverDispatcher adapter that
delegates back into Server.DispatchHostUpdate.
Tests: watcher (success/timeout/mismatch/late-hello), HTTP endpoint
(happy + four pre-check branches + RBAC), worker (two-host happy,
timeout-halt, host-offline-halt, already-at-target skip, cancel
mid-run, double-Start guard).
This commit is contained in:
@@ -77,6 +77,56 @@ func (s *Store) RaiseOrTouch(ctx context.Context, hostID, kind, dedupKey, severi
|
||||
return id, true, nil
|
||||
}
|
||||
|
||||
// RaiseOrTouchSystem is the host-less variant of RaiseOrTouch — the
|
||||
// alert row's host_id is stored as NULL, so the FK to hosts is bypassed.
|
||||
// Used by fleet-wide alerts (e.g. fleet_update_halted) where the
|
||||
// failure surface isn't pinned to a single host.
|
||||
func (s *Store) RaiseOrTouchSystem(ctx context.Context, kind, dedupKey, severity, message string, when time.Time) (id string, didRaise bool, err error) {
|
||||
tx, err := s.db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return "", false, fmt.Errorf("store: begin: %w", err)
|
||||
}
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
row := tx.QueryRowContext(ctx,
|
||||
`SELECT id FROM alerts
|
||||
WHERE host_id IS NULL AND kind = ? AND dedup_key = ? AND resolved_at IS NULL
|
||||
LIMIT 1`,
|
||||
kind, dedupKey)
|
||||
var existing string
|
||||
switch err := row.Scan(&existing); {
|
||||
case err == nil:
|
||||
_, uerr := tx.ExecContext(ctx,
|
||||
`UPDATE alerts SET last_seen_at = ?, message = ? WHERE id = ?`,
|
||||
when.UTC().Format(time.RFC3339Nano), message, existing)
|
||||
if uerr != nil {
|
||||
return "", false, fmt.Errorf("store: touch alert: %w", uerr)
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
return "", false, err
|
||||
}
|
||||
return existing, false, nil
|
||||
case errors.Is(err, sql.ErrNoRows):
|
||||
// fall through to insert
|
||||
default:
|
||||
return "", false, fmt.Errorf("store: lookup alert: %w", err)
|
||||
}
|
||||
|
||||
id = ulid.Make().String()
|
||||
whenStr := when.UTC().Format(time.RFC3339Nano)
|
||||
_, err = tx.ExecContext(ctx,
|
||||
`INSERT INTO alerts (id, host_id, kind, dedup_key, severity, message, created_at, last_seen_at)
|
||||
VALUES (?, NULL, ?, ?, ?, ?, ?, ?)`,
|
||||
id, kind, dedupKey, severity, message, whenStr, whenStr)
|
||||
if err != nil {
|
||||
return "", false, fmt.Errorf("store: insert alert: %w", err)
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
return "", false, err
|
||||
}
|
||||
return id, true, nil
|
||||
}
|
||||
|
||||
// refreshHostOpenAlertCount recomputes hosts.open_alert_count from the
|
||||
// alerts table for one host. Self-healing: idempotent and survives
|
||||
// out-of-order edits. Best-effort — errors are returned but callers
|
||||
|
||||
Reference in New Issue
Block a user