diff --git a/internal/alert/engine.go b/internal/alert/engine.go index 21c591b..0f94b5a 100644 --- a/internal/alert/engine.go +++ b/internal/alert/engine.go @@ -196,11 +196,9 @@ func (e *Engine) handleHostOnline(ctx context.Context, hostID string) { // tick is the 60-second sweep. Responsibilities: // 1. Re-evaluate agent_offline for every offline host that may have // crossed the floor between explicit events. -// 2. Stale-schedule detection — declared in the spec but intentionally -// left as a no-op in v1. The precise "expected to have fired but -// didn't" trigger requires a store helper that lands in a later -// task. The KindStaleSchedule constant is exported so UI code can -// reference the tag string today. +// 2. Stale-schedule detection for intermittent hosts — raises +// stale_schedule when LastBackupAt is older than 7 days and the +// host has an enabled schedule. Always-on hosts are excluded. func (e *Engine) tick(ctx context.Context, now time.Time) { // User-management cleanup piggy-backed here for now. Setup tokens // have a 1h expiry; the alert engine tick is the cheapest existing @@ -232,7 +230,11 @@ func (e *Engine) tick(ctx context.Context, now time.Time) { continue } hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID) - if err != nil || !hasEnabled { + if err != nil { + slog.Warn("alert: tick list schedules", "host_id", h.ID, "err", err) + continue + } + if !hasEnabled { continue } e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning", diff --git a/internal/alert/intermittent_test.go b/internal/alert/intermittent_test.go index fd316f5..d741831 100644 --- a/internal/alert/intermittent_test.go +++ b/internal/alert/intermittent_test.go @@ -170,3 +170,86 @@ func TestNoStalenessWithoutEnabledSchedule(t *testing.T) { } } } + +// TestResolveOnModeChangeClearsOfflineAlert checks that ResolveOnModeChange +// clears an open agent_offline alert when a host's mode is toggled. +func TestResolveOnModeChangeClearsOfflineAlert(t *testing.T) { + t.Parallel() + eng, st, hostID := setupEngine(t) + ctx := context.Background() + + // Make always-on and set it offline with a stale last_seen_at. + if err := st.SetHostAlwaysOn(ctx, hostID, true); err != nil { + t.Fatalf("SetHostAlwaysOn: %v", err) + } + if _, err := st.DB().Exec( + `UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`, + time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano), + "offline", + hostID, + ); err != nil { + t.Fatalf("update last_seen_at: %v", err) + } + + // Raise the offline alert. + eng.handleHostOffline(ctx, hostID) + + open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + if len(open) != 1 || open[0].Kind != KindAgentOffline { + t.Fatalf("expected 1 agent_offline alert before mode change; got %d: %+v", len(open), open) + } + + // Toggle mode — should clear the alert. + eng.ResolveOnModeChange(ctx, hostID, time.Now().UTC()) + + open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + for _, a := range open { + if a.Kind == KindAgentOffline { + t.Fatalf("expected agent_offline to be resolved after mode change; still open: %+v", a) + } + } +} + +// TestNoStalenessWhenNeverBackedUp checks that no stale_schedule alert is +// raised for an intermittent host that has never backed up (nil LastBackupAt). +func TestNoStalenessWhenNeverBackedUp(t *testing.T) { + t.Parallel() + eng, st, hostID := setupEngine(t) + ctx := context.Background() + + // Make intermittent. + if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil { + t.Fatalf("SetHostAlwaysOn: %v", err) + } + + // Create a source group and an enabled schedule — but do NOT set LastBackupAt. + sgID := ulid.Make().String() + if err := st.CreateSourceGroup(ctx, &store.SourceGroup{ + ID: sgID, + HostID: hostID, + Name: "default", + Includes: []string{"/home"}, + }); err != nil { + t.Fatalf("CreateSourceGroup: %v", err) + } + + schedID := ulid.Make().String() + if err := st.CreateSchedule(ctx, &store.Schedule{ + ID: schedID, + HostID: hostID, + CronExpr: "0 2 * * *", + Enabled: true, + SourceGroupIDs: []string{sgID}, + }); err != nil { + t.Fatalf("CreateSchedule: %v", err) + } + + eng.tick(ctx, time.Now().UTC()) + + open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + for _, a := range open { + if a.Kind == KindStaleSchedule { + t.Fatalf("expected no stale_schedule when never backed up; got: %+v", a) + } + } +} diff --git a/internal/alert/rules.go b/internal/alert/rules.go index d44daeb..f5e779c 100644 --- a/internal/alert/rules.go +++ b/internal/alert/rules.go @@ -27,10 +27,10 @@ const ( // integrity is at risk) when a check job fails. KindCheckFailed = "check_failed" - // KindStaleSchedule is declared for completeness but intentionally - // left as a no-op in v1. The precise "expected to have fired but - // didn't" logic requires a store helper that lands in a follow-up - // task. Ask the team before implementing. + // KindStaleSchedule is raised for intermittent (non-always-on) hosts + // when their last successful backup is older than staleBackupThreshold + // (7 days) and they have at least one enabled schedule. Resolved on + // backup success or when the host is switched to always-on mode. KindStaleSchedule = "stale_schedule" // KindAgentOffline is raised when a host's last_seen_at is older