diff --git a/internal/alert/engine.go b/internal/alert/engine.go index 607ed91..21c591b 100644 --- a/internal/alert/engine.go +++ b/internal/alert/engine.go @@ -22,6 +22,12 @@ import ( "gitea.dcglab.co.uk/steve/restic-manager/internal/store" ) +// staleBackupThreshold is how long an intermittent host may go without +// a successful backup before we raise a stale_schedule alert. Global +// constant for v1 (may become per-host later). Only intermittent hosts +// are evaluated — always-on hosts' stale_schedule stays a no-op. +const staleBackupThreshold = 7 * 24 * time.Hour + // JobFinishedEvent carries everything the engine needs to evaluate // the failed-X rules. Pushed via Engine.NotifyJobFinished from the // MarkJobFinished site. @@ -149,6 +155,10 @@ func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) { fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When) case "succeeded": e.resolveAndNotify(ctx, ev.HostID, kind, dedupKey, ev.When) + if ev.Kind == "backup" { + // A fresh backup clears staleness for intermittent hosts. + e.resolveAndNotify(ctx, ev.HostID, KindStaleSchedule, "", ev.When) + } } } @@ -157,6 +167,12 @@ func (e *Engine) handleHostOffline(ctx context.Context, hostID string) { if err != nil { return } + // Intermittent hosts (laptops) legitimately disappear — never raise + // agent_offline for them. The stale_schedule sweep in tick() is the + // only staleness signal for these hosts. + if !host.AlwaysOn { + return + } // Apply the 15-min floor — raise only when last_seen_at is older // than agentOfflineFloor. A nil last_seen_at (host enrolled but // never connected) is treated as "now" so we don't raise @@ -203,6 +219,28 @@ func (e *Engine) tick(ctx context.Context, now time.Time) { return } for _, h := range hosts { + // Intermittent hosts: suppress agent_offline entirely; instead + // raise stale_schedule when they have gone too long with no + // successful backup AND they have at least one enabled schedule + // to be measured against. A nil LastBackupAt (never backed up) + // has no baseline — onboarding/repo_status covers that case. + if !h.AlwaysOn { + if h.LastBackupAt == nil { + continue + } + if now.Sub(*h.LastBackupAt) < staleBackupThreshold { + continue + } + hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID) + if err != nil || !hasEnabled { + continue + } + e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning", + fmt.Sprintf("No backup in %s (threshold %s)", + roundDur(now.Sub(*h.LastBackupAt)), staleBackupThreshold), now) + continue + } + // Always-on hosts: existing agent_offline re-evaluation. if h.Status != "offline" || h.LastSeenAt == nil { continue } @@ -212,7 +250,6 @@ func (e *Engine) tick(ctx context.Context, now time.Time) { roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now) } } - // Stale-schedule sweep — no-op in v1. See KindStaleSchedule doc comment. } // roundDur returns a human-readable duration string, rounding to the @@ -224,3 +261,19 @@ func roundDur(d time.Duration) string { } return d.Round(time.Minute).String() } + +// hostHasEnabledSchedule reports whether the host has at least one +// enabled backup schedule — the precondition for a stale_schedule +// alert (no schedule = no backup expectation to measure against). +func (e *Engine) hostHasEnabledSchedule(ctx context.Context, hostID string) (bool, error) { + schedules, err := e.store.ListSchedulesByHost(ctx, hostID) + if err != nil { + return false, err + } + for _, sc := range schedules { + if sc.Enabled { + return true, nil + } + } + return false, nil +} diff --git a/internal/alert/intermittent_test.go b/internal/alert/intermittent_test.go new file mode 100644 index 0000000..fd316f5 --- /dev/null +++ b/internal/alert/intermittent_test.go @@ -0,0 +1,172 @@ +package alert + +import ( + "context" + "testing" + "time" + + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// TestIntermittentHostSuppressesOfflineAlert checks that handleHostOffline +// does NOT raise agent_offline for a host with AlwaysOn=false. +func TestIntermittentHostSuppressesOfflineAlert(t *testing.T) { + t.Parallel() + eng, st, hostID := setupEngine(t) + ctx := context.Background() + + // Make the host intermittent. + if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil { + t.Fatalf("SetHostAlwaysOn: %v", err) + } + + // Give it a stale last_seen_at well past the floor. + if _, err := st.DB().Exec( + `UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`, + time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano), + "offline", + hostID, + ); err != nil { + t.Fatalf("update last_seen_at: %v", err) + } + + eng.handleHostOffline(ctx, hostID) + + open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + if len(open) != 0 { + t.Fatalf("expected 0 open alerts for intermittent host; got %d: %+v", len(open), open) + } +} + +// TestAlwaysOnHostStillRaisesOfflineAlert checks that always-on hosts still +// get an agent_offline alert when offline past the floor. +func TestAlwaysOnHostStillRaisesOfflineAlert(t *testing.T) { + t.Parallel() + eng, st, hostID := setupEngine(t) + ctx := context.Background() + + // always_on=true is the default, but be explicit. + if err := st.SetHostAlwaysOn(ctx, hostID, true); err != nil { + t.Fatalf("SetHostAlwaysOn: %v", err) + } + + // Give it a stale last_seen_at well past the 15m floor. + if _, err := st.DB().Exec( + `UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`, + time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano), + "offline", + hostID, + ); err != nil { + t.Fatalf("update last_seen_at: %v", err) + } + + eng.handleHostOffline(ctx, hostID) + + open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + if len(open) != 1 || open[0].Kind != KindAgentOffline { + t.Fatalf("expected 1 agent_offline alert; got %d: %+v", len(open), open) + } +} + +// TestStalenessAlertForIntermittentHost checks that tick raises stale_schedule +// for an intermittent host whose last backup is older than 7 days AND has an +// enabled schedule. Also verifies that a succeeded backup clears the alert. +func TestStalenessAlertForIntermittentHost(t *testing.T) { + t.Parallel() + eng, st, hostID := setupEngine(t) + ctx := context.Background() + + // Make intermittent. + if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil { + t.Fatalf("SetHostAlwaysOn: %v", err) + } + + // Create a source group to attach the schedule to. + sgID := ulid.Make().String() + if err := st.CreateSourceGroup(ctx, &store.SourceGroup{ + ID: sgID, + HostID: hostID, + Name: "default", + Includes: []string{"/home"}, + }); err != nil { + t.Fatalf("CreateSourceGroup: %v", err) + } + + // Create an enabled schedule pointing at the source group. + schedID := ulid.Make().String() + if err := st.CreateSchedule(ctx, &store.Schedule{ + ID: schedID, + HostID: hostID, + CronExpr: "0 2 * * *", + Enabled: true, + SourceGroupIDs: []string{sgID}, + }); err != nil { + t.Fatalf("CreateSchedule: %v", err) + } + + // Set last_backup_at to 8 days ago. + eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour) + if err := st.SetHostLastBackup(ctx, hostID, "succeeded", eightDaysAgo); err != nil { + t.Fatalf("SetHostLastBackup: %v", err) + } + + eng.tick(ctx, time.Now().UTC()) + + open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + var staleCount int + for _, a := range open { + if a.Kind == KindStaleSchedule { + staleCount++ + } + } + if staleCount != 1 { + t.Fatalf("expected 1 stale_schedule alert after tick; got %d (all open: %+v)", staleCount, open) + } + + // A succeeded backup should clear the stale_schedule alert. + eng.handleJobFinished(ctx, JobFinishedEvent{ + HostID: hostID, + JobID: ulid.Make().String(), + Kind: "backup", + Status: "succeeded", + SourceGroupID: sgID, + When: time.Now().UTC(), + }) + + open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + for _, a := range open { + if a.Kind == KindStaleSchedule { + t.Fatalf("expected stale_schedule to be resolved after backup succeeded; still open: %+v", a) + } + } +} + +// TestNoStalenessWithoutEnabledSchedule checks that no stale_schedule is +// raised for an intermittent host with a stale backup but no enabled schedule. +func TestNoStalenessWithoutEnabledSchedule(t *testing.T) { + t.Parallel() + eng, st, hostID := setupEngine(t) + ctx := context.Background() + + // Make intermittent. + if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil { + t.Fatalf("SetHostAlwaysOn: %v", err) + } + + // Set last_backup_at to 8 days ago — stale — but no schedule. + eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour) + if err := st.SetHostLastBackup(ctx, hostID, "succeeded", eightDaysAgo); err != nil { + t.Fatalf("SetHostLastBackup: %v", err) + } + + eng.tick(ctx, time.Now().UTC()) + + open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + for _, a := range open { + if a.Kind == KindStaleSchedule { + t.Fatalf("expected no stale_schedule without an enabled schedule; got: %+v", a) + } + } +} diff --git a/internal/alert/rules.go b/internal/alert/rules.go index 54e2015..d44daeb 100644 --- a/internal/alert/rules.go +++ b/internal/alert/rules.go @@ -122,6 +122,16 @@ func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a } } +// ResolveOnModeChange clears any open agent_offline and stale_schedule +// alerts for a host whose always-on flag was just toggled. The next +// 60s tick re-raises whichever still applies under the new mode, so +// this is a self-correcting "wipe and let the sweep settle" call. +// Safe to invoke from the HTTP layer (it only touches the store + hub). +func (e *Engine) ResolveOnModeChange(ctx context.Context, hostID string, when time.Time) { + e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", when) + e.resolveAndNotify(ctx, hostID, KindStaleSchedule, "", when) +} + // resolveAndNotify clears the open (or acknowledged) alert matching // (host_id, kind, dedup_key) via store.AutoResolve, then fires // alert.resolved for the row(s) actually closed. Best-effort —