Always-On vs intermittent host mode (laptops): suppress offline noise, catch up missed backups #31

Merged
steve merged 17 commits from feat-laptop-host-mode into main 2026-06-15 23:01:04 +01:00
3 changed files with 95 additions and 10 deletions
Showing only changes of commit 9e6524788f - Show all commits
+8 -6
View File
@@ -196,11 +196,9 @@ func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
// tick is the 60-second sweep. Responsibilities:
// 1. Re-evaluate agent_offline for every offline host that may have
// crossed the floor between explicit events.
// 2. Stale-schedule detection — declared in the spec but intentionally
// left as a no-op in v1. The precise "expected to have fired but
// didn't" trigger requires a store helper that lands in a later
// task. The KindStaleSchedule constant is exported so UI code can
// reference the tag string today.
// 2. Stale-schedule detection for intermittent hosts — raises
// stale_schedule when LastBackupAt is older than 7 days and the
// host has an enabled schedule. Always-on hosts are excluded.
func (e *Engine) tick(ctx context.Context, now time.Time) {
// User-management cleanup piggy-backed here for now. Setup tokens
// have a 1h expiry; the alert engine tick is the cheapest existing
@@ -232,7 +230,11 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
continue
}
hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID)
if err != nil || !hasEnabled {
if err != nil {
slog.Warn("alert: tick list schedules", "host_id", h.ID, "err", err)
continue
}
if !hasEnabled {
continue
}
e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning",
+83
View File
@@ -170,3 +170,86 @@ func TestNoStalenessWithoutEnabledSchedule(t *testing.T) {
}
}
}
// TestResolveOnModeChangeClearsOfflineAlert checks that ResolveOnModeChange
// clears an open agent_offline alert when a host's mode is toggled.
func TestResolveOnModeChangeClearsOfflineAlert(t *testing.T) {
t.Parallel()
eng, st, hostID := setupEngine(t)
ctx := context.Background()
// Make always-on and set it offline with a stale last_seen_at.
if err := st.SetHostAlwaysOn(ctx, hostID, true); err != nil {
t.Fatalf("SetHostAlwaysOn: %v", err)
}
if _, err := st.DB().Exec(
`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
"offline",
hostID,
); err != nil {
t.Fatalf("update last_seen_at: %v", err)
}
// Raise the offline alert.
eng.handleHostOffline(ctx, hostID)
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
if len(open) != 1 || open[0].Kind != KindAgentOffline {
t.Fatalf("expected 1 agent_offline alert before mode change; got %d: %+v", len(open), open)
}
// Toggle mode — should clear the alert.
eng.ResolveOnModeChange(ctx, hostID, time.Now().UTC())
open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
for _, a := range open {
if a.Kind == KindAgentOffline {
t.Fatalf("expected agent_offline to be resolved after mode change; still open: %+v", a)
}
}
}
// TestNoStalenessWhenNeverBackedUp checks that no stale_schedule alert is
// raised for an intermittent host that has never backed up (nil LastBackupAt).
func TestNoStalenessWhenNeverBackedUp(t *testing.T) {
t.Parallel()
eng, st, hostID := setupEngine(t)
ctx := context.Background()
// Make intermittent.
if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
t.Fatalf("SetHostAlwaysOn: %v", err)
}
// Create a source group and an enabled schedule — but do NOT set LastBackupAt.
sgID := ulid.Make().String()
if err := st.CreateSourceGroup(ctx, &store.SourceGroup{
ID: sgID,
HostID: hostID,
Name: "default",
Includes: []string{"/home"},
}); err != nil {
t.Fatalf("CreateSourceGroup: %v", err)
}
schedID := ulid.Make().String()
if err := st.CreateSchedule(ctx, &store.Schedule{
ID: schedID,
HostID: hostID,
CronExpr: "0 2 * * *",
Enabled: true,
SourceGroupIDs: []string{sgID},
}); err != nil {
t.Fatalf("CreateSchedule: %v", err)
}
eng.tick(ctx, time.Now().UTC())
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
for _, a := range open {
if a.Kind == KindStaleSchedule {
t.Fatalf("expected no stale_schedule when never backed up; got: %+v", a)
}
}
}
+4 -4
View File
@@ -27,10 +27,10 @@ const (
// integrity is at risk) when a check job fails.
KindCheckFailed = "check_failed"
// KindStaleSchedule is declared for completeness but intentionally
// left as a no-op in v1. The precise "expected to have fired but
// didn't" logic requires a store helper that lands in a follow-up
// task. Ask the team before implementing.
// KindStaleSchedule is raised for intermittent (non-always-on) hosts
// when their last successful backup is older than staleBackupThreshold
// (7 days) and they have at least one enabled schedule. Resolved on
// backup success or when the host is switched to always-on mode.
KindStaleSchedule = "stale_schedule"
// KindAgentOffline is raised when a host's last_seen_at is older