refactor(alert): refresh stale_schedule docs; log tick schedule errors; add mode-change + never-backed-up tests
This commit is contained in:
@@ -196,11 +196,9 @@ func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
|
||||
// tick is the 60-second sweep. Responsibilities:
|
||||
// 1. Re-evaluate agent_offline for every offline host that may have
|
||||
// crossed the floor between explicit events.
|
||||
// 2. Stale-schedule detection — declared in the spec but intentionally
|
||||
// left as a no-op in v1. The precise "expected to have fired but
|
||||
// didn't" trigger requires a store helper that lands in a later
|
||||
// task. The KindStaleSchedule constant is exported so UI code can
|
||||
// reference the tag string today.
|
||||
// 2. Stale-schedule detection for intermittent hosts — raises
|
||||
// stale_schedule when LastBackupAt is older than 7 days and the
|
||||
// host has an enabled schedule. Always-on hosts are excluded.
|
||||
func (e *Engine) tick(ctx context.Context, now time.Time) {
|
||||
// User-management cleanup piggy-backed here for now. Setup tokens
|
||||
// have a 1h expiry; the alert engine tick is the cheapest existing
|
||||
@@ -232,7 +230,11 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
|
||||
continue
|
||||
}
|
||||
hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID)
|
||||
if err != nil || !hasEnabled {
|
||||
if err != nil {
|
||||
slog.Warn("alert: tick list schedules", "host_id", h.ID, "err", err)
|
||||
continue
|
||||
}
|
||||
if !hasEnabled {
|
||||
continue
|
||||
}
|
||||
e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning",
|
||||
|
||||
@@ -170,3 +170,86 @@ func TestNoStalenessWithoutEnabledSchedule(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveOnModeChangeClearsOfflineAlert checks that ResolveOnModeChange
|
||||
// clears an open agent_offline alert when a host's mode is toggled.
|
||||
func TestResolveOnModeChangeClearsOfflineAlert(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
ctx := context.Background()
|
||||
|
||||
// Make always-on and set it offline with a stale last_seen_at.
|
||||
if err := st.SetHostAlwaysOn(ctx, hostID, true); err != nil {
|
||||
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||
}
|
||||
if _, err := st.DB().Exec(
|
||||
`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
|
||||
time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
|
||||
"offline",
|
||||
hostID,
|
||||
); err != nil {
|
||||
t.Fatalf("update last_seen_at: %v", err)
|
||||
}
|
||||
|
||||
// Raise the offline alert.
|
||||
eng.handleHostOffline(ctx, hostID)
|
||||
|
||||
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 1 || open[0].Kind != KindAgentOffline {
|
||||
t.Fatalf("expected 1 agent_offline alert before mode change; got %d: %+v", len(open), open)
|
||||
}
|
||||
|
||||
// Toggle mode — should clear the alert.
|
||||
eng.ResolveOnModeChange(ctx, hostID, time.Now().UTC())
|
||||
|
||||
open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
for _, a := range open {
|
||||
if a.Kind == KindAgentOffline {
|
||||
t.Fatalf("expected agent_offline to be resolved after mode change; still open: %+v", a)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestNoStalenessWhenNeverBackedUp checks that no stale_schedule alert is
|
||||
// raised for an intermittent host that has never backed up (nil LastBackupAt).
|
||||
func TestNoStalenessWhenNeverBackedUp(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
ctx := context.Background()
|
||||
|
||||
// Make intermittent.
|
||||
if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
|
||||
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||
}
|
||||
|
||||
// Create a source group and an enabled schedule — but do NOT set LastBackupAt.
|
||||
sgID := ulid.Make().String()
|
||||
if err := st.CreateSourceGroup(ctx, &store.SourceGroup{
|
||||
ID: sgID,
|
||||
HostID: hostID,
|
||||
Name: "default",
|
||||
Includes: []string{"/home"},
|
||||
}); err != nil {
|
||||
t.Fatalf("CreateSourceGroup: %v", err)
|
||||
}
|
||||
|
||||
schedID := ulid.Make().String()
|
||||
if err := st.CreateSchedule(ctx, &store.Schedule{
|
||||
ID: schedID,
|
||||
HostID: hostID,
|
||||
CronExpr: "0 2 * * *",
|
||||
Enabled: true,
|
||||
SourceGroupIDs: []string{sgID},
|
||||
}); err != nil {
|
||||
t.Fatalf("CreateSchedule: %v", err)
|
||||
}
|
||||
|
||||
eng.tick(ctx, time.Now().UTC())
|
||||
|
||||
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
for _, a := range open {
|
||||
if a.Kind == KindStaleSchedule {
|
||||
t.Fatalf("expected no stale_schedule when never backed up; got: %+v", a)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,10 +27,10 @@ const (
|
||||
// integrity is at risk) when a check job fails.
|
||||
KindCheckFailed = "check_failed"
|
||||
|
||||
// KindStaleSchedule is declared for completeness but intentionally
|
||||
// left as a no-op in v1. The precise "expected to have fired but
|
||||
// didn't" logic requires a store helper that lands in a follow-up
|
||||
// task. Ask the team before implementing.
|
||||
// KindStaleSchedule is raised for intermittent (non-always-on) hosts
|
||||
// when their last successful backup is older than staleBackupThreshold
|
||||
// (7 days) and they have at least one enabled schedule. Resolved on
|
||||
// backup success or when the host is switched to always-on mode.
|
||||
KindStaleSchedule = "stale_schedule"
|
||||
|
||||
// KindAgentOffline is raised when a host's last_seen_at is older
|
||||
|
||||
Reference in New Issue
Block a user