refactor(alert): refresh stale_schedule docs; log tick schedule errors; add mode-change + never-backed-up tests
This commit is contained in:
@@ -196,11 +196,9 @@ func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
|
|||||||
// tick is the 60-second sweep. Responsibilities:
|
// tick is the 60-second sweep. Responsibilities:
|
||||||
// 1. Re-evaluate agent_offline for every offline host that may have
|
// 1. Re-evaluate agent_offline for every offline host that may have
|
||||||
// crossed the floor between explicit events.
|
// crossed the floor between explicit events.
|
||||||
// 2. Stale-schedule detection — declared in the spec but intentionally
|
// 2. Stale-schedule detection for intermittent hosts — raises
|
||||||
// left as a no-op in v1. The precise "expected to have fired but
|
// stale_schedule when LastBackupAt is older than 7 days and the
|
||||||
// didn't" trigger requires a store helper that lands in a later
|
// host has an enabled schedule. Always-on hosts are excluded.
|
||||||
// task. The KindStaleSchedule constant is exported so UI code can
|
|
||||||
// reference the tag string today.
|
|
||||||
func (e *Engine) tick(ctx context.Context, now time.Time) {
|
func (e *Engine) tick(ctx context.Context, now time.Time) {
|
||||||
// User-management cleanup piggy-backed here for now. Setup tokens
|
// User-management cleanup piggy-backed here for now. Setup tokens
|
||||||
// have a 1h expiry; the alert engine tick is the cheapest existing
|
// have a 1h expiry; the alert engine tick is the cheapest existing
|
||||||
@@ -232,7 +230,11 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID)
|
hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID)
|
||||||
if err != nil || !hasEnabled {
|
if err != nil {
|
||||||
|
slog.Warn("alert: tick list schedules", "host_id", h.ID, "err", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !hasEnabled {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning",
|
e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning",
|
||||||
|
|||||||
@@ -170,3 +170,86 @@ func TestNoStalenessWithoutEnabledSchedule(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestResolveOnModeChangeClearsOfflineAlert checks that ResolveOnModeChange
|
||||||
|
// clears an open agent_offline alert when a host's mode is toggled.
|
||||||
|
func TestResolveOnModeChangeClearsOfflineAlert(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
eng, st, hostID := setupEngine(t)
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// Make always-on and set it offline with a stale last_seen_at.
|
||||||
|
if err := st.SetHostAlwaysOn(ctx, hostID, true); err != nil {
|
||||||
|
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := st.DB().Exec(
|
||||||
|
`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
|
||||||
|
time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
|
||||||
|
"offline",
|
||||||
|
hostID,
|
||||||
|
); err != nil {
|
||||||
|
t.Fatalf("update last_seen_at: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Raise the offline alert.
|
||||||
|
eng.handleHostOffline(ctx, hostID)
|
||||||
|
|
||||||
|
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
if len(open) != 1 || open[0].Kind != KindAgentOffline {
|
||||||
|
t.Fatalf("expected 1 agent_offline alert before mode change; got %d: %+v", len(open), open)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Toggle mode — should clear the alert.
|
||||||
|
eng.ResolveOnModeChange(ctx, hostID, time.Now().UTC())
|
||||||
|
|
||||||
|
open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
for _, a := range open {
|
||||||
|
if a.Kind == KindAgentOffline {
|
||||||
|
t.Fatalf("expected agent_offline to be resolved after mode change; still open: %+v", a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestNoStalenessWhenNeverBackedUp checks that no stale_schedule alert is
|
||||||
|
// raised for an intermittent host that has never backed up (nil LastBackupAt).
|
||||||
|
func TestNoStalenessWhenNeverBackedUp(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
eng, st, hostID := setupEngine(t)
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// Make intermittent.
|
||||||
|
if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
|
||||||
|
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a source group and an enabled schedule — but do NOT set LastBackupAt.
|
||||||
|
sgID := ulid.Make().String()
|
||||||
|
if err := st.CreateSourceGroup(ctx, &store.SourceGroup{
|
||||||
|
ID: sgID,
|
||||||
|
HostID: hostID,
|
||||||
|
Name: "default",
|
||||||
|
Includes: []string{"/home"},
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("CreateSourceGroup: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
schedID := ulid.Make().String()
|
||||||
|
if err := st.CreateSchedule(ctx, &store.Schedule{
|
||||||
|
ID: schedID,
|
||||||
|
HostID: hostID,
|
||||||
|
CronExpr: "0 2 * * *",
|
||||||
|
Enabled: true,
|
||||||
|
SourceGroupIDs: []string{sgID},
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("CreateSchedule: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
eng.tick(ctx, time.Now().UTC())
|
||||||
|
|
||||||
|
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
for _, a := range open {
|
||||||
|
if a.Kind == KindStaleSchedule {
|
||||||
|
t.Fatalf("expected no stale_schedule when never backed up; got: %+v", a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -27,10 +27,10 @@ const (
|
|||||||
// integrity is at risk) when a check job fails.
|
// integrity is at risk) when a check job fails.
|
||||||
KindCheckFailed = "check_failed"
|
KindCheckFailed = "check_failed"
|
||||||
|
|
||||||
// KindStaleSchedule is declared for completeness but intentionally
|
// KindStaleSchedule is raised for intermittent (non-always-on) hosts
|
||||||
// left as a no-op in v1. The precise "expected to have fired but
|
// when their last successful backup is older than staleBackupThreshold
|
||||||
// didn't" logic requires a store helper that lands in a follow-up
|
// (7 days) and they have at least one enabled schedule. Resolved on
|
||||||
// task. Ask the team before implementing.
|
// backup success or when the host is switched to always-on mode.
|
||||||
KindStaleSchedule = "stale_schedule"
|
KindStaleSchedule = "stale_schedule"
|
||||||
|
|
||||||
// KindAgentOffline is raised when a host's last_seen_at is older
|
// KindAgentOffline is raised when a host's last_seen_at is older
|
||||||
|
|||||||
Reference in New Issue
Block a user