feat(alert): suppress offline + add staleness alert for intermittent hosts

This commit is contained in:
2026-06-15 21:09:39 +01:00
parent e408de9610
commit 25c55e5e4d
3 changed files with 236 additions and 1 deletions
+54 -1
View File
@@ -22,6 +22,12 @@ import (
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// staleBackupThreshold is how long an intermittent host may go without
// a successful backup before we raise a stale_schedule alert. Global
// constant for v1 (may become per-host later). Only intermittent hosts
// are evaluated — always-on hosts' stale_schedule stays a no-op.
const staleBackupThreshold = 7 * 24 * time.Hour
// JobFinishedEvent carries everything the engine needs to evaluate
// the failed-X rules. Pushed via Engine.NotifyJobFinished from the
// MarkJobFinished site.
@@ -149,6 +155,10 @@ func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
case "succeeded":
e.resolveAndNotify(ctx, ev.HostID, kind, dedupKey, ev.When)
if ev.Kind == "backup" {
// A fresh backup clears staleness for intermittent hosts.
e.resolveAndNotify(ctx, ev.HostID, KindStaleSchedule, "", ev.When)
}
}
}
@@ -157,6 +167,12 @@ func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
if err != nil {
return
}
// Intermittent hosts (laptops) legitimately disappear — never raise
// agent_offline for them. The stale_schedule sweep in tick() is the
// only staleness signal for these hosts.
if !host.AlwaysOn {
return
}
// Apply the 15-min floor — raise only when last_seen_at is older
// than agentOfflineFloor. A nil last_seen_at (host enrolled but
// never connected) is treated as "now" so we don't raise
@@ -203,6 +219,28 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
return
}
for _, h := range hosts {
// Intermittent hosts: suppress agent_offline entirely; instead
// raise stale_schedule when they have gone too long with no
// successful backup AND they have at least one enabled schedule
// to be measured against. A nil LastBackupAt (never backed up)
// has no baseline — onboarding/repo_status covers that case.
if !h.AlwaysOn {
if h.LastBackupAt == nil {
continue
}
if now.Sub(*h.LastBackupAt) < staleBackupThreshold {
continue
}
hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID)
if err != nil || !hasEnabled {
continue
}
e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning",
fmt.Sprintf("No backup in %s (threshold %s)",
roundDur(now.Sub(*h.LastBackupAt)), staleBackupThreshold), now)
continue
}
// Always-on hosts: existing agent_offline re-evaluation.
if h.Status != "offline" || h.LastSeenAt == nil {
continue
}
@@ -212,7 +250,6 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
}
}
// Stale-schedule sweep — no-op in v1. See KindStaleSchedule doc comment.
}
// roundDur returns a human-readable duration string, rounding to the
@@ -224,3 +261,19 @@ func roundDur(d time.Duration) string {
}
return d.Round(time.Minute).String()
}
// hostHasEnabledSchedule reports whether the host has at least one
// enabled backup schedule — the precondition for a stale_schedule
// alert (no schedule = no backup expectation to measure against).
func (e *Engine) hostHasEnabledSchedule(ctx context.Context, hostID string) (bool, error) {
schedules, err := e.store.ListSchedulesByHost(ctx, hostID)
if err != nil {
return false, err
}
for _, sc := range schedules {
if sc.Enabled {
return true, nil
}
}
return false, nil
}