feat(alert): suppress offline + add staleness alert for intermittent hosts
This commit is contained in:
@@ -22,6 +22,12 @@ import (
|
|||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// staleBackupThreshold is how long an intermittent host may go without
|
||||||
|
// a successful backup before we raise a stale_schedule alert. Global
|
||||||
|
// constant for v1 (may become per-host later). Only intermittent hosts
|
||||||
|
// are evaluated — always-on hosts' stale_schedule stays a no-op.
|
||||||
|
const staleBackupThreshold = 7 * 24 * time.Hour
|
||||||
|
|
||||||
// JobFinishedEvent carries everything the engine needs to evaluate
|
// JobFinishedEvent carries everything the engine needs to evaluate
|
||||||
// the failed-X rules. Pushed via Engine.NotifyJobFinished from the
|
// the failed-X rules. Pushed via Engine.NotifyJobFinished from the
|
||||||
// MarkJobFinished site.
|
// MarkJobFinished site.
|
||||||
@@ -149,6 +155,10 @@ func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
|
|||||||
fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
|
fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
|
||||||
case "succeeded":
|
case "succeeded":
|
||||||
e.resolveAndNotify(ctx, ev.HostID, kind, dedupKey, ev.When)
|
e.resolveAndNotify(ctx, ev.HostID, kind, dedupKey, ev.When)
|
||||||
|
if ev.Kind == "backup" {
|
||||||
|
// A fresh backup clears staleness for intermittent hosts.
|
||||||
|
e.resolveAndNotify(ctx, ev.HostID, KindStaleSchedule, "", ev.When)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -157,6 +167,12 @@ func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
// Intermittent hosts (laptops) legitimately disappear — never raise
|
||||||
|
// agent_offline for them. The stale_schedule sweep in tick() is the
|
||||||
|
// only staleness signal for these hosts.
|
||||||
|
if !host.AlwaysOn {
|
||||||
|
return
|
||||||
|
}
|
||||||
// Apply the 15-min floor — raise only when last_seen_at is older
|
// Apply the 15-min floor — raise only when last_seen_at is older
|
||||||
// than agentOfflineFloor. A nil last_seen_at (host enrolled but
|
// than agentOfflineFloor. A nil last_seen_at (host enrolled but
|
||||||
// never connected) is treated as "now" so we don't raise
|
// never connected) is treated as "now" so we don't raise
|
||||||
@@ -203,6 +219,28 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
for _, h := range hosts {
|
for _, h := range hosts {
|
||||||
|
// Intermittent hosts: suppress agent_offline entirely; instead
|
||||||
|
// raise stale_schedule when they have gone too long with no
|
||||||
|
// successful backup AND they have at least one enabled schedule
|
||||||
|
// to be measured against. A nil LastBackupAt (never backed up)
|
||||||
|
// has no baseline — onboarding/repo_status covers that case.
|
||||||
|
if !h.AlwaysOn {
|
||||||
|
if h.LastBackupAt == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if now.Sub(*h.LastBackupAt) < staleBackupThreshold {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID)
|
||||||
|
if err != nil || !hasEnabled {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning",
|
||||||
|
fmt.Sprintf("No backup in %s (threshold %s)",
|
||||||
|
roundDur(now.Sub(*h.LastBackupAt)), staleBackupThreshold), now)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Always-on hosts: existing agent_offline re-evaluation.
|
||||||
if h.Status != "offline" || h.LastSeenAt == nil {
|
if h.Status != "offline" || h.LastSeenAt == nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -212,7 +250,6 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
|
|||||||
roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
|
roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Stale-schedule sweep — no-op in v1. See KindStaleSchedule doc comment.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// roundDur returns a human-readable duration string, rounding to the
|
// roundDur returns a human-readable duration string, rounding to the
|
||||||
@@ -224,3 +261,19 @@ func roundDur(d time.Duration) string {
|
|||||||
}
|
}
|
||||||
return d.Round(time.Minute).String()
|
return d.Round(time.Minute).String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// hostHasEnabledSchedule reports whether the host has at least one
|
||||||
|
// enabled backup schedule — the precondition for a stale_schedule
|
||||||
|
// alert (no schedule = no backup expectation to measure against).
|
||||||
|
func (e *Engine) hostHasEnabledSchedule(ctx context.Context, hostID string) (bool, error) {
|
||||||
|
schedules, err := e.store.ListSchedulesByHost(ctx, hostID)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
for _, sc := range schedules {
|
||||||
|
if sc.Enabled {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,172 @@
|
|||||||
|
package alert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestIntermittentHostSuppressesOfflineAlert checks that handleHostOffline
|
||||||
|
// does NOT raise agent_offline for a host with AlwaysOn=false.
|
||||||
|
func TestIntermittentHostSuppressesOfflineAlert(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
eng, st, hostID := setupEngine(t)
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// Make the host intermittent.
|
||||||
|
if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
|
||||||
|
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Give it a stale last_seen_at well past the floor.
|
||||||
|
if _, err := st.DB().Exec(
|
||||||
|
`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
|
||||||
|
time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
|
||||||
|
"offline",
|
||||||
|
hostID,
|
||||||
|
); err != nil {
|
||||||
|
t.Fatalf("update last_seen_at: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
eng.handleHostOffline(ctx, hostID)
|
||||||
|
|
||||||
|
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
if len(open) != 0 {
|
||||||
|
t.Fatalf("expected 0 open alerts for intermittent host; got %d: %+v", len(open), open)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestAlwaysOnHostStillRaisesOfflineAlert checks that always-on hosts still
|
||||||
|
// get an agent_offline alert when offline past the floor.
|
||||||
|
func TestAlwaysOnHostStillRaisesOfflineAlert(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
eng, st, hostID := setupEngine(t)
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// always_on=true is the default, but be explicit.
|
||||||
|
if err := st.SetHostAlwaysOn(ctx, hostID, true); err != nil {
|
||||||
|
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Give it a stale last_seen_at well past the 15m floor.
|
||||||
|
if _, err := st.DB().Exec(
|
||||||
|
`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
|
||||||
|
time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
|
||||||
|
"offline",
|
||||||
|
hostID,
|
||||||
|
); err != nil {
|
||||||
|
t.Fatalf("update last_seen_at: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
eng.handleHostOffline(ctx, hostID)
|
||||||
|
|
||||||
|
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
if len(open) != 1 || open[0].Kind != KindAgentOffline {
|
||||||
|
t.Fatalf("expected 1 agent_offline alert; got %d: %+v", len(open), open)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestStalenessAlertForIntermittentHost checks that tick raises stale_schedule
|
||||||
|
// for an intermittent host whose last backup is older than 7 days AND has an
|
||||||
|
// enabled schedule. Also verifies that a succeeded backup clears the alert.
|
||||||
|
func TestStalenessAlertForIntermittentHost(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
eng, st, hostID := setupEngine(t)
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// Make intermittent.
|
||||||
|
if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
|
||||||
|
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a source group to attach the schedule to.
|
||||||
|
sgID := ulid.Make().String()
|
||||||
|
if err := st.CreateSourceGroup(ctx, &store.SourceGroup{
|
||||||
|
ID: sgID,
|
||||||
|
HostID: hostID,
|
||||||
|
Name: "default",
|
||||||
|
Includes: []string{"/home"},
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("CreateSourceGroup: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create an enabled schedule pointing at the source group.
|
||||||
|
schedID := ulid.Make().String()
|
||||||
|
if err := st.CreateSchedule(ctx, &store.Schedule{
|
||||||
|
ID: schedID,
|
||||||
|
HostID: hostID,
|
||||||
|
CronExpr: "0 2 * * *",
|
||||||
|
Enabled: true,
|
||||||
|
SourceGroupIDs: []string{sgID},
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("CreateSchedule: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set last_backup_at to 8 days ago.
|
||||||
|
eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour)
|
||||||
|
if err := st.SetHostLastBackup(ctx, hostID, "succeeded", eightDaysAgo); err != nil {
|
||||||
|
t.Fatalf("SetHostLastBackup: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
eng.tick(ctx, time.Now().UTC())
|
||||||
|
|
||||||
|
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
var staleCount int
|
||||||
|
for _, a := range open {
|
||||||
|
if a.Kind == KindStaleSchedule {
|
||||||
|
staleCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if staleCount != 1 {
|
||||||
|
t.Fatalf("expected 1 stale_schedule alert after tick; got %d (all open: %+v)", staleCount, open)
|
||||||
|
}
|
||||||
|
|
||||||
|
// A succeeded backup should clear the stale_schedule alert.
|
||||||
|
eng.handleJobFinished(ctx, JobFinishedEvent{
|
||||||
|
HostID: hostID,
|
||||||
|
JobID: ulid.Make().String(),
|
||||||
|
Kind: "backup",
|
||||||
|
Status: "succeeded",
|
||||||
|
SourceGroupID: sgID,
|
||||||
|
When: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
|
||||||
|
open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
for _, a := range open {
|
||||||
|
if a.Kind == KindStaleSchedule {
|
||||||
|
t.Fatalf("expected stale_schedule to be resolved after backup succeeded; still open: %+v", a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestNoStalenessWithoutEnabledSchedule checks that no stale_schedule is
|
||||||
|
// raised for an intermittent host with a stale backup but no enabled schedule.
|
||||||
|
func TestNoStalenessWithoutEnabledSchedule(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
eng, st, hostID := setupEngine(t)
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// Make intermittent.
|
||||||
|
if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
|
||||||
|
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set last_backup_at to 8 days ago — stale — but no schedule.
|
||||||
|
eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour)
|
||||||
|
if err := st.SetHostLastBackup(ctx, hostID, "succeeded", eightDaysAgo); err != nil {
|
||||||
|
t.Fatalf("SetHostLastBackup: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
eng.tick(ctx, time.Now().UTC())
|
||||||
|
|
||||||
|
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
for _, a := range open {
|
||||||
|
if a.Kind == KindStaleSchedule {
|
||||||
|
t.Fatalf("expected no stale_schedule without an enabled schedule; got: %+v", a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -122,6 +122,16 @@ func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ResolveOnModeChange clears any open agent_offline and stale_schedule
|
||||||
|
// alerts for a host whose always-on flag was just toggled. The next
|
||||||
|
// 60s tick re-raises whichever still applies under the new mode, so
|
||||||
|
// this is a self-correcting "wipe and let the sweep settle" call.
|
||||||
|
// Safe to invoke from the HTTP layer (it only touches the store + hub).
|
||||||
|
func (e *Engine) ResolveOnModeChange(ctx context.Context, hostID string, when time.Time) {
|
||||||
|
e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", when)
|
||||||
|
e.resolveAndNotify(ctx, hostID, KindStaleSchedule, "", when)
|
||||||
|
}
|
||||||
|
|
||||||
// resolveAndNotify clears the open (or acknowledged) alert matching
|
// resolveAndNotify clears the open (or acknowledged) alert matching
|
||||||
// (host_id, kind, dedup_key) via store.AutoResolve, then fires
|
// (host_id, kind, dedup_key) via store.AutoResolve, then fires
|
||||||
// alert.resolved for the row(s) actually closed. Best-effort —
|
// alert.resolved for the row(s) actually closed. Best-effort —
|
||||||
|
|||||||
Reference in New Issue
Block a user