diff --git a/internal/alert/engine.go b/internal/alert/engine.go index 56c8427..2ef67db 100644 --- a/internal/alert/engine.go +++ b/internal/alert/engine.go @@ -13,6 +13,7 @@ package alert import ( "context" + "fmt" "log/slog" "sync" "time" @@ -116,19 +117,89 @@ func (e *Engine) NotifyHostOnline(hostID string) { } } -// Placeholder method stubs for C2 implementation func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) { - // Implemented in C2 + // Determine which kind/severity pair this job maps to. Jobs not + // listed here (init, unlock, restore, diff) produce no alerts in v1. + var kind, severity string + switch ev.Kind { + case "backup": + kind, severity = KindBackupFailed, "warning" + case "forget": + kind, severity = KindForgetFailed, "warning" + case "prune": + kind, severity = KindPruneFailed, "warning" + case "check": + kind, severity = KindCheckFailed, "critical" + default: + return + } + switch ev.Status { + case "failed": + e.raiseAndNotify(ctx, ev.HostID, kind, severity, + fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When) + case "succeeded": + e.resolveAndNotify(ctx, ev.HostID, kind, ev.When) + } } func (e *Engine) handleHostOffline(ctx context.Context, hostID string) { - // Implemented in C2 + host, err := e.store.GetHost(ctx, hostID) + if err != nil { + return + } + // Apply the 15-min floor — raise only when last_seen_at is older + // than agentOfflineFloor. A nil last_seen_at (host enrolled but + // never connected) is treated as "now" so we don't raise + // immediately on enrolment. + if host.LastSeenAt == nil { + return + } + if time.Since(*host.LastSeenAt) < e.agentOfflineFloor { + return + } + e.raiseAndNotify(ctx, hostID, KindAgentOffline, "warning", + fmt.Sprintf("Agent offline for %s (threshold %s)", + roundDur(time.Since(*host.LastSeenAt)), e.agentOfflineFloor), + time.Now().UTC()) } func (e *Engine) handleHostOnline(ctx context.Context, hostID string) { - // Implemented in C2 + e.resolveAndNotify(ctx, hostID, KindAgentOffline, time.Now().UTC()) } +// tick is the 60-second sweep. Responsibilities: +// 1. Re-evaluate agent_offline for every offline host that may have +// crossed the floor between explicit events. +// 2. Stale-schedule detection — declared in the spec but intentionally +// left as a no-op in v1. The precise "expected to have fired but +// didn't" trigger requires a store helper that lands in a later +// task. The KindStaleSchedule constant is exported so UI code can +// reference the tag string today. func (e *Engine) tick(ctx context.Context, now time.Time) { - // Implemented in C2 + hosts, err := e.store.ListHosts(ctx) + if err != nil { + slog.Warn("alert: tick list hosts", "err", err) + return + } + for _, h := range hosts { + if h.Status != "offline" || h.LastSeenAt == nil { + continue + } + if now.Sub(*h.LastSeenAt) >= e.agentOfflineFloor { + e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "warning", + fmt.Sprintf("Agent offline for %s (threshold %s)", + roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now) + } + } + // Stale-schedule sweep — no-op in v1. See KindStaleSchedule doc comment. +} + +// roundDur returns a human-readable duration string, rounding to the +// nearest minute. Durations under a minute are reported as "less than +// a minute". +func roundDur(d time.Duration) string { + if d < time.Minute { + return "less than a minute" + } + return d.Round(time.Minute).String() } diff --git a/internal/alert/rules.go b/internal/alert/rules.go new file mode 100644 index 0000000..3207d63 --- /dev/null +++ b/internal/alert/rules.go @@ -0,0 +1,110 @@ +package alert + +import ( + "context" + "fmt" + "log/slog" + "time" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/notification" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// Alert kind constants — keep in lockstep with the engine logic and +// the UI tag-colour table. +const ( + // KindBackupFailed is raised when a backup job finishes with + // status "failed" and resolved on next backup success. + KindBackupFailed = "backup_failed" + + // KindForgetFailed mirrors KindBackupFailed for forget jobs. + KindForgetFailed = "forget_failed" + + // KindPruneFailed mirrors KindBackupFailed for prune jobs. + KindPruneFailed = "prune_failed" + + // KindCheckFailed is raised at "critical" severity (repository + // integrity is at risk) when a check job fails. + KindCheckFailed = "check_failed" + + // KindStaleSchedule is declared for completeness but intentionally + // left as a no-op in v1. The precise "expected to have fired but + // didn't" logic requires a store helper that lands in a follow-up + // task. Ask the team before implementing. + KindStaleSchedule = "stale_schedule" + + // KindAgentOffline is raised when a host's last_seen_at is older + // than the 15-minute floor and resolved when the host reconnects. + KindAgentOffline = "agent_offline" +) + +// raiseAndNotify is the standard raise pattern: store.RaiseOrTouch +// deduplicates, and notification.Hub.Dispatch fires only on the first +// raise (didRaise=true). Subsequent occurrences of the same open alert +// are "touched" (last_seen_at bumped) without a second notification. +func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, severity, message string, when time.Time) { + id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, severity, message, when) + if err != nil { + slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "err", err) + return + } + if !didRaise { + return + } + host, err := e.store.GetHost(ctx, hostID) + hostName := hostID + if err == nil { + hostName = host.Name + } + go e.hub.Dispatch(ctx, notification.Payload{ + Event: notification.EventRaised, + AlertID: id, + Severity: severity, + Kind: kind, + HostID: hostID, + HostName: hostName, + Message: message, + RaisedAt: when, + }) +} + +// resolveAndNotify clears every open (or acknowledged) alert for +// (host_id, kind) via store.AutoResolve, then fires alert.resolved +// for each row that was actually open. Best-effort — errors are +// logged but do not propagate. +func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when time.Time) { + open, err := e.store.ListAlerts(ctx, store.AlertFilter{ + Status: "open", HostID: hostID, + }) + if err != nil { + return + } + openAcked, _ := e.store.ListAlerts(ctx, store.AlertFilter{ + Status: "acknowledged", HostID: hostID, + }) + all := append(open, openAcked...) + if err := e.store.AutoResolve(ctx, hostID, kind, when); err != nil { + slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "err", err) + return + } + host, _ := e.store.GetHost(ctx, hostID) + hostName := hostID + if host != nil { + hostName = host.Name + } + for _, a := range all { + if a.Kind != kind { + continue + } + go e.hub.Dispatch(ctx, notification.Payload{ + Event: notification.EventResolved, + AlertID: a.ID, + Severity: a.Severity, + Kind: a.Kind, + HostID: hostID, + HostName: hostName, + Message: fmt.Sprintf("Auto-resolved (%s)", kind), + RaisedAt: when, + }) + } +} diff --git a/internal/alert/rules_test.go b/internal/alert/rules_test.go new file mode 100644 index 0000000..c8f9d32 --- /dev/null +++ b/internal/alert/rules_test.go @@ -0,0 +1,125 @@ +package alert + +import ( + "context" + "path/filepath" + "testing" + "time" + + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/crypto" + "gitea.dcglab.co.uk/steve/restic-manager/internal/notification" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +func setupEngine(t *testing.T) (*Engine, *store.Store, string) { + t.Helper() + dir := t.TempDir() + st, _ := store.Open(context.Background(), filepath.Join(dir, "rm.db")) + t.Cleanup(func() { _ = st.Close() }) + keyPath := filepath.Join(dir, "secret.key") + _ = crypto.GenerateKeyFile(keyPath) + key, _ := crypto.LoadKeyFromFile(keyPath) + aead, _ := crypto.NewAEAD(key) + hub := notification.NewHub(st, aead, "https://rm.example") + eng := NewEngine(st, hub) + hostID := ulid.Make().String() + if err := st.CreateHost(context.Background(), store.Host{ + ID: hostID, Name: "alfa-01", OS: "linux", Arch: "amd64", + EnrolledAt: time.Now().UTC(), + }, "deadbeef", ""); err != nil { + t.Fatalf("create host: %v", err) + } + return eng, st, hostID +} + +func TestEngineBackupFailedRaisesThenResolves(t *testing.T) { + t.Parallel() + eng, st, hostID := setupEngine(t) + ctx := context.Background() + + eng.handleJobFinished(ctx, JobFinishedEvent{ + HostID: hostID, JobID: "j1", Kind: "backup", Status: "failed", + When: time.Now().UTC(), + }) + open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + if len(open) != 1 || open[0].Kind != KindBackupFailed { + t.Fatalf("expected one backup_failed open; got %+v", open) + } + + // Second failed job should TOUCH (not raise a fresh row). + eng.handleJobFinished(ctx, JobFinishedEvent{ + HostID: hostID, JobID: "j2", Kind: "backup", Status: "failed", + When: time.Now().UTC().Add(time.Minute), + }) + open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + if len(open) != 1 { + t.Fatalf("expected dedup to stay at 1 open; got %d", len(open)) + } + + // Success auto-resolves. + eng.handleJobFinished(ctx, JobFinishedEvent{ + HostID: hostID, JobID: "j3", Kind: "backup", Status: "succeeded", + When: time.Now().UTC().Add(2 * time.Minute), + }) + open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID}) + if len(open) != 0 { + t.Fatalf("expected zero open after success; got %d", len(open)) + } +} + +func TestEngineCheckFailedSeverityCritical(t *testing.T) { + t.Parallel() + eng, st, hostID := setupEngine(t) + eng.handleJobFinished(context.Background(), JobFinishedEvent{ + HostID: hostID, Kind: "check", Status: "failed", When: time.Now().UTC(), + }) + open, _ := st.ListAlerts(context.Background(), + store.AlertFilter{Status: "open", HostID: hostID}) + if len(open) != 1 || open[0].Severity != "critical" { + t.Fatalf("got %+v", open) + } +} + +func TestEngineAgentOfflineRespects15MinFloor(t *testing.T) { + t.Parallel() + eng, st, hostID := setupEngine(t) + // Host's last_seen_at defaulted to NULL via CreateHost (enrolled but never + // seen). Force a stale value for the test by direct DB update. + if _, err := st.DB().Exec( + `UPDATE hosts SET last_seen_at = ? WHERE id = ?`, + time.Now().UTC().Add(-20*time.Minute).Format(time.RFC3339Nano), hostID, + ); err != nil { + t.Fatalf("update last_seen_at: %v", err) + } + eng.handleHostOffline(context.Background(), hostID) + open, _ := st.ListAlerts(context.Background(), + store.AlertFilter{Status: "open", HostID: hostID}) + if len(open) != 1 { + t.Fatalf("expected agent_offline raised; got %d", len(open)) + } + + // Bring back online — should auto-resolve. + eng.handleHostOnline(context.Background(), hostID) + open, _ = st.ListAlerts(context.Background(), + store.AlertFilter{Status: "open", HostID: hostID}) + if len(open) != 0 { + t.Fatalf("expected agent_offline resolved; got %d", len(open)) + } +} + +func TestEngineAgentOfflineUnderFloorNoRaise(t *testing.T) { + t.Parallel() + eng, st, hostID := setupEngine(t) + // last_seen_at is NULL from CreateHost (never touched). A nil + // last_seen_at means the host was enrolled but never connected — + // treat that as "now" for the floor check so we don't raise + // immediately. handleHostOffline must skip the raise. + eng.handleHostOffline(context.Background(), hostID) + open, _ := st.ListAlerts(context.Background(), + store.AlertFilter{Status: "open", HostID: hostID}) + if len(open) != 0 { + t.Fatalf("expected no raise within 15-min floor; got %d", len(open)) + } +}