package alert import ( "context" "fmt" "log/slog" "time" "gitea.dcglab.co.uk/steve/restic-manager/internal/notification" "gitea.dcglab.co.uk/steve/restic-manager/internal/store" ) // Alert kind constants — keep in lockstep with the engine logic and // the UI tag-colour table. const ( // KindBackupFailed is raised when a backup job finishes with // status "failed" and resolved on next backup success. KindBackupFailed = "backup_failed" // KindForgetFailed mirrors KindBackupFailed for forget jobs. KindForgetFailed = "forget_failed" // KindPruneFailed mirrors KindBackupFailed for prune jobs. KindPruneFailed = "prune_failed" // KindCheckFailed is raised at "critical" severity (repository // integrity is at risk) when a check job fails. KindCheckFailed = "check_failed" // KindStaleSchedule is raised for intermittent (non-always-on) hosts // when their last successful backup is older than staleBackupThreshold // (7 days) and they have at least one enabled schedule. Resolved on // backup success or when the host is switched to always-on mode. KindStaleSchedule = "stale_schedule" // KindAgentOffline is raised when a host's last_seen_at is older // than the 15-minute floor and resolved when the host reconnects. KindAgentOffline = "agent_offline" ) // raiseAndNotify is the standard raise pattern: store.RaiseOrTouch // deduplicates, and notification.Hub.Dispatch fires only on the first // raise (didRaise=true). Subsequent occurrences of the same open alert // are "touched" (last_seen_at bumped) without a second notification. func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, dedupKey, severity, message string, when time.Time) { id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, dedupKey, severity, message, when) if err != nil { slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err) return } if !didRaise { return } host, err := e.store.GetHost(ctx, hostID) hostName := hostID if err == nil { hostName = host.Name } go e.hub.Dispatch(ctx, notification.Payload{ Event: notification.EventRaised, AlertID: id, Severity: severity, Kind: kind, HostID: hostID, HostName: hostName, Message: message, RaisedAt: when, }) } // Acknowledge updates the alert row and fans out alert.acknowledged to // every enabled channel. Best-effort: store errors are logged but the // dispatch still fires only when the store update succeeds. func (e *Engine) Acknowledge(ctx context.Context, alertID, userID string, when time.Time) error { if err := e.store.Acknowledge(ctx, alertID, userID, when); err != nil { return err } a, lerr := e.store.GetAlert(ctx, alertID) if lerr != nil || a == nil { // Acknowledge already succeeded; dispatch is best-effort. return nil //nolint:nilerr } p := alertPayload(ctx, e.store, notification.EventAcknowledged, a) go e.hub.Dispatch(context.WithoutCancel(ctx), p) return nil } // Resolve marks the alert resolved and fans out alert.resolved. func (e *Engine) Resolve(ctx context.Context, alertID string, when time.Time) error { a, _ := e.store.GetAlert(ctx, alertID) if err := e.store.Resolve(ctx, alertID, when); err != nil { return err } if a == nil { return nil } p := alertPayload(ctx, e.store, notification.EventResolved, a) go e.hub.Dispatch(context.WithoutCancel(ctx), p) return nil } // alertPayload builds a Payload from a stored Alert, looking up the host // name when HostID is set. func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a *store.Alert) notification.Payload { hostID, hostName := "", "" if a.HostID != nil { hostID = *a.HostID hostName = hostID if h, err := st.GetHost(ctx, hostID); err == nil && h != nil { hostName = h.Name } } return notification.Payload{ Event: ev, AlertID: a.ID, Severity: a.Severity, Kind: a.Kind, HostID: hostID, HostName: hostName, Message: a.Message, RaisedAt: a.CreatedAt, } } // ResolveOnModeChange clears any open agent_offline and stale_schedule // alerts for a host whose always-on flag was just toggled. The next // 60s tick re-raises whichever still applies under the new mode, so // this is a self-correcting "wipe and let the sweep settle" call. // Safe to invoke from the HTTP layer (it only touches the store + hub). func (e *Engine) ResolveOnModeChange(ctx context.Context, hostID string, when time.Time) { e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", when) e.resolveAndNotify(ctx, hostID, KindStaleSchedule, "", when) } // resolveAndNotify clears the open (or acknowledged) alert matching // (host_id, kind, dedup_key) via store.AutoResolve, then fires // alert.resolved for the row(s) actually closed. Best-effort — // errors are logged but do not propagate. func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind, dedupKey string, when time.Time) { open, err := e.store.ListAlerts(ctx, store.AlertFilter{ Status: "open", HostID: hostID, }) if err != nil { return } openAcked, _ := e.store.ListAlerts(ctx, store.AlertFilter{ Status: "acknowledged", HostID: hostID, }) all := append(open, openAcked...) if err := e.store.AutoResolve(ctx, hostID, kind, dedupKey, when); err != nil { slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err) return } host, _ := e.store.GetHost(ctx, hostID) hostName := hostID if host != nil { hostName = host.Name } for _, a := range all { if a.Kind != kind || a.DedupKey != dedupKey { continue } go e.hub.Dispatch(ctx, notification.Payload{ Event: notification.EventResolved, AlertID: a.ID, Severity: a.Severity, Kind: a.Kind, HostID: hostID, HostName: hostName, Message: fmt.Sprintf("Auto-resolved (%s)", kind), RaisedAt: when, }) } }