feat(alerts): per-source-group dedup so two failing backups produce two alerts

Until now the open-alert key was (host_id, kind, resolved_at IS NULL). A host with two source groups both failing collapsed onto one backup_failed row — second failure bumped last_seen_at and overwrote the message but never re-fan-out. Operators saw one alert that appeared to flap, not two distinct broken things. Schema changes (column-level ALTER, no rebuild): - 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL, index). Populated for backup jobs in CreateJob. - 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open partial index gets dropped and replaced with a UNIQUE partial index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL — the index is now the actual dedup primitive. Plumbing: - RaiseOrTouch / AutoResolve / Alert struct gain dedup_key. - engine.JobFinishedEvent gains SourceGroupID; handleJobFinished passes it through for backup_failed only (forget/prune/check stay repo-scoped with key=''). - ws.handler reads SourceGroupID off the freshly-loaded job row. - dispatchJobWithPayload gains a *string sourceGroupID arg; the per-group Run-now path and schedule.fire path pass &g.ID. Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two distinct groups produce two distinct open alerts and that resolving one does not auto-resolve the other. Dev tool: cmd/_fake_alert gains -dedup-key flag.
2026-05-04 22:58:29 +01:00
parent 9d7a714102
commit 350be3f19d
15 changed files with 214 additions and 95 deletions
@@ -26,11 +26,12 @@ import (
 // the failed-X rules. Pushed via Engine.NotifyJobFinished from the
 // MarkJobFinished site.
 type JobFinishedEvent struct {
-	HostID string
-	JobID  string
-	Kind   string // backup | forget | prune | check | unlock | restore | diff
-	Status string // succeeded | failed | cancelled
-	When   time.Time
+	HostID        string
+	JobID         string
+	Kind          string // backup | forget | prune | check | unlock | restore | diff
+	Status        string // succeeded | failed | cancelled
+	SourceGroupID string // dedup key for backup/forget/prune/check; empty otherwise
+	When          time.Time
 }

 // Engine evaluates hardcoded alert rules and dispatches via notification.Hub.
@@ -133,12 +134,21 @@ func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
 	default:
 		return
 	}
+	// dedupKey scopes the alert to a specific subject. For backups it's
+	// the source-group id (each group = its own restic run = its own
+	// failure surface). forget/prune/check are repo-scoped — leave the
+	// key empty so we get one alert per host per kind, matching the
+	// "is this repo healthy?" mental model.
+	dedupKey := ""
+	if ev.Kind == "backup" {
+		dedupKey = ev.SourceGroupID
+	}
 	switch ev.Status {
 	case "failed":
-		e.raiseAndNotify(ctx, ev.HostID, kind, severity,
+		e.raiseAndNotify(ctx, ev.HostID, kind, dedupKey, severity,
 			fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
 	case "succeeded":
-		e.resolveAndNotify(ctx, ev.HostID, kind, ev.When)
+		e.resolveAndNotify(ctx, ev.HostID, kind, dedupKey, ev.When)
 	}
 }

@@ -157,14 +167,14 @@ func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
 	if time.Since(*host.LastSeenAt) < e.agentOfflineFloor {
 		return
 	}
-	e.raiseAndNotify(ctx, hostID, KindAgentOffline, "warning",
+	e.raiseAndNotify(ctx, hostID, KindAgentOffline, "", "warning",
 		fmt.Sprintf("Agent offline for %s (threshold %s)",
 			roundDur(time.Since(*host.LastSeenAt)), e.agentOfflineFloor),
 		time.Now().UTC())
 }

 func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
-	e.resolveAndNotify(ctx, hostID, KindAgentOffline, time.Now().UTC())
+	e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", time.Now().UTC())
 }

 // tick is the 60-second sweep. Responsibilities:
@@ -186,7 +196,7 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
 			continue
 		}
 		if now.Sub(*h.LastSeenAt) >= e.agentOfflineFloor {
-			e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "warning",
+			e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "", "warning",
 				fmt.Sprintf("Agent offline for %s (threshold %s)",
 					roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
 		}
@@ -42,10 +42,10 @@ const (
 // deduplicates, and notification.Hub.Dispatch fires only on the first
 // raise (didRaise=true). Subsequent occurrences of the same open alert
 // are "touched" (last_seen_at bumped) without a second notification.
-func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, severity, message string, when time.Time) {
-	id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, severity, message, when)
+func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, dedupKey, severity, message string, when time.Time) {
+	id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, dedupKey, severity, message, when)
 	if err != nil {
-		slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "err", err)
+		slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
 		return
 	}
 	if !didRaise {
@@ -122,11 +122,11 @@ func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a
 	}
 }

-// resolveAndNotify clears every open (or acknowledged) alert for
-// (host_id, kind) via store.AutoResolve, then fires alert.resolved
-// for each row that was actually open. Best-effort — errors are
-// logged but do not propagate.
-func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when time.Time) {
+// resolveAndNotify clears the open (or acknowledged) alert matching
+// (host_id, kind, dedup_key) via store.AutoResolve, then fires
+// alert.resolved for the row(s) actually closed. Best-effort —
+// errors are logged but do not propagate.
+func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind, dedupKey string, when time.Time) {
 	open, err := e.store.ListAlerts(ctx, store.AlertFilter{
 		Status: "open", HostID: hostID,
 	})
@@ -137,8 +137,8 @@ func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when
 		Status: "acknowledged", HostID: hostID,
 	})
 	all := append(open, openAcked...)
-	if err := e.store.AutoResolve(ctx, hostID, kind, when); err != nil {
-		slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "err", err)
+	if err := e.store.AutoResolve(ctx, hostID, kind, dedupKey, when); err != nil {
+		slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
 		return
 	}
 	host, _ := e.store.GetHost(ctx, hostID)
@@ -147,7 +147,7 @@ func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when
 		hostName = host.Name
 	}
 	for _, a := range all {
-		if a.Kind != kind {
+		if a.Kind != kind || a.DedupKey != dedupKey {
 			continue
 		}
 		go e.hub.Dispatch(ctx, notification.Payload{