feat(alerts): per-source-group dedup so two failing backups produce two alerts

Until now the open-alert key was (host_id, kind, resolved_at IS NULL).
A host with two source groups both failing collapsed onto one
backup_failed row — second failure bumped last_seen_at and
overwrote the message but never re-fan-out. Operators saw one
alert that appeared to flap, not two distinct broken things.

Schema changes (column-level ALTER, no rebuild):

- 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL,
  index). Populated for backup jobs in CreateJob.
- 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open
  partial index gets dropped and replaced with a UNIQUE partial
  index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL —
  the index is now the actual dedup primitive.

Plumbing:

- RaiseOrTouch / AutoResolve / Alert struct gain dedup_key.
- engine.JobFinishedEvent gains SourceGroupID; handleJobFinished
  passes it through for backup_failed only (forget/prune/check stay
  repo-scoped with key='').
- ws.handler reads SourceGroupID off the freshly-loaded job row.
- dispatchJobWithPayload gains a *string sourceGroupID arg; the
  per-group Run-now path and schedule.fire path pass &g.ID.

Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two
distinct groups produce two distinct open alerts and that resolving
one does not auto-resolve the other.

Dev tool: cmd/_fake_alert gains -dedup-key flag.
This commit is contained in:
2026-05-04 22:58:29 +01:00
parent 7792aadb94
commit a45c801884
15 changed files with 214 additions and 95 deletions
+11 -11
View File
@@ -42,10 +42,10 @@ const (
// deduplicates, and notification.Hub.Dispatch fires only on the first
// raise (didRaise=true). Subsequent occurrences of the same open alert
// are "touched" (last_seen_at bumped) without a second notification.
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, severity, message string, when time.Time) {
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, severity, message, when)
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, dedupKey, severity, message string, when time.Time) {
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, dedupKey, severity, message, when)
if err != nil {
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "err", err)
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
return
}
if !didRaise {
@@ -122,11 +122,11 @@ func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a
}
}
// resolveAndNotify clears every open (or acknowledged) alert for
// (host_id, kind) via store.AutoResolve, then fires alert.resolved
// for each row that was actually open. Best-effort — errors are
// logged but do not propagate.
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when time.Time) {
// resolveAndNotify clears the open (or acknowledged) alert matching
// (host_id, kind, dedup_key) via store.AutoResolve, then fires
// alert.resolved for the row(s) actually closed. Best-effort —
// errors are logged but do not propagate.
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind, dedupKey string, when time.Time) {
open, err := e.store.ListAlerts(ctx, store.AlertFilter{
Status: "open", HostID: hostID,
})
@@ -137,8 +137,8 @@ func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when
Status: "acknowledged", HostID: hostID,
})
all := append(open, openAcked...)
if err := e.store.AutoResolve(ctx, hostID, kind, when); err != nil {
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "err", err)
if err := e.store.AutoResolve(ctx, hostID, kind, dedupKey, when); err != nil {
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
return
}
host, _ := e.store.GetHost(ctx, hostID)
@@ -147,7 +147,7 @@ func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when
hostName = host.Name
}
for _, a := range all {
if a.Kind != kind {
if a.Kind != kind || a.DedupKey != dedupKey {
continue
}
go e.hub.Dispatch(ctx, notification.Payload{