feat(alerts): per-source-group dedup so two failing backups produce two alerts

Until now the open-alert key was (host_id, kind, resolved_at IS NULL).
A host with two source groups both failing collapsed onto one
backup_failed row — second failure bumped last_seen_at and
overwrote the message but never re-fan-out. Operators saw one
alert that appeared to flap, not two distinct broken things.

Schema changes (column-level ALTER, no rebuild):

- 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL,
  index). Populated for backup jobs in CreateJob.
- 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open
  partial index gets dropped and replaced with a UNIQUE partial
  index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL —
  the index is now the actual dedup primitive.

Plumbing:

- RaiseOrTouch / AutoResolve / Alert struct gain dedup_key.
- engine.JobFinishedEvent gains SourceGroupID; handleJobFinished
  passes it through for backup_failed only (forget/prune/check stay
  repo-scoped with key='').
- ws.handler reads SourceGroupID off the freshly-loaded job row.
- dispatchJobWithPayload gains a *string sourceGroupID arg; the
  per-group Run-now path and schedule.fire path pass &g.ID.

Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two
distinct groups produce two distinct open alerts and that resolving
one does not auto-resolve the other.

Dev tool: cmd/_fake_alert gains -dedup-key flag.
This commit is contained in:
2026-05-04 22:58:29 +01:00
parent 9d7a714102
commit 350be3f19d
15 changed files with 214 additions and 95 deletions
+41 -9
View File
@@ -33,7 +33,7 @@ func TestRaiseOrTouchInsertsThenTouches(t *testing.T) {
ctx := context.Background()
t0 := time.Now().UTC()
id1, didRaise, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning",
id1, didRaise, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning",
"Backup failed: 401", t0)
if err != nil {
t.Fatalf("first raise: %v", err)
@@ -47,7 +47,7 @@ func TestRaiseOrTouchInsertsThenTouches(t *testing.T) {
// Second call within the same open window should touch, not insert.
t1 := t0.Add(60 * time.Second)
id2, didRaise2, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning",
id2, didRaise2, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning",
"Backup failed: 401 (still)", t1)
if err != nil {
t.Fatalf("touch: %v", err)
@@ -78,7 +78,7 @@ func TestResolveAndReRaiseStartsFreshAlert(t *testing.T) {
ctx := context.Background()
t0 := time.Now().UTC()
id1, _, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning", "first", t0)
id1, _, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning", "first", t0)
if err != nil {
t.Fatalf("raise: %v", err)
}
@@ -86,7 +86,7 @@ func TestResolveAndReRaiseStartsFreshAlert(t *testing.T) {
t.Fatalf("resolve: %v", err)
}
id2, didRaise, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning", "second", t0.Add(2*time.Minute))
id2, didRaise, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning", "second", t0.Add(2*time.Minute))
if err != nil {
t.Fatalf("re-raise: %v", err)
}
@@ -98,6 +98,38 @@ func TestResolveAndReRaiseStartsFreshAlert(t *testing.T) {
}
}
// Two source groups failing on the same host produce two distinct
// open alerts (not one collapsed). Pre-dedup-key, this would have
// touched the existing row and silently dropped the second failure.
func TestRaiseOrTouchDedupsPerSourceGroup(t *testing.T) {
t.Parallel()
st, hostID := newTestStoreWithHost(t)
ctx := context.Background()
t0 := time.Now().UTC()
idA, didA, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "group-a",
"warning", "group A failed", t0)
if err != nil || !didA {
t.Fatalf("group A raise: id=%q didRaise=%v err=%v", idA, didA, err)
}
idB, didB, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "group-b",
"warning", "group B failed", t0.Add(time.Second))
if err != nil || !didB {
t.Fatalf("group B raise: id=%q didRaise=%v err=%v", idB, didB, err)
}
if idA == idB {
t.Fatalf("expected distinct alert ids per source group, got %q twice", idA)
}
// Resolving group A must not auto-resolve group B.
if err := st.AutoResolve(ctx, hostID, "backup_failed", "group-a", t0.Add(2*time.Second)); err != nil {
t.Fatalf("auto-resolve A: %v", err)
}
gotB, _ := st.GetAlert(ctx, idB)
if gotB.ResolvedAt != nil {
t.Errorf("group B got auto-resolved by group A's recovery; resolved_at=%v", gotB.ResolvedAt)
}
}
func TestAcknowledgeKeepsAlertOpen(t *testing.T) {
t.Parallel()
st, hostID := newTestStoreWithHost(t)
@@ -112,7 +144,7 @@ func TestAcknowledgeKeepsAlertOpen(t *testing.T) {
t.Fatalf("create user: %v", err)
}
id, _, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning", "m", time.Now().UTC())
id, _, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning", "m", time.Now().UTC())
if err != nil {
t.Fatalf("raise: %v", err)
}
@@ -140,8 +172,8 @@ func TestAutoResolveClearsOpenAlerts(t *testing.T) {
ctx := context.Background()
t0 := time.Now().UTC()
id, _, _ := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning", "m", t0)
if err := st.AutoResolve(ctx, hostID, "backup_failed", t0.Add(time.Minute)); err != nil {
id, _, _ := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning", "m", t0)
if err := st.AutoResolve(ctx, hostID, "backup_failed", "", t0.Add(time.Minute)); err != nil {
t.Fatalf("auto-resolve: %v", err)
}
got, _ := st.GetAlert(ctx, id)
@@ -157,8 +189,8 @@ func TestListAlertsFilters(t *testing.T) {
t0 := time.Now().UTC()
// One open warning + one resolved info.
_, _, _ = st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning", "open", t0)
id2, _, _ := st.RaiseOrTouch(ctx, hostID, "stale_schedule", "info", "done", t0)
_, _, _ = st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning", "open", t0)
id2, _, _ := st.RaiseOrTouch(ctx, hostID, "stale_schedule", "", "info", "done", t0)
_ = st.Resolve(ctx, id2, t0.Add(time.Minute))
open, err := st.ListAlerts(ctx, AlertFilter{Status: "open"})