feat(alerts): per-source-group dedup so two failing backups produce two alerts
Until now the open-alert key was (host_id, kind, resolved_at IS NULL). A host with two source groups both failing collapsed onto one backup_failed row — second failure bumped last_seen_at and overwrote the message but never re-fan-out. Operators saw one alert that appeared to flap, not two distinct broken things. Schema changes (column-level ALTER, no rebuild): - 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL, index). Populated for backup jobs in CreateJob. - 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open partial index gets dropped and replaced with a UNIQUE partial index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL — the index is now the actual dedup primitive. Plumbing: - RaiseOrTouch / AutoResolve / Alert struct gain dedup_key. - engine.JobFinishedEvent gains SourceGroupID; handleJobFinished passes it through for backup_failed only (forget/prune/check stay repo-scoped with key=''). - ws.handler reads SourceGroupID off the freshly-loaded job row. - dispatchJobWithPayload gains a *string sourceGroupID arg; the per-group Run-now path and schedule.fire path pass &g.ID. Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two distinct groups produce two distinct open alerts and that resolving one does not auto-resolve the other. Dev tool: cmd/_fake_alert gains -dedup-key flag.
This commit is contained in:
@@ -33,7 +33,7 @@ func TestRaiseOrTouchInsertsThenTouches(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
t0 := time.Now().UTC()
|
||||
id1, didRaise, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning",
|
||||
id1, didRaise, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning",
|
||||
"Backup failed: 401", t0)
|
||||
if err != nil {
|
||||
t.Fatalf("first raise: %v", err)
|
||||
@@ -47,7 +47,7 @@ func TestRaiseOrTouchInsertsThenTouches(t *testing.T) {
|
||||
|
||||
// Second call within the same open window should touch, not insert.
|
||||
t1 := t0.Add(60 * time.Second)
|
||||
id2, didRaise2, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning",
|
||||
id2, didRaise2, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning",
|
||||
"Backup failed: 401 (still)", t1)
|
||||
if err != nil {
|
||||
t.Fatalf("touch: %v", err)
|
||||
@@ -78,7 +78,7 @@ func TestResolveAndReRaiseStartsFreshAlert(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
t0 := time.Now().UTC()
|
||||
id1, _, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning", "first", t0)
|
||||
id1, _, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning", "first", t0)
|
||||
if err != nil {
|
||||
t.Fatalf("raise: %v", err)
|
||||
}
|
||||
@@ -86,7 +86,7 @@ func TestResolveAndReRaiseStartsFreshAlert(t *testing.T) {
|
||||
t.Fatalf("resolve: %v", err)
|
||||
}
|
||||
|
||||
id2, didRaise, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning", "second", t0.Add(2*time.Minute))
|
||||
id2, didRaise, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning", "second", t0.Add(2*time.Minute))
|
||||
if err != nil {
|
||||
t.Fatalf("re-raise: %v", err)
|
||||
}
|
||||
@@ -98,6 +98,38 @@ func TestResolveAndReRaiseStartsFreshAlert(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// Two source groups failing on the same host produce two distinct
|
||||
// open alerts (not one collapsed). Pre-dedup-key, this would have
|
||||
// touched the existing row and silently dropped the second failure.
|
||||
func TestRaiseOrTouchDedupsPerSourceGroup(t *testing.T) {
|
||||
t.Parallel()
|
||||
st, hostID := newTestStoreWithHost(t)
|
||||
ctx := context.Background()
|
||||
t0 := time.Now().UTC()
|
||||
|
||||
idA, didA, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "group-a",
|
||||
"warning", "group A failed", t0)
|
||||
if err != nil || !didA {
|
||||
t.Fatalf("group A raise: id=%q didRaise=%v err=%v", idA, didA, err)
|
||||
}
|
||||
idB, didB, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "group-b",
|
||||
"warning", "group B failed", t0.Add(time.Second))
|
||||
if err != nil || !didB {
|
||||
t.Fatalf("group B raise: id=%q didRaise=%v err=%v", idB, didB, err)
|
||||
}
|
||||
if idA == idB {
|
||||
t.Fatalf("expected distinct alert ids per source group, got %q twice", idA)
|
||||
}
|
||||
// Resolving group A must not auto-resolve group B.
|
||||
if err := st.AutoResolve(ctx, hostID, "backup_failed", "group-a", t0.Add(2*time.Second)); err != nil {
|
||||
t.Fatalf("auto-resolve A: %v", err)
|
||||
}
|
||||
gotB, _ := st.GetAlert(ctx, idB)
|
||||
if gotB.ResolvedAt != nil {
|
||||
t.Errorf("group B got auto-resolved by group A's recovery; resolved_at=%v", gotB.ResolvedAt)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAcknowledgeKeepsAlertOpen(t *testing.T) {
|
||||
t.Parallel()
|
||||
st, hostID := newTestStoreWithHost(t)
|
||||
@@ -112,7 +144,7 @@ func TestAcknowledgeKeepsAlertOpen(t *testing.T) {
|
||||
t.Fatalf("create user: %v", err)
|
||||
}
|
||||
|
||||
id, _, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning", "m", time.Now().UTC())
|
||||
id, _, err := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning", "m", time.Now().UTC())
|
||||
if err != nil {
|
||||
t.Fatalf("raise: %v", err)
|
||||
}
|
||||
@@ -140,8 +172,8 @@ func TestAutoResolveClearsOpenAlerts(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
t0 := time.Now().UTC()
|
||||
id, _, _ := st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning", "m", t0)
|
||||
if err := st.AutoResolve(ctx, hostID, "backup_failed", t0.Add(time.Minute)); err != nil {
|
||||
id, _, _ := st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning", "m", t0)
|
||||
if err := st.AutoResolve(ctx, hostID, "backup_failed", "", t0.Add(time.Minute)); err != nil {
|
||||
t.Fatalf("auto-resolve: %v", err)
|
||||
}
|
||||
got, _ := st.GetAlert(ctx, id)
|
||||
@@ -157,8 +189,8 @@ func TestListAlertsFilters(t *testing.T) {
|
||||
t0 := time.Now().UTC()
|
||||
|
||||
// One open warning + one resolved info.
|
||||
_, _, _ = st.RaiseOrTouch(ctx, hostID, "backup_failed", "warning", "open", t0)
|
||||
id2, _, _ := st.RaiseOrTouch(ctx, hostID, "stale_schedule", "info", "done", t0)
|
||||
_, _, _ = st.RaiseOrTouch(ctx, hostID, "backup_failed", "", "warning", "open", t0)
|
||||
id2, _, _ := st.RaiseOrTouch(ctx, hostID, "stale_schedule", "", "info", "done", t0)
|
||||
_ = st.Resolve(ctx, id2, t0.Add(time.Minute))
|
||||
|
||||
open, err := st.ListAlerts(ctx, AlertFilter{Status: "open"})
|
||||
|
||||
Reference in New Issue
Block a user