feat(alerts): per-source-group dedup so two failing backups produce two alerts
Until now the open-alert key was (host_id, kind, resolved_at IS NULL). A host with two source groups both failing collapsed onto one backup_failed row — second failure bumped last_seen_at and overwrote the message but never re-fan-out. Operators saw one alert that appeared to flap, not two distinct broken things. Schema changes (column-level ALTER, no rebuild): - 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL, index). Populated for backup jobs in CreateJob. - 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open partial index gets dropped and replaced with a UNIQUE partial index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL — the index is now the actual dedup primitive. Plumbing: - RaiseOrTouch / AutoResolve / Alert struct gain dedup_key. - engine.JobFinishedEvent gains SourceGroupID; handleJobFinished passes it through for backup_failed only (forget/prune/check stay repo-scoped with key=''). - ws.handler reads SourceGroupID off the freshly-loaded job row. - dispatchJobWithPayload gains a *string sourceGroupID arg; the per-group Run-now path and schedule.fire path pass &g.ID. Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two distinct groups produce two distinct open alerts and that resolving one does not auto-resolve the other. Dev tool: cmd/_fake_alert gains -dedup-key flag.
This commit is contained in:
@@ -65,7 +65,7 @@ func (s *Server) handleRunNow(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||
func (s *Server) dispatchJob(ctx context.Context, user *store.User,
|
||||
hostID string, kind api.JobKind, args []string,
|
||||
) (res runNowResponse, status int, code, msg string) {
|
||||
return s.dispatchJobWithPayload(ctx, user, hostID, kind, api.CommandRunPayload{
|
||||
return s.dispatchJobWithPayload(ctx, user, hostID, kind, nil, api.CommandRunPayload{
|
||||
Kind: kind,
|
||||
Args: args,
|
||||
})
|
||||
@@ -75,8 +75,12 @@ func (s *Server) dispatchJob(ctx context.Context, user *store.User,
|
||||
// fill in structured fields (Includes/Excludes/Tag/ForgetGroups/RequiresAdminCreds)
|
||||
// — used by the per-source-group Run-now path. JobID is filled in
|
||||
// here; callers leave it zero on the input payload.
|
||||
//
|
||||
// sourceGroupID is the dedup key the alert engine will key on for
|
||||
// backup_failed. Pass non-nil for backups; nil for prune/check/unlock
|
||||
// (those are repo-scoped and dedup at host_id only).
|
||||
func (s *Server) dispatchJobWithPayload(ctx context.Context, user *store.User,
|
||||
hostID string, kind api.JobKind, payload api.CommandRunPayload,
|
||||
hostID string, kind api.JobKind, sourceGroupID *string, payload api.CommandRunPayload,
|
||||
) (res runNowResponse, status int, code, msg string) {
|
||||
if !validJobKind(kind) {
|
||||
return res, stdhttp.StatusBadRequest, "invalid_kind",
|
||||
@@ -100,12 +104,13 @@ func (s *Server) dispatchJobWithPayload(ctx context.Context, user *store.User,
|
||||
actorID = &user.ID
|
||||
}
|
||||
if err := s.deps.Store.CreateJob(ctx, store.Job{
|
||||
ID: jobID,
|
||||
HostID: host.ID,
|
||||
Kind: string(kind),
|
||||
ActorKind: actor,
|
||||
ActorID: actorID,
|
||||
CreatedAt: now,
|
||||
ID: jobID,
|
||||
HostID: host.ID,
|
||||
Kind: string(kind),
|
||||
SourceGroupID: sourceGroupID,
|
||||
ActorKind: actor,
|
||||
ActorID: actorID,
|
||||
CreatedAt: now,
|
||||
}); err != nil {
|
||||
return res, stdhttp.StatusInternalServerError, "internal", ""
|
||||
}
|
||||
|
||||
@@ -43,7 +43,7 @@ func (s *Server) DispatchMaintenance(ctx context.Context, decisions []maintenanc
|
||||
"host_id", d.HostID)
|
||||
continue
|
||||
}
|
||||
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobForget, payload)
|
||||
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobForget, nil, payload)
|
||||
if code != "" {
|
||||
slog.Warn("maintenance: forget dispatch failed",
|
||||
"host_id", d.HostID, "code", code, "msg", msg)
|
||||
@@ -65,14 +65,14 @@ func (s *Server) DispatchMaintenance(ctx context.Context, decisions []maintenanc
|
||||
continue
|
||||
}
|
||||
payload := api.CommandRunPayload{RequiresAdminCreds: true}
|
||||
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobPrune, payload)
|
||||
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobPrune, nil, payload)
|
||||
if code != "" {
|
||||
slog.Warn("maintenance: prune dispatch failed",
|
||||
"host_id", d.HostID, "code", code, "msg", msg)
|
||||
}
|
||||
case "check":
|
||||
payload := api.CommandRunPayload{Args: []string{strconv.Itoa(d.SubsetPct)}}
|
||||
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobCheck, payload)
|
||||
_, _, code, msg := s.dispatchJobWithPayload(ctx, nil, d.HostID, api.JobCheck, nil, payload)
|
||||
if code != "" {
|
||||
slog.Warn("maintenance: check dispatch failed",
|
||||
"host_id", d.HostID, "code", code, "msg", msg)
|
||||
|
||||
@@ -52,7 +52,7 @@ func (s *Server) handleRunRepoPrune(w stdhttp.ResponseWriter, r *stdhttp.Request
|
||||
return
|
||||
}
|
||||
|
||||
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobPrune,
|
||||
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobPrune, nil,
|
||||
api.CommandRunPayload{RequiresAdminCreds: true})
|
||||
if code != "" {
|
||||
s.runOpError(w, r, status, code, msg)
|
||||
@@ -107,7 +107,7 @@ func (s *Server) handleRunRepoCheck(w stdhttp.ResponseWriter, r *stdhttp.Request
|
||||
// Non-numeric ?subset silently falls back to DB value.
|
||||
}
|
||||
|
||||
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobCheck,
|
||||
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobCheck, nil,
|
||||
api.CommandRunPayload{Args: []string{strconv.Itoa(subset)}})
|
||||
if code != "" {
|
||||
s.runOpError(w, r, status, code, msg)
|
||||
@@ -134,7 +134,7 @@ func (s *Server) handleRunRepoUnlock(w stdhttp.ResponseWriter, r *stdhttp.Reques
|
||||
return
|
||||
}
|
||||
|
||||
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobUnlock,
|
||||
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobUnlock, nil,
|
||||
api.CommandRunPayload{})
|
||||
if code != "" {
|
||||
s.runOpError(w, r, status, code, msg)
|
||||
|
||||
@@ -88,7 +88,7 @@ func (s *Server) handleRunSourceGroup(w stdhttp.ResponseWriter, r *stdhttp.Reque
|
||||
|
||||
// Backup invocations don't consume RetentionPolicy — that lives on
|
||||
// forget. Sending the resolved set here would just be dead weight.
|
||||
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobBackup,
|
||||
res, status, code, msg := s.dispatchJobWithPayload(r.Context(), user, hostID, api.JobBackup, &g.ID,
|
||||
api.CommandRunPayload{
|
||||
Includes: g.Includes,
|
||||
Excludes: g.Excludes,
|
||||
|
||||
@@ -180,13 +180,15 @@ func (s *Server) dispatchBackupForGroupCore(ctx context.Context, conn *ws.Conn,
|
||||
jobID := ulid.Make().String()
|
||||
now := time.Now().UTC()
|
||||
scheduleRef := scheduleID
|
||||
groupRef := g.ID
|
||||
if err := s.deps.Store.CreateJob(ctx, store.Job{
|
||||
ID: jobID,
|
||||
HostID: hostID,
|
||||
Kind: string(api.JobBackup),
|
||||
ScheduledID: &scheduleRef,
|
||||
ActorKind: "schedule",
|
||||
CreatedAt: now,
|
||||
ID: jobID,
|
||||
HostID: hostID,
|
||||
Kind: string(api.JobBackup),
|
||||
ScheduledID: &scheduleRef,
|
||||
SourceGroupID: &groupRef,
|
||||
ActorKind: "schedule",
|
||||
CreatedAt: now,
|
||||
}); err != nil {
|
||||
slog.Warn("schedule.fire: persist job", "host_id", hostID,
|
||||
"schedule_id", scheduleID, "group", g.Name, "err", err)
|
||||
|
||||
@@ -17,7 +17,7 @@ func TestAPIAlertsListsOpen(t *testing.T) {
|
||||
srv, ts, st := rawTestServer(t)
|
||||
hostID, _ := enrolHostForWS(t, srv, st, "host-alerts")
|
||||
_, _, _ = st.RaiseOrTouch(context.Background(), hostID,
|
||||
"backup_failed", "warning", "x", time.Now().UTC())
|
||||
"backup_failed", "", "warning", "x", time.Now().UTC())
|
||||
cookie := loginAsAdmin(t, st)
|
||||
|
||||
req, _ := stdhttp.NewRequest("GET", ts.URL+"/api/alerts?status=open", nil)
|
||||
|
||||
Reference in New Issue
Block a user