feat(alerts): per-source-group dedup so two failing backups produce two alerts

Until now the open-alert key was (host_id, kind, resolved_at IS NULL).
A host with two source groups both failing collapsed onto one
backup_failed row — second failure bumped last_seen_at and
overwrote the message but never re-fan-out. Operators saw one
alert that appeared to flap, not two distinct broken things.

Schema changes (column-level ALTER, no rebuild):

- 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL,
  index). Populated for backup jobs in CreateJob.
- 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open
  partial index gets dropped and replaced with a UNIQUE partial
  index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL —
  the index is now the actual dedup primitive.

Plumbing:

- RaiseOrTouch / AutoResolve / Alert struct gain dedup_key.
- engine.JobFinishedEvent gains SourceGroupID; handleJobFinished
  passes it through for backup_failed only (forget/prune/check stay
  repo-scoped with key='').
- ws.handler reads SourceGroupID off the freshly-loaded job row.
- dispatchJobWithPayload gains a *string sourceGroupID arg; the
  per-group Run-now path and schedule.fire path pass &g.ID.

Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two
distinct groups produce two distinct open alerts and that resolving
one does not auto-resolve the other.

Dev tool: cmd/_fake_alert gains -dedup-key flag.
This commit is contained in:
2026-05-04 22:58:29 +01:00
parent 9d7a714102
commit 350be3f19d
15 changed files with 214 additions and 95 deletions
+20 -10
View File
@@ -26,11 +26,12 @@ import (
// the failed-X rules. Pushed via Engine.NotifyJobFinished from the
// MarkJobFinished site.
type JobFinishedEvent struct {
HostID string
JobID string
Kind string // backup | forget | prune | check | unlock | restore | diff
Status string // succeeded | failed | cancelled
When time.Time
HostID string
JobID string
Kind string // backup | forget | prune | check | unlock | restore | diff
Status string // succeeded | failed | cancelled
SourceGroupID string // dedup key for backup/forget/prune/check; empty otherwise
When time.Time
}
// Engine evaluates hardcoded alert rules and dispatches via notification.Hub.
@@ -133,12 +134,21 @@ func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
default:
return
}
// dedupKey scopes the alert to a specific subject. For backups it's
// the source-group id (each group = its own restic run = its own
// failure surface). forget/prune/check are repo-scoped — leave the
// key empty so we get one alert per host per kind, matching the
// "is this repo healthy?" mental model.
dedupKey := ""
if ev.Kind == "backup" {
dedupKey = ev.SourceGroupID
}
switch ev.Status {
case "failed":
e.raiseAndNotify(ctx, ev.HostID, kind, severity,
e.raiseAndNotify(ctx, ev.HostID, kind, dedupKey, severity,
fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
case "succeeded":
e.resolveAndNotify(ctx, ev.HostID, kind, ev.When)
e.resolveAndNotify(ctx, ev.HostID, kind, dedupKey, ev.When)
}
}
@@ -157,14 +167,14 @@ func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
if time.Since(*host.LastSeenAt) < e.agentOfflineFloor {
return
}
e.raiseAndNotify(ctx, hostID, KindAgentOffline, "warning",
e.raiseAndNotify(ctx, hostID, KindAgentOffline, "", "warning",
fmt.Sprintf("Agent offline for %s (threshold %s)",
roundDur(time.Since(*host.LastSeenAt)), e.agentOfflineFloor),
time.Now().UTC())
}
func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
e.resolveAndNotify(ctx, hostID, KindAgentOffline, time.Now().UTC())
e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", time.Now().UTC())
}
// tick is the 60-second sweep. Responsibilities:
@@ -186,7 +196,7 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
continue
}
if now.Sub(*h.LastSeenAt) >= e.agentOfflineFloor {
e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "warning",
e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "", "warning",
fmt.Sprintf("Agent offline for %s (threshold %s)",
roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
}
+11 -11
View File
@@ -42,10 +42,10 @@ const (
// deduplicates, and notification.Hub.Dispatch fires only on the first
// raise (didRaise=true). Subsequent occurrences of the same open alert
// are "touched" (last_seen_at bumped) without a second notification.
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, severity, message string, when time.Time) {
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, severity, message, when)
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, dedupKey, severity, message string, when time.Time) {
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, dedupKey, severity, message, when)
if err != nil {
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "err", err)
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
return
}
if !didRaise {
@@ -122,11 +122,11 @@ func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a
}
}
// resolveAndNotify clears every open (or acknowledged) alert for
// (host_id, kind) via store.AutoResolve, then fires alert.resolved
// for each row that was actually open. Best-effort — errors are
// logged but do not propagate.
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when time.Time) {
// resolveAndNotify clears the open (or acknowledged) alert matching
// (host_id, kind, dedup_key) via store.AutoResolve, then fires
// alert.resolved for the row(s) actually closed. Best-effort —
// errors are logged but do not propagate.
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind, dedupKey string, when time.Time) {
open, err := e.store.ListAlerts(ctx, store.AlertFilter{
Status: "open", HostID: hostID,
})
@@ -137,8 +137,8 @@ func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when
Status: "acknowledged", HostID: hostID,
})
all := append(open, openAcked...)
if err := e.store.AutoResolve(ctx, hostID, kind, when); err != nil {
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "err", err)
if err := e.store.AutoResolve(ctx, hostID, kind, dedupKey, when); err != nil {
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
return
}
host, _ := e.store.GetHost(ctx, hostID)
@@ -147,7 +147,7 @@ func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when
hostName = host.Name
}
for _, a := range all {
if a.Kind != kind {
if a.Kind != kind || a.DedupKey != dedupKey {
continue
}
go e.hub.Dispatch(ctx, notification.Payload{