feat(alerts): per-source-group dedup so two failing backups produce two alerts
Until now the open-alert key was (host_id, kind, resolved_at IS NULL). A host with two source groups both failing collapsed onto one backup_failed row — second failure bumped last_seen_at and overwrote the message but never re-fan-out. Operators saw one alert that appeared to flap, not two distinct broken things. Schema changes (column-level ALTER, no rebuild): - 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL, index). Populated for backup jobs in CreateJob. - 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open partial index gets dropped and replaced with a UNIQUE partial index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL — the index is now the actual dedup primitive. Plumbing: - RaiseOrTouch / AutoResolve / Alert struct gain dedup_key. - engine.JobFinishedEvent gains SourceGroupID; handleJobFinished passes it through for backup_failed only (forget/prune/check stay repo-scoped with key=''). - ws.handler reads SourceGroupID off the freshly-loaded job row. - dispatchJobWithPayload gains a *string sourceGroupID arg; the per-group Run-now path and schedule.fire path pass &g.ID. Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two distinct groups produce two distinct open alerts and that resolving one does not auto-resolve the other. Dev tool: cmd/_fake_alert gains -dedup-key flag.
This commit is contained in:
+20
-10
@@ -26,11 +26,12 @@ import (
|
||||
// the failed-X rules. Pushed via Engine.NotifyJobFinished from the
|
||||
// MarkJobFinished site.
|
||||
type JobFinishedEvent struct {
|
||||
HostID string
|
||||
JobID string
|
||||
Kind string // backup | forget | prune | check | unlock | restore | diff
|
||||
Status string // succeeded | failed | cancelled
|
||||
When time.Time
|
||||
HostID string
|
||||
JobID string
|
||||
Kind string // backup | forget | prune | check | unlock | restore | diff
|
||||
Status string // succeeded | failed | cancelled
|
||||
SourceGroupID string // dedup key for backup/forget/prune/check; empty otherwise
|
||||
When time.Time
|
||||
}
|
||||
|
||||
// Engine evaluates hardcoded alert rules and dispatches via notification.Hub.
|
||||
@@ -133,12 +134,21 @@ func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
|
||||
default:
|
||||
return
|
||||
}
|
||||
// dedupKey scopes the alert to a specific subject. For backups it's
|
||||
// the source-group id (each group = its own restic run = its own
|
||||
// failure surface). forget/prune/check are repo-scoped — leave the
|
||||
// key empty so we get one alert per host per kind, matching the
|
||||
// "is this repo healthy?" mental model.
|
||||
dedupKey := ""
|
||||
if ev.Kind == "backup" {
|
||||
dedupKey = ev.SourceGroupID
|
||||
}
|
||||
switch ev.Status {
|
||||
case "failed":
|
||||
e.raiseAndNotify(ctx, ev.HostID, kind, severity,
|
||||
e.raiseAndNotify(ctx, ev.HostID, kind, dedupKey, severity,
|
||||
fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
|
||||
case "succeeded":
|
||||
e.resolveAndNotify(ctx, ev.HostID, kind, ev.When)
|
||||
e.resolveAndNotify(ctx, ev.HostID, kind, dedupKey, ev.When)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -157,14 +167,14 @@ func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
|
||||
if time.Since(*host.LastSeenAt) < e.agentOfflineFloor {
|
||||
return
|
||||
}
|
||||
e.raiseAndNotify(ctx, hostID, KindAgentOffline, "warning",
|
||||
e.raiseAndNotify(ctx, hostID, KindAgentOffline, "", "warning",
|
||||
fmt.Sprintf("Agent offline for %s (threshold %s)",
|
||||
roundDur(time.Since(*host.LastSeenAt)), e.agentOfflineFloor),
|
||||
time.Now().UTC())
|
||||
}
|
||||
|
||||
func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
|
||||
e.resolveAndNotify(ctx, hostID, KindAgentOffline, time.Now().UTC())
|
||||
e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", time.Now().UTC())
|
||||
}
|
||||
|
||||
// tick is the 60-second sweep. Responsibilities:
|
||||
@@ -186,7 +196,7 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
|
||||
continue
|
||||
}
|
||||
if now.Sub(*h.LastSeenAt) >= e.agentOfflineFloor {
|
||||
e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "warning",
|
||||
e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "", "warning",
|
||||
fmt.Sprintf("Agent offline for %s (threshold %s)",
|
||||
roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
|
||||
}
|
||||
|
||||
+11
-11
@@ -42,10 +42,10 @@ const (
|
||||
// deduplicates, and notification.Hub.Dispatch fires only on the first
|
||||
// raise (didRaise=true). Subsequent occurrences of the same open alert
|
||||
// are "touched" (last_seen_at bumped) without a second notification.
|
||||
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, severity, message string, when time.Time) {
|
||||
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, severity, message, when)
|
||||
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, dedupKey, severity, message string, when time.Time) {
|
||||
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, dedupKey, severity, message, when)
|
||||
if err != nil {
|
||||
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "err", err)
|
||||
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
|
||||
return
|
||||
}
|
||||
if !didRaise {
|
||||
@@ -122,11 +122,11 @@ func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a
|
||||
}
|
||||
}
|
||||
|
||||
// resolveAndNotify clears every open (or acknowledged) alert for
|
||||
// (host_id, kind) via store.AutoResolve, then fires alert.resolved
|
||||
// for each row that was actually open. Best-effort — errors are
|
||||
// logged but do not propagate.
|
||||
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when time.Time) {
|
||||
// resolveAndNotify clears the open (or acknowledged) alert matching
|
||||
// (host_id, kind, dedup_key) via store.AutoResolve, then fires
|
||||
// alert.resolved for the row(s) actually closed. Best-effort —
|
||||
// errors are logged but do not propagate.
|
||||
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind, dedupKey string, when time.Time) {
|
||||
open, err := e.store.ListAlerts(ctx, store.AlertFilter{
|
||||
Status: "open", HostID: hostID,
|
||||
})
|
||||
@@ -137,8 +137,8 @@ func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when
|
||||
Status: "acknowledged", HostID: hostID,
|
||||
})
|
||||
all := append(open, openAcked...)
|
||||
if err := e.store.AutoResolve(ctx, hostID, kind, when); err != nil {
|
||||
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "err", err)
|
||||
if err := e.store.AutoResolve(ctx, hostID, kind, dedupKey, when); err != nil {
|
||||
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
|
||||
return
|
||||
}
|
||||
host, _ := e.store.GetHost(ctx, hostID)
|
||||
@@ -147,7 +147,7 @@ func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when
|
||||
hostName = host.Name
|
||||
}
|
||||
for _, a := range all {
|
||||
if a.Kind != kind {
|
||||
if a.Kind != kind || a.DedupKey != dedupKey {
|
||||
continue
|
||||
}
|
||||
go e.hub.Dispatch(ctx, notification.Payload{
|
||||
|
||||
Reference in New Issue
Block a user