feat(alerts): per-source-group dedup so two failing backups produce two alerts
Until now the open-alert key was (host_id, kind, resolved_at IS NULL). A host with two source groups both failing collapsed onto one backup_failed row — second failure bumped last_seen_at and overwrote the message but never re-fan-out. Operators saw one alert that appeared to flap, not two distinct broken things. Schema changes (column-level ALTER, no rebuild): - 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL, index). Populated for backup jobs in CreateJob. - 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open partial index gets dropped and replaced with a UNIQUE partial index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL — the index is now the actual dedup primitive. Plumbing: - RaiseOrTouch / AutoResolve / Alert struct gain dedup_key. - engine.JobFinishedEvent gains SourceGroupID; handleJobFinished passes it through for backup_failed only (forget/prune/check stay repo-scoped with key=''). - ws.handler reads SourceGroupID off the freshly-loaded job row. - dispatchJobWithPayload gains a *string sourceGroupID arg; the per-group Run-now path and schedule.fire path pass &g.ID. Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two distinct groups produce two distinct open alerts and that resolving one does not auto-resolve the other. Dev tool: cmd/_fake_alert gains -dedup-key flag.
This commit is contained in:
+31
-18
@@ -21,11 +21,16 @@ type AlertFilter struct {
|
||||
}
|
||||
|
||||
// RaiseOrTouch implements the dedup + last_seen_at bump pattern. If
|
||||
// an alert with (host_id, kind, resolved_at IS NULL) already exists,
|
||||
// it touches last_seen_at + message and returns (id, false). Otherwise
|
||||
// inserts a fresh row and returns (id, true). Caller fires a
|
||||
// notification only when didRaise=true.
|
||||
func (s *Store) RaiseOrTouch(ctx context.Context, hostID, kind, severity, message string, when time.Time) (id string, didRaise bool, err error) {
|
||||
// an alert with (host_id, kind, dedup_key, resolved_at IS NULL)
|
||||
// already exists, it touches last_seen_at + message and returns
|
||||
// (id, false). Otherwise inserts a fresh row and returns (id, true).
|
||||
// Caller fires a notification only when didRaise=true.
|
||||
//
|
||||
// dedupKey is the source-group id for backup/forget/prune/check
|
||||
// failures (so two failing groups on the same host produce two open
|
||||
// alerts), and the empty string for one-per-host alerts like
|
||||
// agent_offline / stale_schedule.
|
||||
func (s *Store) RaiseOrTouch(ctx context.Context, hostID, kind, dedupKey, severity, message string, when time.Time) (id string, didRaise bool, err error) {
|
||||
tx, err := s.db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return "", false, fmt.Errorf("store: begin: %w", err)
|
||||
@@ -33,8 +38,10 @@ func (s *Store) RaiseOrTouch(ctx context.Context, hostID, kind, severity, messag
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
row := tx.QueryRowContext(ctx,
|
||||
`SELECT id FROM alerts WHERE host_id = ? AND kind = ? AND resolved_at IS NULL LIMIT 1`,
|
||||
hostID, kind)
|
||||
`SELECT id FROM alerts
|
||||
WHERE host_id = ? AND kind = ? AND dedup_key = ? AND resolved_at IS NULL
|
||||
LIMIT 1`,
|
||||
hostID, kind, dedupKey)
|
||||
var existing string
|
||||
switch err := row.Scan(&existing); {
|
||||
case err == nil:
|
||||
@@ -57,9 +64,9 @@ func (s *Store) RaiseOrTouch(ctx context.Context, hostID, kind, severity, messag
|
||||
id = ulid.Make().String()
|
||||
whenStr := when.UTC().Format(time.RFC3339Nano)
|
||||
_, err = tx.ExecContext(ctx,
|
||||
`INSERT INTO alerts (id, host_id, kind, severity, message, created_at, last_seen_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
||||
id, hostID, kind, severity, message, whenStr, whenStr)
|
||||
`INSERT INTO alerts (id, host_id, kind, dedup_key, severity, message, created_at, last_seen_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
id, hostID, kind, dedupKey, severity, message, whenStr, whenStr)
|
||||
if err != nil {
|
||||
return "", false, fmt.Errorf("store: insert alert: %w", err)
|
||||
}
|
||||
@@ -127,14 +134,20 @@ func (s *Store) Resolve(ctx context.Context, id string, when time.Time) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// AutoResolve closes every open alert for the (host_id, kind) pair.
|
||||
// AutoResolve closes the open alert for (host_id, kind, dedup_key).
|
||||
// Used by the engine when a rule's underlying condition clears (e.g.
|
||||
// next backup succeeded so backup_failed clears).
|
||||
func (s *Store) AutoResolve(ctx context.Context, hostID, kind string, when time.Time) error {
|
||||
// next backup succeeded for the same source group so backup_failed
|
||||
// clears). Pass dedupKey="" for one-per-host alerts (agent_offline).
|
||||
//
|
||||
// Closes only the dedup-key-matching row, not every open alert of
|
||||
// the same kind on the host — distinct source groups now have
|
||||
// distinct rows and a recovery in one shouldn't auto-resolve the
|
||||
// others.
|
||||
func (s *Store) AutoResolve(ctx context.Context, hostID, kind, dedupKey string, when time.Time) error {
|
||||
_, err := s.db.ExecContext(ctx,
|
||||
`UPDATE alerts SET resolved_at = ?
|
||||
WHERE host_id = ? AND kind = ? AND resolved_at IS NULL`,
|
||||
when.UTC().Format(time.RFC3339Nano), hostID, kind)
|
||||
WHERE host_id = ? AND kind = ? AND dedup_key = ? AND resolved_at IS NULL`,
|
||||
when.UTC().Format(time.RFC3339Nano), hostID, kind, dedupKey)
|
||||
if err != nil {
|
||||
return fmt.Errorf("store: auto-resolve: %w", err)
|
||||
}
|
||||
@@ -145,7 +158,7 @@ func (s *Store) AutoResolve(ctx context.Context, hostID, kind string, when time.
|
||||
// GetAlert reads one row.
|
||||
func (s *Store) GetAlert(ctx context.Context, id string) (*Alert, error) {
|
||||
row := s.db.QueryRowContext(ctx,
|
||||
`SELECT id, host_id, kind, severity, message, created_at, last_seen_at,
|
||||
`SELECT id, host_id, kind, dedup_key, severity, message, created_at, last_seen_at,
|
||||
acknowledged_at, acknowledged_by, resolved_at
|
||||
FROM alerts WHERE id = ?`, id)
|
||||
return scanAlert(row.Scan)
|
||||
@@ -153,7 +166,7 @@ func (s *Store) GetAlert(ctx context.Context, id string) (*Alert, error) {
|
||||
|
||||
// ListAlerts is the filtered list. Sort: open-first, then by created_at desc.
|
||||
func (s *Store) ListAlerts(ctx context.Context, f AlertFilter) ([]Alert, error) {
|
||||
q := `SELECT id, host_id, kind, severity, message, created_at, last_seen_at,
|
||||
q := `SELECT id, host_id, kind, dedup_key, severity, message, created_at, last_seen_at,
|
||||
acknowledged_at, acknowledged_by, resolved_at FROM alerts`
|
||||
conds := []string{}
|
||||
args := []any{}
|
||||
@@ -209,7 +222,7 @@ func scanAlert(scan func(...any) error) (*Alert, error) {
|
||||
var a Alert
|
||||
var hostID, lastSeen, ackedAt, ackedBy, resolvedAt sql.NullString
|
||||
var createdAt string
|
||||
if err := scan(&a.ID, &hostID, &a.Kind, &a.Severity, &a.Message,
|
||||
if err := scan(&a.ID, &hostID, &a.Kind, &a.DedupKey, &a.Severity, &a.Message,
|
||||
&createdAt, &lastSeen, &ackedAt, &ackedBy, &resolvedAt); err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, ErrNotFound
|
||||
|
||||
Reference in New Issue
Block a user