feat(alerts): per-source-group dedup so two failing backups produce two alerts
Until now the open-alert key was (host_id, kind, resolved_at IS NULL). A host with two source groups both failing collapsed onto one backup_failed row — second failure bumped last_seen_at and overwrote the message but never re-fan-out. Operators saw one alert that appeared to flap, not two distinct broken things. Schema changes (column-level ALTER, no rebuild): - 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL, index). Populated for backup jobs in CreateJob. - 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open partial index gets dropped and replaced with a UNIQUE partial index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL — the index is now the actual dedup primitive. Plumbing: - RaiseOrTouch / AutoResolve / Alert struct gain dedup_key. - engine.JobFinishedEvent gains SourceGroupID; handleJobFinished passes it through for backup_failed only (forget/prune/check stay repo-scoped with key=''). - ws.handler reads SourceGroupID off the freshly-loaded job row. - dispatchJobWithPayload gains a *string sourceGroupID arg; the per-group Run-now path and schedule.fire path pass &g.ID. Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two distinct groups produce two distinct open alerts and that resolving one does not auto-resolve the other. Dev tool: cmd/_fake_alert gains -dedup-key flag.
This commit is contained in:
+32
-20
@@ -11,19 +11,20 @@ import (
|
||||
|
||||
// Job mirrors the jobs table.
|
||||
type Job struct {
|
||||
ID string
|
||||
HostID string
|
||||
Kind string
|
||||
Status string
|
||||
ScheduledID *string
|
||||
ActorKind string // user|schedule|system
|
||||
ActorID *string
|
||||
StartedAt *time.Time
|
||||
FinishedAt *time.Time
|
||||
ExitCode *int
|
||||
Stats json.RawMessage
|
||||
Error *string
|
||||
CreatedAt time.Time
|
||||
ID string
|
||||
HostID string
|
||||
Kind string
|
||||
Status string
|
||||
ScheduledID *string
|
||||
SourceGroupID *string // populated for backup jobs; alert engine dedup key
|
||||
ActorKind string // user|schedule|system
|
||||
ActorID *string
|
||||
StartedAt *time.Time
|
||||
FinishedAt *time.Time
|
||||
ExitCode *int
|
||||
Stats json.RawMessage
|
||||
Error *string
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
// CreateJob inserts a queued job. The agent will mark it running
|
||||
@@ -32,10 +33,11 @@ type Job struct {
|
||||
// operator-driven run-now.
|
||||
func (s *Store) CreateJob(ctx context.Context, j Job) error {
|
||||
_, err := s.db.ExecContext(ctx,
|
||||
`INSERT INTO jobs (id, host_id, kind, status, scheduled_id, actor_kind, actor_id, created_at)
|
||||
VALUES (?, ?, ?, 'queued', ?, ?, ?, ?)`,
|
||||
`INSERT INTO jobs (id, host_id, kind, status, scheduled_id, source_group_id, actor_kind, actor_id, created_at)
|
||||
VALUES (?, ?, ?, 'queued', ?, ?, ?, ?, ?)`,
|
||||
j.ID, j.HostID, j.Kind,
|
||||
nullable(j.ScheduledID), j.ActorKind, nullable(j.ActorID),
|
||||
nullable(j.ScheduledID), nullable(j.SourceGroupID),
|
||||
j.ActorKind, nullable(j.ActorID),
|
||||
j.CreatedAt.UTC().Format(time.RFC3339Nano))
|
||||
if err != nil {
|
||||
return fmt.Errorf("store: create job: %w", err)
|
||||
@@ -139,12 +141,13 @@ func (s *Store) ListJobLogs(ctx context.Context, jobID string, afterSeq int64, l
|
||||
// GetJob returns a job row.
|
||||
func (s *Store) GetJob(ctx context.Context, id string) (*Job, error) {
|
||||
row := s.db.QueryRowContext(ctx,
|
||||
`SELECT id, host_id, kind, status, scheduled_id, actor_kind, actor_id,
|
||||
`SELECT id, host_id, kind, status, scheduled_id, source_group_id, actor_kind, actor_id,
|
||||
started_at, finished_at, exit_code, stats, error, created_at
|
||||
FROM jobs WHERE id = ?`, id)
|
||||
var (
|
||||
j Job
|
||||
schedID sql.NullString
|
||||
groupID sql.NullString
|
||||
actorID sql.NullString
|
||||
startedAt sql.NullString
|
||||
finishedAt sql.NullString
|
||||
@@ -153,7 +156,7 @@ func (s *Store) GetJob(ctx context.Context, id string) (*Job, error) {
|
||||
errMsg sql.NullString
|
||||
createdAt string
|
||||
)
|
||||
if err := row.Scan(&j.ID, &j.HostID, &j.Kind, &j.Status, &schedID,
|
||||
if err := row.Scan(&j.ID, &j.HostID, &j.Kind, &j.Status, &schedID, &groupID,
|
||||
&j.ActorKind, &actorID, &startedAt, &finishedAt,
|
||||
&exitCode, &stats, &errMsg, &createdAt); err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
@@ -165,6 +168,10 @@ func (s *Store) GetJob(ctx context.Context, id string) (*Job, error) {
|
||||
s := schedID.String
|
||||
j.ScheduledID = &s
|
||||
}
|
||||
if groupID.Valid {
|
||||
s := groupID.String
|
||||
j.SourceGroupID = &s
|
||||
}
|
||||
if actorID.Valid {
|
||||
s := actorID.String
|
||||
j.ActorID = &s
|
||||
@@ -201,7 +208,7 @@ func (s *Store) GetJob(ctx context.Context, id string) (*Job, error) {
|
||||
// would re-fire on the next tick while the first is still running.
|
||||
func (s *Store) LatestJobByKind(ctx context.Context, hostID, kind string) (*Job, error) {
|
||||
row := s.db.QueryRowContext(ctx,
|
||||
`SELECT id, host_id, kind, status, scheduled_id, actor_kind, actor_id,
|
||||
`SELECT id, host_id, kind, status, scheduled_id, source_group_id, actor_kind, actor_id,
|
||||
started_at, finished_at, exit_code, stats, error, created_at
|
||||
FROM jobs
|
||||
WHERE host_id = ? AND kind = ?
|
||||
@@ -210,6 +217,7 @@ func (s *Store) LatestJobByKind(ctx context.Context, hostID, kind string) (*Job,
|
||||
var (
|
||||
j Job
|
||||
schedID sql.NullString
|
||||
groupID sql.NullString
|
||||
actorID sql.NullString
|
||||
startedAt sql.NullString
|
||||
finishedAt sql.NullString
|
||||
@@ -218,7 +226,7 @@ func (s *Store) LatestJobByKind(ctx context.Context, hostID, kind string) (*Job,
|
||||
errMsg sql.NullString
|
||||
createdAt string
|
||||
)
|
||||
if err := row.Scan(&j.ID, &j.HostID, &j.Kind, &j.Status, &schedID,
|
||||
if err := row.Scan(&j.ID, &j.HostID, &j.Kind, &j.Status, &schedID, &groupID,
|
||||
&j.ActorKind, &actorID, &startedAt, &finishedAt,
|
||||
&exitCode, &stats, &errMsg, &createdAt); err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
@@ -230,6 +238,10 @@ func (s *Store) LatestJobByKind(ctx context.Context, hostID, kind string) (*Job,
|
||||
s := schedID.String
|
||||
j.ScheduledID = &s
|
||||
}
|
||||
if groupID.Valid {
|
||||
s := groupID.String
|
||||
j.SourceGroupID = &s
|
||||
}
|
||||
if actorID.Valid {
|
||||
s := actorID.String
|
||||
j.ActorID = &s
|
||||
|
||||
Reference in New Issue
Block a user