Files
restic-manager/internal/alert/rules.go
T
steve 350be3f19d feat(alerts): per-source-group dedup so two failing backups produce two alerts
Until now the open-alert key was (host_id, kind, resolved_at IS NULL).
A host with two source groups both failing collapsed onto one
backup_failed row — second failure bumped last_seen_at and
overwrote the message but never re-fan-out. Operators saw one
alert that appeared to flap, not two distinct broken things.

Schema changes (column-level ALTER, no rebuild):

- 0015 jobs.source_group_id (FK → source_groups, ON DELETE SET NULL,
  index). Populated for backup jobs in CreateJob.
- 0016 alerts.dedup_key (NOT NULL DEFAULT ''). The old alerts_open
  partial index gets dropped and replaced with a UNIQUE partial
  index on (host_id, kind, dedup_key) WHERE resolved_at IS NULL —
  the index is now the actual dedup primitive.

Plumbing:

- RaiseOrTouch / AutoResolve / Alert struct gain dedup_key.
- engine.JobFinishedEvent gains SourceGroupID; handleJobFinished
  passes it through for backup_failed only (forget/prune/check stay
  repo-scoped with key='').
- ws.handler reads SourceGroupID off the freshly-loaded job row.
- dispatchJobWithPayload gains a *string sourceGroupID arg; the
  per-group Run-now path and schedule.fire path pass &g.ID.

Test coverage: TestRaiseOrTouchDedupsPerSourceGroup proves two
distinct groups produce two distinct open alerts and that resolving
one does not auto-resolve the other.

Dev tool: cmd/_fake_alert gains -dedup-key flag.
2026-05-04 22:59:48 +01:00

165 lines
5.1 KiB
Go

package alert
import (
"context"
"fmt"
"log/slog"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// Alert kind constants — keep in lockstep with the engine logic and
// the UI tag-colour table.
const (
// KindBackupFailed is raised when a backup job finishes with
// status "failed" and resolved on next backup success.
KindBackupFailed = "backup_failed"
// KindForgetFailed mirrors KindBackupFailed for forget jobs.
KindForgetFailed = "forget_failed"
// KindPruneFailed mirrors KindBackupFailed for prune jobs.
KindPruneFailed = "prune_failed"
// KindCheckFailed is raised at "critical" severity (repository
// integrity is at risk) when a check job fails.
KindCheckFailed = "check_failed"
// KindStaleSchedule is declared for completeness but intentionally
// left as a no-op in v1. The precise "expected to have fired but
// didn't" logic requires a store helper that lands in a follow-up
// task. Ask the team before implementing.
KindStaleSchedule = "stale_schedule"
// KindAgentOffline is raised when a host's last_seen_at is older
// than the 15-minute floor and resolved when the host reconnects.
KindAgentOffline = "agent_offline"
)
// raiseAndNotify is the standard raise pattern: store.RaiseOrTouch
// deduplicates, and notification.Hub.Dispatch fires only on the first
// raise (didRaise=true). Subsequent occurrences of the same open alert
// are "touched" (last_seen_at bumped) without a second notification.
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, dedupKey, severity, message string, when time.Time) {
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, dedupKey, severity, message, when)
if err != nil {
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
return
}
if !didRaise {
return
}
host, err := e.store.GetHost(ctx, hostID)
hostName := hostID
if err == nil {
hostName = host.Name
}
go e.hub.Dispatch(ctx, notification.Payload{
Event: notification.EventRaised,
AlertID: id,
Severity: severity,
Kind: kind,
HostID: hostID,
HostName: hostName,
Message: message,
RaisedAt: when,
})
}
// Acknowledge updates the alert row and fans out alert.acknowledged to
// every enabled channel. Best-effort: store errors are logged but the
// dispatch still fires only when the store update succeeds.
func (e *Engine) Acknowledge(ctx context.Context, alertID, userID string, when time.Time) error {
if err := e.store.Acknowledge(ctx, alertID, userID, when); err != nil {
return err
}
a, lerr := e.store.GetAlert(ctx, alertID)
if lerr != nil || a == nil {
// Acknowledge already succeeded; dispatch is best-effort.
return nil //nolint:nilerr
}
p := alertPayload(ctx, e.store, notification.EventAcknowledged, a)
go e.hub.Dispatch(context.WithoutCancel(ctx), p)
return nil
}
// Resolve marks the alert resolved and fans out alert.resolved.
func (e *Engine) Resolve(ctx context.Context, alertID string, when time.Time) error {
a, _ := e.store.GetAlert(ctx, alertID)
if err := e.store.Resolve(ctx, alertID, when); err != nil {
return err
}
if a == nil {
return nil
}
p := alertPayload(ctx, e.store, notification.EventResolved, a)
go e.hub.Dispatch(context.WithoutCancel(ctx), p)
return nil
}
// alertPayload builds a Payload from a stored Alert, looking up the host
// name when HostID is set.
func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a *store.Alert) notification.Payload {
hostID, hostName := "", ""
if a.HostID != nil {
hostID = *a.HostID
hostName = hostID
if h, err := st.GetHost(ctx, hostID); err == nil && h != nil {
hostName = h.Name
}
}
return notification.Payload{
Event: ev,
AlertID: a.ID,
Severity: a.Severity,
Kind: a.Kind,
HostID: hostID,
HostName: hostName,
Message: a.Message,
RaisedAt: a.CreatedAt,
}
}
// resolveAndNotify clears the open (or acknowledged) alert matching
// (host_id, kind, dedup_key) via store.AutoResolve, then fires
// alert.resolved for the row(s) actually closed. Best-effort —
// errors are logged but do not propagate.
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind, dedupKey string, when time.Time) {
open, err := e.store.ListAlerts(ctx, store.AlertFilter{
Status: "open", HostID: hostID,
})
if err != nil {
return
}
openAcked, _ := e.store.ListAlerts(ctx, store.AlertFilter{
Status: "acknowledged", HostID: hostID,
})
all := append(open, openAcked...)
if err := e.store.AutoResolve(ctx, hostID, kind, dedupKey, when); err != nil {
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
return
}
host, _ := e.store.GetHost(ctx, hostID)
hostName := hostID
if host != nil {
hostName = host.Name
}
for _, a := range all {
if a.Kind != kind || a.DedupKey != dedupKey {
continue
}
go e.hub.Dispatch(ctx, notification.Payload{
Event: notification.EventResolved,
AlertID: a.ID,
Severity: a.Severity,
Kind: a.Kind,
HostID: hostID,
HostName: hostName,
Message: fmt.Sprintf("Auto-resolved (%s)", kind),
RaisedAt: when,
})
}
}