Files
restic-manager/internal/alert/rules.go
T
steve 04dde93acd fix: dispatch alert.acknowledged + alert.resolved on UI ack/resolve
Spotted during the live Playwright sweep: clicking Acknowledge or
Resolve updated the alert row but never fanned out a notification.
The handlers went straight to Store.Acknowledge/Resolve, bypassing
the hub.

Add Engine.Acknowledge and Engine.Resolve that wrap the store call
and dispatch the matching event to every enabled channel. The UI
handlers prefer the engine path when wired, and fall back to the
direct store call so unit tests that construct a Server without an
engine still work.

Use context.WithoutCancel for the goroutine dispatch — the request
context is cancelled the instant the handler returns 204, so the
naive 'go e.hub.Dispatch(ctx, ...)' was racing the response and
losing the channel-list query with 'context canceled'.
2026-05-04 21:00:44 +01:00

165 lines
5.0 KiB
Go

package alert
import (
"context"
"fmt"
"log/slog"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// Alert kind constants — keep in lockstep with the engine logic and
// the UI tag-colour table.
const (
// KindBackupFailed is raised when a backup job finishes with
// status "failed" and resolved on next backup success.
KindBackupFailed = "backup_failed"
// KindForgetFailed mirrors KindBackupFailed for forget jobs.
KindForgetFailed = "forget_failed"
// KindPruneFailed mirrors KindBackupFailed for prune jobs.
KindPruneFailed = "prune_failed"
// KindCheckFailed is raised at "critical" severity (repository
// integrity is at risk) when a check job fails.
KindCheckFailed = "check_failed"
// KindStaleSchedule is declared for completeness but intentionally
// left as a no-op in v1. The precise "expected to have fired but
// didn't" logic requires a store helper that lands in a follow-up
// task. Ask the team before implementing.
KindStaleSchedule = "stale_schedule"
// KindAgentOffline is raised when a host's last_seen_at is older
// than the 15-minute floor and resolved when the host reconnects.
KindAgentOffline = "agent_offline"
)
// raiseAndNotify is the standard raise pattern: store.RaiseOrTouch
// deduplicates, and notification.Hub.Dispatch fires only on the first
// raise (didRaise=true). Subsequent occurrences of the same open alert
// are "touched" (last_seen_at bumped) without a second notification.
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, severity, message string, when time.Time) {
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, severity, message, when)
if err != nil {
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "err", err)
return
}
if !didRaise {
return
}
host, err := e.store.GetHost(ctx, hostID)
hostName := hostID
if err == nil {
hostName = host.Name
}
go e.hub.Dispatch(ctx, notification.Payload{
Event: notification.EventRaised,
AlertID: id,
Severity: severity,
Kind: kind,
HostID: hostID,
HostName: hostName,
Message: message,
RaisedAt: when,
})
}
// Acknowledge updates the alert row and fans out alert.acknowledged to
// every enabled channel. Best-effort: store errors are logged but the
// dispatch still fires only when the store update succeeds.
func (e *Engine) Acknowledge(ctx context.Context, alertID, userID string, when time.Time) error {
if err := e.store.Acknowledge(ctx, alertID, userID, when); err != nil {
return err
}
a, lerr := e.store.GetAlert(ctx, alertID)
if lerr != nil || a == nil {
// Acknowledge already succeeded; dispatch is best-effort.
return nil //nolint:nilerr
}
p := alertPayload(ctx, e.store, notification.EventAcknowledged, a)
go e.hub.Dispatch(context.WithoutCancel(ctx), p)
return nil
}
// Resolve marks the alert resolved and fans out alert.resolved.
func (e *Engine) Resolve(ctx context.Context, alertID string, when time.Time) error {
a, _ := e.store.GetAlert(ctx, alertID)
if err := e.store.Resolve(ctx, alertID, when); err != nil {
return err
}
if a == nil {
return nil
}
p := alertPayload(ctx, e.store, notification.EventResolved, a)
go e.hub.Dispatch(context.WithoutCancel(ctx), p)
return nil
}
// alertPayload builds a Payload from a stored Alert, looking up the host
// name when HostID is set.
func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a *store.Alert) notification.Payload {
hostID, hostName := "", ""
if a.HostID != nil {
hostID = *a.HostID
hostName = hostID
if h, err := st.GetHost(ctx, hostID); err == nil && h != nil {
hostName = h.Name
}
}
return notification.Payload{
Event: ev,
AlertID: a.ID,
Severity: a.Severity,
Kind: a.Kind,
HostID: hostID,
HostName: hostName,
Message: a.Message,
RaisedAt: a.CreatedAt,
}
}
// resolveAndNotify clears every open (or acknowledged) alert for
// (host_id, kind) via store.AutoResolve, then fires alert.resolved
// for each row that was actually open. Best-effort — errors are
// logged but do not propagate.
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when time.Time) {
open, err := e.store.ListAlerts(ctx, store.AlertFilter{
Status: "open", HostID: hostID,
})
if err != nil {
return
}
openAcked, _ := e.store.ListAlerts(ctx, store.AlertFilter{
Status: "acknowledged", HostID: hostID,
})
all := append(open, openAcked...)
if err := e.store.AutoResolve(ctx, hostID, kind, when); err != nil {
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "err", err)
return
}
host, _ := e.store.GetHost(ctx, hostID)
hostName := hostID
if host != nil {
hostName = host.Name
}
for _, a := range all {
if a.Kind != kind {
continue
}
go e.hub.Dispatch(ctx, notification.Payload{
Event: notification.EventResolved,
AlertID: a.ID,
Severity: a.Severity,
Kind: a.Kind,
HostID: hostID,
HostName: hostName,
Message: fmt.Sprintf("Auto-resolved (%s)", kind),
RaisedAt: when,
})
}
}