175 lines
5.7 KiB
Go
175 lines
5.7 KiB
Go
package alert
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
|
)
|
|
|
|
// Alert kind constants — keep in lockstep with the engine logic and
|
|
// the UI tag-colour table.
|
|
const (
|
|
// KindBackupFailed is raised when a backup job finishes with
|
|
// status "failed" and resolved on next backup success.
|
|
KindBackupFailed = "backup_failed"
|
|
|
|
// KindForgetFailed mirrors KindBackupFailed for forget jobs.
|
|
KindForgetFailed = "forget_failed"
|
|
|
|
// KindPruneFailed mirrors KindBackupFailed for prune jobs.
|
|
KindPruneFailed = "prune_failed"
|
|
|
|
// KindCheckFailed is raised at "critical" severity (repository
|
|
// integrity is at risk) when a check job fails.
|
|
KindCheckFailed = "check_failed"
|
|
|
|
// KindStaleSchedule is declared for completeness but intentionally
|
|
// left as a no-op in v1. The precise "expected to have fired but
|
|
// didn't" logic requires a store helper that lands in a follow-up
|
|
// task. Ask the team before implementing.
|
|
KindStaleSchedule = "stale_schedule"
|
|
|
|
// KindAgentOffline is raised when a host's last_seen_at is older
|
|
// than the 15-minute floor and resolved when the host reconnects.
|
|
KindAgentOffline = "agent_offline"
|
|
)
|
|
|
|
// raiseAndNotify is the standard raise pattern: store.RaiseOrTouch
|
|
// deduplicates, and notification.Hub.Dispatch fires only on the first
|
|
// raise (didRaise=true). Subsequent occurrences of the same open alert
|
|
// are "touched" (last_seen_at bumped) without a second notification.
|
|
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, dedupKey, severity, message string, when time.Time) {
|
|
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, dedupKey, severity, message, when)
|
|
if err != nil {
|
|
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
|
|
return
|
|
}
|
|
if !didRaise {
|
|
return
|
|
}
|
|
host, err := e.store.GetHost(ctx, hostID)
|
|
hostName := hostID
|
|
if err == nil {
|
|
hostName = host.Name
|
|
}
|
|
go e.hub.Dispatch(ctx, notification.Payload{
|
|
Event: notification.EventRaised,
|
|
AlertID: id,
|
|
Severity: severity,
|
|
Kind: kind,
|
|
HostID: hostID,
|
|
HostName: hostName,
|
|
Message: message,
|
|
RaisedAt: when,
|
|
})
|
|
}
|
|
|
|
// Acknowledge updates the alert row and fans out alert.acknowledged to
|
|
// every enabled channel. Best-effort: store errors are logged but the
|
|
// dispatch still fires only when the store update succeeds.
|
|
func (e *Engine) Acknowledge(ctx context.Context, alertID, userID string, when time.Time) error {
|
|
if err := e.store.Acknowledge(ctx, alertID, userID, when); err != nil {
|
|
return err
|
|
}
|
|
a, lerr := e.store.GetAlert(ctx, alertID)
|
|
if lerr != nil || a == nil {
|
|
// Acknowledge already succeeded; dispatch is best-effort.
|
|
return nil //nolint:nilerr
|
|
}
|
|
p := alertPayload(ctx, e.store, notification.EventAcknowledged, a)
|
|
go e.hub.Dispatch(context.WithoutCancel(ctx), p)
|
|
return nil
|
|
}
|
|
|
|
// Resolve marks the alert resolved and fans out alert.resolved.
|
|
func (e *Engine) Resolve(ctx context.Context, alertID string, when time.Time) error {
|
|
a, _ := e.store.GetAlert(ctx, alertID)
|
|
if err := e.store.Resolve(ctx, alertID, when); err != nil {
|
|
return err
|
|
}
|
|
if a == nil {
|
|
return nil
|
|
}
|
|
p := alertPayload(ctx, e.store, notification.EventResolved, a)
|
|
go e.hub.Dispatch(context.WithoutCancel(ctx), p)
|
|
return nil
|
|
}
|
|
|
|
// alertPayload builds a Payload from a stored Alert, looking up the host
|
|
// name when HostID is set.
|
|
func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a *store.Alert) notification.Payload {
|
|
hostID, hostName := "", ""
|
|
if a.HostID != nil {
|
|
hostID = *a.HostID
|
|
hostName = hostID
|
|
if h, err := st.GetHost(ctx, hostID); err == nil && h != nil {
|
|
hostName = h.Name
|
|
}
|
|
}
|
|
return notification.Payload{
|
|
Event: ev,
|
|
AlertID: a.ID,
|
|
Severity: a.Severity,
|
|
Kind: a.Kind,
|
|
HostID: hostID,
|
|
HostName: hostName,
|
|
Message: a.Message,
|
|
RaisedAt: a.CreatedAt,
|
|
}
|
|
}
|
|
|
|
// ResolveOnModeChange clears any open agent_offline and stale_schedule
|
|
// alerts for a host whose always-on flag was just toggled. The next
|
|
// 60s tick re-raises whichever still applies under the new mode, so
|
|
// this is a self-correcting "wipe and let the sweep settle" call.
|
|
// Safe to invoke from the HTTP layer (it only touches the store + hub).
|
|
func (e *Engine) ResolveOnModeChange(ctx context.Context, hostID string, when time.Time) {
|
|
e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", when)
|
|
e.resolveAndNotify(ctx, hostID, KindStaleSchedule, "", when)
|
|
}
|
|
|
|
// resolveAndNotify clears the open (or acknowledged) alert matching
|
|
// (host_id, kind, dedup_key) via store.AutoResolve, then fires
|
|
// alert.resolved for the row(s) actually closed. Best-effort —
|
|
// errors are logged but do not propagate.
|
|
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind, dedupKey string, when time.Time) {
|
|
open, err := e.store.ListAlerts(ctx, store.AlertFilter{
|
|
Status: "open", HostID: hostID,
|
|
})
|
|
if err != nil {
|
|
return
|
|
}
|
|
openAcked, _ := e.store.ListAlerts(ctx, store.AlertFilter{
|
|
Status: "acknowledged", HostID: hostID,
|
|
})
|
|
all := append(open, openAcked...)
|
|
if err := e.store.AutoResolve(ctx, hostID, kind, dedupKey, when); err != nil {
|
|
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
|
|
return
|
|
}
|
|
host, _ := e.store.GetHost(ctx, hostID)
|
|
hostName := hostID
|
|
if host != nil {
|
|
hostName = host.Name
|
|
}
|
|
for _, a := range all {
|
|
if a.Kind != kind || a.DedupKey != dedupKey {
|
|
continue
|
|
}
|
|
go e.hub.Dispatch(ctx, notification.Payload{
|
|
Event: notification.EventResolved,
|
|
AlertID: a.ID,
|
|
Severity: a.Severity,
|
|
Kind: a.Kind,
|
|
HostID: hostID,
|
|
HostName: hostName,
|
|
Message: fmt.Sprintf("Auto-resolved (%s)", kind),
|
|
RaisedAt: when,
|
|
})
|
|
}
|
|
}
|