alert: rule logic for the six v1 rules
This commit is contained in:
@@ -0,0 +1,110 @@
|
||||
package alert
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
)
|
||||
|
||||
// Alert kind constants — keep in lockstep with the engine logic and
|
||||
// the UI tag-colour table.
|
||||
const (
|
||||
// KindBackupFailed is raised when a backup job finishes with
|
||||
// status "failed" and resolved on next backup success.
|
||||
KindBackupFailed = "backup_failed"
|
||||
|
||||
// KindForgetFailed mirrors KindBackupFailed for forget jobs.
|
||||
KindForgetFailed = "forget_failed"
|
||||
|
||||
// KindPruneFailed mirrors KindBackupFailed for prune jobs.
|
||||
KindPruneFailed = "prune_failed"
|
||||
|
||||
// KindCheckFailed is raised at "critical" severity (repository
|
||||
// integrity is at risk) when a check job fails.
|
||||
KindCheckFailed = "check_failed"
|
||||
|
||||
// KindStaleSchedule is declared for completeness but intentionally
|
||||
// left as a no-op in v1. The precise "expected to have fired but
|
||||
// didn't" logic requires a store helper that lands in a follow-up
|
||||
// task. Ask the team before implementing.
|
||||
KindStaleSchedule = "stale_schedule"
|
||||
|
||||
// KindAgentOffline is raised when a host's last_seen_at is older
|
||||
// than the 15-minute floor and resolved when the host reconnects.
|
||||
KindAgentOffline = "agent_offline"
|
||||
)
|
||||
|
||||
// raiseAndNotify is the standard raise pattern: store.RaiseOrTouch
|
||||
// deduplicates, and notification.Hub.Dispatch fires only on the first
|
||||
// raise (didRaise=true). Subsequent occurrences of the same open alert
|
||||
// are "touched" (last_seen_at bumped) without a second notification.
|
||||
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, severity, message string, when time.Time) {
|
||||
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, severity, message, when)
|
||||
if err != nil {
|
||||
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "err", err)
|
||||
return
|
||||
}
|
||||
if !didRaise {
|
||||
return
|
||||
}
|
||||
host, err := e.store.GetHost(ctx, hostID)
|
||||
hostName := hostID
|
||||
if err == nil {
|
||||
hostName = host.Name
|
||||
}
|
||||
go e.hub.Dispatch(ctx, notification.Payload{
|
||||
Event: notification.EventRaised,
|
||||
AlertID: id,
|
||||
Severity: severity,
|
||||
Kind: kind,
|
||||
HostID: hostID,
|
||||
HostName: hostName,
|
||||
Message: message,
|
||||
RaisedAt: when,
|
||||
})
|
||||
}
|
||||
|
||||
// resolveAndNotify clears every open (or acknowledged) alert for
|
||||
// (host_id, kind) via store.AutoResolve, then fires alert.resolved
|
||||
// for each row that was actually open. Best-effort — errors are
|
||||
// logged but do not propagate.
|
||||
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when time.Time) {
|
||||
open, err := e.store.ListAlerts(ctx, store.AlertFilter{
|
||||
Status: "open", HostID: hostID,
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
openAcked, _ := e.store.ListAlerts(ctx, store.AlertFilter{
|
||||
Status: "acknowledged", HostID: hostID,
|
||||
})
|
||||
all := append(open, openAcked...)
|
||||
if err := e.store.AutoResolve(ctx, hostID, kind, when); err != nil {
|
||||
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "err", err)
|
||||
return
|
||||
}
|
||||
host, _ := e.store.GetHost(ctx, hostID)
|
||||
hostName := hostID
|
||||
if host != nil {
|
||||
hostName = host.Name
|
||||
}
|
||||
for _, a := range all {
|
||||
if a.Kind != kind {
|
||||
continue
|
||||
}
|
||||
go e.hub.Dispatch(ctx, notification.Payload{
|
||||
Event: notification.EventResolved,
|
||||
AlertID: a.ID,
|
||||
Severity: a.Severity,
|
||||
Kind: a.Kind,
|
||||
HostID: hostID,
|
||||
HostName: hostName,
|
||||
Message: fmt.Sprintf("Auto-resolved (%s)", kind),
|
||||
RaisedAt: when,
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user