alert: rule logic for the six v1 rules
This commit is contained in:
@@ -13,6 +13,7 @@ package alert
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -116,19 +117,89 @@ func (e *Engine) NotifyHostOnline(hostID string) {
|
||||
}
|
||||
}
|
||||
|
||||
// Placeholder method stubs for C2 implementation
|
||||
func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
|
||||
// Implemented in C2
|
||||
// Determine which kind/severity pair this job maps to. Jobs not
|
||||
// listed here (init, unlock, restore, diff) produce no alerts in v1.
|
||||
var kind, severity string
|
||||
switch ev.Kind {
|
||||
case "backup":
|
||||
kind, severity = KindBackupFailed, "warning"
|
||||
case "forget":
|
||||
kind, severity = KindForgetFailed, "warning"
|
||||
case "prune":
|
||||
kind, severity = KindPruneFailed, "warning"
|
||||
case "check":
|
||||
kind, severity = KindCheckFailed, "critical"
|
||||
default:
|
||||
return
|
||||
}
|
||||
switch ev.Status {
|
||||
case "failed":
|
||||
e.raiseAndNotify(ctx, ev.HostID, kind, severity,
|
||||
fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
|
||||
case "succeeded":
|
||||
e.resolveAndNotify(ctx, ev.HostID, kind, ev.When)
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
|
||||
// Implemented in C2
|
||||
host, err := e.store.GetHost(ctx, hostID)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
// Apply the 15-min floor — raise only when last_seen_at is older
|
||||
// than agentOfflineFloor. A nil last_seen_at (host enrolled but
|
||||
// never connected) is treated as "now" so we don't raise
|
||||
// immediately on enrolment.
|
||||
if host.LastSeenAt == nil {
|
||||
return
|
||||
}
|
||||
if time.Since(*host.LastSeenAt) < e.agentOfflineFloor {
|
||||
return
|
||||
}
|
||||
e.raiseAndNotify(ctx, hostID, KindAgentOffline, "warning",
|
||||
fmt.Sprintf("Agent offline for %s (threshold %s)",
|
||||
roundDur(time.Since(*host.LastSeenAt)), e.agentOfflineFloor),
|
||||
time.Now().UTC())
|
||||
}
|
||||
|
||||
func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
|
||||
// Implemented in C2
|
||||
e.resolveAndNotify(ctx, hostID, KindAgentOffline, time.Now().UTC())
|
||||
}
|
||||
|
||||
// tick is the 60-second sweep. Responsibilities:
|
||||
// 1. Re-evaluate agent_offline for every offline host that may have
|
||||
// crossed the floor between explicit events.
|
||||
// 2. Stale-schedule detection — declared in the spec but intentionally
|
||||
// left as a no-op in v1. The precise "expected to have fired but
|
||||
// didn't" trigger requires a store helper that lands in a later
|
||||
// task. The KindStaleSchedule constant is exported so UI code can
|
||||
// reference the tag string today.
|
||||
func (e *Engine) tick(ctx context.Context, now time.Time) {
|
||||
// Implemented in C2
|
||||
hosts, err := e.store.ListHosts(ctx)
|
||||
if err != nil {
|
||||
slog.Warn("alert: tick list hosts", "err", err)
|
||||
return
|
||||
}
|
||||
for _, h := range hosts {
|
||||
if h.Status != "offline" || h.LastSeenAt == nil {
|
||||
continue
|
||||
}
|
||||
if now.Sub(*h.LastSeenAt) >= e.agentOfflineFloor {
|
||||
e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "warning",
|
||||
fmt.Sprintf("Agent offline for %s (threshold %s)",
|
||||
roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
|
||||
}
|
||||
}
|
||||
// Stale-schedule sweep — no-op in v1. See KindStaleSchedule doc comment.
|
||||
}
|
||||
|
||||
// roundDur returns a human-readable duration string, rounding to the
|
||||
// nearest minute. Durations under a minute are reported as "less than
|
||||
// a minute".
|
||||
func roundDur(d time.Duration) string {
|
||||
if d < time.Minute {
|
||||
return "less than a minute"
|
||||
}
|
||||
return d.Round(time.Minute).String()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user