alert: rule logic for the six v1 rules

This commit is contained in:
2026-05-04 19:50:33 +01:00
parent 9abe817aae
commit 5e655d756d
3 changed files with 311 additions and 5 deletions
+76 -5
View File
@@ -13,6 +13,7 @@ package alert
import (
"context"
"fmt"
"log/slog"
"sync"
"time"
@@ -116,19 +117,89 @@ func (e *Engine) NotifyHostOnline(hostID string) {
}
}
// Placeholder method stubs for C2 implementation
func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
// Implemented in C2
// Determine which kind/severity pair this job maps to. Jobs not
// listed here (init, unlock, restore, diff) produce no alerts in v1.
var kind, severity string
switch ev.Kind {
case "backup":
kind, severity = KindBackupFailed, "warning"
case "forget":
kind, severity = KindForgetFailed, "warning"
case "prune":
kind, severity = KindPruneFailed, "warning"
case "check":
kind, severity = KindCheckFailed, "critical"
default:
return
}
switch ev.Status {
case "failed":
e.raiseAndNotify(ctx, ev.HostID, kind, severity,
fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
case "succeeded":
e.resolveAndNotify(ctx, ev.HostID, kind, ev.When)
}
}
func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
// Implemented in C2
host, err := e.store.GetHost(ctx, hostID)
if err != nil {
return
}
// Apply the 15-min floor — raise only when last_seen_at is older
// than agentOfflineFloor. A nil last_seen_at (host enrolled but
// never connected) is treated as "now" so we don't raise
// immediately on enrolment.
if host.LastSeenAt == nil {
return
}
if time.Since(*host.LastSeenAt) < e.agentOfflineFloor {
return
}
e.raiseAndNotify(ctx, hostID, KindAgentOffline, "warning",
fmt.Sprintf("Agent offline for %s (threshold %s)",
roundDur(time.Since(*host.LastSeenAt)), e.agentOfflineFloor),
time.Now().UTC())
}
func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
// Implemented in C2
e.resolveAndNotify(ctx, hostID, KindAgentOffline, time.Now().UTC())
}
// tick is the 60-second sweep. Responsibilities:
// 1. Re-evaluate agent_offline for every offline host that may have
// crossed the floor between explicit events.
// 2. Stale-schedule detection — declared in the spec but intentionally
// left as a no-op in v1. The precise "expected to have fired but
// didn't" trigger requires a store helper that lands in a later
// task. The KindStaleSchedule constant is exported so UI code can
// reference the tag string today.
func (e *Engine) tick(ctx context.Context, now time.Time) {
// Implemented in C2
hosts, err := e.store.ListHosts(ctx)
if err != nil {
slog.Warn("alert: tick list hosts", "err", err)
return
}
for _, h := range hosts {
if h.Status != "offline" || h.LastSeenAt == nil {
continue
}
if now.Sub(*h.LastSeenAt) >= e.agentOfflineFloor {
e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "warning",
fmt.Sprintf("Agent offline for %s (threshold %s)",
roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
}
}
// Stale-schedule sweep — no-op in v1. See KindStaleSchedule doc comment.
}
// roundDur returns a human-readable duration string, rounding to the
// nearest minute. Durations under a minute are reported as "less than
// a minute".
func roundDur(d time.Duration) string {
if d < time.Minute {
return "less than a minute"
}
return d.Round(time.Minute).String()
}