alert: rule logic for the six v1 rules
This commit is contained in:
@@ -13,6 +13,7 @@ package alert
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
@@ -116,19 +117,89 @@ func (e *Engine) NotifyHostOnline(hostID string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Placeholder method stubs for C2 implementation
|
|
||||||
func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
|
func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
|
||||||
// Implemented in C2
|
// Determine which kind/severity pair this job maps to. Jobs not
|
||||||
|
// listed here (init, unlock, restore, diff) produce no alerts in v1.
|
||||||
|
var kind, severity string
|
||||||
|
switch ev.Kind {
|
||||||
|
case "backup":
|
||||||
|
kind, severity = KindBackupFailed, "warning"
|
||||||
|
case "forget":
|
||||||
|
kind, severity = KindForgetFailed, "warning"
|
||||||
|
case "prune":
|
||||||
|
kind, severity = KindPruneFailed, "warning"
|
||||||
|
case "check":
|
||||||
|
kind, severity = KindCheckFailed, "critical"
|
||||||
|
default:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
switch ev.Status {
|
||||||
|
case "failed":
|
||||||
|
e.raiseAndNotify(ctx, ev.HostID, kind, severity,
|
||||||
|
fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
|
||||||
|
case "succeeded":
|
||||||
|
e.resolveAndNotify(ctx, ev.HostID, kind, ev.When)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
|
func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
|
||||||
// Implemented in C2
|
host, err := e.store.GetHost(ctx, hostID)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Apply the 15-min floor — raise only when last_seen_at is older
|
||||||
|
// than agentOfflineFloor. A nil last_seen_at (host enrolled but
|
||||||
|
// never connected) is treated as "now" so we don't raise
|
||||||
|
// immediately on enrolment.
|
||||||
|
if host.LastSeenAt == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if time.Since(*host.LastSeenAt) < e.agentOfflineFloor {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
e.raiseAndNotify(ctx, hostID, KindAgentOffline, "warning",
|
||||||
|
fmt.Sprintf("Agent offline for %s (threshold %s)",
|
||||||
|
roundDur(time.Since(*host.LastSeenAt)), e.agentOfflineFloor),
|
||||||
|
time.Now().UTC())
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
|
func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
|
||||||
// Implemented in C2
|
e.resolveAndNotify(ctx, hostID, KindAgentOffline, time.Now().UTC())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// tick is the 60-second sweep. Responsibilities:
|
||||||
|
// 1. Re-evaluate agent_offline for every offline host that may have
|
||||||
|
// crossed the floor between explicit events.
|
||||||
|
// 2. Stale-schedule detection — declared in the spec but intentionally
|
||||||
|
// left as a no-op in v1. The precise "expected to have fired but
|
||||||
|
// didn't" trigger requires a store helper that lands in a later
|
||||||
|
// task. The KindStaleSchedule constant is exported so UI code can
|
||||||
|
// reference the tag string today.
|
||||||
func (e *Engine) tick(ctx context.Context, now time.Time) {
|
func (e *Engine) tick(ctx context.Context, now time.Time) {
|
||||||
// Implemented in C2
|
hosts, err := e.store.ListHosts(ctx)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("alert: tick list hosts", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, h := range hosts {
|
||||||
|
if h.Status != "offline" || h.LastSeenAt == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if now.Sub(*h.LastSeenAt) >= e.agentOfflineFloor {
|
||||||
|
e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "warning",
|
||||||
|
fmt.Sprintf("Agent offline for %s (threshold %s)",
|
||||||
|
roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Stale-schedule sweep — no-op in v1. See KindStaleSchedule doc comment.
|
||||||
|
}
|
||||||
|
|
||||||
|
// roundDur returns a human-readable duration string, rounding to the
|
||||||
|
// nearest minute. Durations under a minute are reported as "less than
|
||||||
|
// a minute".
|
||||||
|
func roundDur(d time.Duration) string {
|
||||||
|
if d < time.Minute {
|
||||||
|
return "less than a minute"
|
||||||
|
}
|
||||||
|
return d.Round(time.Minute).String()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,110 @@
|
|||||||
|
package alert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Alert kind constants — keep in lockstep with the engine logic and
|
||||||
|
// the UI tag-colour table.
|
||||||
|
const (
|
||||||
|
// KindBackupFailed is raised when a backup job finishes with
|
||||||
|
// status "failed" and resolved on next backup success.
|
||||||
|
KindBackupFailed = "backup_failed"
|
||||||
|
|
||||||
|
// KindForgetFailed mirrors KindBackupFailed for forget jobs.
|
||||||
|
KindForgetFailed = "forget_failed"
|
||||||
|
|
||||||
|
// KindPruneFailed mirrors KindBackupFailed for prune jobs.
|
||||||
|
KindPruneFailed = "prune_failed"
|
||||||
|
|
||||||
|
// KindCheckFailed is raised at "critical" severity (repository
|
||||||
|
// integrity is at risk) when a check job fails.
|
||||||
|
KindCheckFailed = "check_failed"
|
||||||
|
|
||||||
|
// KindStaleSchedule is declared for completeness but intentionally
|
||||||
|
// left as a no-op in v1. The precise "expected to have fired but
|
||||||
|
// didn't" logic requires a store helper that lands in a follow-up
|
||||||
|
// task. Ask the team before implementing.
|
||||||
|
KindStaleSchedule = "stale_schedule"
|
||||||
|
|
||||||
|
// KindAgentOffline is raised when a host's last_seen_at is older
|
||||||
|
// than the 15-minute floor and resolved when the host reconnects.
|
||||||
|
KindAgentOffline = "agent_offline"
|
||||||
|
)
|
||||||
|
|
||||||
|
// raiseAndNotify is the standard raise pattern: store.RaiseOrTouch
|
||||||
|
// deduplicates, and notification.Hub.Dispatch fires only on the first
|
||||||
|
// raise (didRaise=true). Subsequent occurrences of the same open alert
|
||||||
|
// are "touched" (last_seen_at bumped) without a second notification.
|
||||||
|
func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, severity, message string, when time.Time) {
|
||||||
|
id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, severity, message, when)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !didRaise {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
host, err := e.store.GetHost(ctx, hostID)
|
||||||
|
hostName := hostID
|
||||||
|
if err == nil {
|
||||||
|
hostName = host.Name
|
||||||
|
}
|
||||||
|
go e.hub.Dispatch(ctx, notification.Payload{
|
||||||
|
Event: notification.EventRaised,
|
||||||
|
AlertID: id,
|
||||||
|
Severity: severity,
|
||||||
|
Kind: kind,
|
||||||
|
HostID: hostID,
|
||||||
|
HostName: hostName,
|
||||||
|
Message: message,
|
||||||
|
RaisedAt: when,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// resolveAndNotify clears every open (or acknowledged) alert for
|
||||||
|
// (host_id, kind) via store.AutoResolve, then fires alert.resolved
|
||||||
|
// for each row that was actually open. Best-effort — errors are
|
||||||
|
// logged but do not propagate.
|
||||||
|
func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind string, when time.Time) {
|
||||||
|
open, err := e.store.ListAlerts(ctx, store.AlertFilter{
|
||||||
|
Status: "open", HostID: hostID,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
openAcked, _ := e.store.ListAlerts(ctx, store.AlertFilter{
|
||||||
|
Status: "acknowledged", HostID: hostID,
|
||||||
|
})
|
||||||
|
all := append(open, openAcked...)
|
||||||
|
if err := e.store.AutoResolve(ctx, hostID, kind, when); err != nil {
|
||||||
|
slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
host, _ := e.store.GetHost(ctx, hostID)
|
||||||
|
hostName := hostID
|
||||||
|
if host != nil {
|
||||||
|
hostName = host.Name
|
||||||
|
}
|
||||||
|
for _, a := range all {
|
||||||
|
if a.Kind != kind {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
go e.hub.Dispatch(ctx, notification.Payload{
|
||||||
|
Event: notification.EventResolved,
|
||||||
|
AlertID: a.ID,
|
||||||
|
Severity: a.Severity,
|
||||||
|
Kind: a.Kind,
|
||||||
|
HostID: hostID,
|
||||||
|
HostName: hostName,
|
||||||
|
Message: fmt.Sprintf("Auto-resolved (%s)", kind),
|
||||||
|
RaisedAt: when,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,125 @@
|
|||||||
|
package alert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
func setupEngine(t *testing.T) (*Engine, *store.Store, string) {
|
||||||
|
t.Helper()
|
||||||
|
dir := t.TempDir()
|
||||||
|
st, _ := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
|
||||||
|
t.Cleanup(func() { _ = st.Close() })
|
||||||
|
keyPath := filepath.Join(dir, "secret.key")
|
||||||
|
_ = crypto.GenerateKeyFile(keyPath)
|
||||||
|
key, _ := crypto.LoadKeyFromFile(keyPath)
|
||||||
|
aead, _ := crypto.NewAEAD(key)
|
||||||
|
hub := notification.NewHub(st, aead, "https://rm.example")
|
||||||
|
eng := NewEngine(st, hub)
|
||||||
|
hostID := ulid.Make().String()
|
||||||
|
if err := st.CreateHost(context.Background(), store.Host{
|
||||||
|
ID: hostID, Name: "alfa-01", OS: "linux", Arch: "amd64",
|
||||||
|
EnrolledAt: time.Now().UTC(),
|
||||||
|
}, "deadbeef", ""); err != nil {
|
||||||
|
t.Fatalf("create host: %v", err)
|
||||||
|
}
|
||||||
|
return eng, st, hostID
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEngineBackupFailedRaisesThenResolves(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
eng, st, hostID := setupEngine(t)
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
eng.handleJobFinished(ctx, JobFinishedEvent{
|
||||||
|
HostID: hostID, JobID: "j1", Kind: "backup", Status: "failed",
|
||||||
|
When: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
if len(open) != 1 || open[0].Kind != KindBackupFailed {
|
||||||
|
t.Fatalf("expected one backup_failed open; got %+v", open)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second failed job should TOUCH (not raise a fresh row).
|
||||||
|
eng.handleJobFinished(ctx, JobFinishedEvent{
|
||||||
|
HostID: hostID, JobID: "j2", Kind: "backup", Status: "failed",
|
||||||
|
When: time.Now().UTC().Add(time.Minute),
|
||||||
|
})
|
||||||
|
open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
if len(open) != 1 {
|
||||||
|
t.Fatalf("expected dedup to stay at 1 open; got %d", len(open))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Success auto-resolves.
|
||||||
|
eng.handleJobFinished(ctx, JobFinishedEvent{
|
||||||
|
HostID: hostID, JobID: "j3", Kind: "backup", Status: "succeeded",
|
||||||
|
When: time.Now().UTC().Add(2 * time.Minute),
|
||||||
|
})
|
||||||
|
open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
if len(open) != 0 {
|
||||||
|
t.Fatalf("expected zero open after success; got %d", len(open))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEngineCheckFailedSeverityCritical(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
eng, st, hostID := setupEngine(t)
|
||||||
|
eng.handleJobFinished(context.Background(), JobFinishedEvent{
|
||||||
|
HostID: hostID, Kind: "check", Status: "failed", When: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
open, _ := st.ListAlerts(context.Background(),
|
||||||
|
store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
if len(open) != 1 || open[0].Severity != "critical" {
|
||||||
|
t.Fatalf("got %+v", open)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEngineAgentOfflineRespects15MinFloor(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
eng, st, hostID := setupEngine(t)
|
||||||
|
// Host's last_seen_at defaulted to NULL via CreateHost (enrolled but never
|
||||||
|
// seen). Force a stale value for the test by direct DB update.
|
||||||
|
if _, err := st.DB().Exec(
|
||||||
|
`UPDATE hosts SET last_seen_at = ? WHERE id = ?`,
|
||||||
|
time.Now().UTC().Add(-20*time.Minute).Format(time.RFC3339Nano), hostID,
|
||||||
|
); err != nil {
|
||||||
|
t.Fatalf("update last_seen_at: %v", err)
|
||||||
|
}
|
||||||
|
eng.handleHostOffline(context.Background(), hostID)
|
||||||
|
open, _ := st.ListAlerts(context.Background(),
|
||||||
|
store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
if len(open) != 1 {
|
||||||
|
t.Fatalf("expected agent_offline raised; got %d", len(open))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bring back online — should auto-resolve.
|
||||||
|
eng.handleHostOnline(context.Background(), hostID)
|
||||||
|
open, _ = st.ListAlerts(context.Background(),
|
||||||
|
store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
if len(open) != 0 {
|
||||||
|
t.Fatalf("expected agent_offline resolved; got %d", len(open))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEngineAgentOfflineUnderFloorNoRaise(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
eng, st, hostID := setupEngine(t)
|
||||||
|
// last_seen_at is NULL from CreateHost (never touched). A nil
|
||||||
|
// last_seen_at means the host was enrolled but never connected —
|
||||||
|
// treat that as "now" for the floor check so we don't raise
|
||||||
|
// immediately. handleHostOffline must skip the raise.
|
||||||
|
eng.handleHostOffline(context.Background(), hostID)
|
||||||
|
open, _ := st.ListAlerts(context.Background(),
|
||||||
|
store.AlertFilter{Status: "open", HostID: hostID})
|
||||||
|
if len(open) != 0 {
|
||||||
|
t.Fatalf("expected no raise within 15-min floor; got %d", len(open))
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user