alert: rule logic for the six v1 rules
This commit is contained in:
@@ -0,0 +1,125 @@
|
||||
package alert
|
||||
|
||||
import (
|
||||
"context"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/oklog/ulid/v2"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
)
|
||||
|
||||
func setupEngine(t *testing.T) (*Engine, *store.Store, string) {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
st, _ := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
|
||||
t.Cleanup(func() { _ = st.Close() })
|
||||
keyPath := filepath.Join(dir, "secret.key")
|
||||
_ = crypto.GenerateKeyFile(keyPath)
|
||||
key, _ := crypto.LoadKeyFromFile(keyPath)
|
||||
aead, _ := crypto.NewAEAD(key)
|
||||
hub := notification.NewHub(st, aead, "https://rm.example")
|
||||
eng := NewEngine(st, hub)
|
||||
hostID := ulid.Make().String()
|
||||
if err := st.CreateHost(context.Background(), store.Host{
|
||||
ID: hostID, Name: "alfa-01", OS: "linux", Arch: "amd64",
|
||||
EnrolledAt: time.Now().UTC(),
|
||||
}, "deadbeef", ""); err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
return eng, st, hostID
|
||||
}
|
||||
|
||||
func TestEngineBackupFailedRaisesThenResolves(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
ctx := context.Background()
|
||||
|
||||
eng.handleJobFinished(ctx, JobFinishedEvent{
|
||||
HostID: hostID, JobID: "j1", Kind: "backup", Status: "failed",
|
||||
When: time.Now().UTC(),
|
||||
})
|
||||
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 1 || open[0].Kind != KindBackupFailed {
|
||||
t.Fatalf("expected one backup_failed open; got %+v", open)
|
||||
}
|
||||
|
||||
// Second failed job should TOUCH (not raise a fresh row).
|
||||
eng.handleJobFinished(ctx, JobFinishedEvent{
|
||||
HostID: hostID, JobID: "j2", Kind: "backup", Status: "failed",
|
||||
When: time.Now().UTC().Add(time.Minute),
|
||||
})
|
||||
open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 1 {
|
||||
t.Fatalf("expected dedup to stay at 1 open; got %d", len(open))
|
||||
}
|
||||
|
||||
// Success auto-resolves.
|
||||
eng.handleJobFinished(ctx, JobFinishedEvent{
|
||||
HostID: hostID, JobID: "j3", Kind: "backup", Status: "succeeded",
|
||||
When: time.Now().UTC().Add(2 * time.Minute),
|
||||
})
|
||||
open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 0 {
|
||||
t.Fatalf("expected zero open after success; got %d", len(open))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEngineCheckFailedSeverityCritical(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
eng.handleJobFinished(context.Background(), JobFinishedEvent{
|
||||
HostID: hostID, Kind: "check", Status: "failed", When: time.Now().UTC(),
|
||||
})
|
||||
open, _ := st.ListAlerts(context.Background(),
|
||||
store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 1 || open[0].Severity != "critical" {
|
||||
t.Fatalf("got %+v", open)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEngineAgentOfflineRespects15MinFloor(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
// Host's last_seen_at defaulted to NULL via CreateHost (enrolled but never
|
||||
// seen). Force a stale value for the test by direct DB update.
|
||||
if _, err := st.DB().Exec(
|
||||
`UPDATE hosts SET last_seen_at = ? WHERE id = ?`,
|
||||
time.Now().UTC().Add(-20*time.Minute).Format(time.RFC3339Nano), hostID,
|
||||
); err != nil {
|
||||
t.Fatalf("update last_seen_at: %v", err)
|
||||
}
|
||||
eng.handleHostOffline(context.Background(), hostID)
|
||||
open, _ := st.ListAlerts(context.Background(),
|
||||
store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 1 {
|
||||
t.Fatalf("expected agent_offline raised; got %d", len(open))
|
||||
}
|
||||
|
||||
// Bring back online — should auto-resolve.
|
||||
eng.handleHostOnline(context.Background(), hostID)
|
||||
open, _ = st.ListAlerts(context.Background(),
|
||||
store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 0 {
|
||||
t.Fatalf("expected agent_offline resolved; got %d", len(open))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEngineAgentOfflineUnderFloorNoRaise(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
// last_seen_at is NULL from CreateHost (never touched). A nil
|
||||
// last_seen_at means the host was enrolled but never connected —
|
||||
// treat that as "now" for the floor check so we don't raise
|
||||
// immediately. handleHostOffline must skip the raise.
|
||||
eng.handleHostOffline(context.Background(), hostID)
|
||||
open, _ := st.ListAlerts(context.Background(),
|
||||
store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 0 {
|
||||
t.Fatalf("expected no raise within 15-min floor; got %d", len(open))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user