Files
restic-manager/internal/alert/engine.go
T

135 lines
3.9 KiB
Go

// Package alert evaluates the hardcoded rule set and persists raises
// / acknowledges / resolves. Three event sources feed it:
// - JobFinishedEvent — pushed when a job lands a terminal state
// (the existing MarkJobFinished site)
// - HostOfflineEvent / HostOnlineEvent — pushed by the offline
// sweeper and by the ws hello handler
// - 60s ticker (internal) — drives stale-schedule + auto-resolve
//
// All output goes through store.RaiseOrTouch / Acknowledge / Resolve
// and the notification.Hub. The engine is one goroutine started at
// boot; non-blocking sends from hot paths.
package alert
import (
"context"
"log/slog"
"sync"
"time"
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)
// JobFinishedEvent carries everything the engine needs to evaluate
// the failed-X rules. Pushed via Engine.NotifyJobFinished from the
// MarkJobFinished site.
type JobFinishedEvent struct {
HostID string
JobID string
Kind string // backup | forget | prune | check | unlock | restore | diff
Status string // succeeded | failed | cancelled
When time.Time
}
// Engine evaluates hardcoded alert rules and dispatches via notification.Hub.
type Engine struct {
store *store.Store
hub *notification.Hub
jobs chan JobFinishedEvent
hostDown chan string // host_id
hostUp chan string
// agentOfflineFloor is the duration a host must be offline before
// we raise. Configurable for tests; default 15m.
agentOfflineFloor time.Duration
tickPeriod time.Duration
closeOnce sync.Once
done chan struct{}
}
// NewEngine builds the engine. agentOfflineFloor + tickPeriod default
// to 15min and 60s respectively when zero.
func NewEngine(st *store.Store, hub *notification.Hub) *Engine {
return &Engine{
store: st,
hub: hub,
jobs: make(chan JobFinishedEvent, 32),
hostDown: make(chan string, 32),
hostUp: make(chan string, 32),
agentOfflineFloor: 15 * time.Minute,
tickPeriod: 60 * time.Second,
done: make(chan struct{}),
}
}
// Run drives the event loop. Returns when ctx is done. Blocks; call in
// its own goroutine.
func (e *Engine) Run(ctx context.Context) {
t := time.NewTicker(e.tickPeriod)
defer t.Stop()
for {
select {
case <-ctx.Done():
e.closeOnce.Do(func() { close(e.done) })
return
case ev := <-e.jobs:
e.handleJobFinished(ctx, ev)
case hostID := <-e.hostDown:
e.handleHostOffline(ctx, hostID)
case hostID := <-e.hostUp:
e.handleHostOnline(ctx, hostID)
case now := <-t.C:
e.tick(ctx, now)
}
}
}
// NotifyJobFinished is the hot-path hook called from MarkJobFinished's
// caller (ws.handler.dispatchAgentMessage). Non-blocking: drops on a
// full channel with a slog warning.
func (e *Engine) NotifyJobFinished(ev JobFinishedEvent) {
select {
case e.jobs <- ev:
default:
slog.Warn("alert: jobs channel full; dropping event", "kind", ev.Kind, "host_id", ev.HostID)
}
}
// NotifyHostOffline notifies the engine that a host is offline.
func (e *Engine) NotifyHostOffline(hostID string) {
select {
case e.hostDown <- hostID:
default:
slog.Warn("alert: hostDown channel full; dropping", "host_id", hostID)
}
}
// NotifyHostOnline notifies the engine that a host is online.
func (e *Engine) NotifyHostOnline(hostID string) {
select {
case e.hostUp <- hostID:
default:
slog.Warn("alert: hostUp channel full; dropping", "host_id", hostID)
}
}
// Placeholder method stubs for C2 implementation
func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
// Implemented in C2
}
func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
// Implemented in C2
}
func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
// Implemented in C2
}
func (e *Engine) tick(ctx context.Context, now time.Time) {
// Implemented in C2
}