restic-manager/internal/alert/engine.go

// Package alert evaluates the hardcoded rule set and persists raises
// / acknowledges / resolves. Three event sources feed it:
//   - JobFinishedEvent — pushed when a job lands a terminal state
//     (the existing MarkJobFinished site)
//   - HostOfflineEvent / HostOnlineEvent — pushed by the offline
//     sweeper and by the ws hello handler
//   - 60s ticker (internal) — drives stale-schedule + auto-resolve
//
// All output goes through store.RaiseOrTouch / Acknowledge / Resolve
// and the notification.Hub. The engine is one goroutine started at
// boot; non-blocking sends from hot paths.
package alert

import (
	"context"
	"fmt"
	"log/slog"
	"sync"
	"time"

	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
)

// staleBackupThreshold is how long an intermittent host may go without
// a successful backup before we raise a stale_schedule alert. Global
// constant for v1 (may become per-host later). Only intermittent hosts
// are evaluated — always-on hosts' stale_schedule stays a no-op.
const staleBackupThreshold = 7 * 24 * time.Hour

// JobFinishedEvent carries everything the engine needs to evaluate
// the failed-X rules. Pushed via Engine.NotifyJobFinished from the
// MarkJobFinished site.
type JobFinishedEvent struct {
	HostID        string
	JobID         string
	Kind          string // backup | forget | prune | check | unlock | restore | diff
	Status        string // succeeded | failed | cancelled
	SourceGroupID string // dedup key for backup/forget/prune/check; empty otherwise
	When          time.Time
}

// Engine evaluates hardcoded alert rules and dispatches via notification.Hub.
type Engine struct {
	store *store.Store
	hub   *notification.Hub

	jobs     chan JobFinishedEvent
	hostDown chan string // host_id
	hostUp   chan string

	// agentOfflineFloor is the duration a host must be offline before
	// we raise. Configurable for tests; default 15m.
	agentOfflineFloor time.Duration
	tickPeriod        time.Duration

	closeOnce sync.Once
	done      chan struct{}
}

// NewEngine builds the engine. agentOfflineFloor + tickPeriod default
// to 15min and 60s respectively when zero.
func NewEngine(st *store.Store, hub *notification.Hub) *Engine {
	return &Engine{
		store:             st,
		hub:               hub,
		jobs:              make(chan JobFinishedEvent, 32),
		hostDown:          make(chan string, 32),
		hostUp:            make(chan string, 32),
		agentOfflineFloor: 15 * time.Minute,
		tickPeriod:        60 * time.Second,
		done:              make(chan struct{}),
	}
}

// Run drives the event loop. Returns when ctx is done. Blocks; call in
// its own goroutine.
func (e *Engine) Run(ctx context.Context) {
	t := time.NewTicker(e.tickPeriod)
	defer t.Stop()
	for {
		select {
		case <-ctx.Done():
			e.closeOnce.Do(func() { close(e.done) })
			return
		case ev := <-e.jobs:
			e.handleJobFinished(ctx, ev)
		case hostID := <-e.hostDown:
			e.handleHostOffline(ctx, hostID)
		case hostID := <-e.hostUp:
			e.handleHostOnline(ctx, hostID)
		case now := <-t.C:
			e.tick(ctx, now)
		}
	}
}

// NotifyJobFinished is the hot-path hook called from MarkJobFinished's
// caller (ws.handler.dispatchAgentMessage). Non-blocking: drops on a
// full channel with a slog warning.
func (e *Engine) NotifyJobFinished(ev JobFinishedEvent) {
	select {
	case e.jobs <- ev:
	default:
		slog.Warn("alert: jobs channel full; dropping event", "kind", ev.Kind, "host_id", ev.HostID)
	}
}

// NotifyHostOffline notifies the engine that a host is offline.
func (e *Engine) NotifyHostOffline(hostID string) {
	select {
	case e.hostDown <- hostID:
	default:
		slog.Warn("alert: hostDown channel full; dropping", "host_id", hostID)
	}
}

// NotifyHostOnline notifies the engine that a host is online.
func (e *Engine) NotifyHostOnline(hostID string) {
	select {
	case e.hostUp <- hostID:
	default:
		slog.Warn("alert: hostUp channel full; dropping", "host_id", hostID)
	}
}

func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
	// Determine which kind/severity pair this job maps to. Jobs not
	// listed here (init, unlock, restore, diff) produce no alerts in v1.
	var kind, severity string
	switch ev.Kind {
	case "backup":
		kind, severity = KindBackupFailed, "warning"
	case "forget":
		kind, severity = KindForgetFailed, "warning"
	case "prune":
		kind, severity = KindPruneFailed, "warning"
	case "check":
		kind, severity = KindCheckFailed, "critical"
	default:
		return
	}
	// dedupKey scopes the alert to a specific subject. For backups it's
	// the source-group id (each group = its own restic run = its own
	// failure surface). forget/prune/check are repo-scoped — leave the
	// key empty so we get one alert per host per kind, matching the
	// "is this repo healthy?" mental model.
	dedupKey := ""
	if ev.Kind == "backup" {
		dedupKey = ev.SourceGroupID
	}
	switch ev.Status {
	case "failed":
		e.raiseAndNotify(ctx, ev.HostID, kind, dedupKey, severity,
			fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
	case "succeeded":
		e.resolveAndNotify(ctx, ev.HostID, kind, dedupKey, ev.When)
		if ev.Kind == "backup" {
			// A fresh backup clears staleness for intermittent hosts.
			e.resolveAndNotify(ctx, ev.HostID, KindStaleSchedule, "", ev.When)
		}
	}
}

func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
	host, err := e.store.GetHost(ctx, hostID)
	if err != nil {
		return
	}
	// Intermittent hosts (laptops) legitimately disappear — never raise
	// agent_offline for them. The stale_schedule sweep in tick() is the
	// only staleness signal for these hosts.
	if !host.AlwaysOn {
		return
	}
	// Apply the 15-min floor — raise only when last_seen_at is older
	// than agentOfflineFloor. A nil last_seen_at (host enrolled but
	// never connected) is treated as "now" so we don't raise
	// immediately on enrolment.
	if host.LastSeenAt == nil {
		return
	}
	if time.Since(*host.LastSeenAt) < e.agentOfflineFloor {
		return
	}
	e.raiseAndNotify(ctx, hostID, KindAgentOffline, "", "warning",
		fmt.Sprintf("Agent offline for %s (threshold %s)",
			roundDur(time.Since(*host.LastSeenAt)), e.agentOfflineFloor),
		time.Now().UTC())
}

func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
	e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", time.Now().UTC())
}

// tick is the 60-second sweep. Responsibilities:
//  1. Re-evaluate agent_offline for every offline host that may have
//     crossed the floor between explicit events.
//  2. Stale-schedule detection — declared in the spec but intentionally
//     left as a no-op in v1. The precise "expected to have fired but
//     didn't" trigger requires a store helper that lands in a later
//     task. The KindStaleSchedule constant is exported so UI code can
//     reference the tag string today.
func (e *Engine) tick(ctx context.Context, now time.Time) {
	// User-management cleanup piggy-backed here for now. Setup tokens
	// have a 1h expiry; the alert engine tick is the cheapest existing
	// 60s loop. If more housekeeping queries appear, extract a
	// dedicated maintenance loop.
	if _, err := e.store.CleanupExpiredSetupTokens(ctx, now); err != nil {
		slog.Warn("alert: cleanup expired setup tokens", "err", err)
	}
	if _, err := e.store.CleanupExpiredOIDCState(ctx, now.Add(-5*time.Minute)); err != nil {
		slog.Warn("alert: cleanup expired oidc state", "err", err)
	}

	hosts, err := e.store.ListHosts(ctx)
	if err != nil {
		slog.Warn("alert: tick list hosts", "err", err)
		return
	}
	for _, h := range hosts {
		// Intermittent hosts: suppress agent_offline entirely; instead
		// raise stale_schedule when they have gone too long with no
		// successful backup AND they have at least one enabled schedule
		// to be measured against. A nil LastBackupAt (never backed up)
		// has no baseline — onboarding/repo_status covers that case.
		if !h.AlwaysOn {
			if h.LastBackupAt == nil {
				continue
			}
			if now.Sub(*h.LastBackupAt) < staleBackupThreshold {
				continue
			}
			hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID)
			if err != nil || !hasEnabled {
				continue
			}
			e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning",
				fmt.Sprintf("No backup in %s (threshold %s)",
					roundDur(now.Sub(*h.LastBackupAt)), staleBackupThreshold), now)
			continue
		}
		// Always-on hosts: existing agent_offline re-evaluation.
		if h.Status != "offline" || h.LastSeenAt == nil {
			continue
		}
		if now.Sub(*h.LastSeenAt) >= e.agentOfflineFloor {
			e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "", "warning",
				fmt.Sprintf("Agent offline for %s (threshold %s)",
					roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
		}
	}
}

// roundDur returns a human-readable duration string, rounding to the
// nearest minute. Durations under a minute are reported as "less than
// a minute".
func roundDur(d time.Duration) string {
	if d < time.Minute {
		return "less than a minute"
	}
	return d.Round(time.Minute).String()
}

// hostHasEnabledSchedule reports whether the host has at least one
// enabled backup schedule — the precondition for a stale_schedule
// alert (no schedule = no backup expectation to measure against).
func (e *Engine) hostHasEnabledSchedule(ctx context.Context, hostID string) (bool, error) {
	schedules, err := e.store.ListSchedulesByHost(ctx, hostID)
	if err != nil {
		return false, err
	}
	for _, sc := range schedules {
		if sc.Enabled {
			return true, nil
		}
	}
	return false, nil
}