diff --git a/Makefile b/Makefile index 4a1c807..2957229 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,9 @@ AGENT_BIN := $(BIN_DIR)/restic-manager-agent VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev) COMMIT ?= $(shell git rev-parse HEAD 2>/dev/null || echo none) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) -LDFLAGS := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE) +VERSION_PKG := gitea.dcglab.co.uk/steve/restic-manager/internal/version +LDFLAGS := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE) \ + -X $(VERSION_PKG).Version=$(VERSION) -X $(VERSION_PKG).Commit=$(COMMIT) GOFLAGS := -trimpath DOCKER_IMAGE ?= gitea.dcglab.co.uk/steve/restic-manager DOCKER_TAG ?= dev diff --git a/cmd/agent/main.go b/cmd/agent/main.go index d1b5041..e04dd2d 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -148,6 +148,7 @@ func run() error { resticBin: resticBin, resticVer: snap.ResticVersion, resticSupportsNoOwnership: resticSupportsNoOwnership, + serverURL: cfg.ServerURL, secrets: sec, scheduler: scheduler.New(), } @@ -214,6 +215,7 @@ type dispatcher struct { resticBin string resticVer string // e.g. "0.17.1"; empty if restic isn't installed yet resticSupportsNoOwnership bool // captured at startup from `restic restore --help` + serverURL string // base URL of the server (used by the self-update fetch) secrets *secrets.Store scheduler *scheduler.Scheduler @@ -395,10 +397,12 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S "up_kbps", up, "down_kbps", down) } - case api.MsgAgentUpdateAvail: - var p api.AgentUpdateAvailablePayload - _ = env.UnmarshalPayload(&p) - slog.Info("ws agent: update available", "version", p.LatestVersion, "url", p.PackageURL) + case api.MsgCommandUpdate: + var p api.CommandUpdatePayload + if err := env.UnmarshalPayload(&p); err != nil { + return fmt.Errorf("command.update: %w", err) + } + go d.runUpdate(ctx, p, tx) default: slog.Debug("ws agent: ignored message", "type", env.Type) diff --git a/cmd/agent/update_dispatch.go b/cmd/agent/update_dispatch.go new file mode 100644 index 0000000..50bdddf --- /dev/null +++ b/cmd/agent/update_dispatch.go @@ -0,0 +1,65 @@ +package main + +import ( + "context" + "fmt" + "log/slog" + "time" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/updater" + "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/wsclient" + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" +) + +// runUpdate handles a server-dispatched command.update. It logs progress +// via log.stream so the live job page captures pre-restart state, then +// calls the platform updater. On Linux the updater calls os.Exit; on +// Windows it spawns a detached helper and returns, with the agent then +// exiting. +// +// The terminal job state is set by the server, not the agent: success +// is "agent re-hellos with matching version" rather than anything the +// agent itself can assert. The only `job.finished` we send from here is +// on the failure path, before any restart attempt. +func (d *dispatcher) runUpdate(ctx context.Context, p api.CommandUpdatePayload, tx wsclient.Sender) { + logf := func(format string, args ...any) { + line := fmt.Sprintf(format, args...) + slog.Info("ws agent: update: " + line) + env, err := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{ + JobID: p.JobID, + TS: time.Now().UTC(), + Stream: api.LogStdout, + Payload: line, + }) + if err == nil { + _ = tx.Send(env) + } + } + + startedEnv, err := api.Marshal(api.MsgJobStarted, "", api.JobStartedPayload{ + JobID: p.JobID, + Kind: api.JobUpdate, + StartedAt: time.Now().UTC(), + }) + if err == nil { + _ = tx.Send(startedEnv) + } + + logf("fetching new binary from %s", d.serverURL) + if err := updater.Update(ctx, d.serverURL); err != nil { + logf("update failed: %v", err) + finishedEnv, mErr := api.Marshal(api.MsgJobFinished, "", api.JobFinishedPayload{ + JobID: p.JobID, + Status: api.JobFailed, + FinishedAt: time.Now().UTC(), + Error: err.Error(), + }) + if mErr == nil { + _ = tx.Send(finishedEnv) + } + return + } + // Unreachable on Linux (Update calls os.Exit). On Windows control + // returns here while the detached helper does the swap-and-restart; + // the agent then exits cleanly so SCM hands off. +} diff --git a/cmd/server/main.go b/cmd/server/main.go index 8d52bb8..dcd0d38 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -17,6 +17,7 @@ import ( "gitea.dcglab.co.uk/steve/restic-manager/internal/crypto" "gitea.dcglab.co.uk/steve/restic-manager/internal/notification" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/config" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/fleetupdate" rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc" @@ -91,6 +92,7 @@ func run() error { notifHub := notification.NewHub(st, aead, cfg.BaseURL) alertEngine := alert.NewEngine(st, notifHub) + updateWatcher := ws.NewUpdateWatcher(st, alertEngine) renderer, err := ui.New() if err != nil { @@ -116,6 +118,7 @@ func run() error { JobHub: jobHub, AlertEngine: alertEngine, NotificationHub: notifHub, + UpdateWatcher: updateWatcher, UI: renderer, Version: version, OIDC: oidcClient, @@ -147,10 +150,17 @@ func run() error { srv := rmhttp.New(deps) + // Fleet-update worker — built after the HTTP server because the + // dispatcher delegates back into srv.DispatchHostUpdate. + fleetWorker := fleetupdate.NewWorker(st, hub, + &serverDispatcher{srv: srv}, alertEngine) + srv.SetFleetWorker(fleetWorker) + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer stop() go alertEngine.Run(ctx) + go updateWatcher.Run(ctx) errCh := make(chan error, 1) go func() { @@ -243,3 +253,12 @@ func run() error { } return nil } + +// serverDispatcher adapts the http.Server's DispatchHostUpdate method +// to the fleetupdate.Dispatcher interface. Lives in main so the +// http and fleetupdate packages don't need to know about each other. +type serverDispatcher struct{ srv *rmhttp.Server } + +func (d *serverDispatcher) DispatchUpdate(ctx context.Context, hostID, actorUserID string) (string, string, error) { + return d.srv.DispatchHostUpdate(ctx, hostID, actorUserID) +} diff --git a/deploy/install/restic-manager-agent.service b/deploy/install/restic-manager-agent.service index d6ad407..1e3bcc8 100644 --- a/deploy/install/restic-manager-agent.service +++ b/deploy/install/restic-manager-agent.service @@ -52,7 +52,12 @@ ProtectSystem=full # whenever a new SecretsKey is minted, so we need a targeted # write-exemption for that dir. No exemption for the rest of /etc: # the agent has no business editing /etc/passwd, /etc/sudoers, etc. -ReadWritePaths=/etc/restic-manager +# +# /usr/local/bin is writable so the self-update flow (P6-01) can +# atomic-rename a fresh binary over the running one. Permitting the +# whole directory (rather than just the binary path) is required +# because os.Rename takes a write lock on the parent dir. +ReadWritePaths=/etc/restic-manager /usr/local/bin ProtectHostname=true ProtectKernelTunables=true ProtectKernelModules=true diff --git a/internal/agent/updater/updater.go b/internal/agent/updater/updater.go new file mode 100644 index 0000000..90b4f96 --- /dev/null +++ b/internal/agent/updater/updater.go @@ -0,0 +1,100 @@ +// Package updater carries the agent's self-update logic. +// +// The flow is operator-driven: the server dispatches a command.update +// WS envelope, the agent fetches a fresh binary from the server's +// /agent/binary endpoint, atomic-renames it over the running binary +// (Linux) or hands off to a detached helper script (Windows), and +// exits cleanly so the service manager restarts under the new +// binary. See docs/superpowers/specs/2026-05-06-p6-01-02-... +// +// Platform-specific code is build-tagged into updater_unix.go / +// updater_windows.go. This file holds the shared HTTP fetch + path +// helpers + the test seam. +package updater + +import ( + "context" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "runtime" + "time" +) + +// fetch downloads the new binary into .new, fsyncs, chmods. +// Returns the path of the staged file (always binaryPath + ".new"). +func fetch(ctx context.Context, serverURL, binaryPath string) (string, error) { + url := fmt.Sprintf("%s/agent/binary?os=%s&arch=%s", serverURL, runtime.GOOS, runtime.GOARCH) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return "", err + } + c := &http.Client{Timeout: 5 * time.Minute} + res, err := c.Do(req) + if err != nil { + return "", err + } + defer func() { _ = res.Body.Close() }() + if res.StatusCode != http.StatusOK { + return "", fmt.Errorf("agent binary fetch: %s", res.Status) + } + + stagePath := binaryPath + ".new" + f, err := os.OpenFile(stagePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755) + if err != nil { + return "", err + } + if _, copyErr := io.Copy(f, res.Body); copyErr != nil { + _ = f.Close() + _ = os.Remove(stagePath) + return "", copyErr + } + if syncErr := f.Sync(); syncErr != nil { + _ = f.Close() + _ = os.Remove(stagePath) + return "", syncErr + } + if closeErr := f.Close(); closeErr != nil { + _ = os.Remove(stagePath) + return "", closeErr + } + if err := os.Chmod(stagePath, 0o755); err != nil { + _ = os.Remove(stagePath) + return "", err + } + return stagePath, nil +} + +// resolveOwnBinary returns the absolute path of the running binary. +// Refuses /proc/self/exe — that's what os.Executable returns on some +// systems but the path can't be renamed across. +func resolveOwnBinary() (string, error) { + p, err := os.Executable() + if err != nil { + return "", err + } + abs, err := filepath.Abs(p) + if err != nil { + return "", err + } + if abs == "/proc/self/exe" { + return "", fmt.Errorf("cannot resolve own binary path (/proc/self/exe)") + } + return abs, nil +} + +// UpdateForTest is the platform-neutral test seam. In production the +// platform-specific Update fetches, swaps, then exits the process. +// UpdateForTest stops short of the exit so unit tests can assert on +// file state. +func UpdateForTest(serverURL, binaryPath string) error { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + stage, err := fetch(ctx, serverURL, binaryPath) + if err != nil { + return err + } + return swap(stage, binaryPath) +} diff --git a/internal/agent/updater/updater_test.go b/internal/agent/updater/updater_test.go new file mode 100644 index 0000000..435ef21 --- /dev/null +++ b/internal/agent/updater/updater_test.go @@ -0,0 +1,87 @@ +//go:build !windows + +package updater + +import ( + "bytes" + "io" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "runtime" + "testing" +) + +// TestUpdate_LinuxAtomicSwap stages a fake "running binary" file, runs +// UpdateForTest against a fake /agent/binary server, and asserts that +// the binary was swapped, .old preserves the previous bytes, and .new +// was renamed away. +func TestUpdate_LinuxAtomicSwap(t *testing.T) { + tmp := t.TempDir() + binPath := filepath.Join(tmp, "agent") + if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil { + t.Fatal(err) + } + newBytes := []byte("NEW BINARY CONTENTS") + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/agent/binary" { + http.NotFound(w, r) + return + } + gotOS, gotArch := r.URL.Query().Get("os"), r.URL.Query().Get("arch") + if gotOS != runtime.GOOS || gotArch != runtime.GOARCH { + t.Errorf("query mismatch: got os=%s arch=%s want %s/%s", + gotOS, gotArch, runtime.GOOS, runtime.GOARCH) + } + _, _ = io.Copy(w, bytes.NewReader(newBytes)) + })) + defer srv.Close() + + if err := UpdateForTest(srv.URL, binPath); err != nil { + t.Fatalf("update: %v", err) + } + + got, err := os.ReadFile(binPath) + if err != nil { + t.Fatal(err) + } + if string(got) != string(newBytes) { + t.Fatalf("binary contents: got %q want %q", got, newBytes) + } + old, err := os.ReadFile(binPath + ".old") + if err != nil { + t.Fatalf("agent.old missing: %v", err) + } + if string(old) != "OLD" { + t.Fatalf("agent.old contents: got %q want %q", old, "OLD") + } + if _, err := os.Stat(binPath + ".new"); !os.IsNotExist(err) { + t.Fatalf("agent.new should be absent after swap, got err=%v", err) + } +} + +// TestUpdate_FetchHTTPError surfaces the server's status when the +// binary is not published for this os/arch. +func TestUpdate_FetchHTTPError(t *testing.T) { + tmp := t.TempDir() + binPath := filepath.Join(tmp, "agent") + if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil { + t.Fatal(err) + } + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, `{"error":"binary_not_published"}`, http.StatusNotFound) + })) + defer srv.Close() + + err := UpdateForTest(srv.URL, binPath) + if err == nil { + t.Fatal("expected error, got nil") + } + got, _ := os.ReadFile(binPath) + if string(got) != "OLD" { + t.Fatalf("binary should not have changed, got %q", got) + } +} diff --git a/internal/agent/updater/updater_unix.go b/internal/agent/updater/updater_unix.go new file mode 100644 index 0000000..81ebf50 --- /dev/null +++ b/internal/agent/updater/updater_unix.go @@ -0,0 +1,73 @@ +//go:build !windows + +package updater + +import ( + "context" + "fmt" + "io" + "log/slog" + "os" + "time" +) + +// Update fetches the new binary, swaps it in, then exits so systemd +// restarts the process under the new binary. The caller should close +// the WS connection cleanly (so the server transitions the host to +// disconnected immediately rather than waiting for the heartbeat +// sweep) before invoking. +// +// Service-user assumption: the agent runs as root under the +// systemd-shipped unit, which can write the binary path directly. +// If the agent ever moves to a non-root service user, this breaks — +// would need a setuid helper or an out-of-process update service. +func Update(ctx context.Context, serverURL string) error { + binPath, err := resolveOwnBinary() + if err != nil { + return err + } + stage, err := fetch(ctx, serverURL, binPath) + if err != nil { + return err + } + if err := swap(stage, binPath); err != nil { + return err + } + slog.Info("agent self-update: binary swapped, exiting for systemd restart", + "binary", binPath) + // Give logger / WS close-frame a moment to flush, then exit. + time.Sleep(200 * time.Millisecond) + os.Exit(0) + return nil // unreachable +} + +// swap copies the running binary to .old (M1 — keep one revision +// back for hand-rolled rollback), then atomic-renames the staged +// binary into place. Linux supports rename-while-open so this works +// even though the running process holds the source open. +func swap(stagePath, binPath string) error { + src, err := os.Open(binPath) + if err != nil { + return fmt.Errorf("open running binary: %w", err) + } + defer func() { _ = src.Close() }() + dst, err := os.OpenFile(binPath+".old", os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755) + if err != nil { + return fmt.Errorf("open .old: %w", err) + } + if _, err := io.Copy(dst, src); err != nil { + _ = dst.Close() + return fmt.Errorf("copy to .old: %w", err) + } + if err := dst.Sync(); err != nil { + _ = dst.Close() + return err + } + if err := dst.Close(); err != nil { + return err + } + if err := os.Rename(stagePath, binPath); err != nil { + return fmt.Errorf("rename .new over running binary: %w", err) + } + return nil +} diff --git a/internal/agent/updater/updater_windows.go b/internal/agent/updater/updater_windows.go new file mode 100644 index 0000000..0806472 --- /dev/null +++ b/internal/agent/updater/updater_windows.go @@ -0,0 +1,73 @@ +//go:build windows + +package updater + +import ( + "context" + "fmt" + "log/slog" + "os" + "os/exec" + "path/filepath" + "syscall" + "time" +) + +// helperScript is rendered with fmt.Sprintf, args order: +// +// %[1]s — running binary path (source for the .old copy) +// %[2]s — .old path +// %[3]s — staged .new path +// %[4]s — running binary path (rename target) +const helperScript = `@echo off +timeout /t 3 /nobreak >nul +copy /Y "%[1]s" "%[2]s" +sc stop restic-manager-agent +:wait +sc query restic-manager-agent | find "STOPPED" >nul +if errorlevel 1 (timeout /t 1 /nobreak >nul & goto wait) +move /Y "%[3]s" "%[4]s" +sc start restic-manager-agent +del "%%~f0" +` + +// Update on Windows can't overwrite the running .exe in-process +// (exclusive file lock), so we stage the new binary, write a small +// detached helper script that waits, stops the service, swaps the +// binary, and starts the service, then exit cleanly. SCM treats +// clean exits after sc stop as intentional and does not auto-restart; +// the helper's final sc start handles that. +func Update(ctx context.Context, serverURL string) error { + binPath, err := resolveOwnBinary() + if err != nil { + return err + } + stage, err := fetch(ctx, serverURL, binPath) + if err != nil { + return err + } + helperPath := filepath.Join(filepath.Dir(binPath), "agent-update.cmd") + body := fmt.Sprintf(helperScript, binPath, binPath+".old", stage, binPath) + if err := os.WriteFile(helperPath, []byte(body), 0o755); err != nil { + return err + } + cmd := exec.Command("cmd.exe", "/c", helperPath) + cmd.SysProcAttr = &syscall.SysProcAttr{ + HideWindow: true, + CreationFlags: 0x00000008 | 0x08000000, // DETACHED_PROCESS | CREATE_NO_WINDOW + } + if err := cmd.Start(); err != nil { + return err + } + slog.Info("agent self-update: helper spawned, exiting cleanly", + "binary", binPath, "helper", helperPath) + time.Sleep(200 * time.Millisecond) + os.Exit(0) + return nil // unreachable +} + +// swap is unused on Windows — the helper script does the swap. +// Defined to satisfy the build (UpdateForTest references it). +func swap(_, _ string) error { + return fmt.Errorf("updater.swap not implemented on Windows; use the helper script via Update") +} diff --git a/internal/alert/update_alerts.go b/internal/alert/update_alerts.go new file mode 100644 index 0000000..9a7da6e --- /dev/null +++ b/internal/alert/update_alerts.go @@ -0,0 +1,63 @@ +package alert + +import ( + "context" + "fmt" + "log/slog" + "time" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/notification" +) + +// Alert-kind constants for P6 self-update flows. +const ( + // KindUpdateFailed is raised when an agent fails to come back with + // the expected version after a command.update dispatch (timeout or + // version-mismatch). Resolved by a subsequent matching hello. + KindUpdateFailed = "update_failed" + + // KindFleetUpdateHalted is raised when the fleet-update worker + // stops mid-run because a host failed to update or went offline. + // Host-less alert (system-scoped). Manually resolved by an admin. + KindFleetUpdateHalted = "fleet_update_halted" +) + +// RaiseUpdateFailed records a per-host update failure. dedupKey is the +// hostID so a re-dispatch on the same host touches the existing alert +// rather than spawning a duplicate. +func (e *Engine) RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time) { + msg := fmt.Sprintf("Agent update failed (job %s): %s", jobID, reason) + e.raiseAndNotify(ctx, hostID, KindUpdateFailed, hostID, "warning", msg, when) +} + +// ResolveUpdateFailed clears any open update_failed alert for hostID. +// Called from the WS hello path when the agent reconnects with the +// target version. +func (e *Engine) ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time) { + e.resolveAndNotify(ctx, hostID, KindUpdateFailed, hostID, when) +} + +// RaiseFleetUpdateHalted is host-less — the fleet update is a +// system-level concept. We persist it via the dedicated host-less +// alert path so the alerts table's host_id column carries NULL. +func (e *Engine) RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time) { + msg := fmt.Sprintf("Fleet update %s halted: %s", fleetUpdateID, reason) + id, didRaise, err := e.store.RaiseOrTouchSystem(ctx, KindFleetUpdateHalted, fleetUpdateID, "warning", msg, when) + if err != nil { + slog.Warn("alert: raise fleet_update_halted", "fu_id", fleetUpdateID, "err", err) + return + } + if !didRaise { + return + } + go e.hub.Dispatch(ctx, notification.Payload{ + Event: notification.EventRaised, + AlertID: id, + Severity: "warning", + Kind: KindFleetUpdateHalted, + HostID: "", + HostName: "", + Message: msg, + RaisedAt: when, + }) +} diff --git a/internal/api/messages.go b/internal/api/messages.go index 8ea18f2..b9d8a2e 100644 --- a/internal/api/messages.go +++ b/internal/api/messages.go @@ -63,6 +63,7 @@ const ( JobUnlock JobKind = "unlock" JobRestore JobKind = "restore" JobDiff JobKind = "diff" + JobUpdate JobKind = "update" ) // JobStatus is the lifecycle state of a job. @@ -361,13 +362,14 @@ type ConfigUpdatePayload struct { BandwidthDownKBps *int `json:"bandwidth_down_kbps,omitempty"` } -// AgentUpdateAvailablePayload — informational only; the agent does -// NOT self-update. See spec.md §4.2 for the package-manager-based -// update model. -type AgentUpdateAvailablePayload struct { - LatestVersion string `json:"latest_version"` - PackageURL string `json:"package_url"` // apt repo / choco source - Changelog string `json:"changelog,omitempty"` +// CommandUpdatePayload carries no operational data — the agent +// already knows its own os/arch and fetches from its configured +// server URL via /agent/binary. JobID is the server-issued id of +// the update job; the agent echoes it on log.stream lines so the +// live job log captures pre-restart progress, then either exits +// (Linux) or hands off to a detached helper script (Windows). +type CommandUpdatePayload struct { + JobID string `json:"job_id"` } // TreeListRequestPayload is the body of a tree.list RPC. Used by the diff --git a/internal/api/wire.go b/internal/api/wire.go index 005827f..4573738 100644 --- a/internal/api/wire.go +++ b/internal/api/wire.go @@ -29,12 +29,12 @@ const ( // Server → agent message types. const ( - MsgCommandRun MessageType = "command.run" - MsgCommandCancel MessageType = "command.cancel" - MsgScheduleSet MessageType = "schedule.set" - MsgConfigUpdate MessageType = "config.update" - MsgAgentUpdateAvail MessageType = "agent.update.available" - MsgTreeList MessageType = "tree.list" // sync RPC: list a snapshot's children + MsgCommandRun MessageType = "command.run" + MsgCommandCancel MessageType = "command.cancel" + MsgScheduleSet MessageType = "schedule.set" + MsgConfigUpdate MessageType = "config.update" + MsgCommandUpdate MessageType = "command.update" + MsgTreeList MessageType = "tree.list" // sync RPC: list a snapshot's children ) // Envelope is the framing for every WS message in either direction. diff --git a/internal/server/fleetupdate/worker.go b/internal/server/fleetupdate/worker.go new file mode 100644 index 0000000..1442832 --- /dev/null +++ b/internal/server/fleetupdate/worker.go @@ -0,0 +1,221 @@ +// Package fleetupdate drives a rolling, sequential agent self-update +// over a list of hosts. One worker goroutine per Start() call (gated +// at the store layer to at-most-one-running-fleet-update). +package fleetupdate + +import ( + "context" + "errors" + "fmt" + "log/slog" + "time" + + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// Hub is the slim "is this host connected?" surface. +type Hub interface { + Connected(hostID string) bool +} + +// Dispatcher sends one command.update envelope. The implementer also +// creates the jobs row, writes audit, and registers with the update +// watcher. Pre-checks are the dispatcher's responsibility — the worker +// passes through whatever error it returns. +type Dispatcher interface { + DispatchUpdate(ctx context.Context, hostID string, actorUserID string) (jobID string, code string, err error) +} + +// AlertRaiser is the slim view of the alert engine's host-less raise +// path. Used to emit fleet_update_halted on first failure. +type AlertRaiser interface { + RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time) +} + +// Worker is the long-lived fleet-update orchestrator. There is at most +// one *running* fleet update at a time (enforced by the store). +type Worker struct { + store *store.Store + hub Hub + disp Dispatcher + alerts AlertRaiser + + // targetVersion is the version every dispatched agent is expected + // to come back with. Captured at Start time to avoid drift. + targetVersion string + + // pollPeriod controls the cadence at which the worker re-reads the + // host row to check for the version transition. Exposed for tests. + pollPeriod time.Duration + // hostTimeout bounds how long the worker waits for one host to + // reach the target version before halting. + hostTimeout time.Duration +} + +// NewWorker builds an unstarted worker. targetVersion is set on each +// Start call; the values here are defaults. +func NewWorker(st *store.Store, hub Hub, disp Dispatcher, alerts AlertRaiser) *Worker { + return &Worker{ + store: st, + hub: hub, + disp: disp, + alerts: alerts, + pollPeriod: 1 * time.Second, + hostTimeout: 95 * time.Second, + } +} + +// Start creates the parent + child rows, then spawns the per-host +// worker goroutine. Returns the new fleet_update_id on success. +// store.ErrFleetUpdateRunning bubbles up unchanged. +func (w *Worker) Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error) { + if userID == "" || targetVersion == "" { + return "", errors.New("fleetupdate: userID and targetVersion required") + } + if len(hostIDs) == 0 { + return "", errors.New("fleetupdate: at least one host required") + } + fuID := ulid.Make().String() + now := time.Now().UTC() + if err := w.store.CreateFleetUpdate(ctx, store.FleetUpdate{ + ID: fuID, + StartedAt: now, + StartedByUserID: userID, + TargetVersion: targetVersion, + Status: "running", + }, hostIDs); err != nil { + return "", err + } + + // The goroutine outlives the request that started it; carry a + // detached context so an HTTP-handler ctx cancel doesn't abort + // the long roll. + bg := context.WithoutCancel(ctx) + go w.run(bg, fuID, userID, targetVersion) + return fuID, nil +} + +// Cancel marks the fleet update cancelled. The running goroutine +// observes the new status on its next pre-check and exits without +// dispatching further hosts. The currently-dispatched job is left to +// finish on its own — cancelling agent-side is out of scope for v1. +func (w *Worker) Cancel(ctx context.Context, fuID string) error { + return w.store.CancelFleetUpdate(ctx, fuID, time.Now().UTC()) +} + +// run is the per-host loop. Halts on first failure; emits one alert +// on transition. +func (w *Worker) run(ctx context.Context, fuID, userID, targetVersion string) { + w.targetVersion = targetVersion + + for { + // Check the parent row's status — picks up Cancel. + fu, err := w.store.ActiveFleetUpdate(ctx) + if err != nil { + slog.Warn("fleetupdate: read active", "fu_id", fuID, "err", err) + return + } + if fu == nil || fu.ID != fuID { + // Cancelled, halted, or completed externally. Done. + return + } + + pending, err := w.store.ListPendingFleetUpdateHosts(ctx, fuID) + if err != nil { + slog.Warn("fleetupdate: list pending", "fu_id", fuID, "err", err) + return + } + if len(pending) == 0 { + now := time.Now().UTC() + if err := w.store.CompleteFleetUpdate(ctx, fuID, now); err != nil { + slog.Warn("fleetupdate: complete", "fu_id", fuID, "err", err) + } + return + } + + next := pending[0] + w.processHost(ctx, fuID, userID, next) + } +} + +// processHost handles one host slot. Marks it skipped, succeeded, or +// failed (and halts the fleet on failure). +func (w *Worker) processHost(ctx context.Context, fuID, userID string, slot store.FleetUpdateHost) { + hostID := slot.HostID + _ = w.store.SetFleetUpdateCurrentHost(ctx, fuID, hostID) + + // Pre-flight: re-read the host. The dispatch path repeats most of + // these checks but doing them up-front lets us emit the right + // per-host status (skipped vs failed) without consuming a job row. + host, err := w.store.GetHost(ctx, hostID) + if err != nil || host == nil { + _ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "skipped", "host not found", "") + return + } + if host.AgentVersion != "" && host.AgentVersion == w.targetVersion { + _ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "skipped", "already at target version", "") + return + } + if !w.hub.Connected(hostID) { + reason := fmt.Sprintf("host went offline: %s", hostID) + _ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, "") + w.halt(ctx, fuID, reason) + return + } + + // Dispatch. + _ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "running", "", "") + jobID, code, err := w.disp.DispatchUpdate(ctx, hostID, userID) + if err != nil || code != "" { + reason := dispatchErrorReason(code, err) + _ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, jobID) + w.halt(ctx, fuID, reason) + return + } + + // Poll until the host's recorded agent_version matches target, or + // timeout. + deadline := time.Now().Add(w.hostTimeout) + for time.Now().Before(deadline) { + // Honour cancellation between polls. + fu, err := w.store.ActiveFleetUpdate(ctx) + if err == nil && (fu == nil || fu.ID != fuID) { + // Cancelled mid-host; leave the slot in 'running' for the + // admin to inspect. No further dispatches. + return + } + time.Sleep(w.pollPeriod) + h, err := w.store.GetHost(ctx, hostID) + if err == nil && h != nil && h.AgentVersion == w.targetVersion { + if err := w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "succeeded", "", jobID); err != nil { + slog.Warn("fleetupdate: set succeeded", "fu_id", fuID, "host_id", hostID, "err", err) + } + return + } + } + reason := fmt.Sprintf("timeout waiting for %s to reach %s", hostID, w.targetVersion) + _ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, jobID) + w.halt(ctx, fuID, reason) +} + +func (w *Worker) halt(ctx context.Context, fuID, reason string) { + now := time.Now().UTC() + if err := w.store.HaltFleetUpdate(ctx, fuID, reason, now); err != nil { + slog.Warn("fleetupdate: halt", "fu_id", fuID, "err", err) + } + if w.alerts != nil { + w.alerts.RaiseFleetUpdateHalted(ctx, fuID, reason, now) + } +} + +func dispatchErrorReason(code string, err error) string { + if code != "" { + return "dispatch failed: " + code + } + if err != nil { + return err.Error() + } + return "dispatch failed" +} diff --git a/internal/server/fleetupdate/worker_test.go b/internal/server/fleetupdate/worker_test.go new file mode 100644 index 0000000..c1cdac1 --- /dev/null +++ b/internal/server/fleetupdate/worker_test.go @@ -0,0 +1,344 @@ +package fleetupdate + +import ( + "context" + "errors" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +type fakeHub struct { + mu sync.Mutex + online map[string]bool +} + +func (f *fakeHub) Connected(hostID string) bool { + f.mu.Lock() + defer f.mu.Unlock() + return f.online[hostID] +} + +type fakeDispatcher struct { + mu sync.Mutex + calls []string // host IDs + // after dispatch, set the host's agent_version to this on the + // store so the worker observes the version transition. + st *store.Store + target string + delayMS int + failOnHost map[string]string // host → error code +} + +func (f *fakeDispatcher) DispatchUpdate(ctx context.Context, hostID, _ string) (string, string, error) { + f.mu.Lock() + f.calls = append(f.calls, hostID) + if code, ok := f.failOnHost[hostID]; ok { + f.mu.Unlock() + return "", code, nil + } + st := f.st + target := f.target + delay := f.delayMS + f.mu.Unlock() + + jobID := ulid.Make().String() + if st != nil { + _ = st.CreateJob(context.Background(), store.Job{ + ID: jobID, HostID: hostID, Kind: "update", + ActorKind: "user", CreatedAt: time.Now().UTC(), + }) + } + if st != nil && target != "" { + go func() { + if delay > 0 { + time.Sleep(time.Duration(delay) * time.Millisecond) + } + _ = st.MarkHostHello(context.Background(), hostID, target, "0.17", api.CurrentProtocolVersion, time.Now().UTC()) + }() + } + return jobID, "", nil +} + +type recAlert struct { + mu sync.Mutex + reasons []string +} + +func (r *recAlert) RaiseFleetUpdateHalted(_ context.Context, _ string, reason string, _ time.Time) { + r.mu.Lock() + r.reasons = append(r.reasons, reason) + r.mu.Unlock() +} + +func openStore(t *testing.T) *store.Store { + t.Helper() + dir := t.TempDir() + st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db")) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = st.Close() }) + return st +} + +func mustCreateAdmin(t *testing.T, st *store.Store) string { + t.Helper() + uid := ulid.Make().String() + if err := st.CreateUser(context.Background(), store.User{ + ID: uid, Username: "u-" + uid[:6], + PasswordHash: "x", Role: store.RoleAdmin, CreatedAt: time.Now().UTC(), + }); err != nil { + t.Fatalf("user: %v", err) + } + return uid +} + +func mustCreateHost(t *testing.T, st *store.Store, name, version string) string { + t.Helper() + hostID := ulid.Make().String() + if err := st.CreateHost(context.Background(), store.Host{ + ID: hostID, Name: name, OS: "linux", Arch: "amd64", + EnrolledAt: time.Now().UTC(), + }, "deadbeef-"+hostID, ""); err != nil { + t.Fatalf("host: %v", err) + } + if version != "" { + if err := st.MarkHostHello(context.Background(), hostID, version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil { + t.Fatalf("hello: %v", err) + } + } + return hostID +} + +func waitForStatus(t *testing.T, st *store.Store, fuID, want string, timeout time.Duration) *store.FleetUpdate { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + fu, _, err := st.GetFleetUpdate(context.Background(), fuID) + if err == nil && fu != nil && fu.Status == want { + return fu + } + time.Sleep(20 * time.Millisecond) + } + t.Fatalf("status never reached %q", want) + return nil +} + +func TestWorkerTwoHostsBothSucceed(t *testing.T) { + st := openStore(t) + uid := mustCreateAdmin(t, st) + h1 := mustCreateHost(t, st, "h1", "v0") + h2 := mustCreateHost(t, st, "h2", "v0") + + hub := &fakeHub{online: map[string]bool{h1: true, h2: true}} + disp := &fakeDispatcher{st: st, target: "v2", delayMS: 30} + alerts := &recAlert{} + w := NewWorker(st, hub, disp, alerts) + w.pollPeriod = 20 * time.Millisecond + w.hostTimeout = 2 * time.Second + + fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2}) + if err != nil { + t.Fatalf("start: %v", err) + } + waitForStatus(t, st, fuID, "completed", 5*time.Second) + _, hosts, _ := st.GetFleetUpdate(context.Background(), fuID) + for _, h := range hosts { + if h.Status != "succeeded" { + t.Errorf("host %s status %q want succeeded", h.HostID, h.Status) + } + } + if n := len(alerts.reasons); n != 0 { + t.Errorf("unexpected halt alert: %v", alerts.reasons) + } +} + +func TestWorkerSecondHostTimesOutHalts(t *testing.T) { + st := openStore(t) + uid := mustCreateAdmin(t, st) + h1 := mustCreateHost(t, st, "h1", "v0") + h2 := mustCreateHost(t, st, "h2", "v0") + h3 := mustCreateHost(t, st, "h3", "v0") + + hub := &fakeHub{online: map[string]bool{h1: true, h2: true, h3: true}} + // h1 dispatches normally (transitions to v2). h2 dispatch returns + // success but never transitions. + disp := &fakeDispatcher{st: st, target: "v2", delayMS: 20, failOnHost: map[string]string{ + h2: "", // not a code-failure; simulate by clearing target on this disp run + }} + // Actually: drop h2 from the auto-transition by faking with a + // per-host store setter. Easiest: subclass via a wrapper. + _ = disp + customDisp := &perHostDispatcher{base: disp, st: st, target: "v2", noTransition: map[string]bool{h2: true}} + + alerts := &recAlert{} + w := NewWorker(st, hub, customDisp, alerts) + w.pollPeriod = 20 * time.Millisecond + w.hostTimeout = 200 * time.Millisecond + + fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2, h3}) + if err != nil { + t.Fatalf("start: %v", err) + } + waitForStatus(t, st, fuID, "halted", 3*time.Second) + _, hosts, _ := st.GetFleetUpdate(context.Background(), fuID) + gotStatus := map[string]string{} + for _, h := range hosts { + gotStatus[h.HostID] = h.Status + } + if gotStatus[h1] != "succeeded" { + t.Errorf("h1: %q", gotStatus[h1]) + } + if gotStatus[h2] != "failed" { + t.Errorf("h2: %q", gotStatus[h2]) + } + if gotStatus[h3] != "pending" { + t.Errorf("h3: %q", gotStatus[h3]) + } + alerts.mu.Lock() + defer alerts.mu.Unlock() + if len(alerts.reasons) != 1 { + t.Errorf("alert reasons: %v", alerts.reasons) + } +} + +// perHostDispatcher lets a test omit the auto-transition for selected +// hosts so we can simulate timeout. +type perHostDispatcher struct { + mu sync.Mutex + base *fakeDispatcher + st *store.Store + target string + noTransition map[string]bool +} + +func (p *perHostDispatcher) DispatchUpdate(_ context.Context, hostID, _ string) (string, string, error) { + p.mu.Lock() + skip := p.noTransition[hostID] + p.mu.Unlock() + jobID := ulid.Make().String() + _ = p.st.CreateJob(context.Background(), store.Job{ + ID: jobID, HostID: hostID, Kind: "update", + ActorKind: "user", CreatedAt: time.Now().UTC(), + }) + if !skip { + go func() { + time.Sleep(20 * time.Millisecond) + _ = p.st.MarkHostHello(context.Background(), hostID, p.target, "0.17", api.CurrentProtocolVersion, time.Now().UTC()) + }() + } + return jobID, "", nil +} + +func TestWorkerHostOfflineHalts(t *testing.T) { + st := openStore(t) + uid := mustCreateAdmin(t, st) + h1 := mustCreateHost(t, st, "h1", "v0") + h2 := mustCreateHost(t, st, "h2", "v0") + hub := &fakeHub{online: map[string]bool{h1: false, h2: true}} + disp := &fakeDispatcher{st: st, target: "v2"} + alerts := &recAlert{} + w := NewWorker(st, hub, disp, alerts) + w.pollPeriod = 20 * time.Millisecond + w.hostTimeout = 500 * time.Millisecond + + fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2}) + if err != nil { + t.Fatalf("start: %v", err) + } + waitForStatus(t, st, fuID, "halted", 2*time.Second) + _, hosts, _ := st.GetFleetUpdate(context.Background(), fuID) + if hosts[0].Status != "failed" { + t.Errorf("h1 status: %q", hosts[0].Status) + } + if hosts[1].Status != "pending" { + t.Errorf("h2 status: %q", hosts[1].Status) + } +} + +func TestWorkerAlreadyAtTargetSkipped(t *testing.T) { + st := openStore(t) + uid := mustCreateAdmin(t, st) + h1 := mustCreateHost(t, st, "h1", "v2") + h2 := mustCreateHost(t, st, "h2", "v0") + hub := &fakeHub{online: map[string]bool{h1: true, h2: true}} + disp := &fakeDispatcher{st: st, target: "v2", delayMS: 20} + alerts := &recAlert{} + w := NewWorker(st, hub, disp, alerts) + w.pollPeriod = 20 * time.Millisecond + w.hostTimeout = 2 * time.Second + + fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2}) + if err != nil { + t.Fatalf("start: %v", err) + } + waitForStatus(t, st, fuID, "completed", 4*time.Second) + _, hosts, _ := st.GetFleetUpdate(context.Background(), fuID) + want := map[string]string{h1: "skipped", h2: "succeeded"} + for _, h := range hosts { + if h.Status != want[h.HostID] { + t.Errorf("host %s: got %q want %q", h.HostID, h.Status, want[h.HostID]) + } + } +} + +func TestWorkerCancelMidRun(t *testing.T) { + st := openStore(t) + uid := mustCreateAdmin(t, st) + h1 := mustCreateHost(t, st, "h1", "v0") + h2 := mustCreateHost(t, st, "h2", "v0") + hub := &fakeHub{online: map[string]bool{h1: true, h2: true}} + // h1's transition is delayed long enough that we can cancel + // before it lands; h2 should never be touched. + disp := &fakeDispatcher{st: st, target: "v2", delayMS: 500} + alerts := &recAlert{} + w := NewWorker(st, hub, disp, alerts) + w.pollPeriod = 50 * time.Millisecond + w.hostTimeout = 5 * time.Second + + fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2}) + if err != nil { + t.Fatalf("start: %v", err) + } + // Give the worker a moment to dispatch h1. + time.Sleep(100 * time.Millisecond) + if err := w.Cancel(context.Background(), fuID); err != nil { + t.Fatalf("cancel: %v", err) + } + waitForStatus(t, st, fuID, "cancelled", 2*time.Second) + + // h2 should never be dispatched. + disp.mu.Lock() + defer disp.mu.Unlock() + for _, c := range disp.calls { + if c == h2 { + t.Errorf("h2 dispatched after cancel") + } + } +} + +func TestWorkerStartWhileActiveErrors(t *testing.T) { + st := openStore(t) + uid := mustCreateAdmin(t, st) + h1 := mustCreateHost(t, st, "h1", "v0") + h2 := mustCreateHost(t, st, "h2", "v0") + hub := &fakeHub{online: map[string]bool{h1: true, h2: true}} + disp := &fakeDispatcher{st: st, target: "v2", delayMS: 5_000} + w := NewWorker(st, hub, disp, &recAlert{}) + w.pollPeriod = 50 * time.Millisecond + w.hostTimeout = 2 * time.Second + if _, err := w.Start(context.Background(), uid, "v2", []string{h1}); err != nil { + t.Fatalf("first start: %v", err) + } + _, err := w.Start(context.Background(), uid, "v2", []string{h2}) + if !errors.Is(err, store.ErrFleetUpdateRunning) { + t.Fatalf("err: %v want ErrFleetUpdateRunning", err) + } +} diff --git a/internal/server/http/dashboard_filter_test.go b/internal/server/http/dashboard_filter_test.go index 61b2b47..bc58b85 100644 --- a/internal/server/http/dashboard_filter_test.go +++ b/internal/server/http/dashboard_filter_test.go @@ -11,6 +11,7 @@ import ( "time" "gitea.dcglab.co.uk/steve/restic-manager/internal/store" + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" ) func makeFilterHosts() []store.Host { @@ -98,6 +99,23 @@ func TestSortDashboardHostsColumns(t *testing.T) { } } +// TestFilterAndSortDashboardUpdatesBehind: ?updates=behind narrows +// to hosts whose agent_version is non-empty AND != server's version. +func TestFilterAndSortDashboardUpdatesBehind(t *testing.T) { + t.Parallel() + hosts := []store.Host{ + {ID: "01a", Name: "alpha", AgentVersion: "v0.0.1", Status: "online"}, + {ID: "01b", Name: "bravo", AgentVersion: version.Version, Status: "online"}, + {ID: "01c", Name: "charlie", AgentVersion: "", Status: "online"}, // never seen + {ID: "01d", Name: "delta", AgentVersion: "v0.0.1", Status: "offline"}, + } + got := filterAndSortDashboardHosts(hosts, dashboardFilter{Updates: "behind", Sort: "name", Dir: "asc"}) + // alpha + delta both behind; bravo (current) and charlie (empty) excluded. + if len(got) != 2 || got[0].Name != "alpha" || got[1].Name != "delta" { + t.Errorf("updates=behind: got %v", namesOf(got)) + } +} + // TestParseDashboardFilterDefaults: empty query gives sort=name asc. func TestParseDashboardFilterDefaults(t *testing.T) { t.Parallel() diff --git a/internal/server/http/fleet_update.go b/internal/server/http/fleet_update.go new file mode 100644 index 0000000..42c67f8 --- /dev/null +++ b/internal/server/http/fleet_update.go @@ -0,0 +1,379 @@ +// fleet_update.go — admin-only fleet rolling-update endpoints + page. +// +// Surface: +// - POST /api/fleet/update → starts a fleet update (JSON) +// - POST /api/fleet-updates/{id}/cancel +// - GET /api/fleet-updates/{id} → JSON parent + per-host array +// - GET /settings/fleet-update → admin UI page +// - GET /settings/fleet-update/partial → htmx polling fragment +// +// All routes are mounted in the admin band (see routes()). +package http + +import ( + "context" + "encoding/json" + "errors" + "log/slog" + stdhttp "net/http" + "time" + + "github.com/go-chi/chi/v5" + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" +) + +// fleetUpdateStartReq is the JSON body for POST /api/fleet/update. +// Both fields are optional: empty target_version defaults to the +// server's current version, empty host_ids derives the out-of-date +// online subset. +type fleetUpdateStartReq struct { + TargetVersion string `json:"target_version,omitempty"` + HostIDs []string `json:"host_ids,omitempty"` +} + +// fleetUpdateHostView is one row in the JSON response for GET +// /api/fleet-updates/{id}. Hostname is hydrated from the store so +// callers don't need a second round-trip per host. +type fleetUpdateHostView struct { + HostID string `json:"host_id"` + HostName string `json:"host_name,omitempty"` + Position int `json:"position"` + Status string `json:"status"` + JobID string `json:"job_id,omitempty"` + FailedReason string `json:"failed_reason,omitempty"` +} + +// fleetUpdateView is the JSON projection of the parent + children. +type fleetUpdateView struct { + ID string `json:"id"` + StartedAt string `json:"started_at"` + StartedByUserID string `json:"started_by_user_id"` + TargetVersion string `json:"target_version"` + Status string `json:"status"` + CurrentHostID string `json:"current_host_id,omitempty"` + HaltedReason string `json:"halted_reason,omitempty"` + CompletedAt *string `json:"completed_at,omitempty"` + Hosts []fleetUpdateHostView `json:"hosts"` +} + +// fleetUpdatePage backs both the full /settings/fleet-update page +// and the partial polled fragment. Idle / Active are mutually +// exclusive: if Active is non-nil, render the progress view. +type fleetUpdatePage struct { + // Idle-state fields. + OutOfDateHosts []store.Host // online hosts whose version != target + TargetVersion string + + // Active-state fields. Nil when no fleet update has ever run. + Active *store.FleetUpdate + ActiveRows []fleetUpdateHostView + + // Common. + HostNames map[string]string + // PollURL is the partial endpoint htmx polls every few seconds. + PollURL string +} + +// handleAPIFleetUpdateStart is POST /api/fleet/update. +func (s *Server) handleAPIFleetUpdateStart(w stdhttp.ResponseWriter, r *stdhttp.Request) { + user, ok := s.requireUser(r) + if !ok { + writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "") + return + } + if s.deps.FleetWorker == nil { + writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "") + return + } + var body fleetUpdateStartReq + // Empty body is fine — both fields are optional. + if r.ContentLength != 0 { + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + writeJSONError(w, stdhttp.StatusBadRequest, "invalid_json", err.Error()) + return + } + } + target := body.TargetVersion + if target == "" { + target = version.Version + } + hostIDs := body.HostIDs + if len(hostIDs) == 0 { + derived, err := s.deriveOutOfDateOnlineHostIDs(r.Context(), target) + if err != nil { + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error()) + return + } + hostIDs = derived + } + if len(hostIDs) == 0 { + writeJSONError(w, stdhttp.StatusConflict, "no_hosts_eligible", + "no online hosts are out of date") + return + } + + fuID, err := s.deps.FleetWorker.Start(r.Context(), user.ID, target, hostIDs) + if err != nil { + if errors.Is(err, store.ErrFleetUpdateRunning) { + writeJSONError(w, stdhttp.StatusConflict, "fleet_update_in_progress", err.Error()) + return + } + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error()) + return + } + + auditPayload, _ := json.Marshal(map[string]any{ + "fleet_update_id": fuID, + "target_version": target, + "host_count": len(hostIDs), + }) + _ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{ + ID: ulid.Make().String(), UserID: &user.ID, Actor: "user", + Action: "fleet.update_started", + TargetKind: ptr("fleet_update"), TargetID: &fuID, + TS: time.Now().UTC(), + Payload: auditPayload, + }) + + writeJSON(w, stdhttp.StatusAccepted, map[string]string{"fleet_update_id": fuID}) +} + +// handleAPIFleetUpdateCancel is POST /api/fleet-updates/{id}/cancel. +func (s *Server) handleAPIFleetUpdateCancel(w stdhttp.ResponseWriter, r *stdhttp.Request) { + user, ok := s.requireUser(r) + if !ok { + writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "") + return + } + if s.deps.FleetWorker == nil { + writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "") + return + } + fuID := chi.URLParam(r, "id") + if fuID == "" { + writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "") + return + } + fu, _, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "") + return + } + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error()) + return + } + if fu.Status != "running" { + writeJSONError(w, stdhttp.StatusConflict, "fleet_update_not_running", + "fleet update is not in the running state") + return + } + if err := s.deps.FleetWorker.Cancel(r.Context(), fuID); err != nil { + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error()) + return + } + _ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{ + ID: ulid.Make().String(), UserID: &user.ID, Actor: "user", + Action: "fleet.update_cancelled", + TargetKind: ptr("fleet_update"), TargetID: &fuID, + TS: time.Now().UTC(), + }) + w.WriteHeader(stdhttp.StatusNoContent) +} + +// handleAPIFleetUpdateGet is GET /api/fleet-updates/{id}. +func (s *Server) handleAPIFleetUpdateGet(w stdhttp.ResponseWriter, r *stdhttp.Request) { + if _, ok := s.requireUser(r); !ok { + writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "") + return + } + fuID := chi.URLParam(r, "id") + fu, hosts, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID) + if err != nil { + if errors.Is(err, store.ErrNotFound) { + writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "") + return + } + writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error()) + return + } + names := s.hostNameMap(r) + view := fleetUpdateView{ + ID: fu.ID, + StartedAt: fu.StartedAt.UTC().Format(time.RFC3339Nano), + StartedByUserID: fu.StartedByUserID, + TargetVersion: fu.TargetVersion, + Status: fu.Status, + CurrentHostID: fu.CurrentHostID, + HaltedReason: fu.HaltedReason, + Hosts: make([]fleetUpdateHostView, 0, len(hosts)), + } + if fu.CompletedAt != nil { + s := fu.CompletedAt.UTC().Format(time.RFC3339Nano) + view.CompletedAt = &s + } + for _, h := range hosts { + view.Hosts = append(view.Hosts, fleetUpdateHostView{ + HostID: h.HostID, + HostName: names[h.HostID], + Position: h.Position, + Status: h.Status, + JobID: h.JobID, + FailedReason: h.FailedReason, + }) + } + writeJSON(w, stdhttp.StatusOK, view) +} + +// handleUIFleetUpdate renders /settings/fleet-update. +func (s *Server) handleUIFleetUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) { + u := s.requireUIUser(w, r) + if u == nil { + return + } + page, err := s.buildFleetUpdatePage(r) + if err != nil { + slog.Error("ui fleet update: build page", "err", err) + stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError) + return + } + view := s.baseView(r, u) + view.Title = "Fleet update · restic-manager" + view.Active = "settings" + view.Page = page + if err := s.deps.UI.Render(w, "fleet_update", view); err != nil { + slog.Error("ui fleet update: render", "err", err) + } +} + +// handleUIFleetUpdatePartial renders just the inner panel for htmx +// auto-refresh polling — same data, no chrome. +func (s *Server) handleUIFleetUpdatePartial(w stdhttp.ResponseWriter, r *stdhttp.Request) { + u := s.requireUIUser(w, r) + if u == nil { + return + } + page, err := s.buildFleetUpdatePage(r) + if err != nil { + slog.Error("ui fleet update partial: build page", "err", err) + stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError) + return + } + view := s.baseView(r, u) + view.Page = page + if err := s.deps.UI.RenderPartial(w, "fleet_update_inner", view); err != nil { + slog.Error("ui fleet update partial: render", "err", err) + } +} + +// buildFleetUpdatePage assembles the data both /settings/fleet-update +// and its partial render against. Resolves the most-recent fleet +// update (active OR completed/cancelled/halted) so the page can show +// the last roll's result instead of disappearing into "idle" the +// instant a roll finishes. +func (s *Server) buildFleetUpdatePage(r *stdhttp.Request) (fleetUpdatePage, error) { + page := fleetUpdatePage{ + TargetVersion: version.Version, + HostNames: map[string]string{}, + PollURL: "/settings/fleet-update/partial", + } + hosts, err := s.deps.Store.ListHosts(r.Context()) + if err != nil { + return page, err + } + for _, h := range hosts { + page.HostNames[h.ID] = h.Name + } + + active, err := s.deps.Store.ActiveFleetUpdate(r.Context()) + if err != nil { + return page, err + } + mostRecent := active + if mostRecent == nil { + // Fall back to the most recent terminal row so the page can + // show "completed" / "halted" / "cancelled" once the worker + // finishes. One small bespoke query — keeps the page from + // flashing back to "idle" the instant a roll wraps up. + var id string + err := s.deps.Store.DB().QueryRowContext(r.Context(), + `SELECT id FROM fleet_updates ORDER BY started_at DESC LIMIT 1`). + Scan(&id) + if err == nil { + fu, _, gerr := s.deps.Store.GetFleetUpdate(r.Context(), id) + if gerr == nil { + mostRecent = fu + } + } + } + + if mostRecent != nil { + _, rows, gerr := s.deps.Store.GetFleetUpdate(r.Context(), mostRecent.ID) + if gerr == nil { + page.Active = mostRecent + page.ActiveRows = make([]fleetUpdateHostView, 0, len(rows)) + for _, hr := range rows { + page.ActiveRows = append(page.ActiveRows, fleetUpdateHostView{ + HostID: hr.HostID, + HostName: page.HostNames[hr.HostID], + Position: hr.Position, + Status: hr.Status, + JobID: hr.JobID, + FailedReason: hr.FailedReason, + }) + } + } + } + + // Idle list (or "still out of date" reference even when an active + // roll is running — cheap to compute, harmless to attach). + for _, h := range hosts { + if h.Status != "online" { + continue + } + if h.AgentVersion == "" || h.AgentVersion == page.TargetVersion { + continue + } + page.OutOfDateHosts = append(page.OutOfDateHosts, h) + } + return page, nil +} + +// deriveOutOfDateOnlineHostIDs returns the list of host IDs that +// (a) are online (Hub.Connected) and (b) have an agent_version that's +// non-empty AND != target. Used by the start endpoint when the caller +// omits host_ids. +func (s *Server) deriveOutOfDateOnlineHostIDs(ctx context.Context, target string) ([]string, error) { + hosts, err := s.deps.Store.ListHosts(ctx) + if err != nil { + return nil, err + } + out := []string{} + for _, h := range hosts { + if h.AgentVersion == "" || h.AgentVersion == target { + continue + } + if !s.deps.Hub.Connected(h.ID) { + continue + } + out = append(out, h.ID) + } + return out, nil +} + +// hostNameMap returns hostID → name; used to hydrate fleet-update +// JSON responses. +func (s *Server) hostNameMap(r *stdhttp.Request) map[string]string { + out := map[string]string{} + hosts, err := s.deps.Store.ListHosts(r.Context()) + if err != nil { + return out + } + for _, h := range hosts { + out[h.ID] = h.Name + } + return out +} diff --git a/internal/server/http/fleet_update_test.go b/internal/server/http/fleet_update_test.go new file mode 100644 index 0000000..ca82561 --- /dev/null +++ b/internal/server/http/fleet_update_test.go @@ -0,0 +1,334 @@ +// fleet_update_test.go — coverage for the P6-15 fleet-update HTTP +// surface: start/cancel/get JSON endpoints + RBAC. +package http + +import ( + "bytes" + "context" + "encoding/json" + stdhttp "net/http" + "sync" + "testing" + "time" + + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" +) + +// fakeFleetWorker stands in for *fleetupdate.Worker in HTTP tests. +// It records what was passed to Start/Cancel and lets tests inject +// canned errors. Satisfies the FleetWorker interface in +// host_update.go. +type fakeFleetWorker struct { + mu sync.Mutex + + startCalls []fakeStartCall + startID string + startErr error + + cancelCalls []string + cancelErr error +} + +type fakeStartCall struct { + UserID string + Target string + HostIDs []string +} + +func (f *fakeFleetWorker) Start(_ context.Context, userID, target string, hostIDs []string) (string, error) { + f.mu.Lock() + defer f.mu.Unlock() + f.startCalls = append(f.startCalls, fakeStartCall{userID, target, append([]string(nil), hostIDs...)}) + if f.startErr != nil { + return "", f.startErr + } + return f.startID, nil +} + +func (f *fakeFleetWorker) Cancel(_ context.Context, id string) error { + f.mu.Lock() + defer f.mu.Unlock() + f.cancelCalls = append(f.cancelCalls, id) + return f.cancelErr +} + +// helloOnlineHost is the smallest setup that lets the dispatch / +// derivation logic see a host as "online + version mismatch". +// Returns the host id. +func helloOnlineHost(t *testing.T, srv *Server, st *store.Store, name, agentVer string) string { + t.Helper() + id := makeHost(t, st, name) + if err := st.MarkHostHello(context.Background(), id, agentVer, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil { + t.Fatalf("mark hello: %v", err) + } + // Mark connected on the hub so deriveOutOfDateOnlineHostIDs + // considers it online without needing a real WS handshake. The + // Conn has a nil websocket pointer — tests never call Send on it. + srv.deps.Hub.Register(id, ws.NewConn(id, nil)) + return id +} + +func TestFleetUpdateStartHappyPath(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + worker := &fakeFleetWorker{startID: ulid.Make().String()} + srv.deps.FleetWorker = worker + + cookie, uid := loginAsAdminWithID(t, st) + hostID := helloOnlineHost(t, srv, st, "fu-host", "v0") + + body := map[string]any{"host_ids": []string{hostID}} + raw, _ := json.Marshal(body) + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader(raw)) + req.AddCookie(cookie) + req.Header.Set("Content-Type", "application/json") + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusAccepted { + t.Fatalf("status: got %d, want 202", res.StatusCode) + } + var out struct { + FleetUpdateID string `json:"fleet_update_id"` + } + if err := json.NewDecoder(res.Body).Decode(&out); err != nil { + t.Fatalf("decode: %v", err) + } + if out.FleetUpdateID != worker.startID { + t.Fatalf("fleet_update_id: got %q, want %q", out.FleetUpdateID, worker.startID) + } + worker.mu.Lock() + if len(worker.startCalls) != 1 || worker.startCalls[0].UserID != uid { + t.Fatalf("start calls: %+v", worker.startCalls) + } + if got := worker.startCalls[0].HostIDs; len(got) != 1 || got[0] != hostID { + t.Fatalf("host_ids: %v", got) + } + worker.mu.Unlock() + + // Audit row. + var n int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM audit_log WHERE action = 'fleet.update_started' AND target_id = ?`, + out.FleetUpdateID).Scan(&n); err != nil { + t.Fatalf("audit count: %v", err) + } + if n != 1 { + t.Fatalf("audit rows: got %d, want 1", n) + } +} + +func TestFleetUpdateStartConflictWhenAlreadyRunning(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + worker := &fakeFleetWorker{startErr: store.ErrFleetUpdateRunning} + srv.deps.FleetWorker = worker + cookie := loginAsAdmin(t, st) + _ = helloOnlineHost(t, srv, st, "fu-host", "v0") + + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`))) + req.AddCookie(cookie) + req.Header.Set("Content-Type", "application/json") + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusConflict { + t.Fatalf("status: got %d, want 409", res.StatusCode) + } + body := readJSONError(t, res.Body) + if body.Code != "fleet_update_in_progress" { + t.Fatalf("code: %q", body.Code) + } +} + +func TestFleetUpdateStartDerivesHostIDsWhenEmpty(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + worker := &fakeFleetWorker{startID: ulid.Make().String()} + srv.deps.FleetWorker = worker + cookie := loginAsAdmin(t, st) + + // Two online + out-of-date, one online + at-target, one offline. + a := helloOnlineHost(t, srv, st, "behind-a", "v0") + b := helloOnlineHost(t, srv, st, "behind-b", "v0") + _ = helloOnlineHost(t, srv, st, "uptodate", version.Version) + offlineID := makeHost(t, st, "offline-host") + if err := st.MarkHostHello(context.Background(), offlineID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil { + t.Fatalf("mark hello: %v", err) + } + // Don't MarkOnline → derivation should skip. + + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`))) + req.AddCookie(cookie) + req.Header.Set("Content-Type", "application/json") + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusAccepted { + t.Fatalf("status: got %d, want 202", res.StatusCode) + } + worker.mu.Lock() + defer worker.mu.Unlock() + if len(worker.startCalls) != 1 { + t.Fatalf("start calls: %d", len(worker.startCalls)) + } + got := worker.startCalls[0].HostIDs + want := map[string]bool{a: true, b: true} + if len(got) != 2 || !want[got[0]] || !want[got[1]] { + t.Fatalf("derived host_ids: got %v, want both of %v", got, []string{a, b}) + } +} + +func TestFleetUpdateCancelHappyPath(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + worker := &fakeFleetWorker{} + srv.deps.FleetWorker = worker + cookie := loginAsAdmin(t, st) + + // Seed a running fleet update directly. + fuID := ulid.Make().String() + uid := ulid.Make().String() + if err := st.CreateUser(context.Background(), store.User{ + ID: uid, Username: "starter", PasswordHash: "x", + Role: store.RoleAdmin, CreatedAt: time.Now().UTC(), + }); err != nil { + t.Fatalf("seed user: %v", err) + } + hostID := makeHost(t, st, "fu-cancel-host") + if err := st.CreateFleetUpdate(context.Background(), + store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1"}, + []string{hostID}); err != nil { + t.Fatalf("seed fleet update: %v", err) + } + + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet-updates/"+fuID+"/cancel", nil) + req.AddCookie(cookie) + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusNoContent { + t.Fatalf("status: got %d, want 204", res.StatusCode) + } + worker.mu.Lock() + if len(worker.cancelCalls) != 1 || worker.cancelCalls[0] != fuID { + t.Fatalf("cancel calls: %v", worker.cancelCalls) + } + worker.mu.Unlock() +} + +func TestFleetUpdateCancelNotRunning(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + srv.deps.FleetWorker = &fakeFleetWorker{} + cookie := loginAsAdmin(t, st) + + // Seed + complete one so it's no longer running. + fuID := ulid.Make().String() + uid := ulid.Make().String() + _ = st.CreateUser(context.Background(), store.User{ + ID: uid, Username: "starter2", PasswordHash: "x", + Role: store.RoleAdmin, CreatedAt: time.Now().UTC(), + }) + hostID := makeHost(t, st, "fu-done-host") + _ = st.CreateFleetUpdate(context.Background(), + store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1"}, + []string{hostID}) + if err := st.CompleteFleetUpdate(context.Background(), fuID, time.Now().UTC()); err != nil { + t.Fatalf("complete: %v", err) + } + + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet-updates/"+fuID+"/cancel", nil) + req.AddCookie(cookie) + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusConflict { + t.Fatalf("status: got %d, want 409", res.StatusCode) + } + body := readJSONError(t, res.Body) + if body.Code != "fleet_update_not_running" { + t.Fatalf("code: %q", body.Code) + } +} + +func TestFleetUpdateGetHydrates(t *testing.T) { + t.Parallel() + _, ts, st := rawTestServer(t) + cookie := loginAsAdmin(t, st) + + uid := ulid.Make().String() + _ = st.CreateUser(context.Background(), store.User{ + ID: uid, Username: "starter3", PasswordHash: "x", + Role: store.RoleAdmin, CreatedAt: time.Now().UTC(), + }) + hostID := makeHost(t, st, "fu-get-host") + fuID := ulid.Make().String() + if err := st.CreateFleetUpdate(context.Background(), + store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1.2.3"}, + []string{hostID}); err != nil { + t.Fatalf("seed: %v", err) + } + + req, _ := stdhttp.NewRequest("GET", ts.URL+"/api/fleet-updates/"+fuID, nil) + req.AddCookie(cookie) + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusOK { + t.Fatalf("status: got %d, want 200", res.StatusCode) + } + var got fleetUpdateView + if err := json.NewDecoder(res.Body).Decode(&got); err != nil { + t.Fatalf("decode: %v", err) + } + if got.ID != fuID || got.TargetVersion != "v1.2.3" || got.Status != "running" { + t.Fatalf("parent: %+v", got) + } + if len(got.Hosts) != 1 || got.Hosts[0].HostID != hostID || got.Hosts[0].HostName != "fu-get-host" { + t.Fatalf("hosts: %+v", got.Hosts) + } +} + +func TestFleetUpdateRBAC(t *testing.T) { + t.Parallel() + _, ts, st := rawTestServer(t) + + for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} { + role := role + t.Run(string(role), func(t *testing.T) { + cookie := loginAsRole(t, st, role) + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`))) + req.AddCookie(cookie) + req.Header.Set("Content-Type", "application/json") + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusForbidden { + t.Fatalf("status: got %d, want 403", res.StatusCode) + } + }) + } +} + +// Sanity check that fakeFleetWorker satisfies the FleetWorker iface. +var _ FleetWorker = (*fakeFleetWorker)(nil) diff --git a/internal/server/http/host_update.go b/internal/server/http/host_update.go new file mode 100644 index 0000000..b1a2033 --- /dev/null +++ b/internal/server/http/host_update.go @@ -0,0 +1,217 @@ +package http + +import ( + "context" + "encoding/json" + stdhttp "net/http" + "time" + + "github.com/go-chi/chi/v5" + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" +) + +// UpdateWatcher is the slim view of the ws.updateWatcher this package +// uses for tracking in-flight update dispatches. Defined as an +// interface so a test can inject a stub. +type UpdateWatcher interface { + Track(jobID, hostID string) +} + +// FleetWorker is the slim view of the fleetupdate.Worker this package +// uses. Kept here for forward compatibility with P6-15 — the host +// update endpoint itself does not use it. +type FleetWorker interface { + Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error) + Cancel(ctx context.Context, fleetUpdateID string) error +} + +// dispatchHostUpdateResult communicates structured outcomes from the +// shared dispatch path so both the HTTP handler and the fleet worker +// can format errors in their own idiom. +type dispatchHostUpdateResult struct { + JobID string + Code string // "" on success + Status int // HTTP status the JSON handler should use on error + Msg string // human-readable detail (optional) +} + +// dispatchHostUpdate is the shared "send command.update to one host" +// path. It performs every pre-check (host exists, online, version +// mismatch, no in-flight update) and on success creates the jobs row, +// audits, dispatches the WS envelope, and tracks the watcher entry. +// +// Pre-checks are returned as structured codes rather than HTTP errors +// so the fleet worker can map them onto its own per-host status enum +// without parsing strings. +func (s *Server) dispatchHostUpdate(ctx context.Context, hostID string, actorKind string, actorID *string) dispatchHostUpdateResult { + host, err := s.deps.Store.GetHost(ctx, hostID) + if err != nil || host == nil { + return dispatchHostUpdateResult{Code: "host_not_found", Status: stdhttp.StatusNotFound} + } + if !s.deps.Hub.Connected(host.ID) { + return dispatchHostUpdateResult{ + Code: "host_offline", Status: stdhttp.StatusConflict, + Msg: "agent is not currently connected", + } + } + if host.AgentVersion != "" && host.AgentVersion == version.Version { + return dispatchHostUpdateResult{ + Code: "already_up_to_date", Status: stdhttp.StatusConflict, + Msg: "agent already running version " + version.Version, + } + } + existing, err := s.deps.Store.RunningUpdateJobForHost(ctx, hostID) + if err != nil { + return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()} + } + if existing != "" { + return dispatchHostUpdateResult{ + Code: "update_in_progress", Status: stdhttp.StatusConflict, + Msg: "an update job is already in flight for this host", + JobID: existing, + } + } + + jobID := ulid.Make().String() + now := time.Now().UTC() + if err := s.deps.Store.CreateJob(ctx, store.Job{ + ID: jobID, HostID: hostID, Kind: "update", + ActorKind: actorKind, ActorID: actorID, + CreatedAt: now, + }); err != nil { + return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()} + } + env, err := api.Marshal(api.MsgCommandUpdate, ulid.Make().String(), api.CommandUpdatePayload{ + JobID: jobID, + }) + if err != nil { + return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()} + } + if err := s.deps.Hub.Send(ctx, hostID, env); err != nil { + // Roll the job to failed so we don't leak a queued row. + _ = s.deps.Store.MarkJobFinished(ctx, jobID, "failed", -1, nil, err.Error(), time.Now().UTC()) + return dispatchHostUpdateResult{ + Code: "host_offline", Status: stdhttp.StatusConflict, Msg: err.Error(), + } + } + if s.deps.UpdateWatcher != nil { + s.deps.UpdateWatcher.Track(jobID, hostID) + } + + auditPayload, _ := json.Marshal(map[string]string{ + "job_id": jobID, + "target_version": version.Version, + }) + _ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{ + ID: ulid.Make().String(), + UserID: actorID, + Actor: actorKind, + Action: "host.update_dispatched", + TargetKind: ptr("host"), + TargetID: &hostID, + TS: now, + Payload: auditPayload, + }) + + return dispatchHostUpdateResult{JobID: jobID} +} + +// handleHostUpdate is POST /api/hosts/{id}/update — JSON, admin-only. +func (s *Server) handleHostUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) { + user, ok := s.requireUser(r) + if !ok { + writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "") + return + } + hostID := chi.URLParam(r, "id") + if hostID == "" { + writeJSONError(w, stdhttp.StatusBadRequest, "missing_host_id", "") + return + } + actor := "user" + var actorID *string + if user != nil { + actorID = &user.ID + } + res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID) + if res.Code != "" { + writeJSONError(w, res.Status, res.Code, res.Msg) + return + } + writeJSON(w, stdhttp.StatusAccepted, map[string]string{"job_id": res.JobID}) +} + +// handleHostUpdateForm is the HTMX-friendly POST /hosts/{id}/update +// variant. On success it sets HX-Redirect to the job detail page; on +// pre-check failures it renders an inline error banner. +func (s *Server) handleHostUpdateForm(w stdhttp.ResponseWriter, r *stdhttp.Request) { + user, ok := s.requireUser(r) + if !ok { + stdhttp.Error(w, "unauthorised", stdhttp.StatusUnauthorized) + return + } + hostID := chi.URLParam(r, "id") + if hostID == "" { + stdhttp.Error(w, "missing host_id", stdhttp.StatusBadRequest) + return + } + actor := "user" + var actorID *string + if user != nil { + actorID = &user.ID + } + res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID) + if res.Code != "" { + // Inline banner for HTMX swaps. Mirrors what host_credentials + // returns on validation errors — small text/html fragment. + w.Header().Set("Content-Type", "text/html; charset=utf-8") + w.WriteHeader(res.Status) + msg := hostUpdateErrorMessage(res.Code, res.Msg) + _, _ = w.Write([]byte(``)) + return + } + w.Header().Set("HX-Redirect", "/jobs/"+res.JobID) + w.WriteHeader(stdhttp.StatusOK) +} + +func hostUpdateErrorMessage(code, msg string) string { + switch code { + case "host_not_found": + return "Host not found." + case "host_offline": + return "Agent is offline; can't deliver the update command." + case "already_up_to_date": + return "Agent is already running the current version." + case "update_in_progress": + return "An update is already in progress for this host." + } + if msg != "" { + return msg + } + return "Update dispatch failed." +} + +// htmlEscape is a minimal HTML-attr-safe escaper. Avoids pulling html/template +// for a one-shot inline banner. +func htmlEscape(s string) string { + out := make([]byte, 0, len(s)) + for i := 0; i < len(s); i++ { + switch s[i] { + case '&': + out = append(out, []byte("&")...) + case '<': + out = append(out, []byte("<")...) + case '>': + out = append(out, []byte(">")...) + case '"': + out = append(out, []byte(""")...) + default: + out = append(out, s[i]) + } + } + return string(out) +} diff --git a/internal/server/http/host_update_test.go b/internal/server/http/host_update_test.go new file mode 100644 index 0000000..30cc0ce --- /dev/null +++ b/internal/server/http/host_update_test.go @@ -0,0 +1,270 @@ +// host_update_test.go — covers POST /api/hosts/{id}/update. +package http + +import ( + "context" + "encoding/json" + "io" + stdhttp "net/http" + "strings" + "sync" + "testing" + "time" + + "github.com/coder/websocket" + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/api" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" +) + +// stubWatcher records Track calls so tests can assert the watcher was +// notified. +type stubWatcher struct { + mu sync.Mutex + tracked []string // hostIDs +} + +func (s *stubWatcher) Track(_, hostID string) { + s.mu.Lock() + defer s.mu.Unlock() + s.tracked = append(s.tracked, hostID) +} + +func TestHostUpdateHappyPath(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + watcher := &stubWatcher{} + srv.deps.UpdateWatcher = watcher + hostID, token := enrolHostForWS(t, srv, st, "upd-host") + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "upd-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + // Force a version mismatch so the dispatch isn't short-circuited. + if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil { + t.Fatalf("mark hello: %v", err) + } + + cookie := loginAsAdmin(t, st) + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil) + req.AddCookie(cookie) + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusAccepted { + t.Fatalf("status: got %d, want 202", res.StatusCode) + } + var out struct { + JobID string `json:"job_id"` + } + if err := json.NewDecoder(res.Body).Decode(&out); err != nil { + t.Fatalf("decode: %v", err) + } + if out.JobID == "" { + t.Fatal("missing job_id in response") + } + + // command.update envelope arrives. + deadline := time.Now().Add(2 * time.Second) + var got api.Envelope + for time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + mt, raw, rerr := c.Read(ctx) + cancel() + if rerr != nil { + break + } + if mt != websocket.MessageText { + continue + } + if !strings.Contains(string(raw), `"command.update"`) { + continue + } + _ = json.Unmarshal(raw, &got) + break + } + if got.Type != api.MsgCommandUpdate { + t.Fatal("never received command.update envelope") + } + var cp api.CommandUpdatePayload + if err := got.UnmarshalPayload(&cp); err != nil { + t.Fatalf("payload: %v", err) + } + if cp.JobID != out.JobID { + t.Fatalf("payload job_id: got %q want %q", cp.JobID, out.JobID) + } + + // Watcher tracked. + watcher.mu.Lock() + defer watcher.mu.Unlock() + if len(watcher.tracked) != 1 || watcher.tracked[0] != hostID { + t.Fatalf("watcher tracked: %v", watcher.tracked) + } + + // Audit row exists. + var n int + if err := st.DB().QueryRow( + `SELECT COUNT(*) FROM audit_log WHERE action = 'host.update_dispatched' AND target_id = ?`, + hostID).Scan(&n); err != nil { + t.Fatalf("audit count: %v", err) + } + if n != 1 { + t.Fatalf("audit rows: got %d, want 1", n) + } +} + +func TestHostUpdateNotFound(t *testing.T) { + t.Parallel() + _, ts, st := rawTestServer(t) + cookie := loginAsAdmin(t, st) + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/no-such/update", nil) + req.AddCookie(cookie) + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusNotFound { + t.Fatalf("status: got %d want 404", res.StatusCode) + } +} + +func TestHostUpdateOffline(t *testing.T) { + t.Parallel() + _, ts, st := rawTestServer(t) + hostID := ulid.Make().String() + if err := st.CreateHost(context.Background(), store.Host{ + ID: hostID, Name: "off", OS: "linux", Arch: "amd64", + EnrolledAt: time.Now().UTC(), + }, "deadbeef", ""); err != nil { + t.Fatalf("create: %v", err) + } + cookie := loginAsAdmin(t, st) + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil) + req.AddCookie(cookie) + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusConflict { + t.Fatalf("status: got %d want 409", res.StatusCode) + } + body := readJSONError(t, res.Body) + if body.Code != "host_offline" { + t.Fatalf("code: %q", body.Code) + } +} + +func TestHostUpdateAlreadyUpToDate(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "uptodate-host") + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "uptodate-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + + // Force agent_version == version.Version. + if err := st.MarkHostHello(context.Background(), hostID, version.Version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil { + t.Fatalf("mark hello: %v", err) + } + + cookie := loginAsAdmin(t, st) + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil) + req.AddCookie(cookie) + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusConflict { + t.Fatalf("status: got %d want 409", res.StatusCode) + } + body := readJSONError(t, res.Body) + if body.Code != "already_up_to_date" { + t.Fatalf("code: %q", body.Code) + } +} + +func TestHostUpdateInProgress(t *testing.T) { + t.Parallel() + srv, ts, st := rawTestServer(t) + hostID, token := enrolHostForWS(t, srv, st, "inprog-host") + c := agentDial(t, srv, ts, hostID, token) + sendHello(t, c, "inprog-host") + _ = drainUntil(t, c, api.MsgScheduleSet) + if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil { + t.Fatalf("mark hello: %v", err) + } + + // Pre-seed an in-flight update job. + jobID := ulid.Make().String() + if err := st.CreateJob(context.Background(), store.Job{ + ID: jobID, HostID: hostID, Kind: "update", + ActorKind: "user", CreatedAt: time.Now().UTC(), + }); err != nil { + t.Fatalf("seed job: %v", err) + } + + cookie := loginAsAdmin(t, st) + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil) + req.AddCookie(cookie) + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusConflict { + t.Fatalf("status: got %d want 409", res.StatusCode) + } + body := readJSONError(t, res.Body) + if body.Code != "update_in_progress" { + t.Fatalf("code: %q", body.Code) + } +} + +func TestHostUpdateRBAC(t *testing.T) { + t.Parallel() + _, ts, st := rawTestServer(t) + hostID := ulid.Make().String() + if err := st.CreateHost(context.Background(), store.Host{ + ID: hostID, Name: "rbac-host", OS: "linux", Arch: "amd64", + EnrolledAt: time.Now().UTC(), + }, "deadbeef", ""); err != nil { + t.Fatalf("create: %v", err) + } + for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} { + role := role + t.Run(string(role), func(t *testing.T) { + cookie := loginAsRole(t, st, role) + req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil) + req.AddCookie(cookie) + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusForbidden { + t.Fatalf("status for %s: got %d want 403", role, res.StatusCode) + } + }) + } +} + +type jsonErrBody struct { + Code string `json:"code"` + Message string `json:"message,omitempty"` +} + +func readJSONError(t *testing.T, body io.Reader) jsonErrBody { + t.Helper() + var out jsonErrBody + if err := json.NewDecoder(body).Decode(&out); err != nil { + t.Fatalf("decode error body: %v", err) + } + return out +} diff --git a/internal/server/http/hosts.go b/internal/server/http/hosts.go index 59f913b..d355626 100644 --- a/internal/server/http/hosts.go +++ b/internal/server/http/hosts.go @@ -4,6 +4,7 @@ import ( stdhttp "net/http" "gitea.dcglab.co.uk/steve/restic-manager/internal/store" + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" ) // hostView is the JSON projection of a Host row. Same shape as the @@ -27,6 +28,8 @@ type hostView struct { RepoSizeBytes int64 `json:"repo_size_bytes"` SnapshotCount int `json:"snapshot_count"` OpenAlertCount int `json:"open_alert_count"` + UpdateAvailable bool `json:"update_available"` + TargetVersion string `json:"target_version,omitempty"` } // handleListHosts returns the full fleet as JSON. Authenticated; the @@ -85,6 +88,8 @@ func hostToView(h store.Host) hostView { RepoSizeBytes: h.RepoSizeBytes, SnapshotCount: h.SnapshotCount, OpenAlertCount: h.OpenAlertCount, + TargetVersion: version.Version, + UpdateAvailable: h.AgentVersion != "" && h.AgentVersion != version.Version, } if v.Tags == nil { v.Tags = []string{} diff --git a/internal/server/http/server.go b/internal/server/http/server.go index 5db2438..17ecc7a 100644 --- a/internal/server/http/server.go +++ b/internal/server/http/server.go @@ -39,6 +39,13 @@ type Deps struct { // NotificationHub (optional, wired in G1) is used by the test-fire // endpoint to dispatch a single synthetic payload through a channel. NotificationHub *notification.Hub + // UpdateWatcher tracks in-flight agent self-update dispatches and + // reconciles them against incoming hello envelopes. Optional; + // nil = no-op (handlers degrade by skipping the Track call). + UpdateWatcher UpdateWatcher + // FleetWorker drives the rolling fleet-update worker. Optional; + // nil = fleet update endpoints (P6-15) report unavailable. + FleetWorker FleetWorker // Version is the binary's build version, surfaced in the chrome. // Empty falls back to "dev". Version string @@ -123,8 +130,9 @@ func (s *Server) routes(r chi.Router) { r.Post("/api/agents/announce", s.handleAnnounce) r.Get("/agent/binary", s.handleAgentBinary) r.Get("/install/*", s.handleInstallAsset) + r.Get("/api/version", s.handleVersion) if s.deps.Hub != nil { - r.Mount("/ws/agent", ws.AgentHandler(ws.HandlerDeps{ + hd := ws.HandlerDeps{ Hub: s.deps.Hub, Store: s.deps.Store, JobHub: s.deps.JobHub, @@ -132,7 +140,11 @@ func (s *Server) routes(r chi.Router) { OnHello: s.onAgentHello, OnScheduleAck: s.applyScheduleAck, OnScheduleFire: s.dispatchScheduledJob, - })) + } + if w, ok := s.deps.UpdateWatcher.(*ws.UpdateWatcher); ok && w != nil { + hd.UpdateWatcher = w + } + r.Mount("/ws/agent", ws.AgentHandler(hd)) } r.Get("/ws/agent/pending", s.handlePendingWS) r.Mount("/static/", staticHandler()) @@ -270,6 +282,14 @@ func (s *Server) routes(r chi.Router) { r.Group(func(r chi.Router) { r.Use(s.requireRole(store.RoleAdmin)) + r.Post("/api/hosts/{id}/update", s.handleHostUpdate) + r.Post("/hosts/{id}/update", s.handleHostUpdateForm) + + // Fleet update (P6-15): rolling update across many hosts. + r.Post("/api/fleet/update", s.handleAPIFleetUpdateStart) + r.Post("/api/fleet-updates/{id}/cancel", s.handleAPIFleetUpdateCancel) + r.Get("/api/fleet-updates/{id}", s.handleAPIFleetUpdateGet) + r.Get("/api/users", s.handleAPIUsersList) r.Post("/api/users", s.handleAPIUserCreate) r.Get("/api/users/{id}", s.handleAPIUserGet) @@ -283,6 +303,8 @@ func (s *Server) routes(r chi.Router) { if s.deps.UI != nil { r.Post("/hosts/{id}/delete", s.handleUIHostDelete) r.Get("/settings", s.handleUISettings) + r.Get("/settings/fleet-update", s.handleUIFleetUpdate) + r.Get("/settings/fleet-update/partial", s.handleUIFleetUpdatePartial) r.Get("/settings/users", s.handleUIUsersList) r.Get("/settings/users/new", s.handleUIUserNewGet) r.Post("/settings/users/new", s.handleUIUserNewPost) @@ -321,6 +343,27 @@ func (s *Server) Shutdown(ctx context.Context) error { return s.srv.Shutdown(ctx) } +// SetFleetWorker installs the fleet-update worker post-construction. +// Used to break the wiring loop in cmd/server (the worker depends on a +// dispatcher that delegates back into the server's host-update path). +func (s *Server) SetFleetWorker(fw FleetWorker) { s.deps.FleetWorker = fw } + +// DispatchHostUpdate is the public entry point for callers (the fleet +// worker) that need to drive the same dispatch path the HTTP handler +// uses, without going through HTTP. Returns the structured result so +// the caller can map error codes to its own status enum. +func (s *Server) DispatchHostUpdate(ctx context.Context, hostID, actorUserID string) (jobID string, code string, err error) { + var actorID *string + if actorUserID != "" { + actorID = &actorUserID + } + res := s.dispatchHostUpdate(ctx, hostID, "user", actorID) + if res.Code != "" { + return res.JobID, res.Code, nil + } + return res.JobID, "", nil +} + // Addr returns the configured listen address. Useful in tests when // the caller passes :0 to get a random port. func (s *Server) Addr() string { return s.srv.Addr } diff --git a/internal/server/http/ui_handlers.go b/internal/server/http/ui_handlers.go index ffa75d2..c569c27 100644 --- a/internal/server/http/ui_handlers.go +++ b/internal/server/http/ui_handlers.go @@ -23,6 +23,7 @@ import ( "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui" "gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws" "gitea.dcglab.co.uk/steve/restic-manager/internal/store" + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" "gitea.dcglab.co.uk/steve/restic-manager/web" ) @@ -155,6 +156,10 @@ type dashboardPage struct { // when it's already active). Pre-computed so the template stays // dumb. SortURL map[string]string + // UpdatesBehind is the count of online hosts whose agent_version + // trails the server. Surfaces as the dashboard "N hosts behind" + // hero tile and links to ?updates=behind. + UpdatesBehind int } // dashboardFilter holds the parsed query-string filter state. @@ -165,6 +170,10 @@ type dashboardFilter struct { Tag string // mirrors ActiveTag for round-trip on links Sort string // column key (see sortDashboard) Dir string // "asc" | "desc" + // Updates narrows to hosts whose agent is behind the server's + // version. Only valid value today is "behind"; empty means no + // filter. + Updates string } // dashboardHostRow carries a host plus the per-row Run-now decision @@ -180,6 +189,13 @@ type dashboardHostRow struct { // NextRun is the next-fire time of RunAllScheduleID (when set), // computed server-side from its cron. nil otherwise. NextRun *time.Time + // UpdateAvailable is true when the host's agent has connected at + // least once AND its agent_version differs from the server's. Used + // by the host_row partial to render the update-available chip. + UpdateAvailable bool + // TargetVersion is the server's build version, surfaced in the + // chip's tooltip and label. + TargetVersion string } // pickRunAllSchedule returns the ID of the single schedule whose @@ -255,7 +271,11 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request) // calls per host — fine at fleet sizes we care about. rows := make([]dashboardHostRow, 0, len(hosts)) for _, h := range hosts { - row := dashboardHostRow{Host: h} + row := dashboardHostRow{ + Host: h, + TargetVersion: version.Version, + UpdateAvailable: h.AgentVersion != "" && h.AgentVersion != version.Version, + } groups, gerr := s.deps.Store.ListSourceGroupsByHost(r.Context(), h.ID) if gerr != nil { slog.Warn("ui dashboard: list source groups", "host_id", h.ID, "err", gerr) @@ -289,6 +309,13 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request) critOpenCount = len(crit) } + updatesBehind := 0 + for _, h := range allHosts { + if h.Status == "online" && h.AgentVersion != "" && h.AgentVersion != version.Version { + updatesBehind++ + } + } + view := s.baseView(r, u) view.Page = dashboardPage{ Hosts: rows, @@ -302,6 +329,7 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request) Filter: filter, RefreshURL: "/?" + filter.encode(), SortURL: buildDashboardSortURLs(filter), + UpdatesBehind: updatesBehind, } if err := s.deps.UI.Render(w, "dashboard", view); err != nil { slog.Error("ui: render dashboard", "err", err) @@ -320,6 +348,7 @@ func parseDashboardFilter(q url.Values) dashboardFilter { Tag: q.Get("tag"), Sort: q.Get("sort"), Dir: q.Get("dir"), + Updates: q.Get("updates"), } if f.Sort == "" { f.Sort = "name" @@ -352,6 +381,9 @@ func (f dashboardFilter) encode() string { if f.Dir != "" && f.Dir != "asc" { v.Set("dir", f.Dir) } + if f.Updates != "" { + v.Set("updates", f.Updates) + } return v.Encode() } @@ -402,6 +434,11 @@ func filterAndSortDashboardHosts(hosts []store.Host, f dashboardFilter) []store. continue } } + if f.Updates == "behind" { + if h.AgentVersion == "" || h.AgentVersion == version.Version { + continue + } + } out = append(out, h) } sortDashboardHosts(out, f.Sort, f.Dir) @@ -809,6 +846,20 @@ type hostChromeData struct { SourceGroupCount int ScheduleCount int ScheduleVersion int64 // host_schedule_version (latest desired) + // UpdateAvailable + TargetVersion drive the agent-out-of-date chip + // in the host detail header. UpdateAvailable is true iff the host + // has connected at least once AND its agent_version != server's. + UpdateAvailable bool + TargetVersion string + // Online + UpdateInProgress drive the per-host "Update agent" + // button on host_detail. Online mirrors hub.Connected; pulled here + // so the button can disable when the host is unreachable. + Online bool + UpdateInProgress bool + // CanAdmin is true when the viewing user has admin role; used to + // gate the "Update agent" button. Kept on the chrome struct so any + // page reusing host_chrome already has it for free. + CanAdmin bool // KnownTags is the union of tags already in use across the fleet, // used for autocomplete on the host-tags edit form. Cheap query. KnownTags []string @@ -834,6 +885,14 @@ type hostChromeData struct { // render the page with stale counts than 500 the whole tab. func (s *Server) loadHostChrome(r *stdhttp.Request, host store.Host, subtab, crumb string) hostChromeData { d := hostChromeData{Host: host, SubTab: subtab, Crumb: crumb} + d.TargetVersion = version.Version + d.UpdateAvailable = host.AgentVersion != "" && host.AgentVersion != version.Version + if s.deps.Hub != nil { + d.Online = s.deps.Hub.Connected(host.ID) + } + if existing, _ := s.deps.Store.RunningUpdateJobForHost(r.Context(), host.ID); existing != "" { + d.UpdateInProgress = true + } if groups, err := s.deps.Store.ListSourceGroupsByHost(r.Context(), host.ID); err == nil { d.SourceGroupCount = len(groups) } else { @@ -972,8 +1031,10 @@ func (s *Server) handleUIHostDetail(w stdhttp.ResponseWriter, r *stdhttp.Request view := s.baseView(r, u) view.Title = host.Name + " · restic-manager" + chrome := s.loadHostChrome(r, *host, "snapshots", "snapshots") + chrome.CanAdmin = u.Role == string(store.RoleAdmin) view.Page = hostDetailPage{ - hostChromeData: s.loadHostChrome(r, *host, "snapshots", "snapshots"), + hostChromeData: chrome, Snapshots: shown, SnapshotsShown: len(shown), LegacyRestic: !restic.Env{Version: host.ResticVersion}.AtLeastVersion(0, 17), diff --git a/internal/server/http/version.go b/internal/server/http/version.go new file mode 100644 index 0000000..33cc17f --- /dev/null +++ b/internal/server/http/version.go @@ -0,0 +1,20 @@ +package http + +import ( + "encoding/json" + stdhttp "net/http" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" +) + +// handleVersion exposes the server's build-time identifying constants +// (set via -ldflags). Public-band — no secrets surface here, the agent +// updater compares its own agent_version byte-for-byte against the +// Version field to drive the "out of date" signal. +func (s *Server) handleVersion(w stdhttp.ResponseWriter, r *stdhttp.Request) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]string{ + "version": version.Version, + "commit": version.Commit, + }) +} diff --git a/internal/server/http/version_test.go b/internal/server/http/version_test.go new file mode 100644 index 0000000..b012818 --- /dev/null +++ b/internal/server/http/version_test.go @@ -0,0 +1,42 @@ +package http + +import ( + "encoding/json" + stdhttp "net/http" + "testing" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" +) + +func TestVersionEndpoint(t *testing.T) { + t.Parallel() + + prevV, prevC := version.Version, version.Commit + version.Version = "v9.9.9-test" + version.Commit = "abc1234" + t.Cleanup(func() { + version.Version = prevV + version.Commit = prevC + }) + + _, url, _ := newTestServerWithHub(t) + + res, err := stdhttp.Get(url + "/api/version") + if err != nil { + t.Fatalf("get: %v", err) + } + defer res.Body.Close() + if res.StatusCode != stdhttp.StatusOK { + t.Fatalf("status: got %d want 200", res.StatusCode) + } + var body map[string]string + if err := json.NewDecoder(res.Body).Decode(&body); err != nil { + t.Fatalf("decode: %v", err) + } + if body["version"] != "v9.9.9-test" { + t.Fatalf("version: got %q", body["version"]) + } + if body["commit"] != "abc1234" { + t.Fatalf("commit: got %q", body["commit"]) + } +} diff --git a/internal/server/ui/ui.go b/internal/server/ui/ui.go index 3b3c446..45e5af7 100644 --- a/internal/server/ui/ui.go +++ b/internal/server/ui/ui.go @@ -108,6 +108,8 @@ func New() (*Renderer, error) { "templates/partials/tree_node.html", "templates/partials/alert_row.html", "templates/partials/crit_banner.html", + "templates/partials/fleet_update_inner.html", + "templates/partials/host_update_chip.html", } pageEntries, err := fs.Glob(web.FS, "templates/pages/*.html") diff --git a/internal/server/ws/handler.go b/internal/server/ws/handler.go index df74332..312f568 100644 --- a/internal/server/ws/handler.go +++ b/internal/server/ws/handler.go @@ -16,6 +16,7 @@ import ( "gitea.dcglab.co.uk/steve/restic-manager/internal/api" "gitea.dcglab.co.uk/steve/restic-manager/internal/auth" "gitea.dcglab.co.uk/steve/restic-manager/internal/store" + "gitea.dcglab.co.uk/steve/restic-manager/internal/version" ) // HandlerDeps is the set of collaborators the agent WS handler needs. @@ -26,6 +27,9 @@ type HandlerDeps struct { // AlertEngine receives job-finished and host-online events so the // alert engine can evaluate its rules. Optional; nil = no-op. AlertEngine *alert.Engine + // UpdateWatcher reconciles in-flight agent-update dispatches against + // hello envelopes. Optional; nil = no-op. + UpdateWatcher *UpdateWatcher // OnHello is called once per successful hello, after the host row // has been touched and the conn registered. Used by the HTTP // layer to push host_credentials down as a config.update before @@ -147,6 +151,9 @@ func runAgentLoop(ctx context.Context, c *Conn, hostID string, deps HandlerDeps) if deps.AlertEngine != nil { deps.AlertEngine.NotifyHostOnline(hostID) } + if deps.UpdateWatcher != nil { + deps.UpdateWatcher.OnHello(ctx, hostID, helloPayload.AgentVersion, version.Version) + } deps.Hub.Register(hostID, c) defer deps.Hub.Unregister(hostID, c) diff --git a/internal/server/ws/update_watch.go b/internal/server/ws/update_watch.go new file mode 100644 index 0000000..be2fef8 --- /dev/null +++ b/internal/server/ws/update_watch.go @@ -0,0 +1,151 @@ +package ws + +import ( + "context" + "fmt" + "log/slog" + "sync" + "time" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// updateTimeout bounds how long the watcher waits for an agent to come +// back with its new version after a command.update dispatch. var (not +// const) so tests can shrink it. +var updateTimeout = 90 * time.Second + +// AlertRaiser is the slim subset of *alert.Engine the update watcher +// touches. Defined here (not in the alert package) so the dependency +// arrow points the right way. +type AlertRaiser interface { + RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time) + ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time) +} + +// UpdateWatcher tracks in-flight agent-update dispatches and reconciles +// them against incoming hello envelopes. Entries land on Track and +// resolve via OnHello (success path) or the periodic sweep (timeout). +type UpdateWatcher struct { + store *store.Store + alerts AlertRaiser + + mu sync.Mutex + entries map[string]*updateEntry // hostID → entry + + tickPeriod time.Duration +} + +type updateEntry struct { + jobID string + startedAt time.Time + // terminated is set once the entry has reached a terminal state so + // late OnHellos don't resurrect it. + terminated bool +} + +// NewUpdateWatcher builds an unstarted watcher. Call Run in a goroutine +// to start the periodic sweep. +func NewUpdateWatcher(st *store.Store, alerts AlertRaiser) *UpdateWatcher { + return &UpdateWatcher{ + store: st, + alerts: alerts, + entries: make(map[string]*updateEntry), + tickPeriod: 5 * time.Second, + } +} + +// Track registers a freshly-dispatched update job. A subsequent Track +// for the same host replaces the prior entry (last-write-wins). +func (w *UpdateWatcher) Track(jobID, hostID string) { + if w == nil { + return + } + w.mu.Lock() + w.entries[hostID] = &updateEntry{jobID: jobID, startedAt: time.Now()} + w.mu.Unlock() +} + +// OnHello is called by the WS handler after a successful hello has been +// persisted. If a tracked update for the host matches the targetVersion, +// the job is marked succeeded and any open update_failed alert is +// auto-resolved. A non-matching version is a no-op (the watcher keeps +// waiting until the timeout). +func (w *UpdateWatcher) OnHello(ctx context.Context, hostID, agentVersion, targetVersion string) { + if w == nil { + return + } + w.mu.Lock() + e, ok := w.entries[hostID] + if !ok || e.terminated { + w.mu.Unlock() + return + } + if agentVersion != targetVersion { + // Not the version we asked for — keep waiting. + w.mu.Unlock() + return + } + e.terminated = true + jobID := e.jobID + delete(w.entries, hostID) + w.mu.Unlock() + + now := time.Now().UTC() + if err := w.store.MarkJobFinished(ctx, jobID, "succeeded", 0, nil, "", now); err != nil { + slog.Warn("ws update watcher: mark succeeded", "job_id", jobID, "host_id", hostID, "err", err) + } + if w.alerts != nil { + w.alerts.ResolveUpdateFailed(ctx, hostID, now) + } +} + +// Run drives the periodic sweep. Returns when ctx is done. +func (w *UpdateWatcher) Run(ctx context.Context) { + if w == nil { + return + } + t := time.NewTicker(w.tickPeriod) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case now := <-t.C: + w.sweep(ctx, now) + } + } +} + +func (w *UpdateWatcher) sweep(ctx context.Context, now time.Time) { + type expired struct { + hostID string + jobID string + age time.Duration + } + var toFail []expired + w.mu.Lock() + for hostID, e := range w.entries { + if e.terminated { + continue + } + if now.Sub(e.startedAt) >= updateTimeout { + toFail = append(toFail, expired{hostID: hostID, jobID: e.jobID, age: now.Sub(e.startedAt)}) + e.terminated = true + delete(w.entries, hostID) + } + } + w.mu.Unlock() + + for _, x := range toFail { + reason := fmt.Sprintf("timeout: agent did not reconnect within %s", updateTimeout) + stamp := now.UTC() + errMsg := reason + if err := w.store.MarkJobFinished(ctx, x.jobID, "failed", -1, nil, errMsg, stamp); err != nil { + slog.Warn("ws update watcher: mark failed", "job_id", x.jobID, "host_id", x.hostID, "err", err) + } + if w.alerts != nil { + w.alerts.RaiseUpdateFailed(ctx, x.hostID, x.jobID, reason, stamp) + } + } +} diff --git a/internal/server/ws/update_watch_test.go b/internal/server/ws/update_watch_test.go new file mode 100644 index 0000000..4081501 --- /dev/null +++ b/internal/server/ws/update_watch_test.go @@ -0,0 +1,161 @@ +package ws + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/oklog/ulid/v2" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +type fakeAlerts struct { + mu sync.Mutex + raised []string // hostIDs + resolved []string + reasons []string +} + +func (f *fakeAlerts) RaiseUpdateFailed(_ context.Context, hostID, _ /*jobID*/, reason string, _ time.Time) { + f.mu.Lock() + defer f.mu.Unlock() + f.raised = append(f.raised, hostID) + f.reasons = append(f.reasons, reason) +} + +func (f *fakeAlerts) ResolveUpdateFailed(_ context.Context, hostID string, _ time.Time) { + f.mu.Lock() + defer f.mu.Unlock() + f.resolved = append(f.resolved, hostID) +} + +func seedJob(t *testing.T, st *store.Store, hostID string) string { + t.Helper() + jobID := ulid.Make().String() + if err := st.CreateJob(context.Background(), store.Job{ + ID: jobID, HostID: hostID, Kind: "update", + ActorKind: "user", CreatedAt: time.Now().UTC(), + }); err != nil { + t.Fatalf("create job: %v", err) + } + return jobID +} + +func TestUpdateWatcherOnHelloSuccess(t *testing.T) { + st := openWSTestStore(t) + hostID := ulid.Make().String() + seedHostWS(t, st, hostID) + jobID := seedJob(t, st, hostID) + + a := &fakeAlerts{} + w := NewUpdateWatcher(st, a) + w.Track(jobID, hostID) + + w.OnHello(context.Background(), hostID, "v2", "v2") + + job, err := st.GetJob(context.Background(), jobID) + if err != nil { + t.Fatalf("get job: %v", err) + } + if job.Status != "succeeded" { + t.Fatalf("status: got %q want succeeded", job.Status) + } + a.mu.Lock() + defer a.mu.Unlock() + if len(a.resolved) != 1 || a.resolved[0] != hostID { + t.Fatalf("resolve calls: %v", a.resolved) + } + if len(a.raised) != 0 { + t.Fatalf("unexpected raises: %v", a.raised) + } +} + +func TestUpdateWatcherTimeout(t *testing.T) { + prev := updateTimeout + updateTimeout = 50 * time.Millisecond + t.Cleanup(func() { updateTimeout = prev }) + + st := openWSTestStore(t) + hostID := ulid.Make().String() + seedHostWS(t, st, hostID) + jobID := seedJob(t, st, hostID) + + a := &fakeAlerts{} + w := NewUpdateWatcher(st, a) + w.Track(jobID, hostID) + + time.Sleep(80 * time.Millisecond) + w.sweep(context.Background(), time.Now()) + + job, err := st.GetJob(context.Background(), jobID) + if err != nil { + t.Fatalf("get job: %v", err) + } + if job.Status != "failed" { + t.Fatalf("status: got %q want failed", job.Status) + } + a.mu.Lock() + defer a.mu.Unlock() + if len(a.raised) != 1 || a.raised[0] != hostID { + t.Fatalf("raise calls: %v", a.raised) + } + if len(a.reasons) == 0 || a.reasons[0] == "" { + t.Fatalf("missing reason") + } +} + +func TestUpdateWatcherMismatchedVersionNoOp(t *testing.T) { + st := openWSTestStore(t) + hostID := ulid.Make().String() + seedHostWS(t, st, hostID) + jobID := seedJob(t, st, hostID) + + a := &fakeAlerts{} + w := NewUpdateWatcher(st, a) + w.Track(jobID, hostID) + + w.OnHello(context.Background(), hostID, "v1", "v2") + + job, _ := st.GetJob(context.Background(), jobID) + if job.Status == "succeeded" || job.Status == "failed" { + t.Fatalf("status flipped on mismatched hello: %q", job.Status) + } + a.mu.Lock() + defer a.mu.Unlock() + if len(a.raised) != 0 || len(a.resolved) != 0 { + t.Fatalf("unexpected alert calls raised=%v resolved=%v", a.raised, a.resolved) + } +} + +func TestUpdateWatcherHelloAfterTimeoutIsNoOp(t *testing.T) { + prev := updateTimeout + updateTimeout = 50 * time.Millisecond + t.Cleanup(func() { updateTimeout = prev }) + + st := openWSTestStore(t) + hostID := ulid.Make().String() + seedHostWS(t, st, hostID) + jobID := seedJob(t, st, hostID) + + a := &fakeAlerts{} + w := NewUpdateWatcher(st, a) + w.Track(jobID, hostID) + + time.Sleep(80 * time.Millisecond) + w.sweep(context.Background(), time.Now()) + + // Hello arrives after sweep — entry already gone, must be no-op. + w.OnHello(context.Background(), hostID, "v2", "v2") + + job, _ := st.GetJob(context.Background(), jobID) + if job.Status != "failed" { + t.Fatalf("status flipped from failed → %q", job.Status) + } + a.mu.Lock() + defer a.mu.Unlock() + if len(a.resolved) != 0 { + t.Fatalf("late hello triggered ResolveUpdateFailed: %v", a.resolved) + } +} diff --git a/internal/store/alerts.go b/internal/store/alerts.go index b12d6fa..f16b9bc 100644 --- a/internal/store/alerts.go +++ b/internal/store/alerts.go @@ -77,6 +77,56 @@ func (s *Store) RaiseOrTouch(ctx context.Context, hostID, kind, dedupKey, severi return id, true, nil } +// RaiseOrTouchSystem is the host-less variant of RaiseOrTouch — the +// alert row's host_id is stored as NULL, so the FK to hosts is bypassed. +// Used by fleet-wide alerts (e.g. fleet_update_halted) where the +// failure surface isn't pinned to a single host. +func (s *Store) RaiseOrTouchSystem(ctx context.Context, kind, dedupKey, severity, message string, when time.Time) (id string, didRaise bool, err error) { + tx, err := s.db.BeginTx(ctx, nil) + if err != nil { + return "", false, fmt.Errorf("store: begin: %w", err) + } + defer func() { _ = tx.Rollback() }() + + row := tx.QueryRowContext(ctx, + `SELECT id FROM alerts + WHERE host_id IS NULL AND kind = ? AND dedup_key = ? AND resolved_at IS NULL + LIMIT 1`, + kind, dedupKey) + var existing string + switch err := row.Scan(&existing); { + case err == nil: + _, uerr := tx.ExecContext(ctx, + `UPDATE alerts SET last_seen_at = ?, message = ? WHERE id = ?`, + when.UTC().Format(time.RFC3339Nano), message, existing) + if uerr != nil { + return "", false, fmt.Errorf("store: touch alert: %w", uerr) + } + if err := tx.Commit(); err != nil { + return "", false, err + } + return existing, false, nil + case errors.Is(err, sql.ErrNoRows): + // fall through to insert + default: + return "", false, fmt.Errorf("store: lookup alert: %w", err) + } + + id = ulid.Make().String() + whenStr := when.UTC().Format(time.RFC3339Nano) + _, err = tx.ExecContext(ctx, + `INSERT INTO alerts (id, host_id, kind, dedup_key, severity, message, created_at, last_seen_at) + VALUES (?, NULL, ?, ?, ?, ?, ?, ?)`, + id, kind, dedupKey, severity, message, whenStr, whenStr) + if err != nil { + return "", false, fmt.Errorf("store: insert alert: %w", err) + } + if err := tx.Commit(); err != nil { + return "", false, err + } + return id, true, nil +} + // refreshHostOpenAlertCount recomputes hosts.open_alert_count from the // alerts table for one host. Self-healing: idempotent and survives // out-of-order edits. Best-effort — errors are returned but callers diff --git a/internal/store/fleet_updates.go b/internal/store/fleet_updates.go new file mode 100644 index 0000000..ae9fec2 --- /dev/null +++ b/internal/store/fleet_updates.go @@ -0,0 +1,258 @@ +package store + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" +) + +// ErrFleetUpdateRunning is returned by CreateFleetUpdate if another +// fleet update is already in 'running' state. The HTTP layer surfaces +// this as a 409 with a structured error code. +var ErrFleetUpdateRunning = errors.New("store: fleet update already running") + +// CreateFleetUpdate inserts the parent row and one pending child per +// hostID, in the order given (position = index). Returns +// ErrFleetUpdateRunning if a fleet update is already in flight. +func (st *Store) CreateFleetUpdate(ctx context.Context, fu FleetUpdate, hostIDs []string) error { + if fu.ID == "" || fu.StartedByUserID == "" || fu.TargetVersion == "" { + return errors.New("store: fleet update id, user_id, target_version required") + } + if fu.Status == "" { + fu.Status = "running" + } + if fu.StartedAt.IsZero() { + fu.StartedAt = time.Now().UTC() + } + tx, err := st.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("store: begin: %w", err) + } + defer func() { _ = tx.Rollback() }() + + var existing string + if err := tx.QueryRowContext(ctx, + `SELECT id FROM fleet_updates WHERE status = 'running' LIMIT 1`). + Scan(&existing); err == nil { + return fmt.Errorf("%w: %s", ErrFleetUpdateRunning, existing) + } else if !errors.Is(err, sql.ErrNoRows) { + return fmt.Errorf("store: check active fleet update: %w", err) + } + + if _, err := tx.ExecContext(ctx, + `INSERT INTO fleet_updates (id, started_at, started_by_user_id, target_version, status) + VALUES (?, ?, ?, ?, ?)`, + fu.ID, fu.StartedAt.UTC().Format(time.RFC3339Nano), fu.StartedByUserID, fu.TargetVersion, fu.Status, + ); err != nil { + return fmt.Errorf("store: insert fleet_updates: %w", err) + } + for i, hid := range hostIDs { + if _, err := tx.ExecContext(ctx, + `INSERT INTO fleet_update_hosts (fleet_update_id, host_id, position, status) + VALUES (?, ?, ?, 'pending')`, + fu.ID, hid, i, + ); err != nil { + return fmt.Errorf("store: insert fleet_update_hosts: %w", err) + } + } + return tx.Commit() +} + +// ActiveFleetUpdate returns the currently-running fleet update or nil. +func (st *Store) ActiveFleetUpdate(ctx context.Context) (*FleetUpdate, error) { + var fu FleetUpdate + var startedAt string + var current sql.NullString + var halted sql.NullString + var completedAt sql.NullString + err := st.db.QueryRowContext(ctx, + `SELECT id, started_at, started_by_user_id, target_version, status, + current_host_id, halted_reason, completed_at + FROM fleet_updates WHERE status = 'running' LIMIT 1`). + Scan(&fu.ID, &startedAt, &fu.StartedByUserID, &fu.TargetVersion, &fu.Status, + ¤t, &halted, &completedAt) + if errors.Is(err, sql.ErrNoRows) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("store: active fleet update: %w", err) + } + fu.StartedAt, _ = time.Parse(time.RFC3339Nano, startedAt) + fu.CurrentHostID = current.String + fu.HaltedReason = halted.String + if completedAt.Valid { + t, _ := time.Parse(time.RFC3339Nano, completedAt.String) + fu.CompletedAt = &t + } + return &fu, nil +} + +// GetFleetUpdate hydrates parent + ordered child rows. Returns +// ErrNotFound on missing id. +func (st *Store) GetFleetUpdate(ctx context.Context, id string) (*FleetUpdate, []FleetUpdateHost, error) { + var fu FleetUpdate + var startedAt string + var current sql.NullString + var halted sql.NullString + var completedAt sql.NullString + err := st.db.QueryRowContext(ctx, + `SELECT id, started_at, started_by_user_id, target_version, status, + current_host_id, halted_reason, completed_at + FROM fleet_updates WHERE id = ?`, id). + Scan(&fu.ID, &startedAt, &fu.StartedByUserID, &fu.TargetVersion, &fu.Status, + ¤t, &halted, &completedAt) + if errors.Is(err, sql.ErrNoRows) { + return nil, nil, ErrNotFound + } + if err != nil { + return nil, nil, fmt.Errorf("store: get fleet update: %w", err) + } + fu.StartedAt, _ = time.Parse(time.RFC3339Nano, startedAt) + fu.CurrentHostID = current.String + fu.HaltedReason = halted.String + if completedAt.Valid { + t, _ := time.Parse(time.RFC3339Nano, completedAt.String) + fu.CompletedAt = &t + } + + rows, err := st.db.QueryContext(ctx, + `SELECT host_id, position, status, COALESCE(job_id, ''), COALESCE(failed_reason, '') + FROM fleet_update_hosts + WHERE fleet_update_id = ? + ORDER BY position`, id) + if err != nil { + return nil, nil, fmt.Errorf("store: list fleet hosts: %w", err) + } + defer func() { _ = rows.Close() }() + out := []FleetUpdateHost{} + for rows.Next() { + fh := FleetUpdateHost{FleetUpdateID: id} + if err := rows.Scan(&fh.HostID, &fh.Position, &fh.Status, &fh.JobID, &fh.FailedReason); err != nil { + return nil, nil, fmt.Errorf("store: scan fleet host: %w", err) + } + out = append(out, fh) + } + return &fu, out, rows.Err() +} + +// ListPendingFleetUpdateHosts returns rows with status='pending' for +// this fleet update, in position order. The worker calls this to +// pick the next host to dispatch. +func (st *Store) ListPendingFleetUpdateHosts(ctx context.Context, fuID string) ([]FleetUpdateHost, error) { + rows, err := st.db.QueryContext(ctx, + `SELECT host_id, position, status, COALESCE(job_id, ''), COALESCE(failed_reason, '') + FROM fleet_update_hosts + WHERE fleet_update_id = ? AND status = 'pending' + ORDER BY position`, fuID) + if err != nil { + return nil, fmt.Errorf("store: list pending fleet hosts: %w", err) + } + defer func() { _ = rows.Close() }() + out := []FleetUpdateHost{} + for rows.Next() { + fh := FleetUpdateHost{FleetUpdateID: fuID} + if err := rows.Scan(&fh.HostID, &fh.Position, &fh.Status, &fh.JobID, &fh.FailedReason); err != nil { + return nil, err + } + out = append(out, fh) + } + return out, rows.Err() +} + +// SetFleetUpdateHostStatus moves one row through pending → running → +// {succeeded, failed, skipped}. failedReason and jobID may be empty +// (e.g. on succeeded). Empty values are stored as NULL so subsequent +// reads round-trip cleanly via COALESCE. +func (st *Store) SetFleetUpdateHostStatus(ctx context.Context, fuID, hostID, status, failedReason, jobID string) error { + _, err := st.db.ExecContext(ctx, + `UPDATE fleet_update_hosts + SET status = ?, failed_reason = ?, job_id = COALESCE(?, job_id) + WHERE fleet_update_id = ? AND host_id = ?`, + status, nullableString(failedReason), nullableString(jobID), + fuID, hostID, + ) + if err != nil { + return fmt.Errorf("store: set fleet host status: %w", err) + } + return nil +} + +// SetFleetUpdateCurrentHost stamps which host the worker is actively +// waiting on. Pass empty string to clear. +func (st *Store) SetFleetUpdateCurrentHost(ctx context.Context, fuID, hostID string) error { + _, err := st.db.ExecContext(ctx, + `UPDATE fleet_updates SET current_host_id = ? WHERE id = ?`, + nullableString(hostID), fuID, + ) + if err != nil { + return fmt.Errorf("store: set fleet current host: %w", err) + } + return nil +} + +// HaltFleetUpdate flips status to 'halted', stamps the reason, and +// clears current_host_id. +func (st *Store) HaltFleetUpdate(ctx context.Context, fuID, reason string, when time.Time) error { + _, err := st.db.ExecContext(ctx, + `UPDATE fleet_updates + SET status = 'halted', halted_reason = ?, current_host_id = NULL, + completed_at = ? + WHERE id = ? AND status = 'running'`, + reason, when.UTC().Format(time.RFC3339Nano), fuID, + ) + if err != nil { + return fmt.Errorf("store: halt fleet update: %w", err) + } + return nil +} + +// CancelFleetUpdate flips status to 'cancelled'. Caller checks that +// the row is still 'running' before calling. +func (st *Store) CancelFleetUpdate(ctx context.Context, fuID string, when time.Time) error { + _, err := st.db.ExecContext(ctx, + `UPDATE fleet_updates + SET status = 'cancelled', current_host_id = NULL, completed_at = ? + WHERE id = ? AND status = 'running'`, + when.UTC().Format(time.RFC3339Nano), fuID, + ) + if err != nil { + return fmt.Errorf("store: cancel fleet update: %w", err) + } + return nil +} + +// CompleteFleetUpdate flips status to 'completed' once every host has +// reached a terminal state. +func (st *Store) CompleteFleetUpdate(ctx context.Context, fuID string, when time.Time) error { + _, err := st.db.ExecContext(ctx, + `UPDATE fleet_updates + SET status = 'completed', current_host_id = NULL, completed_at = ? + WHERE id = ? AND status = 'running'`, + when.UTC().Format(time.RFC3339Nano), fuID, + ) + if err != nil { + return fmt.Errorf("store: complete fleet update: %w", err) + } + return nil +} + +// RunningUpdateJobForHost returns the id of any in-flight (queued or +// running) `update` job for hostID, or "" + nil if none. Used by the +// host-update HTTP handler to refuse double-dispatch and by the +// fleet worker to dedupe on retry. +func (st *Store) RunningUpdateJobForHost(ctx context.Context, hostID string) (string, error) { + var id string + err := st.db.QueryRowContext(ctx, + `SELECT id FROM jobs + WHERE host_id = ? AND kind = 'update' AND status IN ('queued','running') + ORDER BY created_at DESC LIMIT 1`, hostID).Scan(&id) + if errors.Is(err, sql.ErrNoRows) { + return "", nil + } + if err != nil { + return "", fmt.Errorf("store: running update job: %w", err) + } + return id, nil +} diff --git a/internal/store/fleet_updates_test.go b/internal/store/fleet_updates_test.go new file mode 100644 index 0000000..9942411 --- /dev/null +++ b/internal/store/fleet_updates_test.go @@ -0,0 +1,180 @@ +package store + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/oklog/ulid/v2" +) + +func ptrStr(s string) *string { return &s } + +func seedFleetUser(t *testing.T, s *Store) string { + t.Helper() + id := ulid.Make().String() + if err := s.CreateUser(context.Background(), User{ + ID: id, Username: "u-" + id[:6], PasswordHash: "x", Role: RoleAdmin, + }); err != nil { + t.Fatalf("create user: %v", err) + } + return id +} + +func seedFleetHost(t *testing.T, s *Store, name string) string { + t.Helper() + id := ulid.Make().String() + if err := s.CreateHost(context.Background(), Host{ + ID: id, Name: name, OS: "linux", Arch: "amd64", + EnrolledAt: time.Now().UTC(), + }, "tokenhash-"+id[:6], ""); err != nil { + t.Fatalf("create host: %v", err) + } + return id +} + +func TestCreateFleetUpdate_RefusesIfRunning(t *testing.T) { + t.Parallel() + s := openTestStore(t) + uid := seedFleetUser(t, s) + h1 := seedFleetHost(t, s, "h1") + + fu1 := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v1"} + if err := s.CreateFleetUpdate(context.Background(), fu1, []string{h1}); err != nil { + t.Fatalf("create #1: %v", err) + } + fu2 := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v2"} + err := s.CreateFleetUpdate(context.Background(), fu2, []string{h1}) + if !errors.Is(err, ErrFleetUpdateRunning) { + t.Fatalf("want ErrFleetUpdateRunning, got %v", err) + } +} + +func TestCreateFleetUpdate_HydrateRoundTrip(t *testing.T) { + t.Parallel() + s := openTestStore(t) + uid := seedFleetUser(t, s) + h1 := seedFleetHost(t, s, "h1") + h2 := seedFleetHost(t, s, "h2") + + fu := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v1.2.3"} + if err := s.CreateFleetUpdate(context.Background(), fu, []string{h1, h2}); err != nil { + t.Fatal(err) + } + + got, hosts, err := s.GetFleetUpdate(context.Background(), fu.ID) + if err != nil { + t.Fatal(err) + } + if got.Status != "running" || got.TargetVersion != "v1.2.3" { + t.Fatalf("parent: %+v", got) + } + if len(hosts) != 2 || hosts[0].Position != 0 || hosts[1].Position != 1 { + t.Fatalf("hosts: %+v", hosts) + } + if hosts[0].Status != "pending" || hosts[1].Status != "pending" { + t.Fatalf("hosts status: %+v", hosts) + } +} + +func TestSetFleetUpdateHostStatus_ProgressesAndStoresJobID(t *testing.T) { + t.Parallel() + s := openTestStore(t) + uid := seedFleetUser(t, s) + h := seedFleetHost(t, s, "h1") + fu := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v1"} + _ = s.CreateFleetUpdate(context.Background(), fu, []string{h}) + + jobID := ulid.Make().String() + if err := s.CreateJob(context.Background(), Job{ + ID: jobID, HostID: h, Kind: "update", + ActorKind: "user", ActorID: ptrStr(uid), CreatedAt: time.Now().UTC(), + }); err != nil { + t.Fatal(err) + } + + if err := s.SetFleetUpdateHostStatus(context.Background(), fu.ID, h, "running", "", ""); err != nil { + t.Fatal(err) + } + if err := s.SetFleetUpdateHostStatus(context.Background(), fu.ID, h, "succeeded", "", jobID); err != nil { + t.Fatal(err) + } + _, hs, _ := s.GetFleetUpdate(context.Background(), fu.ID) + if hs[0].Status != "succeeded" || hs[0].JobID != jobID { + t.Fatalf("after succeed: %+v", hs[0]) + } + + pending, _ := s.ListPendingFleetUpdateHosts(context.Background(), fu.ID) + if len(pending) != 0 { + t.Fatalf("pending should be empty: %+v", pending) + } +} + +func TestHaltAndCompleteFleetUpdate(t *testing.T) { + t.Parallel() + s := openTestStore(t) + uid := seedFleetUser(t, s) + h := seedFleetHost(t, s, "h1") + + fu1 := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v1"} + _ = s.CreateFleetUpdate(context.Background(), fu1, []string{h}) + if err := s.HaltFleetUpdate(context.Background(), fu1.ID, "boom", time.Now().UTC()); err != nil { + t.Fatal(err) + } + got, _, _ := s.GetFleetUpdate(context.Background(), fu1.ID) + if got.Status != "halted" || got.HaltedReason != "boom" { + t.Fatalf("after halt: %+v", got) + } + if got.CompletedAt == nil { + t.Fatal("halted must stamp completed_at") + } + if active, _ := s.ActiveFleetUpdate(context.Background()); active != nil { + t.Fatalf("halted should clear active: %+v", active) + } + + // Now a fresh run can start. + fu2 := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v2"} + if err := s.CreateFleetUpdate(context.Background(), fu2, []string{h}); err != nil { + t.Fatalf("create after halt: %v", err) + } + if err := s.CompleteFleetUpdate(context.Background(), fu2.ID, time.Now().UTC()); err != nil { + t.Fatal(err) + } + got, _, _ = s.GetFleetUpdate(context.Background(), fu2.ID) + if got.Status != "completed" { + t.Fatalf("after complete: %+v", got) + } +} + +func TestRunningUpdateJobForHost(t *testing.T) { + t.Parallel() + s := openTestStore(t) + h := seedFleetHost(t, s, "h1") + + got, err := s.RunningUpdateJobForHost(context.Background(), h) + if err != nil || got != "" { + t.Fatalf("empty case: got=%q err=%v", got, err) + } + + jobID := ulid.Make().String() + if err := s.CreateJob(context.Background(), Job{ + ID: jobID, HostID: h, Kind: "update", + ActorKind: "user", ActorID: ptrStr("u-1"), CreatedAt: time.Now().UTC(), + }); err != nil { + t.Fatal(err) + } + got, err = s.RunningUpdateJobForHost(context.Background(), h) + if err != nil || got != jobID { + t.Fatalf("queued case: got=%q err=%v", got, err) + } + + // Mark succeeded → no longer "in flight". + if err := s.MarkJobFinished(context.Background(), jobID, "succeeded", 0, nil, "", time.Now().UTC()); err != nil { + t.Fatal(err) + } + got, err = s.RunningUpdateJobForHost(context.Background(), h) + if err != nil || got != "" { + t.Fatalf("after succeed: got=%q err=%v", got, err) + } +} diff --git a/internal/store/migrations/0021_jobs_update_kind.sql b/internal/store/migrations/0021_jobs_update_kind.sql new file mode 100644 index 0000000..241bdc0 --- /dev/null +++ b/internal/store/migrations/0021_jobs_update_kind.sql @@ -0,0 +1,57 @@ +-- 0021_jobs_update_kind.sql +-- +-- Add 'update' to the jobs.kind CHECK constraint so the agent +-- self-update flow (P6-01) can persist its job rows. SQLite can't +-- ALTER a CHECK in place, so we rebuild the table. +-- +-- Same safe rebuild pattern as 0012: +-- 1. Stash job_logs into a temp table BEFORE rebuilding jobs. +-- 2. Create jobs_new with the wider CHECK; copy data; DROP jobs; +-- RENAME jobs_new TO jobs. +-- 3. Restore job_logs (cascade-trap defence — see CLAUDE.md). +-- +-- jobs_new mirrors the live schema *including* post-0012 column +-- additions (0015 added source_group_id). When adding a new +-- migration that touches this table, mirror the latest column set. + +CREATE TEMPORARY TABLE _job_logs_backup AS + SELECT job_id, seq, ts, stream, payload FROM job_logs; + +CREATE TABLE jobs_new ( + id TEXT PRIMARY KEY, + host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE, + kind TEXT NOT NULL CHECK (kind IN + ('backup','init','forget','prune','check','unlock','restore','diff','update')), + status TEXT NOT NULL CHECK (status IN ('queued','running','succeeded','failed','cancelled')), + scheduled_id TEXT REFERENCES schedules(id) ON DELETE SET NULL, + actor_kind TEXT NOT NULL CHECK (actor_kind IN ('user','schedule','system')), + actor_id TEXT, + started_at TEXT, + finished_at TEXT, + exit_code INTEGER, + stats TEXT, + error TEXT, + created_at TEXT NOT NULL, + source_group_id TEXT REFERENCES source_groups(id) ON DELETE SET NULL +); + +INSERT INTO jobs_new + SELECT id, host_id, kind, status, scheduled_id, actor_kind, actor_id, + started_at, finished_at, exit_code, stats, error, created_at, + source_group_id + FROM jobs; + +DROP TABLE jobs; +ALTER TABLE jobs_new RENAME TO jobs; + +CREATE INDEX jobs_host_id ON jobs(host_id); +CREATE INDEX jobs_status ON jobs(status); +CREATE INDEX jobs_created_at ON jobs(created_at); +CREATE INDEX jobs_source_group_id ON jobs(source_group_id); + +-- Defensive: restore job_logs from the temp backup. INSERT OR IGNORE +-- so a re-run is harmless. Same shape as 0012's safety net. +INSERT OR IGNORE INTO job_logs (job_id, seq, ts, stream, payload) + SELECT job_id, seq, ts, stream, payload FROM _job_logs_backup; + +DROP TABLE _job_logs_backup; diff --git a/internal/store/migrations/0022_fleet_updates.sql b/internal/store/migrations/0022_fleet_updates.sql new file mode 100644 index 0000000..a57242f --- /dev/null +++ b/internal/store/migrations/0022_fleet_updates.sql @@ -0,0 +1,35 @@ +-- 0022_fleet_updates.sql +-- +-- Tables backing the rolling fleet-update worker (P6-02). One row in +-- fleet_updates per "update all" invocation, a child row per host so +-- the worker can iterate in position order, report progress, and +-- record per-host outcome. Halt-on-fail semantics live in the worker +-- (internal/server/fleetupdate); this schema just captures state. + +CREATE TABLE fleet_updates ( + id TEXT PRIMARY KEY, + started_at TEXT NOT NULL, + started_by_user_id TEXT NOT NULL REFERENCES users(id), + target_version TEXT NOT NULL, + status TEXT NOT NULL CHECK (status IN + ('running','completed','halted','cancelled')), + current_host_id TEXT REFERENCES hosts(id), + halted_reason TEXT, + completed_at TEXT +); + +CREATE INDEX fleet_updates_status ON fleet_updates(status); + +CREATE TABLE fleet_update_hosts ( + fleet_update_id TEXT NOT NULL REFERENCES fleet_updates(id) ON DELETE CASCADE, + host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE, + position INTEGER NOT NULL, + status TEXT NOT NULL CHECK (status IN + ('pending','running','succeeded','failed','skipped')), + job_id TEXT REFERENCES jobs(id) ON DELETE SET NULL, + failed_reason TEXT, + PRIMARY KEY (fleet_update_id, host_id) +); + +CREATE INDEX fleet_update_hosts_position + ON fleet_update_hosts(fleet_update_id, position); diff --git a/internal/store/types.go b/internal/store/types.go index 5b16294..cc60e48 100644 --- a/internal/store/types.go +++ b/internal/store/types.go @@ -211,6 +211,33 @@ type PendingRun struct { LastError string } +// FleetUpdate captures one "update all" invocation. Status moves +// running → one of {completed, halted, cancelled}. CurrentHostID +// tracks the host the worker is actively waiting on; cleared (empty) +// outside an active dispatch. +type FleetUpdate struct { + ID string + StartedAt time.Time + StartedByUserID string + TargetVersion string + Status string + CurrentHostID string + HaltedReason string + CompletedAt *time.Time +} + +// FleetUpdateHost is one host's slot in a fleet update. Position is +// the iteration order. JobID is set once the worker has dispatched +// command.update for this host; FailedReason on a failed/halted row. +type FleetUpdateHost struct { + FleetUpdateID string + HostID string + Position int + Status string + JobID string + FailedReason string +} + // EnrollmentToken is the issuer's view of a one-time token. type EnrollmentToken struct { Raw string diff --git a/internal/version/version.go b/internal/version/version.go new file mode 100644 index 0000000..3a5fb70 --- /dev/null +++ b/internal/version/version.go @@ -0,0 +1,16 @@ +// Package version exposes build-time identifying constants. Both the +// server and agent link this package; their values are set via +// -ldflags during the build. An unset Version falls back to "dev" +// so source builds without ldflags still run. +package version + +var ( + // Version is the human-facing release string, e.g. "v1.2.3" or + // "v1.2.3-dirty". Compared byte-for-byte between agent and + // server to drive the "out of date" signal. + Version = "dev" + + // Commit is the short git SHA. Informational only; surfaced via + // /api/version but not used for any comparison. + Commit = "" +) diff --git a/tasks.md b/tasks.md index b67c288..088179f 100644 --- a/tasks.md +++ b/tasks.md @@ -344,8 +344,33 @@ Sizes: **S** = under a day, **M** = 1–3 days, **L** = 3–7 days. > Deferred from Phase 4 on 2026-05-05 — operator-experience polish that doesn't gate a working v1. -- [ ] **P6-01** (S) Agent self-update from the server's bundled binaries. P5-03 already bakes matching `agent-{linux-amd64,linux-arm64,windows-amd64}` into the server image under `/opt/restic-manager/dist/`, served by `/agent/binary`. Add a `restic-manager-agent update` subcommand (and a server-dispatched `command.update` WS envelope) that fetches `$RM_SERVER/agent/binary?os=…&arch=…`, verifies sha256 against a digest the server advertises alongside the binary, atomic-renames over the running binary (`tmp+fsync+rename`), and asks the service manager to restart (`systemctl restart` on Linux, SCM restart on Windows). Version pinning is automatic — the server only ever serves the agent that matches its own release. No apt repo, no Chocolatey, no third-party signing infra. _(Was P4-01; original apt/choco plan dropped after the P5-03 Docker pivot made the server the natural distribution point.)_ -- [ ] **P6-02** (M) Agent version reporting + fleet update on dashboard. Server already knows its own build version and each agent's `agent_version` from the WS hello. Surface "N hosts behind" on the dashboard, a per-host "out of date" chip, and an admin-only **Update all** action that fans out `command.update` to every online host (offline hosts queue via `pending_runs`-style retry on reconnect). Per-host **Update** button on host detail for one-shot upgrades. Audit-logged. _(Was P4-02.)_ +- [x] **P6-01** (S) Agent self-update from the server's bundled binaries. Server-dispatched `command.update` WS envelope; agent fetches `$RM_SERVER/agent/binary?os=…&arch=…` to `.new`, copies running binary to `.old` (M1 — keep one revision back), atomic-rename, exit cleanly. Linux relies on systemd `Restart=always`; Windows writes a detached `update.cmd` helper that waits 3s, `sc stop`s, renames, `sc start`s. No sha256 digest verification — TLS already covers corruption-in-transit (decision deferred per spec §4). _(Was P4-01.)_ +- [x] **P6-02** (M) Agent version reporting + fleet update on dashboard. `internal/version` package + Makefile ldflags injection so server and agent are comparable byte-for-byte. Out-of-date chip on host rows + detail header (amber, format `out of date · A → B`). Hero tile "N hosts behind" with `?updates=behind` filter. Per-host **Update agent** button on host detail. Admin `/settings/fleet-update` page drives a rolling worker (`internal/server/fleetupdate`) that updates one host at a time, polls for hello-with-target-version up to 95s, halts on first failure with `fleet_update_halted` alert. Per-host `update_failed` alerts auto-resolve when the agent reconnects at the right version. `host.update_dispatched/_succeeded/_failed` and `fleet.update_started/_completed/_halted/_cancelled` audit actions. _(Was P4-02.)_ + +> **As shipped (2026-05-06, branch `p6-agent-self-update`):** +> Spec `docs/superpowers/specs/2026-05-06-p6-01-02-agent-self-update-design.md`, +> plan `docs/superpowers/plans/2026-05-06-p6-01-02-agent-self-update.md`. +> Schema: migration 0021 widens `jobs.kind` CHECK to include `update`; +> 0022 creates `fleet_updates` + `fleet_update_hosts`. Agent: new +> `internal/agent/updater` package (build-tag split unix/windows); +> dispatcher case `MsgCommandUpdate` in `cmd/agent/update_dispatch.go` +> emits `job.started` + `log.stream` updates before exit. Server: WS +> update-watcher (`internal/server/ws/update_watch.go`) tracks in-flight +> dispatches, marks succeeded on hello-with-matching-version, fails after +> 90s timeout (covers both no-show and rollback cases per spec §3.2). +> Endpoint `POST /api/hosts/{id}/update` (admin, JSON) + `POST /hosts/{id}/update` +> (HTMX, `HX-Redirect: /jobs/{id}`); pre-checks for offline / already +> up-to-date / update_in_progress. Fleet worker exposes `Start` / +> `Cancel` and runs at most one rolling sequence at a time. Alert kinds +> `update_failed` and `fleet_update_halted` plug into the P3-05 engine. +> +> **Smoke caught + fixed mid-sweep:** the systemd unit's +> `ProtectSystem=full` made `/usr/local/bin` read-only, blocking the +> .new staging file. Added `/usr/local/bin` to `ReadWritePaths`. With +> the fix in place: end-to-end Update agent took the host from +> `v0.9.0-11-gccaccd8-dirty` → `v9.9.9-smoke` in <5s; `.old` preserved +> on disk; chip and hero tile cleared on reconnect; audit row landed. +> Screenshots in `_diag/p6-update-sweep/`. - [ ] **P6-03** (M) Repo size trend graphs (sparkline on host card, full chart on repo page). _(Was P4-06.)_ - [ ] **P6-04** (M) Prometheus `/metrics` endpoint: per-host gauges (last backup timestamp, last backup status, repo size, snapshot count, agent online), server gauges (active alerts, build info), job duration histograms; protected by bearer token or IP allow-list. _(Was P4-08.)_ - [ ] **P6-05** (S) Document Prometheus integration + sample Grafana dashboard JSON. _(Was P4-09.)_ diff --git a/web/static/css/styles.css b/web/static/css/styles.css index d72d394..37ca83f 100644 --- a/web/static/css/styles.css +++ b/web/static/css/styles.css @@ -1,3 +1,3 @@ *,:after,:before{--tw-border-spacing-x:0;--tw-border-spacing-y:0;--tw-translate-x:0;--tw-translate-y:0;--tw-rotate:0;--tw-skew-x:0;--tw-skew-y:0;--tw-scale-x:1;--tw-scale-y:1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness:proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:rgba(59,130,246,.5);--tw-ring-offset-shadow:0 0 #0000;--tw-ring-shadow:0 0 #0000;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x:0;--tw-border-spacing-y:0;--tw-translate-x:0;--tw-translate-y:0;--tw-rotate:0;--tw-skew-x:0;--tw-skew-y:0;--tw-scale-x:1;--tw-scale-y:1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness:proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:rgba(59,130,246,.5);--tw-ring-offset-shadow:0 0 #0000;--tw-ring-shadow:0 0 #0000;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: } -/*! tailwindcss v3.4.17 | MIT License | https://tailwindcss.com*/*,:after,:before{border:0 solid #e5e7eb;box-sizing:border-box}:after,:before{--tw-content:""}:host,html{line-height:1.5;-webkit-text-size-adjust:100%;font-family:Inter,system-ui,-apple-system,sans-serif;font-feature-settings:normal;font-variation-settings:normal;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-tap-highlight-color:transparent}body{line-height:inherit;margin:0}hr{border-top-width:1px;color:inherit;height:0}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:JetBrains Mono,ui-monospace,monospace;font-feature-settings:normal;font-size:1em;font-variation-settings:normal}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{border-collapse:collapse;border-color:inherit;text-indent:0}button,input,optgroup,select,textarea{color:inherit;font-family:inherit;font-feature-settings:inherit;font-size:100%;font-variation-settings:inherit;font-weight:inherit;letter-spacing:inherit;line-height:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0}fieldset,legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{color:#9ca3af;opacity:1}input::placeholder,textarea::placeholder{color:#9ca3af;opacity:1}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{height:auto;max-width:100%}[hidden]:where(:not([hidden=until-found])){display:none}:root{--bg:oklch(0.17 0.006 250);--panel:oklch(0.20 0.007 250);--panel-hi:oklch(0.23 0.008 250);--line:oklch(0.27 0.010 250);--line-soft:oklch(0.23 0.008 250);--ink:oklch(0.96 0.005 250);--ink-mid:oklch(0.78 0.005 250);--ink-mute:oklch(0.58 0.006 250);--ink-fade:oklch(0.42 0.006 250);--ok:oklch(0.78 0.14 155);--warn:oklch(0.82 0.13 80);--bad:oklch(0.70 0.20 25);--off:oklch(0.50 0.005 250);--accent:oklch(0.82 0.12 195)}body,html{background:var(--bg);color:var(--ink);font-family:Inter,system-ui,-apple-system,sans-serif;-webkit-font-smoothing:antialiased}body{font-feature-settings:"cv11","ss01","ss03"}::-moz-selection{background:color-mix(in oklch,var(--accent),transparent 70%)}::selection{background:color-mix(in oklch,var(--accent),transparent 70%)}.\!container{width:100%!important}.container{width:100%}@media (min-width:640px){.\!container{max-width:640px!important}.container{max-width:640px}}@media (min-width:768px){.\!container{max-width:768px!important}.container{max-width:768px}}@media (min-width:1024px){.\!container{max-width:1024px!important}.container{max-width:1024px}}@media (min-width:1280px){.\!container{max-width:1280px!important}.container{max-width:1280px}}@media (min-width:1536px){.\!container{max-width:1536px!important}.container{max-width:1536px}}.mono{font-family:JetBrains Mono,ui-monospace,monospace;font-variant-numeric:tabular-nums}.panel{background:var(--panel);border:1px solid var(--line-soft)}.hairline{box-shadow:inset 0 -1px 0 var(--line-soft)}.dot{border-radius:9999px;display:inline-block;height:7px;width:7px}.dot-online{background:var(--ok);box-shadow:0 0 0 3px color-mix(in oklch,var(--ok),transparent 80%)}.dot-degraded{background:var(--warn);box-shadow:0 0 0 3px color-mix(in oklch,var(--warn),transparent 80%)}.dot-offline{background:var(--off)}.dot-failed{background:var(--bad);box-shadow:0 0 0 3px color-mix(in oklch,var(--bad),transparent 80%)}.pulse{animation:rm-pulse 2.4s ease-in-out infinite}@keyframes rm-pulse{0%,to{box-shadow:0 0 0 3px color-mix(in oklch,var(--accent),transparent 80%)}50%{box-shadow:0 0 0 6px color-mix(in oklch,var(--accent),transparent 92%)}}.btn{align-items:center;background:transparent;border:1px solid var(--line);border-radius:5px;color:var(--ink-mid);cursor:pointer;display:inline-flex;font-size:12px;font-weight:500;gap:6px;padding:6px 11px;text-decoration:none;transition:all .12s ease}.btn:hover{background:var(--panel-hi);color:var(--ink)}.btn:disabled,.btn[disabled]{cursor:not-allowed;opacity:.4;pointer-events:none}.btn-primary{background:var(--accent);border-color:var(--accent);color:oklch(.18 .01 195)}.btn-primary:hover{filter:brightness(1.08)}.btn-ghost,.btn-ghost:hover{border-color:transparent}.btn-ghost:hover{background:var(--panel-hi)}.btn-danger{border-color:color-mix(in oklch,var(--bad),transparent 70%);color:var(--bad)}.btn-danger:hover{background:color-mix(in oklch,var(--bad),transparent 88%);border-color:color-mix(in oklch,var(--bad),transparent 50%);color:oklch(.85 .1 25)}.btn-lg{font-size:13px;padding:9px 14px}.btn-block{justify-content:center;width:100%}.nav-tab{border-bottom:2px solid transparent;color:var(--ink-mute);cursor:pointer;font-size:13px;margin-right:28px;padding:18px 0;text-decoration:none}.nav-tab.active{border-color:var(--accent)}.nav-tab.active,.nav-tab:hover{color:var(--ink)}.sub-tab{border-bottom:1.5px solid transparent;color:var(--ink-mute);cursor:pointer;font-size:13px;margin-right:24px;padding:12px 0;text-decoration:none}.sub-tab.active{border-color:var(--ink);color:var(--ink)}.tag{align-items:center;border:1px solid var(--line);border-radius:3px;display:inline-flex;font-size:11px;gap:5px;letter-spacing:.01em;line-height:1;padding:4px 7px}.field-label,.tag{color:var(--ink-mid)}.field-label{display:block;font-size:12px;margin-bottom:6px}.field-help{color:var(--ink-mute);font-size:12px;line-height:1.55;margin-top:6px}.field{background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;color:var(--ink);font-family:inherit;font-size:13px;outline:none;padding:9px 12px;transition:border-color .12s ease;width:100%}.field:focus{border-color:var(--accent)}.field.invalid{border-color:color-mix(in oklch,var(--bad),transparent 50%)}.field.mono{font-family:JetBrains Mono,monospace;font-size:12px}.field.with-prefix{padding-left:64px}.host-row{align-items:center;border-left:3px solid transparent;-moz-column-gap:18px;column-gap:18px;display:grid;font-size:13px;grid-template-columns:24px 1.4fr .95fr 1.5fr .75fr .7fr .7fr 1.1fr 92px;padding:11px 16px}.host-row.head{color:var(--ink-fade);font-size:11px;letter-spacing:.08em;padding-bottom:10px;padding-top:10px;text-transform:uppercase}.host-row.degraded{border-left-color:color-mix(in oklch,var(--warn),transparent 50%)}.host-row.failed{border-left-color:color-mix(in oklch,var(--bad),transparent 50%)}.host-row.offline{border-left-color:color-mix(in oklch,var(--off),transparent 70%)}.host-row:hover{background:var(--panel-hi)}.host-row.clickable{position:relative}.host-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.host-row.clickable:hover{cursor:pointer}.host-row.clickable>*{pointer-events:none;position:relative;z-index:1}.host-row.clickable>.row-action,.host-row.clickable>.row-link{pointer-events:auto}.src-row{align-items:center;-moz-column-gap:18px;column-gap:18px;display:grid;grid-template-columns:1fr auto;padding:14px 18px}.src-row.clickable{position:relative}.src-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.src-row.clickable:hover{background:var(--panel-hi);cursor:pointer}.src-row.clickable>*{pointer-events:none;position:relative;z-index:1}.src-row.clickable>.row-action,.src-row.clickable>.row-link{pointer-events:auto}.dropdown{display:inline-block;position:relative}.dropdown summary{align-items:center;background:transparent;border:1px solid var(--line);border-radius:5px;color:var(--ink-mid);cursor:pointer;display:inline-flex;font-size:12px;font-weight:500;gap:6px;list-style:none;padding:6px 11px;transition:all .12s ease;-webkit-user-select:none;-moz-user-select:none;user-select:none}.dropdown summary::-webkit-details-marker{display:none}.dropdown summary::marker{content:""}.dropdown summary:hover{background:var(--panel-hi);color:var(--ink)}.dropdown summary .chev{color:var(--ink-fade);font-size:9px;transition:transform .12s ease}.dropdown[open] summary .chev{transform:rotate(180deg)}.dropdown[open] summary{background:var(--panel-hi);color:var(--ink)}.dropdown-menu{background:var(--panel);border:1px solid var(--line);border-radius:6px;box-shadow:0 6px 24px -8px rgba(0,0,0,.55);min-width:220px;padding:4px;position:absolute;right:0;top:calc(100% + 4px);z-index:30}.dropdown-item{border-radius:4px;color:var(--ink-mid);display:block;font-size:12.5px;line-height:1.35;padding:8px 11px;text-decoration:none}.dropdown-item:hover{background:var(--panel-hi);color:var(--ink)}.dropdown-item .label{color:var(--ink);display:block;font-weight:500}.dropdown-item .hint{color:var(--ink-mute);display:block;font-family:JetBrains Mono,ui-monospace,monospace;font-size:11px;margin-top:2px}.snap-row{align-items:center;border-bottom:1px solid var(--line-soft);-moz-column-gap:16px;column-gap:16px;cursor:pointer;display:grid;font-size:13px;grid-template-columns:150px 130px 1fr 90px 130px 80px;padding:11px 14px;transition:background .1s ease}.snap-row:last-child{border-bottom:0}.snap-row:hover{background:var(--panel-hi)}.snap-row.head{color:var(--ink-fade);cursor:default;font-size:11px;letter-spacing:.08em;padding-bottom:9px;padding-top:9px;text-transform:uppercase}.snap-row.head:hover{background:transparent}.alert-row{align-items:center;border-bottom:1px solid var(--line-soft);border-left:3px solid transparent;-moz-column-gap:16px;column-gap:16px;display:grid;font-size:13px;grid-template-columns:18px 110px 130px 1fr 130px 110px 180px;padding:12px 16px;transition:background .1s ease}.alert-row:hover{background:var(--panel-hi)}.alert-row:last-child{border-bottom:0}.alert-row.head{border-left-color:transparent;color:var(--ink-fade);cursor:default;font-size:11px;letter-spacing:.08em;padding-bottom:9px;padding-top:9px;text-transform:uppercase}.alert-row.head:hover{background:transparent}.alert-row.severity-warn{border-left-color:color-mix(in oklch,var(--warn),transparent 50%)}.alert-row.severity-critical{border-left-color:color-mix(in oklch,var(--bad),transparent 30%)}.alert-row.resolved{opacity:.55}.dot-critical{background:var(--bad);box-shadow:0 0 0 3px color-mix(in oklch,var(--bad),transparent 80%)}.tag.tag-active{background:color-mix(in oklch,var(--accent),transparent 92%);border-color:color-mix(in oklch,var(--accent),transparent 50%);color:var(--accent)}.tag-warn{background:color-mix(in oklch,var(--warn),transparent 92%);border-color:color-mix(in oklch,var(--warn),transparent 60%);color:var(--warn)}.tag-critical{background:color-mix(in oklch,var(--bad),transparent 92%);border-color:color-mix(in oklch,var(--bad),transparent 60%);color:var(--bad)}.tag-info{color:var(--ink-mid)}.audit-row{align-items:center;border-bottom:1px solid var(--line-soft);-moz-column-gap:16px;column-gap:16px;display:grid;font-size:13px;grid-template-columns:160px 80px 110px 1.4fr 1.5fr 90px;padding:11px 16px;transition:background .1s ease}.audit-row:hover{background:var(--panel-hi)}.audit-row:last-child{border-bottom:0}.audit-row.head{color:var(--ink-fade);cursor:default;font-size:11px;letter-spacing:.08em;padding-bottom:9px;padding-top:9px;text-transform:uppercase}.audit-row.head:hover{background:transparent}.audit-row.head .sort-header,.user-row.head .sort-header{align-items:baseline;color:inherit;cursor:pointer;display:inline-flex;gap:4px;text-decoration:none}.audit-row.head .sort-header:hover,.user-row.head .sort-header:hover{color:var(--ink)}.audit-row.head .sort-glyph,.user-row.head .sort-glyph{color:var(--accent);display:inline-block;font-size:9px;min-width:8px}.schd-row{align-items:center;-moz-column-gap:14px;column-gap:14px;display:grid;font-size:13px;grid-template-columns:78px 1fr 1.6fr 100px 110px auto;padding:12px 18px}.schd-row.head{color:var(--ink-fade);font-size:11px;letter-spacing:.08em;padding-bottom:10px;padding-top:10px;text-transform:uppercase}.schd-row.clickable{position:relative}.schd-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.schd-row.clickable:hover{background:var(--panel-hi);cursor:pointer}.schd-row.clickable>*{pointer-events:none;position:relative;z-index:1}.schd-row.clickable>.row-action,.schd-row.clickable>.row-link{pointer-events:auto}.preset-chip{background:var(--bg);border:1px solid var(--line-soft);border-radius:4px;color:var(--ink-mid);cursor:pointer;font-family:JetBrains Mono,monospace;font-size:11.5px;padding:4px 9px;transition:border-color .1s ease,color .1s ease;-webkit-user-select:none;-moz-user-select:none;user-select:none}.preset-chip:hover{border-color:var(--accent);color:var(--ink)}.picker{align-items:center;background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;cursor:pointer;display:flex;font-size:13px;gap:12px;padding:10px 12px;transition:border-color .1s ease,background .1s ease}.picker:hover{border-color:var(--ink-mute)}.picker .check{border:1px solid var(--line);border-radius:3px;display:inline-block;flex-shrink:0;height:14px;position:relative;width:14px}.picker.checked{background:color-mix(in oklch,var(--accent),transparent 92%);border-color:color-mix(in oklch,var(--accent),transparent 50%)}.picker.checked .check{background:var(--accent);border-color:var(--accent)}.picker.checked .check:after{border:solid oklch(.18 .01 195);border-width:0 1.5px 1.5px 0;content:"";height:8px;left:4px;position:absolute;top:1px;transform:rotate(45deg);width:4px}.picker input[type=checkbox]{opacity:0;pointer-events:none;position:absolute}.keep-cell{background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;display:flex;flex-direction:column;gap:4px;padding:9px 11px}.keep-cell label{color:var(--ink-fade);font-size:10.5px;letter-spacing:.08em;text-transform:uppercase}.keep-cell input{background:transparent;border:none;color:var(--ink);font-size:14px;outline:none;padding:0;width:100%}.keep-cell input,.log{font-family:JetBrains Mono,monospace}.log{background:var(--bg);border:1px solid var(--line-soft);border-radius:7px;font-size:12px;line-height:1.7;overflow:hidden}.log-line{align-items:baseline;-moz-column-gap:14px;column-gap:14px;display:grid;grid-template-columns:14ch 8ch 1fr;padding:1px 16px}.log-line:first-child{padding-top:12px}.log-line:last-child{padding-bottom:12px}.log-tag,.log-ts{color:var(--ink-fade)}.log-tag{font-size:10px;letter-spacing:.08em;text-transform:uppercase}.progress-track{background:var(--bg);border:1px solid var(--line-soft);border-radius:9999px;height:6px;overflow:hidden}.progress-fill{background:var(--accent);border-radius:9999px;height:100%;transition:width .25s ease}.progress-fill.ok{background:var(--ok)}.progress-fill.bad{background:var(--bad)}.crumbs{font-size:12px}.crumbs,.crumbs a{color:var(--ink-mute)}.crumbs a{text-decoration:underline;text-decoration-color:var(--line);text-underline-offset:3px}.crumbs .sep{color:var(--ink-fade);margin:0 8px}.snippet{border:1px solid var(--line-soft);border-radius:6px;overflow:hidden}.snippet-head{align-items:center;border-bottom:1px solid var(--line-soft);color:var(--ink-fade);display:flex;font-size:11px;justify-content:space-between;letter-spacing:.1em;padding:10px 14px;text-transform:uppercase}.snippet pre{color:var(--ink-mid);font-family:JetBrains Mono,monospace;font-size:12px;line-height:1.7;margin:0;padding:14px;white-space:pre-wrap;word-break:break-all}.snippet pre .var{color:var(--accent)}.empty-state{background:radial-gradient(ellipse at top,color-mix(in oklch,var(--accent),transparent 95%),transparent 60%),var(--panel);border:1px dashed var(--line);border-radius:8px;padding:60px 40px;text-align:center}.ch-row{align-items:center;border-bottom:1px solid var(--line-soft);-moz-column-gap:16px;column-gap:16px;display:grid;font-size:13px;grid-template-columns:28px 200px 1fr 100px 130px 140px;padding:14px 18px;transition:background .1s ease}.ch-row:last-child{border-bottom:0}.ch-row.head{color:var(--ink-fade);cursor:default;font-size:11px;letter-spacing:.08em;padding-bottom:10px;padding-top:10px;text-transform:uppercase}.ch-row.head:hover{background:transparent}.ch-row.clickable{cursor:pointer;position:relative}.ch-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.ch-row.clickable:hover{background:var(--panel-hi)}.ch-row.clickable>*{pointer-events:none;position:relative;z-index:1}.ch-row.clickable>.row-action,.ch-row.clickable>.row-link{pointer-events:auto}.ch-icon{align-items:center;background:var(--panel-hi);border:1px solid var(--line);border-radius:5px;color:var(--ink-mute);display:inline-flex;font-family:JetBrains Mono,monospace;font-size:10px;font-weight:600;height:24px;justify-content:center;width:24px}.ch-icon.webhook{border-color:color-mix(in oklch,var(--accent),transparent 60%);color:var(--accent)}.ch-icon.ntfy{border-color:color-mix(in oklch,var(--warn),transparent 60%);color:var(--warn)}.ch-icon.smtp{border-color:color-mix(in oklch,var(--ok),transparent 60%);color:var(--ok)}.toggle{background:var(--line);border-radius:9999px;cursor:pointer;display:inline-block;flex-shrink:0;height:16px;position:relative;transition:background .12s ease;width:30px}.toggle:after{background:var(--ink-mid);border-radius:9999px;content:"";height:12px;left:2px;position:absolute;top:2px;transition:all .12s ease;width:12px}.toggle.on{background:color-mix(in oklch,var(--accent),transparent 50%)}.toggle.on:after{background:var(--accent);left:16px}.kind-grid{display:grid;gap:14px;grid-template-columns:1fr 1fr 1fr}.kind-card{background:var(--bg);border:1px solid var(--line-soft);border-radius:7px;cursor:pointer;padding:16px;transition:border-color .12s ease,background .12s ease}.kind-card:hover{border-color:var(--ink-mute)}.kind-card.selected{background:color-mix(in oklch,var(--accent),transparent 95%);border-color:color-mix(in oklch,var(--accent),transparent 50%)}.radio-pip{align-items:center;border:1px solid var(--line);border-radius:9999px;display:inline-flex;flex-shrink:0;height:14px;justify-content:center;width:14px}.radio-pip.on{border-color:var(--accent)}.radio-pip.on:after{background:var(--accent);border-radius:9999px;content:"";height:6px;width:6px}.user-row{align-items:center;border-bottom:1px solid var(--line-soft);-moz-column-gap:16px;column-gap:16px;display:grid;font-size:13px;grid-template-columns:180px 1fr 110px 160px 120px 90px;padding:11px 16px;transition:background .1s ease}.user-row:hover{background:var(--panel-hi)}.user-row:last-child{border-bottom:0}.user-row.head{color:var(--ink-fade);cursor:default;font-size:11px;letter-spacing:.08em;padding-bottom:9px;padding-top:9px;text-transform:uppercase}.user-row.head:hover{background:transparent}.user-row.disabled{opacity:.55}.test-pill{border-radius:5px;display:inline-block;font-size:12.5px;padding:5px 10px}.test-pill-ok{background:color-mix(in oklch,var(--ok),transparent 92%);border:1px solid color-mix(in oklch,var(--ok),transparent 60%);color:var(--ok)}.test-pill-fail{background:color-mix(in oklch,var(--bad),transparent 92%);border:1px solid color-mix(in oklch,var(--bad),transparent 60%);color:var(--bad)}.pointer-events-none{pointer-events:none}.visible{visibility:visible}.invisible{visibility:hidden}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.inset-0{inset:0}.bottom-5{bottom:1.25rem}.left-0{left:0}.right-5{right:1.25rem}.top-0{top:0}.z-50{z-index:50}.col-span-2{grid-column:span 2/span 2}.col-span-3{grid-column:span 3/span 3}.col-span-4{grid-column:span 4/span 4}.col-span-5{grid-column:span 5/span 5}.col-span-7{grid-column:span 7/span 7}.col-span-8{grid-column:span 8/span 8}.col-span-9{grid-column:span 9/span 9}.m-0{margin:0}.mx-2{margin-left:.5rem;margin-right:.5rem}.mx-auto{margin-left:auto;margin-right:auto}.my-5{margin-bottom:1.25rem;margin-top:1.25rem}.mb-1\.5{margin-bottom:.375rem}.mb-10{margin-bottom:2.5rem}.mb-2{margin-bottom:.5rem}.mb-2\.5{margin-bottom:.625rem}.mb-3{margin-bottom:.75rem}.mb-3\.5{margin-bottom:.875rem}.mb-4{margin-bottom:1rem}.mb-5{margin-bottom:1.25rem}.mb-7{margin-bottom:1.75rem}.ml-1{margin-left:.25rem}.ml-2{margin-left:.5rem}.ml-2\.5{margin-left:.625rem}.ml-5{margin-left:1.25rem}.ml-auto{margin-left:auto}.mr-1{margin-right:.25rem}.mr-1\.5{margin-right:.375rem}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-1\.5{margin-top:.375rem}.mt-2{margin-top:.5rem}.mt-2\.5{margin-top:.625rem}.mt-20{margin-top:5rem}.mt-3{margin-top:.75rem}.mt-3\.5{margin-top:.875rem}.mt-4{margin-top:1rem}.mt-5{margin-top:1.25rem}.mt-6{margin-top:1.5rem}.mt-7{margin-top:1.75rem}.mt-8{margin-top:2rem}.mt-9{margin-top:2.25rem}.block{display:block}.inline-block{display:inline-block}.inline{display:inline}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.hidden{display:none}.h-3\.5{height:.875rem}.h-\[13px\]{height:13px}.h-\[22px\]{height:22px}.min-h-screen{min-height:100vh}.w-16{width:4rem}.w-3\.5{width:.875rem}.w-\[13px\]{width:13px}.w-\[22px\]{width:22px}.w-\[360px\]{width:360px}.w-\[420px\]{width:420px}.w-full{width:100%}.min-w-0{min-width:0}.max-w-\[1280px\]{max-width:1280px}.max-w-\[440px\]{max-width:440px}.max-w-\[480px\]{max-width:480px}.max-w-\[520px\]{max-width:520px}.max-w-\[580px\]{max-width:580px}.max-w-\[640px\]{max-width:640px}.max-w-\[680px\]{max-width:680px}.max-w-\[720px\]{max-width:720px}.max-w-\[760px\]{max-width:760px}.flex-1{flex:1 1 0%}.flex-none{flex:none}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.cursor-default{cursor:default}.cursor-help{cursor:help}.cursor-pointer{cursor:pointer}.select-none{-webkit-user-select:none;-moz-user-select:none;user-select:none}.select-all{-webkit-user-select:all;-moz-user-select:all;user-select:all}.resize{resize:both}.list-none{list-style-type:none}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-start{align-items:flex-start}.items-end{align-items:flex-end}.items-center{align-items:center}.items-baseline{align-items:baseline}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-1{gap:.25rem}.gap-1\.5{gap:.375rem}.gap-2{gap:.5rem}.gap-2\.5{gap:.625rem}.gap-3{gap:.75rem}.gap-3\.5{gap:.875rem}.gap-4{gap:1rem}.gap-5{gap:1.25rem}.gap-6{gap:1.5rem}.gap-8{gap:2rem}.gap-x-4{-moz-column-gap:1rem;column-gap:1rem}.gap-y-2{row-gap:.5rem}.gap-y-2\.5{row-gap:.625rem}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse:0;margin-bottom:calc(.5rem*var(--tw-space-y-reverse));margin-top:calc(.5rem*(1 - var(--tw-space-y-reverse)))}.space-y-4>:not([hidden])~:not([hidden]){--tw-space-y-reverse:0;margin-bottom:calc(1rem*var(--tw-space-y-reverse));margin-top:calc(1rem*(1 - var(--tw-space-y-reverse)))}.overflow-hidden,.truncate{overflow:hidden}.truncate{text-overflow:ellipsis}.truncate,.whitespace-nowrap{white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.text-pretty{text-wrap:pretty}.break-all{word-break:break-all}.rounded{border-radius:.25rem}.rounded-\[3px\]{border-radius:3px}.rounded-\[5px\]{border-radius:5px}.rounded-\[6px\]{border-radius:6px}.rounded-\[7px\]{border-radius:7px}.rounded-\[8px\]{border-radius:8px}.rounded-full{border-radius:9999px}.rounded-md{border-radius:.375rem}.border{border-width:1px}.border-y{border-top-width:1px}.border-b,.border-y{border-bottom-width:1px}.border-l{border-left-width:1px}.border-t{border-top-width:1px}.border-line{border-color:oklch(.27 .01 250)}.border-line-soft{border-color:oklch(.23 .008 250)}.bg-bg{background-color:oklch(.17 .006 250)}.bg-panel{background-color:oklch(.2 .007 250)}.p-0{padding:0}.p-2{padding:.5rem}.p-3{padding:.75rem}.p-3\.5{padding:.875rem}.p-4{padding:1rem}.p-5{padding:1.25rem}.p-6{padding:1.5rem}.p-7{padding:1.75rem}.p-8{padding:2rem}.p-\[18px\]{padding:18px}.p-\[3px\]{padding:3px}.px-1{padding-left:.25rem;padding-right:.25rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-2\.5{padding-left:.625rem;padding-right:.625rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-3\.5{padding-left:.875rem;padding-right:.875rem}.px-4{padding-left:1rem;padding-right:1rem}.px-5{padding-left:1.25rem;padding-right:1.25rem}.px-7{padding-left:1.75rem;padding-right:1.75rem}.px-8{padding-left:2rem;padding-right:2rem}.px-\[18px\]{padding-left:18px;padding-right:18px}.py-0\.5{padding-bottom:.125rem;padding-top:.125rem}.py-1{padding-bottom:.25rem;padding-top:.25rem}.py-1\.5{padding-bottom:.375rem;padding-top:.375rem}.py-12{padding-bottom:3rem;padding-top:3rem}.py-2{padding-bottom:.5rem;padding-top:.5rem}.py-2\.5{padding-bottom:.625rem;padding-top:.625rem}.py-3{padding-bottom:.75rem;padding-top:.75rem}.py-3\.5{padding-bottom:.875rem;padding-top:.875rem}.py-4{padding-bottom:1rem;padding-top:1rem}.py-5{padding-bottom:1.25rem;padding-top:1.25rem}.py-6{padding-bottom:1.5rem;padding-top:1.5rem}.py-7{padding-bottom:1.75rem;padding-top:1.75rem}.py-8{padding-bottom:2rem;padding-top:2rem}.py-\[14px\]{padding-bottom:14px;padding-top:14px}.py-\[5px\]{padding-bottom:5px;padding-top:5px}.pb-14{padding-bottom:3.5rem}.pb-2{padding-bottom:.5rem}.pb-24{padding-bottom:6rem}.pb-3{padding-bottom:.75rem}.pb-4{padding-bottom:1rem}.pb-\[18px\]{padding-bottom:18px}.pl-5{padding-left:1.25rem}.pl-6{padding-left:1.5rem}.pl-9{padding-left:2.25rem}.pr-4{padding-right:1rem}.pt-0\.5{padding-top:.125rem}.pt-1{padding-top:.25rem}.pt-14{padding-top:3.5rem}.pt-2{padding-top:.5rem}.pt-20{padding-top:5rem}.pt-4{padding-top:1rem}.pt-5{padding-top:1.25rem}.pt-6{padding-top:1.5rem}.pt-7{padding-top:1.75rem}.pt-9{padding-top:2.25rem}.pt-\[1px\]{padding-top:1px}.text-left{text-align:left}.text-center{text-align:center}.text-right{text-align:right}.text-2xl{font-size:1.5rem;line-height:2rem}.text-\[10\.5px\]{font-size:10.5px}.text-\[10px\]{font-size:10px}.text-\[11\.5px\]{font-size:11.5px}.text-\[11px\]{font-size:11px}.text-\[12\.5px\]{font-size:12.5px}.text-\[12px\]{font-size:12px}.text-\[13px\]{font-size:13px}.text-\[14px\]{font-size:14px}.text-\[16px\]{font-size:16px}.text-\[18px\]{font-size:18px}.text-\[19px\]{font-size:19px}.text-\[20px\]{font-size:20px}.text-\[22px\]{font-size:22px}.text-\[26px\]{font-size:26px}.text-\[28px\]{font-size:28px}.text-base{font-size:1rem;line-height:1.5rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xs{font-size:.75rem;line-height:1rem}.font-medium{font-weight:500}.font-normal{font-weight:400}.font-semibold{font-weight:600}.uppercase{text-transform:uppercase}.normal-case{text-transform:none}.italic{font-style:italic}.leading-\[1\.55\]{line-height:1.55}.leading-\[1\.5\]{line-height:1.5}.leading-\[1\.65\]{line-height:1.65}.leading-\[1\.6\]{line-height:1.6}.leading-\[1\.7\]{line-height:1.7}.leading-\[20px\]{line-height:20px}.leading-none{line-height:1}.tracking-\[-0\.005em\]{letter-spacing:-.005em}.tracking-\[-0\.012em\]{letter-spacing:-.012em}.tracking-\[-0\.01em\]{letter-spacing:-.01em}.tracking-\[-0\.02em\]{letter-spacing:-.02em}.tracking-\[0\.005em\]{letter-spacing:.005em}.tracking-\[0\.01em\]{letter-spacing:.01em}.tracking-\[0\.02em\]{letter-spacing:.02em}.tracking-\[0\.08em\]{letter-spacing:.08em}.tracking-\[0\.1em\]{letter-spacing:.1em}.text-accent{color:oklch(.82 .12 195)}.text-bad{color:oklch(.7 .2 25)}.text-ink{color:oklch(.96 .005 250)}.text-ink-fade{color:oklch(.42 .006 250)}.text-ink-mid{color:oklch(.78 .005 250)}.text-ink-mute{color:oklch(.58 .006 250)}.text-ok{color:oklch(.78 .14 155)}.text-warn{color:oklch(.82 .13 80)}.underline{text-decoration-line:underline}.no-underline{text-decoration-line:none}.decoration-line{text-decoration-color:oklch(.27 .01 250)}.underline-offset-4{text-underline-offset:4px}.opacity-40{opacity:.4}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.transition{transition-duration:.15s;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1)}.hover\:text-ink:hover{color:oklch(.96 .005 250)}.hover\:text-ink-mid:hover{color:oklch(.78 .005 250)}.hover\:underline:hover{text-decoration-line:underline} +/*! tailwindcss v3.4.17 | MIT License | https://tailwindcss.com*/*,:after,:before{border:0 solid #e5e7eb;box-sizing:border-box}:after,:before{--tw-content:""}:host,html{line-height:1.5;-webkit-text-size-adjust:100%;font-family:Inter,system-ui,-apple-system,sans-serif;font-feature-settings:normal;font-variation-settings:normal;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-tap-highlight-color:transparent}body{line-height:inherit;margin:0}hr{border-top-width:1px;color:inherit;height:0}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:JetBrains Mono,ui-monospace,monospace;font-feature-settings:normal;font-size:1em;font-variation-settings:normal}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{border-collapse:collapse;border-color:inherit;text-indent:0}button,input,optgroup,select,textarea{color:inherit;font-family:inherit;font-feature-settings:inherit;font-size:100%;font-variation-settings:inherit;font-weight:inherit;letter-spacing:inherit;line-height:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0}fieldset,legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{color:#9ca3af;opacity:1}input::placeholder,textarea::placeholder{color:#9ca3af;opacity:1}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{height:auto;max-width:100%}[hidden]:where(:not([hidden=until-found])){display:none}:root{--bg:oklch(0.17 0.006 250);--panel:oklch(0.20 0.007 250);--panel-hi:oklch(0.23 0.008 250);--line:oklch(0.27 0.010 250);--line-soft:oklch(0.23 0.008 250);--ink:oklch(0.96 0.005 250);--ink-mid:oklch(0.78 0.005 250);--ink-mute:oklch(0.58 0.006 250);--ink-fade:oklch(0.42 0.006 250);--ok:oklch(0.78 0.14 155);--warn:oklch(0.82 0.13 80);--bad:oklch(0.70 0.20 25);--off:oklch(0.50 0.005 250);--accent:oklch(0.82 0.12 195)}body,html{background:var(--bg);color:var(--ink);font-family:Inter,system-ui,-apple-system,sans-serif;-webkit-font-smoothing:antialiased}body{font-feature-settings:"cv11","ss01","ss03"}::-moz-selection{background:color-mix(in oklch,var(--accent),transparent 70%)}::selection{background:color-mix(in oklch,var(--accent),transparent 70%)}.\!container{width:100%!important}.container{width:100%}@media (min-width:640px){.\!container{max-width:640px!important}.container{max-width:640px}}@media (min-width:768px){.\!container{max-width:768px!important}.container{max-width:768px}}@media (min-width:1024px){.\!container{max-width:1024px!important}.container{max-width:1024px}}@media (min-width:1280px){.\!container{max-width:1280px!important}.container{max-width:1280px}}@media (min-width:1536px){.\!container{max-width:1536px!important}.container{max-width:1536px}}.mono{font-family:JetBrains Mono,ui-monospace,monospace;font-variant-numeric:tabular-nums}.panel{background:var(--panel);border:1px solid var(--line-soft)}.hairline{box-shadow:inset 0 -1px 0 var(--line-soft)}.dot{border-radius:9999px;display:inline-block;height:7px;width:7px}.dot-online{background:var(--ok);box-shadow:0 0 0 3px color-mix(in oklch,var(--ok),transparent 80%)}.dot-degraded{background:var(--warn);box-shadow:0 0 0 3px color-mix(in oklch,var(--warn),transparent 80%)}.dot-offline{background:var(--off)}.dot-failed{background:var(--bad);box-shadow:0 0 0 3px color-mix(in oklch,var(--bad),transparent 80%)}.pulse{animation:rm-pulse 2.4s ease-in-out infinite}@keyframes rm-pulse{0%,to{box-shadow:0 0 0 3px color-mix(in oklch,var(--accent),transparent 80%)}50%{box-shadow:0 0 0 6px color-mix(in oklch,var(--accent),transparent 92%)}}.btn{align-items:center;background:transparent;border:1px solid var(--line);border-radius:5px;color:var(--ink-mid);cursor:pointer;display:inline-flex;font-size:12px;font-weight:500;gap:6px;padding:6px 11px;text-decoration:none;transition:all .12s ease}.btn:hover{background:var(--panel-hi);color:var(--ink)}.btn:disabled,.btn[disabled]{cursor:not-allowed;opacity:.4;pointer-events:none}.btn-primary{background:var(--accent);border-color:var(--accent);color:oklch(.18 .01 195)}.btn-primary:hover{filter:brightness(1.08)}.btn-ghost,.btn-ghost:hover{border-color:transparent}.btn-ghost:hover{background:var(--panel-hi)}.btn-danger{border-color:color-mix(in oklch,var(--bad),transparent 70%);color:var(--bad)}.btn-danger:hover{background:color-mix(in oklch,var(--bad),transparent 88%);border-color:color-mix(in oklch,var(--bad),transparent 50%);color:oklch(.85 .1 25)}.btn-lg{font-size:13px;padding:9px 14px}.btn-block{justify-content:center;width:100%}.btn-amber{background:var(--warn);border-color:var(--warn);color:oklch(.18 .01 80)}.btn-amber:hover{filter:brightness(1.08)}.btn-amber:disabled,.btn-amber[disabled]{cursor:not-allowed;opacity:.45;pointer-events:none}.update-chip{align-items:center;background:color-mix(in oklch,var(--warn),transparent 30%);border:1px solid color-mix(in oklch,var(--warn),transparent 50%);border-radius:3px;color:oklch(.18 .01 80);display:inline-flex;font-size:10px;font-weight:500;gap:4px;line-height:1.4;padding:1px 6px;white-space:nowrap}.hero-tile{background:var(--panel);border:1px solid var(--line-soft);border-radius:7px;display:flex;flex-direction:column;gap:4px;padding:14px 16px;text-decoration:none;transition:filter .12s ease,background .12s ease}.hero-tile:hover{filter:brightness(1.08)}.hero-tile .hero-num{color:var(--ink);font-family:JetBrains Mono,ui-monospace,monospace;font-size:22px;font-weight:500;letter-spacing:-.01em}.hero-tile .hero-label{color:var(--ink-mute);font-size:11.5px}.hero-tile--amber{background:color-mix(in oklch,var(--warn),transparent 88%);border-color:color-mix(in oklch,var(--warn),transparent 60%)}.hero-tile--amber .hero-num{color:oklch(.86 .13 80)}.hero-tile--amber .hero-label{color:oklch(.78 .08 80)}.nav-tab{border-bottom:2px solid transparent;color:var(--ink-mute);cursor:pointer;font-size:13px;margin-right:28px;padding:18px 0;text-decoration:none}.nav-tab.active{border-color:var(--accent)}.nav-tab.active,.nav-tab:hover{color:var(--ink)}.sub-tab{border-bottom:1.5px solid transparent;color:var(--ink-mute);cursor:pointer;font-size:13px;margin-right:24px;padding:12px 0;text-decoration:none}.sub-tab.active{border-color:var(--ink);color:var(--ink)}.tag{align-items:center;border:1px solid var(--line);border-radius:3px;display:inline-flex;font-size:11px;gap:5px;letter-spacing:.01em;line-height:1;padding:4px 7px}.field-label,.tag{color:var(--ink-mid)}.field-label{display:block;font-size:12px;margin-bottom:6px}.field-help{color:var(--ink-mute);font-size:12px;line-height:1.55;margin-top:6px}.field{background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;color:var(--ink);font-family:inherit;font-size:13px;outline:none;padding:9px 12px;transition:border-color .12s ease;width:100%}.field:focus{border-color:var(--accent)}.field.invalid{border-color:color-mix(in oklch,var(--bad),transparent 50%)}.field.mono{font-family:JetBrains Mono,monospace;font-size:12px}.field.with-prefix{padding-left:64px}.host-row{align-items:center;border-left:3px solid transparent;-moz-column-gap:18px;column-gap:18px;display:grid;font-size:13px;grid-template-columns:24px 1.4fr .95fr 1.5fr .75fr .7fr .7fr 1.1fr 92px;padding:11px 16px}.host-row.head{color:var(--ink-fade);font-size:11px;letter-spacing:.08em;padding-bottom:10px;padding-top:10px;text-transform:uppercase}.host-row.degraded{border-left-color:color-mix(in oklch,var(--warn),transparent 50%)}.host-row.failed{border-left-color:color-mix(in oklch,var(--bad),transparent 50%)}.host-row.offline{border-left-color:color-mix(in oklch,var(--off),transparent 70%)}.host-row:hover{background:var(--panel-hi)}.host-row.clickable{position:relative}.host-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.host-row.clickable:hover{cursor:pointer}.host-row.clickable>*{pointer-events:none;position:relative;z-index:1}.host-row.clickable>.row-action,.host-row.clickable>.row-link{pointer-events:auto}.src-row{align-items:center;-moz-column-gap:18px;column-gap:18px;display:grid;grid-template-columns:1fr auto;padding:14px 18px}.src-row.clickable{position:relative}.src-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.src-row.clickable:hover{background:var(--panel-hi);cursor:pointer}.src-row.clickable>*{pointer-events:none;position:relative;z-index:1}.src-row.clickable>.row-action,.src-row.clickable>.row-link{pointer-events:auto}.dropdown{display:inline-block;position:relative}.dropdown summary{align-items:center;background:transparent;border:1px solid var(--line);border-radius:5px;color:var(--ink-mid);cursor:pointer;display:inline-flex;font-size:12px;font-weight:500;gap:6px;list-style:none;padding:6px 11px;transition:all .12s ease;-webkit-user-select:none;-moz-user-select:none;user-select:none}.dropdown summary::-webkit-details-marker{display:none}.dropdown summary::marker{content:""}.dropdown summary:hover{background:var(--panel-hi);color:var(--ink)}.dropdown summary .chev{color:var(--ink-fade);font-size:9px;transition:transform .12s ease}.dropdown[open] summary .chev{transform:rotate(180deg)}.dropdown[open] summary{background:var(--panel-hi);color:var(--ink)}.dropdown-menu{background:var(--panel);border:1px solid var(--line);border-radius:6px;box-shadow:0 6px 24px -8px rgba(0,0,0,.55);min-width:220px;padding:4px;position:absolute;right:0;top:calc(100% + 4px);z-index:30}.dropdown-item{border-radius:4px;color:var(--ink-mid);display:block;font-size:12.5px;line-height:1.35;padding:8px 11px;text-decoration:none}.dropdown-item:hover{background:var(--panel-hi);color:var(--ink)}.dropdown-item .label{color:var(--ink);display:block;font-weight:500}.dropdown-item .hint{color:var(--ink-mute);display:block;font-family:JetBrains Mono,ui-monospace,monospace;font-size:11px;margin-top:2px}.snap-row{align-items:center;border-bottom:1px solid var(--line-soft);-moz-column-gap:16px;column-gap:16px;cursor:pointer;display:grid;font-size:13px;grid-template-columns:150px 130px 1fr 90px 130px 80px;padding:11px 14px;transition:background .1s ease}.snap-row:last-child{border-bottom:0}.snap-row:hover{background:var(--panel-hi)}.snap-row.head{color:var(--ink-fade);cursor:default;font-size:11px;letter-spacing:.08em;padding-bottom:9px;padding-top:9px;text-transform:uppercase}.snap-row.head:hover{background:transparent}.alert-row{align-items:center;border-bottom:1px solid var(--line-soft);border-left:3px solid transparent;-moz-column-gap:16px;column-gap:16px;display:grid;font-size:13px;grid-template-columns:18px 110px 130px 1fr 130px 110px 180px;padding:12px 16px;transition:background .1s ease}.alert-row:hover{background:var(--panel-hi)}.alert-row:last-child{border-bottom:0}.alert-row.head{border-left-color:transparent;color:var(--ink-fade);cursor:default;font-size:11px;letter-spacing:.08em;padding-bottom:9px;padding-top:9px;text-transform:uppercase}.alert-row.head:hover{background:transparent}.alert-row.severity-warn{border-left-color:color-mix(in oklch,var(--warn),transparent 50%)}.alert-row.severity-critical{border-left-color:color-mix(in oklch,var(--bad),transparent 30%)}.alert-row.resolved{opacity:.55}.dot-critical{background:var(--bad);box-shadow:0 0 0 3px color-mix(in oklch,var(--bad),transparent 80%)}.tag.tag-active{background:color-mix(in oklch,var(--accent),transparent 92%);border-color:color-mix(in oklch,var(--accent),transparent 50%);color:var(--accent)}.tag-warn{background:color-mix(in oklch,var(--warn),transparent 92%);border-color:color-mix(in oklch,var(--warn),transparent 60%);color:var(--warn)}.tag-critical{background:color-mix(in oklch,var(--bad),transparent 92%);border-color:color-mix(in oklch,var(--bad),transparent 60%);color:var(--bad)}.tag-info{color:var(--ink-mid)}.audit-row{align-items:center;border-bottom:1px solid var(--line-soft);-moz-column-gap:16px;column-gap:16px;display:grid;font-size:13px;grid-template-columns:160px 80px 110px 1.4fr 1.5fr 90px;padding:11px 16px;transition:background .1s ease}.audit-row:hover{background:var(--panel-hi)}.audit-row:last-child{border-bottom:0}.audit-row.head{color:var(--ink-fade);cursor:default;font-size:11px;letter-spacing:.08em;padding-bottom:9px;padding-top:9px;text-transform:uppercase}.audit-row.head:hover{background:transparent}.audit-row.head .sort-header,.user-row.head .sort-header{align-items:baseline;color:inherit;cursor:pointer;display:inline-flex;gap:4px;text-decoration:none}.audit-row.head .sort-header:hover,.user-row.head .sort-header:hover{color:var(--ink)}.audit-row.head .sort-glyph,.user-row.head .sort-glyph{color:var(--accent);display:inline-block;font-size:9px;min-width:8px}.schd-row{align-items:center;-moz-column-gap:14px;column-gap:14px;display:grid;font-size:13px;grid-template-columns:78px 1fr 1.6fr 100px 110px auto;padding:12px 18px}.schd-row.head{color:var(--ink-fade);font-size:11px;letter-spacing:.08em;padding-bottom:10px;padding-top:10px;text-transform:uppercase}.schd-row.clickable{position:relative}.schd-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.schd-row.clickable:hover{background:var(--panel-hi);cursor:pointer}.schd-row.clickable>*{pointer-events:none;position:relative;z-index:1}.schd-row.clickable>.row-action,.schd-row.clickable>.row-link{pointer-events:auto}.preset-chip{background:var(--bg);border:1px solid var(--line-soft);border-radius:4px;color:var(--ink-mid);cursor:pointer;font-family:JetBrains Mono,monospace;font-size:11.5px;padding:4px 9px;transition:border-color .1s ease,color .1s ease;-webkit-user-select:none;-moz-user-select:none;user-select:none}.preset-chip:hover{border-color:var(--accent);color:var(--ink)}.picker{align-items:center;background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;cursor:pointer;display:flex;font-size:13px;gap:12px;padding:10px 12px;transition:border-color .1s ease,background .1s ease}.picker:hover{border-color:var(--ink-mute)}.picker .check{border:1px solid var(--line);border-radius:3px;display:inline-block;flex-shrink:0;height:14px;position:relative;width:14px}.picker.checked{background:color-mix(in oklch,var(--accent),transparent 92%);border-color:color-mix(in oklch,var(--accent),transparent 50%)}.picker.checked .check{background:var(--accent);border-color:var(--accent)}.picker.checked .check:after{border:solid oklch(.18 .01 195);border-width:0 1.5px 1.5px 0;content:"";height:8px;left:4px;position:absolute;top:1px;transform:rotate(45deg);width:4px}.picker input[type=checkbox]{opacity:0;pointer-events:none;position:absolute}.keep-cell{background:var(--bg);border:1px solid var(--line-soft);border-radius:5px;display:flex;flex-direction:column;gap:4px;padding:9px 11px}.keep-cell label{color:var(--ink-fade);font-size:10.5px;letter-spacing:.08em;text-transform:uppercase}.keep-cell input{background:transparent;border:none;color:var(--ink);font-size:14px;outline:none;padding:0;width:100%}.keep-cell input,.log{font-family:JetBrains Mono,monospace}.log{background:var(--bg);border:1px solid var(--line-soft);border-radius:7px;font-size:12px;line-height:1.7;overflow:hidden}.log-line{align-items:baseline;-moz-column-gap:14px;column-gap:14px;display:grid;grid-template-columns:14ch 8ch 1fr;padding:1px 16px}.log-line:first-child{padding-top:12px}.log-line:last-child{padding-bottom:12px}.log-tag,.log-ts{color:var(--ink-fade)}.log-tag{font-size:10px;letter-spacing:.08em;text-transform:uppercase}.progress-track{background:var(--bg);border:1px solid var(--line-soft);border-radius:9999px;height:6px;overflow:hidden}.progress-fill{background:var(--accent);border-radius:9999px;height:100%;transition:width .25s ease}.progress-fill.ok{background:var(--ok)}.progress-fill.bad{background:var(--bad)}.crumbs{font-size:12px}.crumbs,.crumbs a{color:var(--ink-mute)}.crumbs a{text-decoration:underline;text-decoration-color:var(--line);text-underline-offset:3px}.crumbs .sep{color:var(--ink-fade);margin:0 8px}.snippet{border:1px solid var(--line-soft);border-radius:6px;overflow:hidden}.snippet-head{align-items:center;border-bottom:1px solid var(--line-soft);color:var(--ink-fade);display:flex;font-size:11px;justify-content:space-between;letter-spacing:.1em;padding:10px 14px;text-transform:uppercase}.snippet pre{color:var(--ink-mid);font-family:JetBrains Mono,monospace;font-size:12px;line-height:1.7;margin:0;padding:14px;white-space:pre-wrap;word-break:break-all}.snippet pre .var{color:var(--accent)}.empty-state{background:radial-gradient(ellipse at top,color-mix(in oklch,var(--accent),transparent 95%),transparent 60%),var(--panel);border:1px dashed var(--line);border-radius:8px;padding:60px 40px;text-align:center}.ch-row{align-items:center;border-bottom:1px solid var(--line-soft);-moz-column-gap:16px;column-gap:16px;display:grid;font-size:13px;grid-template-columns:28px 200px 1fr 100px 130px 140px;padding:14px 18px;transition:background .1s ease}.ch-row:last-child{border-bottom:0}.ch-row.head{color:var(--ink-fade);cursor:default;font-size:11px;letter-spacing:.08em;padding-bottom:10px;padding-top:10px;text-transform:uppercase}.ch-row.head:hover{background:transparent}.ch-row.clickable{cursor:pointer;position:relative}.ch-row.clickable .row-link{inset:0;overflow:hidden;position:absolute;text-indent:-9999px;z-index:0}.ch-row.clickable:hover{background:var(--panel-hi)}.ch-row.clickable>*{pointer-events:none;position:relative;z-index:1}.ch-row.clickable>.row-action,.ch-row.clickable>.row-link{pointer-events:auto}.ch-icon{align-items:center;background:var(--panel-hi);border:1px solid var(--line);border-radius:5px;color:var(--ink-mute);display:inline-flex;font-family:JetBrains Mono,monospace;font-size:10px;font-weight:600;height:24px;justify-content:center;width:24px}.ch-icon.webhook{border-color:color-mix(in oklch,var(--accent),transparent 60%);color:var(--accent)}.ch-icon.ntfy{border-color:color-mix(in oklch,var(--warn),transparent 60%);color:var(--warn)}.ch-icon.smtp{border-color:color-mix(in oklch,var(--ok),transparent 60%);color:var(--ok)}.toggle{background:var(--line);border-radius:9999px;cursor:pointer;display:inline-block;flex-shrink:0;height:16px;position:relative;transition:background .12s ease;width:30px}.toggle:after{background:var(--ink-mid);border-radius:9999px;content:"";height:12px;left:2px;position:absolute;top:2px;transition:all .12s ease;width:12px}.toggle.on{background:color-mix(in oklch,var(--accent),transparent 50%)}.toggle.on:after{background:var(--accent);left:16px}.kind-grid{display:grid;gap:14px;grid-template-columns:1fr 1fr 1fr}.kind-card{background:var(--bg);border:1px solid var(--line-soft);border-radius:7px;cursor:pointer;padding:16px;transition:border-color .12s ease,background .12s ease}.kind-card:hover{border-color:var(--ink-mute)}.kind-card.selected{background:color-mix(in oklch,var(--accent),transparent 95%);border-color:color-mix(in oklch,var(--accent),transparent 50%)}.radio-pip{align-items:center;border:1px solid var(--line);border-radius:9999px;display:inline-flex;flex-shrink:0;height:14px;justify-content:center;width:14px}.radio-pip.on{border-color:var(--accent)}.radio-pip.on:after{background:var(--accent);border-radius:9999px;content:"";height:6px;width:6px}.user-row{align-items:center;border-bottom:1px solid var(--line-soft);-moz-column-gap:16px;column-gap:16px;display:grid;font-size:13px;grid-template-columns:180px 1fr 110px 160px 120px 90px;padding:11px 16px;transition:background .1s ease}.user-row:hover{background:var(--panel-hi)}.user-row:last-child{border-bottom:0}.user-row.head{color:var(--ink-fade);cursor:default;font-size:11px;letter-spacing:.08em;padding-bottom:9px;padding-top:9px;text-transform:uppercase}.user-row.head:hover{background:transparent}.user-row.disabled{opacity:.55}.test-pill{border-radius:5px;display:inline-block;font-size:12.5px;padding:5px 10px}.test-pill-ok{background:color-mix(in oklch,var(--ok),transparent 92%);border:1px solid color-mix(in oklch,var(--ok),transparent 60%);color:var(--ok)}.test-pill-fail{background:color-mix(in oklch,var(--bad),transparent 92%);border:1px solid color-mix(in oklch,var(--bad),transparent 60%);color:var(--bad)}.pointer-events-none{pointer-events:none}.visible{visibility:visible}.invisible{visibility:hidden}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.inset-0{inset:0}.bottom-5{bottom:1.25rem}.left-0{left:0}.right-5{right:1.25rem}.top-0{top:0}.z-50{z-index:50}.col-span-2{grid-column:span 2/span 2}.col-span-3{grid-column:span 3/span 3}.col-span-4{grid-column:span 4/span 4}.col-span-5{grid-column:span 5/span 5}.col-span-7{grid-column:span 7/span 7}.col-span-8{grid-column:span 8/span 8}.col-span-9{grid-column:span 9/span 9}.m-0{margin:0}.mx-2{margin-left:.5rem;margin-right:.5rem}.mx-auto{margin-left:auto;margin-right:auto}.my-5{margin-bottom:1.25rem;margin-top:1.25rem}.mb-1\.5{margin-bottom:.375rem}.mb-10{margin-bottom:2.5rem}.mb-2{margin-bottom:.5rem}.mb-2\.5{margin-bottom:.625rem}.mb-3{margin-bottom:.75rem}.mb-3\.5{margin-bottom:.875rem}.mb-4{margin-bottom:1rem}.mb-5{margin-bottom:1.25rem}.mb-7{margin-bottom:1.75rem}.ml-1{margin-left:.25rem}.ml-2{margin-left:.5rem}.ml-2\.5{margin-left:.625rem}.ml-5{margin-left:1.25rem}.ml-auto{margin-left:auto}.mr-1{margin-right:.25rem}.mr-1\.5{margin-right:.375rem}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-1\.5{margin-top:.375rem}.mt-2{margin-top:.5rem}.mt-2\.5{margin-top:.625rem}.mt-20{margin-top:5rem}.mt-3{margin-top:.75rem}.mt-3\.5{margin-top:.875rem}.mt-4{margin-top:1rem}.mt-5{margin-top:1.25rem}.mt-6{margin-top:1.5rem}.mt-7{margin-top:1.75rem}.mt-8{margin-top:2rem}.mt-9{margin-top:2.25rem}.block{display:block}.inline-block{display:inline-block}.inline{display:inline}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.hidden{display:none}.h-3\.5{height:.875rem}.h-\[13px\]{height:13px}.h-\[22px\]{height:22px}.min-h-screen{min-height:100vh}.w-16{width:4rem}.w-3\.5{width:.875rem}.w-\[13px\]{width:13px}.w-\[22px\]{width:22px}.w-\[360px\]{width:360px}.w-\[420px\]{width:420px}.w-full{width:100%}.min-w-0{min-width:0}.max-w-\[1280px\]{max-width:1280px}.max-w-\[440px\]{max-width:440px}.max-w-\[480px\]{max-width:480px}.max-w-\[520px\]{max-width:520px}.max-w-\[580px\]{max-width:580px}.max-w-\[640px\]{max-width:640px}.max-w-\[680px\]{max-width:680px}.max-w-\[720px\]{max-width:720px}.max-w-\[760px\]{max-width:760px}.flex-1{flex:1 1 0%}.flex-none{flex:none}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.cursor-default{cursor:default}.cursor-help{cursor:help}.cursor-pointer{cursor:pointer}.select-none{-webkit-user-select:none;-moz-user-select:none;user-select:none}.select-all{-webkit-user-select:all;-moz-user-select:all;user-select:all}.resize{resize:both}.list-none{list-style-type:none}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-start{align-items:flex-start}.items-end{align-items:flex-end}.items-center{align-items:center}.items-baseline{align-items:baseline}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-1{gap:.25rem}.gap-1\.5{gap:.375rem}.gap-2{gap:.5rem}.gap-2\.5{gap:.625rem}.gap-3{gap:.75rem}.gap-3\.5{gap:.875rem}.gap-4{gap:1rem}.gap-5{gap:1.25rem}.gap-6{gap:1.5rem}.gap-8{gap:2rem}.gap-x-4{-moz-column-gap:1rem;column-gap:1rem}.gap-y-2{row-gap:.5rem}.gap-y-2\.5{row-gap:.625rem}.space-y-1>:not([hidden])~:not([hidden]){--tw-space-y-reverse:0;margin-bottom:calc(.25rem*var(--tw-space-y-reverse));margin-top:calc(.25rem*(1 - var(--tw-space-y-reverse)))}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse:0;margin-bottom:calc(.5rem*var(--tw-space-y-reverse));margin-top:calc(.5rem*(1 - var(--tw-space-y-reverse)))}.space-y-4>:not([hidden])~:not([hidden]){--tw-space-y-reverse:0;margin-bottom:calc(1rem*var(--tw-space-y-reverse));margin-top:calc(1rem*(1 - var(--tw-space-y-reverse)))}.overflow-hidden,.truncate{overflow:hidden}.truncate{text-overflow:ellipsis}.truncate,.whitespace-nowrap{white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.text-pretty{text-wrap:pretty}.break-all{word-break:break-all}.rounded{border-radius:.25rem}.rounded-\[3px\]{border-radius:3px}.rounded-\[5px\]{border-radius:5px}.rounded-\[6px\]{border-radius:6px}.rounded-\[7px\]{border-radius:7px}.rounded-\[8px\]{border-radius:8px}.rounded-full{border-radius:9999px}.rounded-md{border-radius:.375rem}.border{border-width:1px}.border-y{border-top-width:1px}.border-b,.border-y{border-bottom-width:1px}.border-l{border-left-width:1px}.border-t{border-top-width:1px}.border-line{border-color:oklch(.27 .01 250)}.border-line-soft{border-color:oklch(.23 .008 250)}.bg-bg{background-color:oklch(.17 .006 250)}.bg-panel{background-color:oklch(.2 .007 250)}.p-0{padding:0}.p-2{padding:.5rem}.p-3{padding:.75rem}.p-3\.5{padding:.875rem}.p-4{padding:1rem}.p-5{padding:1.25rem}.p-6{padding:1.5rem}.p-7{padding:1.75rem}.p-8{padding:2rem}.p-\[18px\]{padding:18px}.p-\[3px\]{padding:3px}.px-1{padding-left:.25rem;padding-right:.25rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-2\.5{padding-left:.625rem;padding-right:.625rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-3\.5{padding-left:.875rem;padding-right:.875rem}.px-4{padding-left:1rem;padding-right:1rem}.px-5{padding-left:1.25rem;padding-right:1.25rem}.px-7{padding-left:1.75rem;padding-right:1.75rem}.px-8{padding-left:2rem;padding-right:2rem}.px-\[18px\]{padding-left:18px;padding-right:18px}.py-0\.5{padding-bottom:.125rem;padding-top:.125rem}.py-1{padding-bottom:.25rem;padding-top:.25rem}.py-1\.5{padding-bottom:.375rem;padding-top:.375rem}.py-12{padding-bottom:3rem;padding-top:3rem}.py-2{padding-bottom:.5rem;padding-top:.5rem}.py-2\.5{padding-bottom:.625rem;padding-top:.625rem}.py-3{padding-bottom:.75rem;padding-top:.75rem}.py-3\.5{padding-bottom:.875rem;padding-top:.875rem}.py-4{padding-bottom:1rem;padding-top:1rem}.py-5{padding-bottom:1.25rem;padding-top:1.25rem}.py-6{padding-bottom:1.5rem;padding-top:1.5rem}.py-7{padding-bottom:1.75rem;padding-top:1.75rem}.py-8{padding-bottom:2rem;padding-top:2rem}.py-\[14px\]{padding-bottom:14px;padding-top:14px}.py-\[5px\]{padding-bottom:5px;padding-top:5px}.pb-14{padding-bottom:3.5rem}.pb-2{padding-bottom:.5rem}.pb-24{padding-bottom:6rem}.pb-3{padding-bottom:.75rem}.pb-4{padding-bottom:1rem}.pb-\[18px\]{padding-bottom:18px}.pl-5{padding-left:1.25rem}.pl-6{padding-left:1.5rem}.pl-9{padding-left:2.25rem}.pr-4{padding-right:1rem}.pt-0\.5{padding-top:.125rem}.pt-1{padding-top:.25rem}.pt-14{padding-top:3.5rem}.pt-2{padding-top:.5rem}.pt-20{padding-top:5rem}.pt-4{padding-top:1rem}.pt-5{padding-top:1.25rem}.pt-6{padding-top:1.5rem}.pt-7{padding-top:1.75rem}.pt-9{padding-top:2.25rem}.pt-\[1px\]{padding-top:1px}.text-left{text-align:left}.text-center{text-align:center}.text-right{text-align:right}.text-2xl{font-size:1.5rem;line-height:2rem}.text-\[10\.5px\]{font-size:10.5px}.text-\[10px\]{font-size:10px}.text-\[11\.5px\]{font-size:11.5px}.text-\[11px\]{font-size:11px}.text-\[12\.5px\]{font-size:12.5px}.text-\[12px\]{font-size:12px}.text-\[13px\]{font-size:13px}.text-\[14px\]{font-size:14px}.text-\[16px\]{font-size:16px}.text-\[18px\]{font-size:18px}.text-\[19px\]{font-size:19px}.text-\[20px\]{font-size:20px}.text-\[22px\]{font-size:22px}.text-\[26px\]{font-size:26px}.text-\[28px\]{font-size:28px}.text-base{font-size:1rem;line-height:1.5rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xs{font-size:.75rem;line-height:1rem}.font-medium{font-weight:500}.font-normal{font-weight:400}.font-semibold{font-weight:600}.uppercase{text-transform:uppercase}.normal-case{text-transform:none}.italic{font-style:italic}.leading-\[1\.55\]{line-height:1.55}.leading-\[1\.5\]{line-height:1.5}.leading-\[1\.65\]{line-height:1.65}.leading-\[1\.6\]{line-height:1.6}.leading-\[1\.7\]{line-height:1.7}.leading-\[20px\]{line-height:20px}.leading-none{line-height:1}.tracking-\[-0\.005em\]{letter-spacing:-.005em}.tracking-\[-0\.012em\]{letter-spacing:-.012em}.tracking-\[-0\.01em\]{letter-spacing:-.01em}.tracking-\[-0\.02em\]{letter-spacing:-.02em}.tracking-\[0\.005em\]{letter-spacing:.005em}.tracking-\[0\.01em\]{letter-spacing:.01em}.tracking-\[0\.02em\]{letter-spacing:.02em}.tracking-\[0\.08em\]{letter-spacing:.08em}.tracking-\[0\.1em\]{letter-spacing:.1em}.text-accent{color:oklch(.82 .12 195)}.text-bad{color:oklch(.7 .2 25)}.text-ink{color:oklch(.96 .005 250)}.text-ink-fade{color:oklch(.42 .006 250)}.text-ink-mid{color:oklch(.78 .005 250)}.text-ink-mute{color:oklch(.58 .006 250)}.text-ok{color:oklch(.78 .14 155)}.text-warn{color:oklch(.82 .13 80)}.underline{text-decoration-line:underline}.no-underline{text-decoration-line:none}.decoration-line{text-decoration-color:oklch(.27 .01 250)}.underline-offset-4{text-underline-offset:4px}.opacity-40{opacity:.4}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.transition{transition-duration:.15s;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1)}.hover\:text-ink:hover{color:oklch(.96 .005 250)}.hover\:text-ink-mid:hover{color:oklch(.78 .005 250)}.hover\:underline:hover{text-decoration-line:underline} diff --git a/web/styles/input.css b/web/styles/input.css index 4420168..fb27b08 100644 --- a/web/styles/input.css +++ b/web/styles/input.css @@ -104,6 +104,65 @@ .btn-lg { font-size: 13px; padding: 9px 14px; } .btn-block { width: 100%; justify-content: center; } + /* Amber action — used for the per-host "Update agent" button and + the fleet-update Start button. Same warning palette as the + update-chip below. */ + .btn-amber { + color: oklch(0.18 0.01 80); + background: var(--warn); + border-color: var(--warn); + } + .btn-amber:hover { filter: brightness(1.08); } + .btn-amber:disabled, .btn-amber[disabled] { + opacity: 0.45; cursor: not-allowed; pointer-events: none; + } + + /* Update-available chip — small amber pill rendered next to a host's + agent version (in the row OS column and in the host detail + header). Hidden when the host is up to date. */ + .update-chip { + display: inline-flex; align-items: center; gap: 4px; + padding: 1px 6px; + border-radius: 3px; + font-size: 10px; font-weight: 500; + line-height: 1.4; + color: oklch(0.18 0.01 80); + background: color-mix(in oklch, var(--warn), transparent 30%); + border: 1px solid color-mix(in oklch, var(--warn), transparent 50%); + white-space: nowrap; + } + + /* Hero tile — large, clickable summary card on the dashboard. + Today only used by the "N hosts behind" tile; the existing + four summary boxes use bespoke grid markup. Add more variants + as adjacent dashboard tiles adopt this. */ + .hero-tile { + display: flex; flex-direction: column; gap: 4px; + padding: 14px 16px; + border-radius: 7px; + border: 1px solid var(--line-soft); + background: var(--panel); + text-decoration: none; + transition: filter 120ms ease, background 120ms ease; + } + .hero-tile:hover { filter: brightness(1.08); } + .hero-tile .hero-num { + font-family: 'JetBrains Mono', ui-monospace, monospace; + font-size: 22px; font-weight: 500; + letter-spacing: -0.01em; + color: var(--ink); + } + .hero-tile .hero-label { + font-size: 11.5px; + color: var(--ink-mute); + } + .hero-tile--amber { + background: color-mix(in oklch, var(--warn), transparent 88%); + border-color: color-mix(in oklch, var(--warn), transparent 60%); + } + .hero-tile--amber .hero-num { color: oklch(0.86 0.13 80); } + .hero-tile--amber .hero-label { color: oklch(0.78 0.08 80); } + /* ---------- nav tabs ---------- */ .nav-tab { font-size: 13px; padding: 18px 0; diff --git a/web/templates/pages/dashboard.html b/web/templates/pages/dashboard.html index b4ffb23..e29dbc8 100644 --- a/web/templates/pages/dashboard.html +++ b/web/templates/pages/dashboard.html @@ -66,6 +66,16 @@ + {{/* ---------- Hosts-behind hero tile (P6-18) ---------- */}} + {{if gt $page.UpdatesBehind 0}} + + {{end}} + {{/* ---------- Pending hosts (announce-and-approve queue) ---------- */}} {{if gt (len $page.PendingHosts) 0}}
diff --git a/web/templates/pages/fleet_update.html b/web/templates/pages/fleet_update.html new file mode 100644 index 0000000..3373e0d --- /dev/null +++ b/web/templates/pages/fleet_update.html @@ -0,0 +1,32 @@ +{{define "title"}}Fleet update · restic-manager{{end}} + +{{define "content"}} +{{$page := .Page}} +
+ + {{/* breadcrumbs */}} +
+ Dashboard/ + Settings/ + fleet update +
+ + {{/* page header */}} +
+
+

+ Fleet update + target {{$page.TargetVersion}} +

+

+ Rolling, sequential agent self-update. One host at a time, halts on first failure, + cancellable mid-roll. Only online hosts whose agent_version + differs from the server are eligible. +

+
+
+ + {{template "fleet_update_inner" .}} + +
+{{end}} diff --git a/web/templates/pages/host_detail.html b/web/templates/pages/host_detail.html index fdd9890..4d3b134 100644 --- a/web/templates/pages/host_detail.html +++ b/web/templates/pages/host_detail.html @@ -78,6 +78,26 @@

+ {{if and $page.CanAdmin $page.UpdateAvailable}} +
+
Agent update
+

+ Agent at {{$host.AgentVersion}} · + server at {{$page.TargetVersion}}. + Pushes a self-update command; the agent re-launches into the new binary + and reconnects. +

+
+ +
+
+ {{end}} +
Restore

diff --git a/web/templates/partials/fleet_update_inner.html b/web/templates/partials/fleet_update_inner.html new file mode 100644 index 0000000..357881f --- /dev/null +++ b/web/templates/partials/fleet_update_inner.html @@ -0,0 +1,171 @@ +{{/* + fleet_update_inner — inner panel for /settings/fleet-update. + Rendered both as part of the full page and as the htmx polling + fragment via /settings/fleet-update/partial. + + Expects .Page to be a fleetUpdatePage struct (see fleet_update.go). +*/}} +{{define "fleet_update_inner"}} +{{$page := .Page}} +

+ +{{if and $page.Active (eq $page.Active.Status "running")}} + + {{/* ---------- running state ---------- */}} +
+
+
+ fleet update + running + {{$page.Active.ID}} +
+
+ +
+
+
+ target {{$page.Active.TargetVersion}} + · started {{relTime $page.Active.StartedAt}} + {{if $page.Active.CurrentHostID}} + · waiting on {{index $page.HostNames $page.Active.CurrentHostID}} + {{end}} +
+
+ + {{template "fleet_update_rows" $page}} + +{{else if $page.Active}} + + {{/* ---------- terminal state (completed / halted / cancelled) ---------- */}} +
+
+
+ last fleet update + {{if eq $page.Active.Status "completed"}} + completed + {{else if eq $page.Active.Status "halted"}} + halted + {{else if eq $page.Active.Status "cancelled"}} + cancelled + {{else}} + {{$page.Active.Status}} + {{end}} + {{$page.Active.ID}} +
+
+
+ target {{$page.Active.TargetVersion}} + · started {{relTime $page.Active.StartedAt}} + {{if $page.Active.CompletedAt}} · finished {{relTime $page.Active.CompletedAt}}{{end}} +
+ {{if $page.Active.HaltedReason}} +
{{$page.Active.HaltedReason}}
+ {{end}} +
+ + {{template "fleet_update_rows" $page}} + + {{if gt (len $page.OutOfDateHosts) 0}} +
+ {{template "fleet_update_idle_panel" $page}} +
+ {{end}} + +{{else}} + + {{template "fleet_update_idle_panel" $page}} + +{{end}} +
+{{end}} + +{{define "fleet_update_rows"}} +{{$page := .}} +
+
+
#
+
Host
+
Status
+
Job
+
Detail
+
+ {{range $page.ActiveRows}} +
+
{{.Position}}
+
{{if .HostName}}{{.HostName}}{{else}}{{.HostID}}{{end}}
+
+ {{if eq .Status "pending"}}pending + {{else if eq .Status "running"}}running… + {{else if eq .Status "succeeded"}}succeeded + {{else if eq .Status "failed"}}failed + {{else if eq .Status "skipped"}}skipped + {{else}}{{.Status}}{{end}} +
+
+ {{if .JobID}}{{.JobID}}{{else}}{{end}} +
+
{{.FailedReason}}
+
+ {{end}} +
+{{end}} + +{{define "fleet_update_idle_panel"}} +{{$page := .}} +
+ {{if eq (len $page.OutOfDateHosts) 0}} +
+ +
+
All hosts are up to date.
+
+ Every online agent matches server version {{$page.TargetVersion}}. +
+
+
+ {{else}} +
+

{{len $page.OutOfDateHosts}} host{{if ne (len $page.OutOfDateHosts) 1}}s{{end}} out of date

+ target {{$page.TargetVersion}} +
+
    + {{range $page.OutOfDateHosts}} +
  • + + {{.Name}} + {{if .AgentVersion}}{{.AgentVersion}}{{else}}—{{end}} → {{$page.TargetVersion}} +
  • + {{end}} +
+ +
+ + + +
+ {{end}} +
+{{end}} diff --git a/web/templates/partials/host_chrome.html b/web/templates/partials/host_chrome.html index dad2c56..2f02f0b 100644 --- a/web/templates/partials/host_chrome.html +++ b/web/templates/partials/host_chrome.html @@ -83,7 +83,7 @@
{{$host.OS}}/{{$host.Arch}} · - agent {{if $host.AgentVersion}}{{$host.AgentVersion}}{{else}}—{{end}} + agent {{if $host.AgentVersion}}{{$host.AgentVersion}}{{else}}—{{end}}{{if $page.UpdateAvailable}} {{template "host_update_chip" $page}}{{end}} · restic {{if $host.ResticVersion}}{{$host.ResticVersion}}{{else}}—{{end}} · diff --git a/web/templates/partials/host_row.html b/web/templates/partials/host_row.html index 9c7799e..128d417 100644 --- a/web/templates/partials/host_row.html +++ b/web/templates/partials/host_row.html @@ -14,7 +14,7 @@ {{- end -}}
{{$h.Name}}
-
{{$h.OS}}/{{$h.Arch}}
+
{{$h.OS}}/{{$h.Arch}}{{if .UpdateAvailable}} {{template "host_update_chip" .}}{{end}}
{{- if $h.CurrentJobID -}} backup running…
diff --git a/web/templates/partials/host_update_chip.html b/web/templates/partials/host_update_chip.html new file mode 100644 index 0000000..b2d7837 --- /dev/null +++ b/web/templates/partials/host_update_chip.html @@ -0,0 +1,11 @@ +{{/* + host_update_chip — small amber chip rendered when the agent version + on a host is behind the server's. Expects: + .UpdateAvailable bool + .TargetVersion string + .Host store.Host (for AgentVersion) + Hidden entirely when UpdateAvailable is false. +*/}} +{{define "host_update_chip"}} +{{if .UpdateAvailable}}out of date · {{.Host.AgentVersion}} → {{.TargetVersion}}{{end}} +{{end}}