Merge pull request 'P6-01 + P6-02: agent self-update + fleet update' (#19) from p6-agent-self-update into main
Reviewed-on: #19
This commit is contained in:
@@ -7,7 +7,9 @@ AGENT_BIN := $(BIN_DIR)/restic-manager-agent
|
|||||||
VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev)
|
VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev)
|
||||||
COMMIT ?= $(shell git rev-parse HEAD 2>/dev/null || echo none)
|
COMMIT ?= $(shell git rev-parse HEAD 2>/dev/null || echo none)
|
||||||
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||||
LDFLAGS := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE)
|
VERSION_PKG := gitea.dcglab.co.uk/steve/restic-manager/internal/version
|
||||||
|
LDFLAGS := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE) \
|
||||||
|
-X $(VERSION_PKG).Version=$(VERSION) -X $(VERSION_PKG).Commit=$(COMMIT)
|
||||||
GOFLAGS := -trimpath
|
GOFLAGS := -trimpath
|
||||||
DOCKER_IMAGE ?= gitea.dcglab.co.uk/steve/restic-manager
|
DOCKER_IMAGE ?= gitea.dcglab.co.uk/steve/restic-manager
|
||||||
DOCKER_TAG ?= dev
|
DOCKER_TAG ?= dev
|
||||||
|
|||||||
+8
-4
@@ -148,6 +148,7 @@ func run() error {
|
|||||||
resticBin: resticBin,
|
resticBin: resticBin,
|
||||||
resticVer: snap.ResticVersion,
|
resticVer: snap.ResticVersion,
|
||||||
resticSupportsNoOwnership: resticSupportsNoOwnership,
|
resticSupportsNoOwnership: resticSupportsNoOwnership,
|
||||||
|
serverURL: cfg.ServerURL,
|
||||||
secrets: sec,
|
secrets: sec,
|
||||||
scheduler: scheduler.New(),
|
scheduler: scheduler.New(),
|
||||||
}
|
}
|
||||||
@@ -214,6 +215,7 @@ type dispatcher struct {
|
|||||||
resticBin string
|
resticBin string
|
||||||
resticVer string // e.g. "0.17.1"; empty if restic isn't installed yet
|
resticVer string // e.g. "0.17.1"; empty if restic isn't installed yet
|
||||||
resticSupportsNoOwnership bool // captured at startup from `restic restore --help`
|
resticSupportsNoOwnership bool // captured at startup from `restic restore --help`
|
||||||
|
serverURL string // base URL of the server (used by the self-update fetch)
|
||||||
secrets *secrets.Store
|
secrets *secrets.Store
|
||||||
scheduler *scheduler.Scheduler
|
scheduler *scheduler.Scheduler
|
||||||
|
|
||||||
@@ -395,10 +397,12 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S
|
|||||||
"up_kbps", up, "down_kbps", down)
|
"up_kbps", up, "down_kbps", down)
|
||||||
}
|
}
|
||||||
|
|
||||||
case api.MsgAgentUpdateAvail:
|
case api.MsgCommandUpdate:
|
||||||
var p api.AgentUpdateAvailablePayload
|
var p api.CommandUpdatePayload
|
||||||
_ = env.UnmarshalPayload(&p)
|
if err := env.UnmarshalPayload(&p); err != nil {
|
||||||
slog.Info("ws agent: update available", "version", p.LatestVersion, "url", p.PackageURL)
|
return fmt.Errorf("command.update: %w", err)
|
||||||
|
}
|
||||||
|
go d.runUpdate(ctx, p, tx)
|
||||||
|
|
||||||
default:
|
default:
|
||||||
slog.Debug("ws agent: ignored message", "type", env.Type)
|
slog.Debug("ws agent: ignored message", "type", env.Type)
|
||||||
|
|||||||
@@ -0,0 +1,65 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/updater"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/wsclient"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
// runUpdate handles a server-dispatched command.update. It logs progress
|
||||||
|
// via log.stream so the live job page captures pre-restart state, then
|
||||||
|
// calls the platform updater. On Linux the updater calls os.Exit; on
|
||||||
|
// Windows it spawns a detached helper and returns, with the agent then
|
||||||
|
// exiting.
|
||||||
|
//
|
||||||
|
// The terminal job state is set by the server, not the agent: success
|
||||||
|
// is "agent re-hellos with matching version" rather than anything the
|
||||||
|
// agent itself can assert. The only `job.finished` we send from here is
|
||||||
|
// on the failure path, before any restart attempt.
|
||||||
|
func (d *dispatcher) runUpdate(ctx context.Context, p api.CommandUpdatePayload, tx wsclient.Sender) {
|
||||||
|
logf := func(format string, args ...any) {
|
||||||
|
line := fmt.Sprintf(format, args...)
|
||||||
|
slog.Info("ws agent: update: " + line)
|
||||||
|
env, err := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
|
||||||
|
JobID: p.JobID,
|
||||||
|
TS: time.Now().UTC(),
|
||||||
|
Stream: api.LogStdout,
|
||||||
|
Payload: line,
|
||||||
|
})
|
||||||
|
if err == nil {
|
||||||
|
_ = tx.Send(env)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
startedEnv, err := api.Marshal(api.MsgJobStarted, "", api.JobStartedPayload{
|
||||||
|
JobID: p.JobID,
|
||||||
|
Kind: api.JobUpdate,
|
||||||
|
StartedAt: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
if err == nil {
|
||||||
|
_ = tx.Send(startedEnv)
|
||||||
|
}
|
||||||
|
|
||||||
|
logf("fetching new binary from %s", d.serverURL)
|
||||||
|
if err := updater.Update(ctx, d.serverURL); err != nil {
|
||||||
|
logf("update failed: %v", err)
|
||||||
|
finishedEnv, mErr := api.Marshal(api.MsgJobFinished, "", api.JobFinishedPayload{
|
||||||
|
JobID: p.JobID,
|
||||||
|
Status: api.JobFailed,
|
||||||
|
FinishedAt: time.Now().UTC(),
|
||||||
|
Error: err.Error(),
|
||||||
|
})
|
||||||
|
if mErr == nil {
|
||||||
|
_ = tx.Send(finishedEnv)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Unreachable on Linux (Update calls os.Exit). On Windows control
|
||||||
|
// returns here while the detached helper does the swap-and-restart;
|
||||||
|
// the agent then exits cleanly so SCM hands off.
|
||||||
|
}
|
||||||
@@ -17,6 +17,7 @@ import (
|
|||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/fleetupdate"
|
||||||
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
|
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
|
||||||
@@ -91,6 +92,7 @@ func run() error {
|
|||||||
|
|
||||||
notifHub := notification.NewHub(st, aead, cfg.BaseURL)
|
notifHub := notification.NewHub(st, aead, cfg.BaseURL)
|
||||||
alertEngine := alert.NewEngine(st, notifHub)
|
alertEngine := alert.NewEngine(st, notifHub)
|
||||||
|
updateWatcher := ws.NewUpdateWatcher(st, alertEngine)
|
||||||
|
|
||||||
renderer, err := ui.New()
|
renderer, err := ui.New()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -116,6 +118,7 @@ func run() error {
|
|||||||
JobHub: jobHub,
|
JobHub: jobHub,
|
||||||
AlertEngine: alertEngine,
|
AlertEngine: alertEngine,
|
||||||
NotificationHub: notifHub,
|
NotificationHub: notifHub,
|
||||||
|
UpdateWatcher: updateWatcher,
|
||||||
UI: renderer,
|
UI: renderer,
|
||||||
Version: version,
|
Version: version,
|
||||||
OIDC: oidcClient,
|
OIDC: oidcClient,
|
||||||
@@ -147,10 +150,17 @@ func run() error {
|
|||||||
|
|
||||||
srv := rmhttp.New(deps)
|
srv := rmhttp.New(deps)
|
||||||
|
|
||||||
|
// Fleet-update worker — built after the HTTP server because the
|
||||||
|
// dispatcher delegates back into srv.DispatchHostUpdate.
|
||||||
|
fleetWorker := fleetupdate.NewWorker(st, hub,
|
||||||
|
&serverDispatcher{srv: srv}, alertEngine)
|
||||||
|
srv.SetFleetWorker(fleetWorker)
|
||||||
|
|
||||||
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||||
defer stop()
|
defer stop()
|
||||||
|
|
||||||
go alertEngine.Run(ctx)
|
go alertEngine.Run(ctx)
|
||||||
|
go updateWatcher.Run(ctx)
|
||||||
|
|
||||||
errCh := make(chan error, 1)
|
errCh := make(chan error, 1)
|
||||||
go func() {
|
go func() {
|
||||||
@@ -243,3 +253,12 @@ func run() error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// serverDispatcher adapts the http.Server's DispatchHostUpdate method
|
||||||
|
// to the fleetupdate.Dispatcher interface. Lives in main so the
|
||||||
|
// http and fleetupdate packages don't need to know about each other.
|
||||||
|
type serverDispatcher struct{ srv *rmhttp.Server }
|
||||||
|
|
||||||
|
func (d *serverDispatcher) DispatchUpdate(ctx context.Context, hostID, actorUserID string) (string, string, error) {
|
||||||
|
return d.srv.DispatchHostUpdate(ctx, hostID, actorUserID)
|
||||||
|
}
|
||||||
|
|||||||
@@ -52,7 +52,12 @@ ProtectSystem=full
|
|||||||
# whenever a new SecretsKey is minted, so we need a targeted
|
# whenever a new SecretsKey is minted, so we need a targeted
|
||||||
# write-exemption for that dir. No exemption for the rest of /etc:
|
# write-exemption for that dir. No exemption for the rest of /etc:
|
||||||
# the agent has no business editing /etc/passwd, /etc/sudoers, etc.
|
# the agent has no business editing /etc/passwd, /etc/sudoers, etc.
|
||||||
ReadWritePaths=/etc/restic-manager
|
#
|
||||||
|
# /usr/local/bin is writable so the self-update flow (P6-01) can
|
||||||
|
# atomic-rename a fresh binary over the running one. Permitting the
|
||||||
|
# whole directory (rather than just the binary path) is required
|
||||||
|
# because os.Rename takes a write lock on the parent dir.
|
||||||
|
ReadWritePaths=/etc/restic-manager /usr/local/bin
|
||||||
ProtectHostname=true
|
ProtectHostname=true
|
||||||
ProtectKernelTunables=true
|
ProtectKernelTunables=true
|
||||||
ProtectKernelModules=true
|
ProtectKernelModules=true
|
||||||
|
|||||||
@@ -0,0 +1,100 @@
|
|||||||
|
// Package updater carries the agent's self-update logic.
|
||||||
|
//
|
||||||
|
// The flow is operator-driven: the server dispatches a command.update
|
||||||
|
// WS envelope, the agent fetches a fresh binary from the server's
|
||||||
|
// /agent/binary endpoint, atomic-renames it over the running binary
|
||||||
|
// (Linux) or hands off to a detached helper script (Windows), and
|
||||||
|
// exits cleanly so the service manager restarts under the new
|
||||||
|
// binary. See docs/superpowers/specs/2026-05-06-p6-01-02-...
|
||||||
|
//
|
||||||
|
// Platform-specific code is build-tagged into updater_unix.go /
|
||||||
|
// updater_windows.go. This file holds the shared HTTP fetch + path
|
||||||
|
// helpers + the test seam.
|
||||||
|
package updater
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// fetch downloads the new binary into <binaryPath>.new, fsyncs, chmods.
|
||||||
|
// Returns the path of the staged file (always binaryPath + ".new").
|
||||||
|
func fetch(ctx context.Context, serverURL, binaryPath string) (string, error) {
|
||||||
|
url := fmt.Sprintf("%s/agent/binary?os=%s&arch=%s", serverURL, runtime.GOOS, runtime.GOARCH)
|
||||||
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
c := &http.Client{Timeout: 5 * time.Minute}
|
||||||
|
res, err := c.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer func() { _ = res.Body.Close() }()
|
||||||
|
if res.StatusCode != http.StatusOK {
|
||||||
|
return "", fmt.Errorf("agent binary fetch: %s", res.Status)
|
||||||
|
}
|
||||||
|
|
||||||
|
stagePath := binaryPath + ".new"
|
||||||
|
f, err := os.OpenFile(stagePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if _, copyErr := io.Copy(f, res.Body); copyErr != nil {
|
||||||
|
_ = f.Close()
|
||||||
|
_ = os.Remove(stagePath)
|
||||||
|
return "", copyErr
|
||||||
|
}
|
||||||
|
if syncErr := f.Sync(); syncErr != nil {
|
||||||
|
_ = f.Close()
|
||||||
|
_ = os.Remove(stagePath)
|
||||||
|
return "", syncErr
|
||||||
|
}
|
||||||
|
if closeErr := f.Close(); closeErr != nil {
|
||||||
|
_ = os.Remove(stagePath)
|
||||||
|
return "", closeErr
|
||||||
|
}
|
||||||
|
if err := os.Chmod(stagePath, 0o755); err != nil {
|
||||||
|
_ = os.Remove(stagePath)
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return stagePath, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// resolveOwnBinary returns the absolute path of the running binary.
|
||||||
|
// Refuses /proc/self/exe — that's what os.Executable returns on some
|
||||||
|
// systems but the path can't be renamed across.
|
||||||
|
func resolveOwnBinary() (string, error) {
|
||||||
|
p, err := os.Executable()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
abs, err := filepath.Abs(p)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if abs == "/proc/self/exe" {
|
||||||
|
return "", fmt.Errorf("cannot resolve own binary path (/proc/self/exe)")
|
||||||
|
}
|
||||||
|
return abs, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateForTest is the platform-neutral test seam. In production the
|
||||||
|
// platform-specific Update fetches, swaps, then exits the process.
|
||||||
|
// UpdateForTest stops short of the exit so unit tests can assert on
|
||||||
|
// file state.
|
||||||
|
func UpdateForTest(serverURL, binaryPath string) error {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
stage, err := fetch(ctx, serverURL, binaryPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return swap(stage, binaryPath)
|
||||||
|
}
|
||||||
@@ -0,0 +1,87 @@
|
|||||||
|
//go:build !windows
|
||||||
|
|
||||||
|
package updater
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestUpdate_LinuxAtomicSwap stages a fake "running binary" file, runs
|
||||||
|
// UpdateForTest against a fake /agent/binary server, and asserts that
|
||||||
|
// the binary was swapped, .old preserves the previous bytes, and .new
|
||||||
|
// was renamed away.
|
||||||
|
func TestUpdate_LinuxAtomicSwap(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
binPath := filepath.Join(tmp, "agent")
|
||||||
|
if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
newBytes := []byte("NEW BINARY CONTENTS")
|
||||||
|
|
||||||
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.URL.Path != "/agent/binary" {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
gotOS, gotArch := r.URL.Query().Get("os"), r.URL.Query().Get("arch")
|
||||||
|
if gotOS != runtime.GOOS || gotArch != runtime.GOARCH {
|
||||||
|
t.Errorf("query mismatch: got os=%s arch=%s want %s/%s",
|
||||||
|
gotOS, gotArch, runtime.GOOS, runtime.GOARCH)
|
||||||
|
}
|
||||||
|
_, _ = io.Copy(w, bytes.NewReader(newBytes))
|
||||||
|
}))
|
||||||
|
defer srv.Close()
|
||||||
|
|
||||||
|
if err := UpdateForTest(srv.URL, binPath); err != nil {
|
||||||
|
t.Fatalf("update: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := os.ReadFile(binPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if string(got) != string(newBytes) {
|
||||||
|
t.Fatalf("binary contents: got %q want %q", got, newBytes)
|
||||||
|
}
|
||||||
|
old, err := os.ReadFile(binPath + ".old")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("agent.old missing: %v", err)
|
||||||
|
}
|
||||||
|
if string(old) != "OLD" {
|
||||||
|
t.Fatalf("agent.old contents: got %q want %q", old, "OLD")
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(binPath + ".new"); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("agent.new should be absent after swap, got err=%v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestUpdate_FetchHTTPError surfaces the server's status when the
|
||||||
|
// binary is not published for this os/arch.
|
||||||
|
func TestUpdate_FetchHTTPError(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
binPath := filepath.Join(tmp, "agent")
|
||||||
|
if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
http.Error(w, `{"error":"binary_not_published"}`, http.StatusNotFound)
|
||||||
|
}))
|
||||||
|
defer srv.Close()
|
||||||
|
|
||||||
|
err := UpdateForTest(srv.URL, binPath)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error, got nil")
|
||||||
|
}
|
||||||
|
got, _ := os.ReadFile(binPath)
|
||||||
|
if string(got) != "OLD" {
|
||||||
|
t.Fatalf("binary should not have changed, got %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,73 @@
|
|||||||
|
//go:build !windows
|
||||||
|
|
||||||
|
package updater
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Update fetches the new binary, swaps it in, then exits so systemd
|
||||||
|
// restarts the process under the new binary. The caller should close
|
||||||
|
// the WS connection cleanly (so the server transitions the host to
|
||||||
|
// disconnected immediately rather than waiting for the heartbeat
|
||||||
|
// sweep) before invoking.
|
||||||
|
//
|
||||||
|
// Service-user assumption: the agent runs as root under the
|
||||||
|
// systemd-shipped unit, which can write the binary path directly.
|
||||||
|
// If the agent ever moves to a non-root service user, this breaks —
|
||||||
|
// would need a setuid helper or an out-of-process update service.
|
||||||
|
func Update(ctx context.Context, serverURL string) error {
|
||||||
|
binPath, err := resolveOwnBinary()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
stage, err := fetch(ctx, serverURL, binPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := swap(stage, binPath); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
slog.Info("agent self-update: binary swapped, exiting for systemd restart",
|
||||||
|
"binary", binPath)
|
||||||
|
// Give logger / WS close-frame a moment to flush, then exit.
|
||||||
|
time.Sleep(200 * time.Millisecond)
|
||||||
|
os.Exit(0)
|
||||||
|
return nil // unreachable
|
||||||
|
}
|
||||||
|
|
||||||
|
// swap copies the running binary to <bin>.old (M1 — keep one revision
|
||||||
|
// back for hand-rolled rollback), then atomic-renames the staged
|
||||||
|
// binary into place. Linux supports rename-while-open so this works
|
||||||
|
// even though the running process holds the source open.
|
||||||
|
func swap(stagePath, binPath string) error {
|
||||||
|
src, err := os.Open(binPath)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("open running binary: %w", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = src.Close() }()
|
||||||
|
dst, err := os.OpenFile(binPath+".old", os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("open .old: %w", err)
|
||||||
|
}
|
||||||
|
if _, err := io.Copy(dst, src); err != nil {
|
||||||
|
_ = dst.Close()
|
||||||
|
return fmt.Errorf("copy to .old: %w", err)
|
||||||
|
}
|
||||||
|
if err := dst.Sync(); err != nil {
|
||||||
|
_ = dst.Close()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := dst.Close(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := os.Rename(stagePath, binPath); err != nil {
|
||||||
|
return fmt.Errorf("rename .new over running binary: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,73 @@
|
|||||||
|
//go:build windows
|
||||||
|
|
||||||
|
package updater
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// helperScript is rendered with fmt.Sprintf, args order:
|
||||||
|
//
|
||||||
|
// %[1]s — running binary path (source for the .old copy)
|
||||||
|
// %[2]s — .old path
|
||||||
|
// %[3]s — staged .new path
|
||||||
|
// %[4]s — running binary path (rename target)
|
||||||
|
const helperScript = `@echo off
|
||||||
|
timeout /t 3 /nobreak >nul
|
||||||
|
copy /Y "%[1]s" "%[2]s"
|
||||||
|
sc stop restic-manager-agent
|
||||||
|
:wait
|
||||||
|
sc query restic-manager-agent | find "STOPPED" >nul
|
||||||
|
if errorlevel 1 (timeout /t 1 /nobreak >nul & goto wait)
|
||||||
|
move /Y "%[3]s" "%[4]s"
|
||||||
|
sc start restic-manager-agent
|
||||||
|
del "%%~f0"
|
||||||
|
`
|
||||||
|
|
||||||
|
// Update on Windows can't overwrite the running .exe in-process
|
||||||
|
// (exclusive file lock), so we stage the new binary, write a small
|
||||||
|
// detached helper script that waits, stops the service, swaps the
|
||||||
|
// binary, and starts the service, then exit cleanly. SCM treats
|
||||||
|
// clean exits after sc stop as intentional and does not auto-restart;
|
||||||
|
// the helper's final sc start handles that.
|
||||||
|
func Update(ctx context.Context, serverURL string) error {
|
||||||
|
binPath, err := resolveOwnBinary()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
stage, err := fetch(ctx, serverURL, binPath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
helperPath := filepath.Join(filepath.Dir(binPath), "agent-update.cmd")
|
||||||
|
body := fmt.Sprintf(helperScript, binPath, binPath+".old", stage, binPath)
|
||||||
|
if err := os.WriteFile(helperPath, []byte(body), 0o755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
cmd := exec.Command("cmd.exe", "/c", helperPath)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||||
|
HideWindow: true,
|
||||||
|
CreationFlags: 0x00000008 | 0x08000000, // DETACHED_PROCESS | CREATE_NO_WINDOW
|
||||||
|
}
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
slog.Info("agent self-update: helper spawned, exiting cleanly",
|
||||||
|
"binary", binPath, "helper", helperPath)
|
||||||
|
time.Sleep(200 * time.Millisecond)
|
||||||
|
os.Exit(0)
|
||||||
|
return nil // unreachable
|
||||||
|
}
|
||||||
|
|
||||||
|
// swap is unused on Windows — the helper script does the swap.
|
||||||
|
// Defined to satisfy the build (UpdateForTest references it).
|
||||||
|
func swap(_, _ string) error {
|
||||||
|
return fmt.Errorf("updater.swap not implemented on Windows; use the helper script via Update")
|
||||||
|
}
|
||||||
@@ -0,0 +1,63 @@
|
|||||||
|
package alert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Alert-kind constants for P6 self-update flows.
|
||||||
|
const (
|
||||||
|
// KindUpdateFailed is raised when an agent fails to come back with
|
||||||
|
// the expected version after a command.update dispatch (timeout or
|
||||||
|
// version-mismatch). Resolved by a subsequent matching hello.
|
||||||
|
KindUpdateFailed = "update_failed"
|
||||||
|
|
||||||
|
// KindFleetUpdateHalted is raised when the fleet-update worker
|
||||||
|
// stops mid-run because a host failed to update or went offline.
|
||||||
|
// Host-less alert (system-scoped). Manually resolved by an admin.
|
||||||
|
KindFleetUpdateHalted = "fleet_update_halted"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RaiseUpdateFailed records a per-host update failure. dedupKey is the
|
||||||
|
// hostID so a re-dispatch on the same host touches the existing alert
|
||||||
|
// rather than spawning a duplicate.
|
||||||
|
func (e *Engine) RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time) {
|
||||||
|
msg := fmt.Sprintf("Agent update failed (job %s): %s", jobID, reason)
|
||||||
|
e.raiseAndNotify(ctx, hostID, KindUpdateFailed, hostID, "warning", msg, when)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResolveUpdateFailed clears any open update_failed alert for hostID.
|
||||||
|
// Called from the WS hello path when the agent reconnects with the
|
||||||
|
// target version.
|
||||||
|
func (e *Engine) ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time) {
|
||||||
|
e.resolveAndNotify(ctx, hostID, KindUpdateFailed, hostID, when)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RaiseFleetUpdateHalted is host-less — the fleet update is a
|
||||||
|
// system-level concept. We persist it via the dedicated host-less
|
||||||
|
// alert path so the alerts table's host_id column carries NULL.
|
||||||
|
func (e *Engine) RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time) {
|
||||||
|
msg := fmt.Sprintf("Fleet update %s halted: %s", fleetUpdateID, reason)
|
||||||
|
id, didRaise, err := e.store.RaiseOrTouchSystem(ctx, KindFleetUpdateHalted, fleetUpdateID, "warning", msg, when)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("alert: raise fleet_update_halted", "fu_id", fleetUpdateID, "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !didRaise {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
go e.hub.Dispatch(ctx, notification.Payload{
|
||||||
|
Event: notification.EventRaised,
|
||||||
|
AlertID: id,
|
||||||
|
Severity: "warning",
|
||||||
|
Kind: KindFleetUpdateHalted,
|
||||||
|
HostID: "",
|
||||||
|
HostName: "",
|
||||||
|
Message: msg,
|
||||||
|
RaisedAt: when,
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -63,6 +63,7 @@ const (
|
|||||||
JobUnlock JobKind = "unlock"
|
JobUnlock JobKind = "unlock"
|
||||||
JobRestore JobKind = "restore"
|
JobRestore JobKind = "restore"
|
||||||
JobDiff JobKind = "diff"
|
JobDiff JobKind = "diff"
|
||||||
|
JobUpdate JobKind = "update"
|
||||||
)
|
)
|
||||||
|
|
||||||
// JobStatus is the lifecycle state of a job.
|
// JobStatus is the lifecycle state of a job.
|
||||||
@@ -361,13 +362,14 @@ type ConfigUpdatePayload struct {
|
|||||||
BandwidthDownKBps *int `json:"bandwidth_down_kbps,omitempty"`
|
BandwidthDownKBps *int `json:"bandwidth_down_kbps,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// AgentUpdateAvailablePayload — informational only; the agent does
|
// CommandUpdatePayload carries no operational data — the agent
|
||||||
// NOT self-update. See spec.md §4.2 for the package-manager-based
|
// already knows its own os/arch and fetches from its configured
|
||||||
// update model.
|
// server URL via /agent/binary. JobID is the server-issued id of
|
||||||
type AgentUpdateAvailablePayload struct {
|
// the update job; the agent echoes it on log.stream lines so the
|
||||||
LatestVersion string `json:"latest_version"`
|
// live job log captures pre-restart progress, then either exits
|
||||||
PackageURL string `json:"package_url"` // apt repo / choco source
|
// (Linux) or hands off to a detached helper script (Windows).
|
||||||
Changelog string `json:"changelog,omitempty"`
|
type CommandUpdatePayload struct {
|
||||||
|
JobID string `json:"job_id"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// TreeListRequestPayload is the body of a tree.list RPC. Used by the
|
// TreeListRequestPayload is the body of a tree.list RPC. Used by the
|
||||||
|
|||||||
@@ -29,12 +29,12 @@ const (
|
|||||||
|
|
||||||
// Server → agent message types.
|
// Server → agent message types.
|
||||||
const (
|
const (
|
||||||
MsgCommandRun MessageType = "command.run"
|
MsgCommandRun MessageType = "command.run"
|
||||||
MsgCommandCancel MessageType = "command.cancel"
|
MsgCommandCancel MessageType = "command.cancel"
|
||||||
MsgScheduleSet MessageType = "schedule.set"
|
MsgScheduleSet MessageType = "schedule.set"
|
||||||
MsgConfigUpdate MessageType = "config.update"
|
MsgConfigUpdate MessageType = "config.update"
|
||||||
MsgAgentUpdateAvail MessageType = "agent.update.available"
|
MsgCommandUpdate MessageType = "command.update"
|
||||||
MsgTreeList MessageType = "tree.list" // sync RPC: list a snapshot's children
|
MsgTreeList MessageType = "tree.list" // sync RPC: list a snapshot's children
|
||||||
)
|
)
|
||||||
|
|
||||||
// Envelope is the framing for every WS message in either direction.
|
// Envelope is the framing for every WS message in either direction.
|
||||||
|
|||||||
@@ -0,0 +1,221 @@
|
|||||||
|
// Package fleetupdate drives a rolling, sequential agent self-update
|
||||||
|
// over a list of hosts. One worker goroutine per Start() call (gated
|
||||||
|
// at the store layer to at-most-one-running-fleet-update).
|
||||||
|
package fleetupdate
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Hub is the slim "is this host connected?" surface.
|
||||||
|
type Hub interface {
|
||||||
|
Connected(hostID string) bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dispatcher sends one command.update envelope. The implementer also
|
||||||
|
// creates the jobs row, writes audit, and registers with the update
|
||||||
|
// watcher. Pre-checks are the dispatcher's responsibility — the worker
|
||||||
|
// passes through whatever error it returns.
|
||||||
|
type Dispatcher interface {
|
||||||
|
DispatchUpdate(ctx context.Context, hostID string, actorUserID string) (jobID string, code string, err error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AlertRaiser is the slim view of the alert engine's host-less raise
|
||||||
|
// path. Used to emit fleet_update_halted on first failure.
|
||||||
|
type AlertRaiser interface {
|
||||||
|
RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Worker is the long-lived fleet-update orchestrator. There is at most
|
||||||
|
// one *running* fleet update at a time (enforced by the store).
|
||||||
|
type Worker struct {
|
||||||
|
store *store.Store
|
||||||
|
hub Hub
|
||||||
|
disp Dispatcher
|
||||||
|
alerts AlertRaiser
|
||||||
|
|
||||||
|
// targetVersion is the version every dispatched agent is expected
|
||||||
|
// to come back with. Captured at Start time to avoid drift.
|
||||||
|
targetVersion string
|
||||||
|
|
||||||
|
// pollPeriod controls the cadence at which the worker re-reads the
|
||||||
|
// host row to check for the version transition. Exposed for tests.
|
||||||
|
pollPeriod time.Duration
|
||||||
|
// hostTimeout bounds how long the worker waits for one host to
|
||||||
|
// reach the target version before halting.
|
||||||
|
hostTimeout time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewWorker builds an unstarted worker. targetVersion is set on each
|
||||||
|
// Start call; the values here are defaults.
|
||||||
|
func NewWorker(st *store.Store, hub Hub, disp Dispatcher, alerts AlertRaiser) *Worker {
|
||||||
|
return &Worker{
|
||||||
|
store: st,
|
||||||
|
hub: hub,
|
||||||
|
disp: disp,
|
||||||
|
alerts: alerts,
|
||||||
|
pollPeriod: 1 * time.Second,
|
||||||
|
hostTimeout: 95 * time.Second,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start creates the parent + child rows, then spawns the per-host
|
||||||
|
// worker goroutine. Returns the new fleet_update_id on success.
|
||||||
|
// store.ErrFleetUpdateRunning bubbles up unchanged.
|
||||||
|
func (w *Worker) Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error) {
|
||||||
|
if userID == "" || targetVersion == "" {
|
||||||
|
return "", errors.New("fleetupdate: userID and targetVersion required")
|
||||||
|
}
|
||||||
|
if len(hostIDs) == 0 {
|
||||||
|
return "", errors.New("fleetupdate: at least one host required")
|
||||||
|
}
|
||||||
|
fuID := ulid.Make().String()
|
||||||
|
now := time.Now().UTC()
|
||||||
|
if err := w.store.CreateFleetUpdate(ctx, store.FleetUpdate{
|
||||||
|
ID: fuID,
|
||||||
|
StartedAt: now,
|
||||||
|
StartedByUserID: userID,
|
||||||
|
TargetVersion: targetVersion,
|
||||||
|
Status: "running",
|
||||||
|
}, hostIDs); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
// The goroutine outlives the request that started it; carry a
|
||||||
|
// detached context so an HTTP-handler ctx cancel doesn't abort
|
||||||
|
// the long roll.
|
||||||
|
bg := context.WithoutCancel(ctx)
|
||||||
|
go w.run(bg, fuID, userID, targetVersion)
|
||||||
|
return fuID, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cancel marks the fleet update cancelled. The running goroutine
|
||||||
|
// observes the new status on its next pre-check and exits without
|
||||||
|
// dispatching further hosts. The currently-dispatched job is left to
|
||||||
|
// finish on its own — cancelling agent-side is out of scope for v1.
|
||||||
|
func (w *Worker) Cancel(ctx context.Context, fuID string) error {
|
||||||
|
return w.store.CancelFleetUpdate(ctx, fuID, time.Now().UTC())
|
||||||
|
}
|
||||||
|
|
||||||
|
// run is the per-host loop. Halts on first failure; emits one alert
|
||||||
|
// on transition.
|
||||||
|
func (w *Worker) run(ctx context.Context, fuID, userID, targetVersion string) {
|
||||||
|
w.targetVersion = targetVersion
|
||||||
|
|
||||||
|
for {
|
||||||
|
// Check the parent row's status — picks up Cancel.
|
||||||
|
fu, err := w.store.ActiveFleetUpdate(ctx)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("fleetupdate: read active", "fu_id", fuID, "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if fu == nil || fu.ID != fuID {
|
||||||
|
// Cancelled, halted, or completed externally. Done.
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
pending, err := w.store.ListPendingFleetUpdateHosts(ctx, fuID)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("fleetupdate: list pending", "fu_id", fuID, "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(pending) == 0 {
|
||||||
|
now := time.Now().UTC()
|
||||||
|
if err := w.store.CompleteFleetUpdate(ctx, fuID, now); err != nil {
|
||||||
|
slog.Warn("fleetupdate: complete", "fu_id", fuID, "err", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
next := pending[0]
|
||||||
|
w.processHost(ctx, fuID, userID, next)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// processHost handles one host slot. Marks it skipped, succeeded, or
|
||||||
|
// failed (and halts the fleet on failure).
|
||||||
|
func (w *Worker) processHost(ctx context.Context, fuID, userID string, slot store.FleetUpdateHost) {
|
||||||
|
hostID := slot.HostID
|
||||||
|
_ = w.store.SetFleetUpdateCurrentHost(ctx, fuID, hostID)
|
||||||
|
|
||||||
|
// Pre-flight: re-read the host. The dispatch path repeats most of
|
||||||
|
// these checks but doing them up-front lets us emit the right
|
||||||
|
// per-host status (skipped vs failed) without consuming a job row.
|
||||||
|
host, err := w.store.GetHost(ctx, hostID)
|
||||||
|
if err != nil || host == nil {
|
||||||
|
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "skipped", "host not found", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if host.AgentVersion != "" && host.AgentVersion == w.targetVersion {
|
||||||
|
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "skipped", "already at target version", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !w.hub.Connected(hostID) {
|
||||||
|
reason := fmt.Sprintf("host went offline: %s", hostID)
|
||||||
|
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, "")
|
||||||
|
w.halt(ctx, fuID, reason)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dispatch.
|
||||||
|
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "running", "", "")
|
||||||
|
jobID, code, err := w.disp.DispatchUpdate(ctx, hostID, userID)
|
||||||
|
if err != nil || code != "" {
|
||||||
|
reason := dispatchErrorReason(code, err)
|
||||||
|
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, jobID)
|
||||||
|
w.halt(ctx, fuID, reason)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Poll until the host's recorded agent_version matches target, or
|
||||||
|
// timeout.
|
||||||
|
deadline := time.Now().Add(w.hostTimeout)
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
// Honour cancellation between polls.
|
||||||
|
fu, err := w.store.ActiveFleetUpdate(ctx)
|
||||||
|
if err == nil && (fu == nil || fu.ID != fuID) {
|
||||||
|
// Cancelled mid-host; leave the slot in 'running' for the
|
||||||
|
// admin to inspect. No further dispatches.
|
||||||
|
return
|
||||||
|
}
|
||||||
|
time.Sleep(w.pollPeriod)
|
||||||
|
h, err := w.store.GetHost(ctx, hostID)
|
||||||
|
if err == nil && h != nil && h.AgentVersion == w.targetVersion {
|
||||||
|
if err := w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "succeeded", "", jobID); err != nil {
|
||||||
|
slog.Warn("fleetupdate: set succeeded", "fu_id", fuID, "host_id", hostID, "err", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
reason := fmt.Sprintf("timeout waiting for %s to reach %s", hostID, w.targetVersion)
|
||||||
|
_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, jobID)
|
||||||
|
w.halt(ctx, fuID, reason)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *Worker) halt(ctx context.Context, fuID, reason string) {
|
||||||
|
now := time.Now().UTC()
|
||||||
|
if err := w.store.HaltFleetUpdate(ctx, fuID, reason, now); err != nil {
|
||||||
|
slog.Warn("fleetupdate: halt", "fu_id", fuID, "err", err)
|
||||||
|
}
|
||||||
|
if w.alerts != nil {
|
||||||
|
w.alerts.RaiseFleetUpdateHalted(ctx, fuID, reason, now)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func dispatchErrorReason(code string, err error) string {
|
||||||
|
if code != "" {
|
||||||
|
return "dispatch failed: " + code
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return err.Error()
|
||||||
|
}
|
||||||
|
return "dispatch failed"
|
||||||
|
}
|
||||||
@@ -0,0 +1,344 @@
|
|||||||
|
package fleetupdate
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"path/filepath"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
type fakeHub struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
online map[string]bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeHub) Connected(hostID string) bool {
|
||||||
|
f.mu.Lock()
|
||||||
|
defer f.mu.Unlock()
|
||||||
|
return f.online[hostID]
|
||||||
|
}
|
||||||
|
|
||||||
|
type fakeDispatcher struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
calls []string // host IDs
|
||||||
|
// after dispatch, set the host's agent_version to this on the
|
||||||
|
// store so the worker observes the version transition.
|
||||||
|
st *store.Store
|
||||||
|
target string
|
||||||
|
delayMS int
|
||||||
|
failOnHost map[string]string // host → error code
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeDispatcher) DispatchUpdate(ctx context.Context, hostID, _ string) (string, string, error) {
|
||||||
|
f.mu.Lock()
|
||||||
|
f.calls = append(f.calls, hostID)
|
||||||
|
if code, ok := f.failOnHost[hostID]; ok {
|
||||||
|
f.mu.Unlock()
|
||||||
|
return "", code, nil
|
||||||
|
}
|
||||||
|
st := f.st
|
||||||
|
target := f.target
|
||||||
|
delay := f.delayMS
|
||||||
|
f.mu.Unlock()
|
||||||
|
|
||||||
|
jobID := ulid.Make().String()
|
||||||
|
if st != nil {
|
||||||
|
_ = st.CreateJob(context.Background(), store.Job{
|
||||||
|
ID: jobID, HostID: hostID, Kind: "update",
|
||||||
|
ActorKind: "user", CreatedAt: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if st != nil && target != "" {
|
||||||
|
go func() {
|
||||||
|
if delay > 0 {
|
||||||
|
time.Sleep(time.Duration(delay) * time.Millisecond)
|
||||||
|
}
|
||||||
|
_ = st.MarkHostHello(context.Background(), hostID, target, "0.17", api.CurrentProtocolVersion, time.Now().UTC())
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
return jobID, "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type recAlert struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
reasons []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *recAlert) RaiseFleetUpdateHalted(_ context.Context, _ string, reason string, _ time.Time) {
|
||||||
|
r.mu.Lock()
|
||||||
|
r.reasons = append(r.reasons, reason)
|
||||||
|
r.mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func openStore(t *testing.T) *store.Store {
|
||||||
|
t.Helper()
|
||||||
|
dir := t.TempDir()
|
||||||
|
st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("open: %v", err)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { _ = st.Close() })
|
||||||
|
return st
|
||||||
|
}
|
||||||
|
|
||||||
|
func mustCreateAdmin(t *testing.T, st *store.Store) string {
|
||||||
|
t.Helper()
|
||||||
|
uid := ulid.Make().String()
|
||||||
|
if err := st.CreateUser(context.Background(), store.User{
|
||||||
|
ID: uid, Username: "u-" + uid[:6],
|
||||||
|
PasswordHash: "x", Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("user: %v", err)
|
||||||
|
}
|
||||||
|
return uid
|
||||||
|
}
|
||||||
|
|
||||||
|
func mustCreateHost(t *testing.T, st *store.Store, name, version string) string {
|
||||||
|
t.Helper()
|
||||||
|
hostID := ulid.Make().String()
|
||||||
|
if err := st.CreateHost(context.Background(), store.Host{
|
||||||
|
ID: hostID, Name: name, OS: "linux", Arch: "amd64",
|
||||||
|
EnrolledAt: time.Now().UTC(),
|
||||||
|
}, "deadbeef-"+hostID, ""); err != nil {
|
||||||
|
t.Fatalf("host: %v", err)
|
||||||
|
}
|
||||||
|
if version != "" {
|
||||||
|
if err := st.MarkHostHello(context.Background(), hostID, version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
||||||
|
t.Fatalf("hello: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return hostID
|
||||||
|
}
|
||||||
|
|
||||||
|
func waitForStatus(t *testing.T, st *store.Store, fuID, want string, timeout time.Duration) *store.FleetUpdate {
|
||||||
|
t.Helper()
|
||||||
|
deadline := time.Now().Add(timeout)
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
fu, _, err := st.GetFleetUpdate(context.Background(), fuID)
|
||||||
|
if err == nil && fu != nil && fu.Status == want {
|
||||||
|
return fu
|
||||||
|
}
|
||||||
|
time.Sleep(20 * time.Millisecond)
|
||||||
|
}
|
||||||
|
t.Fatalf("status never reached %q", want)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWorkerTwoHostsBothSucceed(t *testing.T) {
|
||||||
|
st := openStore(t)
|
||||||
|
uid := mustCreateAdmin(t, st)
|
||||||
|
h1 := mustCreateHost(t, st, "h1", "v0")
|
||||||
|
h2 := mustCreateHost(t, st, "h2", "v0")
|
||||||
|
|
||||||
|
hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
|
||||||
|
disp := &fakeDispatcher{st: st, target: "v2", delayMS: 30}
|
||||||
|
alerts := &recAlert{}
|
||||||
|
w := NewWorker(st, hub, disp, alerts)
|
||||||
|
w.pollPeriod = 20 * time.Millisecond
|
||||||
|
w.hostTimeout = 2 * time.Second
|
||||||
|
|
||||||
|
fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("start: %v", err)
|
||||||
|
}
|
||||||
|
waitForStatus(t, st, fuID, "completed", 5*time.Second)
|
||||||
|
_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
|
||||||
|
for _, h := range hosts {
|
||||||
|
if h.Status != "succeeded" {
|
||||||
|
t.Errorf("host %s status %q want succeeded", h.HostID, h.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if n := len(alerts.reasons); n != 0 {
|
||||||
|
t.Errorf("unexpected halt alert: %v", alerts.reasons)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWorkerSecondHostTimesOutHalts(t *testing.T) {
|
||||||
|
st := openStore(t)
|
||||||
|
uid := mustCreateAdmin(t, st)
|
||||||
|
h1 := mustCreateHost(t, st, "h1", "v0")
|
||||||
|
h2 := mustCreateHost(t, st, "h2", "v0")
|
||||||
|
h3 := mustCreateHost(t, st, "h3", "v0")
|
||||||
|
|
||||||
|
hub := &fakeHub{online: map[string]bool{h1: true, h2: true, h3: true}}
|
||||||
|
// h1 dispatches normally (transitions to v2). h2 dispatch returns
|
||||||
|
// success but never transitions.
|
||||||
|
disp := &fakeDispatcher{st: st, target: "v2", delayMS: 20, failOnHost: map[string]string{
|
||||||
|
h2: "", // not a code-failure; simulate by clearing target on this disp run
|
||||||
|
}}
|
||||||
|
// Actually: drop h2 from the auto-transition by faking with a
|
||||||
|
// per-host store setter. Easiest: subclass via a wrapper.
|
||||||
|
_ = disp
|
||||||
|
customDisp := &perHostDispatcher{base: disp, st: st, target: "v2", noTransition: map[string]bool{h2: true}}
|
||||||
|
|
||||||
|
alerts := &recAlert{}
|
||||||
|
w := NewWorker(st, hub, customDisp, alerts)
|
||||||
|
w.pollPeriod = 20 * time.Millisecond
|
||||||
|
w.hostTimeout = 200 * time.Millisecond
|
||||||
|
|
||||||
|
fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2, h3})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("start: %v", err)
|
||||||
|
}
|
||||||
|
waitForStatus(t, st, fuID, "halted", 3*time.Second)
|
||||||
|
_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
|
||||||
|
gotStatus := map[string]string{}
|
||||||
|
for _, h := range hosts {
|
||||||
|
gotStatus[h.HostID] = h.Status
|
||||||
|
}
|
||||||
|
if gotStatus[h1] != "succeeded" {
|
||||||
|
t.Errorf("h1: %q", gotStatus[h1])
|
||||||
|
}
|
||||||
|
if gotStatus[h2] != "failed" {
|
||||||
|
t.Errorf("h2: %q", gotStatus[h2])
|
||||||
|
}
|
||||||
|
if gotStatus[h3] != "pending" {
|
||||||
|
t.Errorf("h3: %q", gotStatus[h3])
|
||||||
|
}
|
||||||
|
alerts.mu.Lock()
|
||||||
|
defer alerts.mu.Unlock()
|
||||||
|
if len(alerts.reasons) != 1 {
|
||||||
|
t.Errorf("alert reasons: %v", alerts.reasons)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// perHostDispatcher lets a test omit the auto-transition for selected
|
||||||
|
// hosts so we can simulate timeout.
|
||||||
|
type perHostDispatcher struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
base *fakeDispatcher
|
||||||
|
st *store.Store
|
||||||
|
target string
|
||||||
|
noTransition map[string]bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *perHostDispatcher) DispatchUpdate(_ context.Context, hostID, _ string) (string, string, error) {
|
||||||
|
p.mu.Lock()
|
||||||
|
skip := p.noTransition[hostID]
|
||||||
|
p.mu.Unlock()
|
||||||
|
jobID := ulid.Make().String()
|
||||||
|
_ = p.st.CreateJob(context.Background(), store.Job{
|
||||||
|
ID: jobID, HostID: hostID, Kind: "update",
|
||||||
|
ActorKind: "user", CreatedAt: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
if !skip {
|
||||||
|
go func() {
|
||||||
|
time.Sleep(20 * time.Millisecond)
|
||||||
|
_ = p.st.MarkHostHello(context.Background(), hostID, p.target, "0.17", api.CurrentProtocolVersion, time.Now().UTC())
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
return jobID, "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWorkerHostOfflineHalts(t *testing.T) {
|
||||||
|
st := openStore(t)
|
||||||
|
uid := mustCreateAdmin(t, st)
|
||||||
|
h1 := mustCreateHost(t, st, "h1", "v0")
|
||||||
|
h2 := mustCreateHost(t, st, "h2", "v0")
|
||||||
|
hub := &fakeHub{online: map[string]bool{h1: false, h2: true}}
|
||||||
|
disp := &fakeDispatcher{st: st, target: "v2"}
|
||||||
|
alerts := &recAlert{}
|
||||||
|
w := NewWorker(st, hub, disp, alerts)
|
||||||
|
w.pollPeriod = 20 * time.Millisecond
|
||||||
|
w.hostTimeout = 500 * time.Millisecond
|
||||||
|
|
||||||
|
fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("start: %v", err)
|
||||||
|
}
|
||||||
|
waitForStatus(t, st, fuID, "halted", 2*time.Second)
|
||||||
|
_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
|
||||||
|
if hosts[0].Status != "failed" {
|
||||||
|
t.Errorf("h1 status: %q", hosts[0].Status)
|
||||||
|
}
|
||||||
|
if hosts[1].Status != "pending" {
|
||||||
|
t.Errorf("h2 status: %q", hosts[1].Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWorkerAlreadyAtTargetSkipped(t *testing.T) {
|
||||||
|
st := openStore(t)
|
||||||
|
uid := mustCreateAdmin(t, st)
|
||||||
|
h1 := mustCreateHost(t, st, "h1", "v2")
|
||||||
|
h2 := mustCreateHost(t, st, "h2", "v0")
|
||||||
|
hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
|
||||||
|
disp := &fakeDispatcher{st: st, target: "v2", delayMS: 20}
|
||||||
|
alerts := &recAlert{}
|
||||||
|
w := NewWorker(st, hub, disp, alerts)
|
||||||
|
w.pollPeriod = 20 * time.Millisecond
|
||||||
|
w.hostTimeout = 2 * time.Second
|
||||||
|
|
||||||
|
fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("start: %v", err)
|
||||||
|
}
|
||||||
|
waitForStatus(t, st, fuID, "completed", 4*time.Second)
|
||||||
|
_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
|
||||||
|
want := map[string]string{h1: "skipped", h2: "succeeded"}
|
||||||
|
for _, h := range hosts {
|
||||||
|
if h.Status != want[h.HostID] {
|
||||||
|
t.Errorf("host %s: got %q want %q", h.HostID, h.Status, want[h.HostID])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWorkerCancelMidRun(t *testing.T) {
|
||||||
|
st := openStore(t)
|
||||||
|
uid := mustCreateAdmin(t, st)
|
||||||
|
h1 := mustCreateHost(t, st, "h1", "v0")
|
||||||
|
h2 := mustCreateHost(t, st, "h2", "v0")
|
||||||
|
hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
|
||||||
|
// h1's transition is delayed long enough that we can cancel
|
||||||
|
// before it lands; h2 should never be touched.
|
||||||
|
disp := &fakeDispatcher{st: st, target: "v2", delayMS: 500}
|
||||||
|
alerts := &recAlert{}
|
||||||
|
w := NewWorker(st, hub, disp, alerts)
|
||||||
|
w.pollPeriod = 50 * time.Millisecond
|
||||||
|
w.hostTimeout = 5 * time.Second
|
||||||
|
|
||||||
|
fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("start: %v", err)
|
||||||
|
}
|
||||||
|
// Give the worker a moment to dispatch h1.
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
if err := w.Cancel(context.Background(), fuID); err != nil {
|
||||||
|
t.Fatalf("cancel: %v", err)
|
||||||
|
}
|
||||||
|
waitForStatus(t, st, fuID, "cancelled", 2*time.Second)
|
||||||
|
|
||||||
|
// h2 should never be dispatched.
|
||||||
|
disp.mu.Lock()
|
||||||
|
defer disp.mu.Unlock()
|
||||||
|
for _, c := range disp.calls {
|
||||||
|
if c == h2 {
|
||||||
|
t.Errorf("h2 dispatched after cancel")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWorkerStartWhileActiveErrors(t *testing.T) {
|
||||||
|
st := openStore(t)
|
||||||
|
uid := mustCreateAdmin(t, st)
|
||||||
|
h1 := mustCreateHost(t, st, "h1", "v0")
|
||||||
|
h2 := mustCreateHost(t, st, "h2", "v0")
|
||||||
|
hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
|
||||||
|
disp := &fakeDispatcher{st: st, target: "v2", delayMS: 5_000}
|
||||||
|
w := NewWorker(st, hub, disp, &recAlert{})
|
||||||
|
w.pollPeriod = 50 * time.Millisecond
|
||||||
|
w.hostTimeout = 2 * time.Second
|
||||||
|
if _, err := w.Start(context.Background(), uid, "v2", []string{h1}); err != nil {
|
||||||
|
t.Fatalf("first start: %v", err)
|
||||||
|
}
|
||||||
|
_, err := w.Start(context.Background(), uid, "v2", []string{h2})
|
||||||
|
if !errors.Is(err, store.ErrFleetUpdateRunning) {
|
||||||
|
t.Fatalf("err: %v want ErrFleetUpdateRunning", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
func makeFilterHosts() []store.Host {
|
func makeFilterHosts() []store.Host {
|
||||||
@@ -98,6 +99,23 @@ func TestSortDashboardHostsColumns(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestFilterAndSortDashboardUpdatesBehind: ?updates=behind narrows
|
||||||
|
// to hosts whose agent_version is non-empty AND != server's version.
|
||||||
|
func TestFilterAndSortDashboardUpdatesBehind(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
hosts := []store.Host{
|
||||||
|
{ID: "01a", Name: "alpha", AgentVersion: "v0.0.1", Status: "online"},
|
||||||
|
{ID: "01b", Name: "bravo", AgentVersion: version.Version, Status: "online"},
|
||||||
|
{ID: "01c", Name: "charlie", AgentVersion: "", Status: "online"}, // never seen
|
||||||
|
{ID: "01d", Name: "delta", AgentVersion: "v0.0.1", Status: "offline"},
|
||||||
|
}
|
||||||
|
got := filterAndSortDashboardHosts(hosts, dashboardFilter{Updates: "behind", Sort: "name", Dir: "asc"})
|
||||||
|
// alpha + delta both behind; bravo (current) and charlie (empty) excluded.
|
||||||
|
if len(got) != 2 || got[0].Name != "alpha" || got[1].Name != "delta" {
|
||||||
|
t.Errorf("updates=behind: got %v", namesOf(got))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestParseDashboardFilterDefaults: empty query gives sort=name asc.
|
// TestParseDashboardFilterDefaults: empty query gives sort=name asc.
|
||||||
func TestParseDashboardFilterDefaults(t *testing.T) {
|
func TestParseDashboardFilterDefaults(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|||||||
@@ -0,0 +1,379 @@
|
|||||||
|
// fleet_update.go — admin-only fleet rolling-update endpoints + page.
|
||||||
|
//
|
||||||
|
// Surface:
|
||||||
|
// - POST /api/fleet/update → starts a fleet update (JSON)
|
||||||
|
// - POST /api/fleet-updates/{id}/cancel
|
||||||
|
// - GET /api/fleet-updates/{id} → JSON parent + per-host array
|
||||||
|
// - GET /settings/fleet-update → admin UI page
|
||||||
|
// - GET /settings/fleet-update/partial → htmx polling fragment
|
||||||
|
//
|
||||||
|
// All routes are mounted in the admin band (see routes()).
|
||||||
|
package http
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"log/slog"
|
||||||
|
stdhttp "net/http"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/go-chi/chi/v5"
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||||
|
)
|
||||||
|
|
||||||
|
// fleetUpdateStartReq is the JSON body for POST /api/fleet/update.
|
||||||
|
// Both fields are optional: empty target_version defaults to the
|
||||||
|
// server's current version, empty host_ids derives the out-of-date
|
||||||
|
// online subset.
|
||||||
|
type fleetUpdateStartReq struct {
|
||||||
|
TargetVersion string `json:"target_version,omitempty"`
|
||||||
|
HostIDs []string `json:"host_ids,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// fleetUpdateHostView is one row in the JSON response for GET
|
||||||
|
// /api/fleet-updates/{id}. Hostname is hydrated from the store so
|
||||||
|
// callers don't need a second round-trip per host.
|
||||||
|
type fleetUpdateHostView struct {
|
||||||
|
HostID string `json:"host_id"`
|
||||||
|
HostName string `json:"host_name,omitempty"`
|
||||||
|
Position int `json:"position"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
JobID string `json:"job_id,omitempty"`
|
||||||
|
FailedReason string `json:"failed_reason,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// fleetUpdateView is the JSON projection of the parent + children.
|
||||||
|
type fleetUpdateView struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
StartedAt string `json:"started_at"`
|
||||||
|
StartedByUserID string `json:"started_by_user_id"`
|
||||||
|
TargetVersion string `json:"target_version"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
CurrentHostID string `json:"current_host_id,omitempty"`
|
||||||
|
HaltedReason string `json:"halted_reason,omitempty"`
|
||||||
|
CompletedAt *string `json:"completed_at,omitempty"`
|
||||||
|
Hosts []fleetUpdateHostView `json:"hosts"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// fleetUpdatePage backs both the full /settings/fleet-update page
|
||||||
|
// and the partial polled fragment. Idle / Active are mutually
|
||||||
|
// exclusive: if Active is non-nil, render the progress view.
|
||||||
|
type fleetUpdatePage struct {
|
||||||
|
// Idle-state fields.
|
||||||
|
OutOfDateHosts []store.Host // online hosts whose version != target
|
||||||
|
TargetVersion string
|
||||||
|
|
||||||
|
// Active-state fields. Nil when no fleet update has ever run.
|
||||||
|
Active *store.FleetUpdate
|
||||||
|
ActiveRows []fleetUpdateHostView
|
||||||
|
|
||||||
|
// Common.
|
||||||
|
HostNames map[string]string
|
||||||
|
// PollURL is the partial endpoint htmx polls every few seconds.
|
||||||
|
PollURL string
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleAPIFleetUpdateStart is POST /api/fleet/update.
|
||||||
|
func (s *Server) handleAPIFleetUpdateStart(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||||
|
user, ok := s.requireUser(r)
|
||||||
|
if !ok {
|
||||||
|
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.deps.FleetWorker == nil {
|
||||||
|
writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var body fleetUpdateStartReq
|
||||||
|
// Empty body is fine — both fields are optional.
|
||||||
|
if r.ContentLength != 0 {
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||||
|
writeJSONError(w, stdhttp.StatusBadRequest, "invalid_json", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
target := body.TargetVersion
|
||||||
|
if target == "" {
|
||||||
|
target = version.Version
|
||||||
|
}
|
||||||
|
hostIDs := body.HostIDs
|
||||||
|
if len(hostIDs) == 0 {
|
||||||
|
derived, err := s.deriveOutOfDateOnlineHostIDs(r.Context(), target)
|
||||||
|
if err != nil {
|
||||||
|
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
hostIDs = derived
|
||||||
|
}
|
||||||
|
if len(hostIDs) == 0 {
|
||||||
|
writeJSONError(w, stdhttp.StatusConflict, "no_hosts_eligible",
|
||||||
|
"no online hosts are out of date")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fuID, err := s.deps.FleetWorker.Start(r.Context(), user.ID, target, hostIDs)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, store.ErrFleetUpdateRunning) {
|
||||||
|
writeJSONError(w, stdhttp.StatusConflict, "fleet_update_in_progress", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
auditPayload, _ := json.Marshal(map[string]any{
|
||||||
|
"fleet_update_id": fuID,
|
||||||
|
"target_version": target,
|
||||||
|
"host_count": len(hostIDs),
|
||||||
|
})
|
||||||
|
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
|
||||||
|
ID: ulid.Make().String(), UserID: &user.ID, Actor: "user",
|
||||||
|
Action: "fleet.update_started",
|
||||||
|
TargetKind: ptr("fleet_update"), TargetID: &fuID,
|
||||||
|
TS: time.Now().UTC(),
|
||||||
|
Payload: auditPayload,
|
||||||
|
})
|
||||||
|
|
||||||
|
writeJSON(w, stdhttp.StatusAccepted, map[string]string{"fleet_update_id": fuID})
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleAPIFleetUpdateCancel is POST /api/fleet-updates/{id}/cancel.
|
||||||
|
func (s *Server) handleAPIFleetUpdateCancel(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||||
|
user, ok := s.requireUser(r)
|
||||||
|
if !ok {
|
||||||
|
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if s.deps.FleetWorker == nil {
|
||||||
|
writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fuID := chi.URLParam(r, "id")
|
||||||
|
if fuID == "" {
|
||||||
|
writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fu, _, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, store.ErrNotFound) {
|
||||||
|
writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if fu.Status != "running" {
|
||||||
|
writeJSONError(w, stdhttp.StatusConflict, "fleet_update_not_running",
|
||||||
|
"fleet update is not in the running state")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := s.deps.FleetWorker.Cancel(r.Context(), fuID); err != nil {
|
||||||
|
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
|
||||||
|
ID: ulid.Make().String(), UserID: &user.ID, Actor: "user",
|
||||||
|
Action: "fleet.update_cancelled",
|
||||||
|
TargetKind: ptr("fleet_update"), TargetID: &fuID,
|
||||||
|
TS: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
w.WriteHeader(stdhttp.StatusNoContent)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleAPIFleetUpdateGet is GET /api/fleet-updates/{id}.
|
||||||
|
func (s *Server) handleAPIFleetUpdateGet(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||||
|
if _, ok := s.requireUser(r); !ok {
|
||||||
|
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fuID := chi.URLParam(r, "id")
|
||||||
|
fu, hosts, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, store.ErrNotFound) {
|
||||||
|
writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
names := s.hostNameMap(r)
|
||||||
|
view := fleetUpdateView{
|
||||||
|
ID: fu.ID,
|
||||||
|
StartedAt: fu.StartedAt.UTC().Format(time.RFC3339Nano),
|
||||||
|
StartedByUserID: fu.StartedByUserID,
|
||||||
|
TargetVersion: fu.TargetVersion,
|
||||||
|
Status: fu.Status,
|
||||||
|
CurrentHostID: fu.CurrentHostID,
|
||||||
|
HaltedReason: fu.HaltedReason,
|
||||||
|
Hosts: make([]fleetUpdateHostView, 0, len(hosts)),
|
||||||
|
}
|
||||||
|
if fu.CompletedAt != nil {
|
||||||
|
s := fu.CompletedAt.UTC().Format(time.RFC3339Nano)
|
||||||
|
view.CompletedAt = &s
|
||||||
|
}
|
||||||
|
for _, h := range hosts {
|
||||||
|
view.Hosts = append(view.Hosts, fleetUpdateHostView{
|
||||||
|
HostID: h.HostID,
|
||||||
|
HostName: names[h.HostID],
|
||||||
|
Position: h.Position,
|
||||||
|
Status: h.Status,
|
||||||
|
JobID: h.JobID,
|
||||||
|
FailedReason: h.FailedReason,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
writeJSON(w, stdhttp.StatusOK, view)
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleUIFleetUpdate renders /settings/fleet-update.
|
||||||
|
func (s *Server) handleUIFleetUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||||
|
u := s.requireUIUser(w, r)
|
||||||
|
if u == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
page, err := s.buildFleetUpdatePage(r)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("ui fleet update: build page", "err", err)
|
||||||
|
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
view := s.baseView(r, u)
|
||||||
|
view.Title = "Fleet update · restic-manager"
|
||||||
|
view.Active = "settings"
|
||||||
|
view.Page = page
|
||||||
|
if err := s.deps.UI.Render(w, "fleet_update", view); err != nil {
|
||||||
|
slog.Error("ui fleet update: render", "err", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleUIFleetUpdatePartial renders just the inner panel for htmx
|
||||||
|
// auto-refresh polling — same data, no chrome.
|
||||||
|
func (s *Server) handleUIFleetUpdatePartial(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||||
|
u := s.requireUIUser(w, r)
|
||||||
|
if u == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
page, err := s.buildFleetUpdatePage(r)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("ui fleet update partial: build page", "err", err)
|
||||||
|
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
view := s.baseView(r, u)
|
||||||
|
view.Page = page
|
||||||
|
if err := s.deps.UI.RenderPartial(w, "fleet_update_inner", view); err != nil {
|
||||||
|
slog.Error("ui fleet update partial: render", "err", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildFleetUpdatePage assembles the data both /settings/fleet-update
|
||||||
|
// and its partial render against. Resolves the most-recent fleet
|
||||||
|
// update (active OR completed/cancelled/halted) so the page can show
|
||||||
|
// the last roll's result instead of disappearing into "idle" the
|
||||||
|
// instant a roll finishes.
|
||||||
|
func (s *Server) buildFleetUpdatePage(r *stdhttp.Request) (fleetUpdatePage, error) {
|
||||||
|
page := fleetUpdatePage{
|
||||||
|
TargetVersion: version.Version,
|
||||||
|
HostNames: map[string]string{},
|
||||||
|
PollURL: "/settings/fleet-update/partial",
|
||||||
|
}
|
||||||
|
hosts, err := s.deps.Store.ListHosts(r.Context())
|
||||||
|
if err != nil {
|
||||||
|
return page, err
|
||||||
|
}
|
||||||
|
for _, h := range hosts {
|
||||||
|
page.HostNames[h.ID] = h.Name
|
||||||
|
}
|
||||||
|
|
||||||
|
active, err := s.deps.Store.ActiveFleetUpdate(r.Context())
|
||||||
|
if err != nil {
|
||||||
|
return page, err
|
||||||
|
}
|
||||||
|
mostRecent := active
|
||||||
|
if mostRecent == nil {
|
||||||
|
// Fall back to the most recent terminal row so the page can
|
||||||
|
// show "completed" / "halted" / "cancelled" once the worker
|
||||||
|
// finishes. One small bespoke query — keeps the page from
|
||||||
|
// flashing back to "idle" the instant a roll wraps up.
|
||||||
|
var id string
|
||||||
|
err := s.deps.Store.DB().QueryRowContext(r.Context(),
|
||||||
|
`SELECT id FROM fleet_updates ORDER BY started_at DESC LIMIT 1`).
|
||||||
|
Scan(&id)
|
||||||
|
if err == nil {
|
||||||
|
fu, _, gerr := s.deps.Store.GetFleetUpdate(r.Context(), id)
|
||||||
|
if gerr == nil {
|
||||||
|
mostRecent = fu
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if mostRecent != nil {
|
||||||
|
_, rows, gerr := s.deps.Store.GetFleetUpdate(r.Context(), mostRecent.ID)
|
||||||
|
if gerr == nil {
|
||||||
|
page.Active = mostRecent
|
||||||
|
page.ActiveRows = make([]fleetUpdateHostView, 0, len(rows))
|
||||||
|
for _, hr := range rows {
|
||||||
|
page.ActiveRows = append(page.ActiveRows, fleetUpdateHostView{
|
||||||
|
HostID: hr.HostID,
|
||||||
|
HostName: page.HostNames[hr.HostID],
|
||||||
|
Position: hr.Position,
|
||||||
|
Status: hr.Status,
|
||||||
|
JobID: hr.JobID,
|
||||||
|
FailedReason: hr.FailedReason,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Idle list (or "still out of date" reference even when an active
|
||||||
|
// roll is running — cheap to compute, harmless to attach).
|
||||||
|
for _, h := range hosts {
|
||||||
|
if h.Status != "online" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if h.AgentVersion == "" || h.AgentVersion == page.TargetVersion {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
page.OutOfDateHosts = append(page.OutOfDateHosts, h)
|
||||||
|
}
|
||||||
|
return page, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// deriveOutOfDateOnlineHostIDs returns the list of host IDs that
|
||||||
|
// (a) are online (Hub.Connected) and (b) have an agent_version that's
|
||||||
|
// non-empty AND != target. Used by the start endpoint when the caller
|
||||||
|
// omits host_ids.
|
||||||
|
func (s *Server) deriveOutOfDateOnlineHostIDs(ctx context.Context, target string) ([]string, error) {
|
||||||
|
hosts, err := s.deps.Store.ListHosts(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
out := []string{}
|
||||||
|
for _, h := range hosts {
|
||||||
|
if h.AgentVersion == "" || h.AgentVersion == target {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !s.deps.Hub.Connected(h.ID) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, h.ID)
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// hostNameMap returns hostID → name; used to hydrate fleet-update
|
||||||
|
// JSON responses.
|
||||||
|
func (s *Server) hostNameMap(r *stdhttp.Request) map[string]string {
|
||||||
|
out := map[string]string{}
|
||||||
|
hosts, err := s.deps.Store.ListHosts(r.Context())
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
for _, h := range hosts {
|
||||||
|
out[h.ID] = h.Name
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
@@ -0,0 +1,334 @@
|
|||||||
|
// fleet_update_test.go — coverage for the P6-15 fleet-update HTTP
|
||||||
|
// surface: start/cancel/get JSON endpoints + RBAC.
|
||||||
|
package http
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
stdhttp "net/http"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||||
|
)
|
||||||
|
|
||||||
|
// fakeFleetWorker stands in for *fleetupdate.Worker in HTTP tests.
|
||||||
|
// It records what was passed to Start/Cancel and lets tests inject
|
||||||
|
// canned errors. Satisfies the FleetWorker interface in
|
||||||
|
// host_update.go.
|
||||||
|
type fakeFleetWorker struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
|
||||||
|
startCalls []fakeStartCall
|
||||||
|
startID string
|
||||||
|
startErr error
|
||||||
|
|
||||||
|
cancelCalls []string
|
||||||
|
cancelErr error
|
||||||
|
}
|
||||||
|
|
||||||
|
type fakeStartCall struct {
|
||||||
|
UserID string
|
||||||
|
Target string
|
||||||
|
HostIDs []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeFleetWorker) Start(_ context.Context, userID, target string, hostIDs []string) (string, error) {
|
||||||
|
f.mu.Lock()
|
||||||
|
defer f.mu.Unlock()
|
||||||
|
f.startCalls = append(f.startCalls, fakeStartCall{userID, target, append([]string(nil), hostIDs...)})
|
||||||
|
if f.startErr != nil {
|
||||||
|
return "", f.startErr
|
||||||
|
}
|
||||||
|
return f.startID, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeFleetWorker) Cancel(_ context.Context, id string) error {
|
||||||
|
f.mu.Lock()
|
||||||
|
defer f.mu.Unlock()
|
||||||
|
f.cancelCalls = append(f.cancelCalls, id)
|
||||||
|
return f.cancelErr
|
||||||
|
}
|
||||||
|
|
||||||
|
// helloOnlineHost is the smallest setup that lets the dispatch /
|
||||||
|
// derivation logic see a host as "online + version mismatch".
|
||||||
|
// Returns the host id.
|
||||||
|
func helloOnlineHost(t *testing.T, srv *Server, st *store.Store, name, agentVer string) string {
|
||||||
|
t.Helper()
|
||||||
|
id := makeHost(t, st, name)
|
||||||
|
if err := st.MarkHostHello(context.Background(), id, agentVer, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
||||||
|
t.Fatalf("mark hello: %v", err)
|
||||||
|
}
|
||||||
|
// Mark connected on the hub so deriveOutOfDateOnlineHostIDs
|
||||||
|
// considers it online without needing a real WS handshake. The
|
||||||
|
// Conn has a nil websocket pointer — tests never call Send on it.
|
||||||
|
srv.deps.Hub.Register(id, ws.NewConn(id, nil))
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFleetUpdateStartHappyPath(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
srv, ts, st := rawTestServer(t)
|
||||||
|
worker := &fakeFleetWorker{startID: ulid.Make().String()}
|
||||||
|
srv.deps.FleetWorker = worker
|
||||||
|
|
||||||
|
cookie, uid := loginAsAdminWithID(t, st)
|
||||||
|
hostID := helloOnlineHost(t, srv, st, "fu-host", "v0")
|
||||||
|
|
||||||
|
body := map[string]any{"host_ids": []string{hostID}}
|
||||||
|
raw, _ := json.Marshal(body)
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader(raw))
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusAccepted {
|
||||||
|
t.Fatalf("status: got %d, want 202", res.StatusCode)
|
||||||
|
}
|
||||||
|
var out struct {
|
||||||
|
FleetUpdateID string `json:"fleet_update_id"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(res.Body).Decode(&out); err != nil {
|
||||||
|
t.Fatalf("decode: %v", err)
|
||||||
|
}
|
||||||
|
if out.FleetUpdateID != worker.startID {
|
||||||
|
t.Fatalf("fleet_update_id: got %q, want %q", out.FleetUpdateID, worker.startID)
|
||||||
|
}
|
||||||
|
worker.mu.Lock()
|
||||||
|
if len(worker.startCalls) != 1 || worker.startCalls[0].UserID != uid {
|
||||||
|
t.Fatalf("start calls: %+v", worker.startCalls)
|
||||||
|
}
|
||||||
|
if got := worker.startCalls[0].HostIDs; len(got) != 1 || got[0] != hostID {
|
||||||
|
t.Fatalf("host_ids: %v", got)
|
||||||
|
}
|
||||||
|
worker.mu.Unlock()
|
||||||
|
|
||||||
|
// Audit row.
|
||||||
|
var n int
|
||||||
|
if err := st.DB().QueryRow(
|
||||||
|
`SELECT COUNT(*) FROM audit_log WHERE action = 'fleet.update_started' AND target_id = ?`,
|
||||||
|
out.FleetUpdateID).Scan(&n); err != nil {
|
||||||
|
t.Fatalf("audit count: %v", err)
|
||||||
|
}
|
||||||
|
if n != 1 {
|
||||||
|
t.Fatalf("audit rows: got %d, want 1", n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFleetUpdateStartConflictWhenAlreadyRunning(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
srv, ts, st := rawTestServer(t)
|
||||||
|
worker := &fakeFleetWorker{startErr: store.ErrFleetUpdateRunning}
|
||||||
|
srv.deps.FleetWorker = worker
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
_ = helloOnlineHost(t, srv, st, "fu-host", "v0")
|
||||||
|
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusConflict {
|
||||||
|
t.Fatalf("status: got %d, want 409", res.StatusCode)
|
||||||
|
}
|
||||||
|
body := readJSONError(t, res.Body)
|
||||||
|
if body.Code != "fleet_update_in_progress" {
|
||||||
|
t.Fatalf("code: %q", body.Code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFleetUpdateStartDerivesHostIDsWhenEmpty(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
srv, ts, st := rawTestServer(t)
|
||||||
|
worker := &fakeFleetWorker{startID: ulid.Make().String()}
|
||||||
|
srv.deps.FleetWorker = worker
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
|
||||||
|
// Two online + out-of-date, one online + at-target, one offline.
|
||||||
|
a := helloOnlineHost(t, srv, st, "behind-a", "v0")
|
||||||
|
b := helloOnlineHost(t, srv, st, "behind-b", "v0")
|
||||||
|
_ = helloOnlineHost(t, srv, st, "uptodate", version.Version)
|
||||||
|
offlineID := makeHost(t, st, "offline-host")
|
||||||
|
if err := st.MarkHostHello(context.Background(), offlineID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
||||||
|
t.Fatalf("mark hello: %v", err)
|
||||||
|
}
|
||||||
|
// Don't MarkOnline → derivation should skip.
|
||||||
|
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusAccepted {
|
||||||
|
t.Fatalf("status: got %d, want 202", res.StatusCode)
|
||||||
|
}
|
||||||
|
worker.mu.Lock()
|
||||||
|
defer worker.mu.Unlock()
|
||||||
|
if len(worker.startCalls) != 1 {
|
||||||
|
t.Fatalf("start calls: %d", len(worker.startCalls))
|
||||||
|
}
|
||||||
|
got := worker.startCalls[0].HostIDs
|
||||||
|
want := map[string]bool{a: true, b: true}
|
||||||
|
if len(got) != 2 || !want[got[0]] || !want[got[1]] {
|
||||||
|
t.Fatalf("derived host_ids: got %v, want both of %v", got, []string{a, b})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFleetUpdateCancelHappyPath(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
srv, ts, st := rawTestServer(t)
|
||||||
|
worker := &fakeFleetWorker{}
|
||||||
|
srv.deps.FleetWorker = worker
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
|
||||||
|
// Seed a running fleet update directly.
|
||||||
|
fuID := ulid.Make().String()
|
||||||
|
uid := ulid.Make().String()
|
||||||
|
if err := st.CreateUser(context.Background(), store.User{
|
||||||
|
ID: uid, Username: "starter", PasswordHash: "x",
|
||||||
|
Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("seed user: %v", err)
|
||||||
|
}
|
||||||
|
hostID := makeHost(t, st, "fu-cancel-host")
|
||||||
|
if err := st.CreateFleetUpdate(context.Background(),
|
||||||
|
store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1"},
|
||||||
|
[]string{hostID}); err != nil {
|
||||||
|
t.Fatalf("seed fleet update: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet-updates/"+fuID+"/cancel", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusNoContent {
|
||||||
|
t.Fatalf("status: got %d, want 204", res.StatusCode)
|
||||||
|
}
|
||||||
|
worker.mu.Lock()
|
||||||
|
if len(worker.cancelCalls) != 1 || worker.cancelCalls[0] != fuID {
|
||||||
|
t.Fatalf("cancel calls: %v", worker.cancelCalls)
|
||||||
|
}
|
||||||
|
worker.mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFleetUpdateCancelNotRunning(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
srv, ts, st := rawTestServer(t)
|
||||||
|
srv.deps.FleetWorker = &fakeFleetWorker{}
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
|
||||||
|
// Seed + complete one so it's no longer running.
|
||||||
|
fuID := ulid.Make().String()
|
||||||
|
uid := ulid.Make().String()
|
||||||
|
_ = st.CreateUser(context.Background(), store.User{
|
||||||
|
ID: uid, Username: "starter2", PasswordHash: "x",
|
||||||
|
Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
hostID := makeHost(t, st, "fu-done-host")
|
||||||
|
_ = st.CreateFleetUpdate(context.Background(),
|
||||||
|
store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1"},
|
||||||
|
[]string{hostID})
|
||||||
|
if err := st.CompleteFleetUpdate(context.Background(), fuID, time.Now().UTC()); err != nil {
|
||||||
|
t.Fatalf("complete: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet-updates/"+fuID+"/cancel", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusConflict {
|
||||||
|
t.Fatalf("status: got %d, want 409", res.StatusCode)
|
||||||
|
}
|
||||||
|
body := readJSONError(t, res.Body)
|
||||||
|
if body.Code != "fleet_update_not_running" {
|
||||||
|
t.Fatalf("code: %q", body.Code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFleetUpdateGetHydrates(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
_, ts, st := rawTestServer(t)
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
|
||||||
|
uid := ulid.Make().String()
|
||||||
|
_ = st.CreateUser(context.Background(), store.User{
|
||||||
|
ID: uid, Username: "starter3", PasswordHash: "x",
|
||||||
|
Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
|
||||||
|
})
|
||||||
|
hostID := makeHost(t, st, "fu-get-host")
|
||||||
|
fuID := ulid.Make().String()
|
||||||
|
if err := st.CreateFleetUpdate(context.Background(),
|
||||||
|
store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1.2.3"},
|
||||||
|
[]string{hostID}); err != nil {
|
||||||
|
t.Fatalf("seed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
req, _ := stdhttp.NewRequest("GET", ts.URL+"/api/fleet-updates/"+fuID, nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusOK {
|
||||||
|
t.Fatalf("status: got %d, want 200", res.StatusCode)
|
||||||
|
}
|
||||||
|
var got fleetUpdateView
|
||||||
|
if err := json.NewDecoder(res.Body).Decode(&got); err != nil {
|
||||||
|
t.Fatalf("decode: %v", err)
|
||||||
|
}
|
||||||
|
if got.ID != fuID || got.TargetVersion != "v1.2.3" || got.Status != "running" {
|
||||||
|
t.Fatalf("parent: %+v", got)
|
||||||
|
}
|
||||||
|
if len(got.Hosts) != 1 || got.Hosts[0].HostID != hostID || got.Hosts[0].HostName != "fu-get-host" {
|
||||||
|
t.Fatalf("hosts: %+v", got.Hosts)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFleetUpdateRBAC(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
_, ts, st := rawTestServer(t)
|
||||||
|
|
||||||
|
for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} {
|
||||||
|
role := role
|
||||||
|
t.Run(string(role), func(t *testing.T) {
|
||||||
|
cookie := loginAsRole(t, st, role)
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusForbidden {
|
||||||
|
t.Fatalf("status: got %d, want 403", res.StatusCode)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sanity check that fakeFleetWorker satisfies the FleetWorker iface.
|
||||||
|
var _ FleetWorker = (*fakeFleetWorker)(nil)
|
||||||
@@ -0,0 +1,217 @@
|
|||||||
|
package http
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
stdhttp "net/http"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/go-chi/chi/v5"
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||||
|
)
|
||||||
|
|
||||||
|
// UpdateWatcher is the slim view of the ws.updateWatcher this package
|
||||||
|
// uses for tracking in-flight update dispatches. Defined as an
|
||||||
|
// interface so a test can inject a stub.
|
||||||
|
type UpdateWatcher interface {
|
||||||
|
Track(jobID, hostID string)
|
||||||
|
}
|
||||||
|
|
||||||
|
// FleetWorker is the slim view of the fleetupdate.Worker this package
|
||||||
|
// uses. Kept here for forward compatibility with P6-15 — the host
|
||||||
|
// update endpoint itself does not use it.
|
||||||
|
type FleetWorker interface {
|
||||||
|
Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error)
|
||||||
|
Cancel(ctx context.Context, fleetUpdateID string) error
|
||||||
|
}
|
||||||
|
|
||||||
|
// dispatchHostUpdateResult communicates structured outcomes from the
|
||||||
|
// shared dispatch path so both the HTTP handler and the fleet worker
|
||||||
|
// can format errors in their own idiom.
|
||||||
|
type dispatchHostUpdateResult struct {
|
||||||
|
JobID string
|
||||||
|
Code string // "" on success
|
||||||
|
Status int // HTTP status the JSON handler should use on error
|
||||||
|
Msg string // human-readable detail (optional)
|
||||||
|
}
|
||||||
|
|
||||||
|
// dispatchHostUpdate is the shared "send command.update to one host"
|
||||||
|
// path. It performs every pre-check (host exists, online, version
|
||||||
|
// mismatch, no in-flight update) and on success creates the jobs row,
|
||||||
|
// audits, dispatches the WS envelope, and tracks the watcher entry.
|
||||||
|
//
|
||||||
|
// Pre-checks are returned as structured codes rather than HTTP errors
|
||||||
|
// so the fleet worker can map them onto its own per-host status enum
|
||||||
|
// without parsing strings.
|
||||||
|
func (s *Server) dispatchHostUpdate(ctx context.Context, hostID string, actorKind string, actorID *string) dispatchHostUpdateResult {
|
||||||
|
host, err := s.deps.Store.GetHost(ctx, hostID)
|
||||||
|
if err != nil || host == nil {
|
||||||
|
return dispatchHostUpdateResult{Code: "host_not_found", Status: stdhttp.StatusNotFound}
|
||||||
|
}
|
||||||
|
if !s.deps.Hub.Connected(host.ID) {
|
||||||
|
return dispatchHostUpdateResult{
|
||||||
|
Code: "host_offline", Status: stdhttp.StatusConflict,
|
||||||
|
Msg: "agent is not currently connected",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if host.AgentVersion != "" && host.AgentVersion == version.Version {
|
||||||
|
return dispatchHostUpdateResult{
|
||||||
|
Code: "already_up_to_date", Status: stdhttp.StatusConflict,
|
||||||
|
Msg: "agent already running version " + version.Version,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
existing, err := s.deps.Store.RunningUpdateJobForHost(ctx, hostID)
|
||||||
|
if err != nil {
|
||||||
|
return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
|
||||||
|
}
|
||||||
|
if existing != "" {
|
||||||
|
return dispatchHostUpdateResult{
|
||||||
|
Code: "update_in_progress", Status: stdhttp.StatusConflict,
|
||||||
|
Msg: "an update job is already in flight for this host",
|
||||||
|
JobID: existing,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
jobID := ulid.Make().String()
|
||||||
|
now := time.Now().UTC()
|
||||||
|
if err := s.deps.Store.CreateJob(ctx, store.Job{
|
||||||
|
ID: jobID, HostID: hostID, Kind: "update",
|
||||||
|
ActorKind: actorKind, ActorID: actorID,
|
||||||
|
CreatedAt: now,
|
||||||
|
}); err != nil {
|
||||||
|
return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
|
||||||
|
}
|
||||||
|
env, err := api.Marshal(api.MsgCommandUpdate, ulid.Make().String(), api.CommandUpdatePayload{
|
||||||
|
JobID: jobID,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
|
||||||
|
}
|
||||||
|
if err := s.deps.Hub.Send(ctx, hostID, env); err != nil {
|
||||||
|
// Roll the job to failed so we don't leak a queued row.
|
||||||
|
_ = s.deps.Store.MarkJobFinished(ctx, jobID, "failed", -1, nil, err.Error(), time.Now().UTC())
|
||||||
|
return dispatchHostUpdateResult{
|
||||||
|
Code: "host_offline", Status: stdhttp.StatusConflict, Msg: err.Error(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if s.deps.UpdateWatcher != nil {
|
||||||
|
s.deps.UpdateWatcher.Track(jobID, hostID)
|
||||||
|
}
|
||||||
|
|
||||||
|
auditPayload, _ := json.Marshal(map[string]string{
|
||||||
|
"job_id": jobID,
|
||||||
|
"target_version": version.Version,
|
||||||
|
})
|
||||||
|
_ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{
|
||||||
|
ID: ulid.Make().String(),
|
||||||
|
UserID: actorID,
|
||||||
|
Actor: actorKind,
|
||||||
|
Action: "host.update_dispatched",
|
||||||
|
TargetKind: ptr("host"),
|
||||||
|
TargetID: &hostID,
|
||||||
|
TS: now,
|
||||||
|
Payload: auditPayload,
|
||||||
|
})
|
||||||
|
|
||||||
|
return dispatchHostUpdateResult{JobID: jobID}
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleHostUpdate is POST /api/hosts/{id}/update — JSON, admin-only.
|
||||||
|
func (s *Server) handleHostUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||||
|
user, ok := s.requireUser(r)
|
||||||
|
if !ok {
|
||||||
|
writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
hostID := chi.URLParam(r, "id")
|
||||||
|
if hostID == "" {
|
||||||
|
writeJSONError(w, stdhttp.StatusBadRequest, "missing_host_id", "")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
actor := "user"
|
||||||
|
var actorID *string
|
||||||
|
if user != nil {
|
||||||
|
actorID = &user.ID
|
||||||
|
}
|
||||||
|
res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID)
|
||||||
|
if res.Code != "" {
|
||||||
|
writeJSONError(w, res.Status, res.Code, res.Msg)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, stdhttp.StatusAccepted, map[string]string{"job_id": res.JobID})
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleHostUpdateForm is the HTMX-friendly POST /hosts/{id}/update
|
||||||
|
// variant. On success it sets HX-Redirect to the job detail page; on
|
||||||
|
// pre-check failures it renders an inline error banner.
|
||||||
|
func (s *Server) handleHostUpdateForm(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||||
|
user, ok := s.requireUser(r)
|
||||||
|
if !ok {
|
||||||
|
stdhttp.Error(w, "unauthorised", stdhttp.StatusUnauthorized)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
hostID := chi.URLParam(r, "id")
|
||||||
|
if hostID == "" {
|
||||||
|
stdhttp.Error(w, "missing host_id", stdhttp.StatusBadRequest)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
actor := "user"
|
||||||
|
var actorID *string
|
||||||
|
if user != nil {
|
||||||
|
actorID = &user.ID
|
||||||
|
}
|
||||||
|
res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID)
|
||||||
|
if res.Code != "" {
|
||||||
|
// Inline banner for HTMX swaps. Mirrors what host_credentials
|
||||||
|
// returns on validation errors — small text/html fragment.
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
w.WriteHeader(res.Status)
|
||||||
|
msg := hostUpdateErrorMessage(res.Code, res.Msg)
|
||||||
|
_, _ = w.Write([]byte(`<div class="banner banner-error" role="alert">` + htmlEscape(msg) + `</div>`))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("HX-Redirect", "/jobs/"+res.JobID)
|
||||||
|
w.WriteHeader(stdhttp.StatusOK)
|
||||||
|
}
|
||||||
|
|
||||||
|
func hostUpdateErrorMessage(code, msg string) string {
|
||||||
|
switch code {
|
||||||
|
case "host_not_found":
|
||||||
|
return "Host not found."
|
||||||
|
case "host_offline":
|
||||||
|
return "Agent is offline; can't deliver the update command."
|
||||||
|
case "already_up_to_date":
|
||||||
|
return "Agent is already running the current version."
|
||||||
|
case "update_in_progress":
|
||||||
|
return "An update is already in progress for this host."
|
||||||
|
}
|
||||||
|
if msg != "" {
|
||||||
|
return msg
|
||||||
|
}
|
||||||
|
return "Update dispatch failed."
|
||||||
|
}
|
||||||
|
|
||||||
|
// htmlEscape is a minimal HTML-attr-safe escaper. Avoids pulling html/template
|
||||||
|
// for a one-shot inline banner.
|
||||||
|
func htmlEscape(s string) string {
|
||||||
|
out := make([]byte, 0, len(s))
|
||||||
|
for i := 0; i < len(s); i++ {
|
||||||
|
switch s[i] {
|
||||||
|
case '&':
|
||||||
|
out = append(out, []byte("&")...)
|
||||||
|
case '<':
|
||||||
|
out = append(out, []byte("<")...)
|
||||||
|
case '>':
|
||||||
|
out = append(out, []byte(">")...)
|
||||||
|
case '"':
|
||||||
|
out = append(out, []byte(""")...)
|
||||||
|
default:
|
||||||
|
out = append(out, s[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return string(out)
|
||||||
|
}
|
||||||
@@ -0,0 +1,270 @@
|
|||||||
|
// host_update_test.go — covers POST /api/hosts/{id}/update.
|
||||||
|
package http
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"io"
|
||||||
|
stdhttp "net/http"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/coder/websocket"
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||||
|
)
|
||||||
|
|
||||||
|
// stubWatcher records Track calls so tests can assert the watcher was
|
||||||
|
// notified.
|
||||||
|
type stubWatcher struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
tracked []string // hostIDs
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *stubWatcher) Track(_, hostID string) {
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
s.tracked = append(s.tracked, hostID)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHostUpdateHappyPath(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
srv, ts, st := rawTestServer(t)
|
||||||
|
watcher := &stubWatcher{}
|
||||||
|
srv.deps.UpdateWatcher = watcher
|
||||||
|
hostID, token := enrolHostForWS(t, srv, st, "upd-host")
|
||||||
|
c := agentDial(t, srv, ts, hostID, token)
|
||||||
|
sendHello(t, c, "upd-host")
|
||||||
|
_ = drainUntil(t, c, api.MsgScheduleSet)
|
||||||
|
|
||||||
|
// Force a version mismatch so the dispatch isn't short-circuited.
|
||||||
|
if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
||||||
|
t.Fatalf("mark hello: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusAccepted {
|
||||||
|
t.Fatalf("status: got %d, want 202", res.StatusCode)
|
||||||
|
}
|
||||||
|
var out struct {
|
||||||
|
JobID string `json:"job_id"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(res.Body).Decode(&out); err != nil {
|
||||||
|
t.Fatalf("decode: %v", err)
|
||||||
|
}
|
||||||
|
if out.JobID == "" {
|
||||||
|
t.Fatal("missing job_id in response")
|
||||||
|
}
|
||||||
|
|
||||||
|
// command.update envelope arrives.
|
||||||
|
deadline := time.Now().Add(2 * time.Second)
|
||||||
|
var got api.Envelope
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||||
|
mt, raw, rerr := c.Read(ctx)
|
||||||
|
cancel()
|
||||||
|
if rerr != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if mt != websocket.MessageText {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !strings.Contains(string(raw), `"command.update"`) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
_ = json.Unmarshal(raw, &got)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if got.Type != api.MsgCommandUpdate {
|
||||||
|
t.Fatal("never received command.update envelope")
|
||||||
|
}
|
||||||
|
var cp api.CommandUpdatePayload
|
||||||
|
if err := got.UnmarshalPayload(&cp); err != nil {
|
||||||
|
t.Fatalf("payload: %v", err)
|
||||||
|
}
|
||||||
|
if cp.JobID != out.JobID {
|
||||||
|
t.Fatalf("payload job_id: got %q want %q", cp.JobID, out.JobID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Watcher tracked.
|
||||||
|
watcher.mu.Lock()
|
||||||
|
defer watcher.mu.Unlock()
|
||||||
|
if len(watcher.tracked) != 1 || watcher.tracked[0] != hostID {
|
||||||
|
t.Fatalf("watcher tracked: %v", watcher.tracked)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Audit row exists.
|
||||||
|
var n int
|
||||||
|
if err := st.DB().QueryRow(
|
||||||
|
`SELECT COUNT(*) FROM audit_log WHERE action = 'host.update_dispatched' AND target_id = ?`,
|
||||||
|
hostID).Scan(&n); err != nil {
|
||||||
|
t.Fatalf("audit count: %v", err)
|
||||||
|
}
|
||||||
|
if n != 1 {
|
||||||
|
t.Fatalf("audit rows: got %d, want 1", n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHostUpdateNotFound(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
_, ts, st := rawTestServer(t)
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/no-such/update", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusNotFound {
|
||||||
|
t.Fatalf("status: got %d want 404", res.StatusCode)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHostUpdateOffline(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
_, ts, st := rawTestServer(t)
|
||||||
|
hostID := ulid.Make().String()
|
||||||
|
if err := st.CreateHost(context.Background(), store.Host{
|
||||||
|
ID: hostID, Name: "off", OS: "linux", Arch: "amd64",
|
||||||
|
EnrolledAt: time.Now().UTC(),
|
||||||
|
}, "deadbeef", ""); err != nil {
|
||||||
|
t.Fatalf("create: %v", err)
|
||||||
|
}
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusConflict {
|
||||||
|
t.Fatalf("status: got %d want 409", res.StatusCode)
|
||||||
|
}
|
||||||
|
body := readJSONError(t, res.Body)
|
||||||
|
if body.Code != "host_offline" {
|
||||||
|
t.Fatalf("code: %q", body.Code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHostUpdateAlreadyUpToDate(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
srv, ts, st := rawTestServer(t)
|
||||||
|
hostID, token := enrolHostForWS(t, srv, st, "uptodate-host")
|
||||||
|
c := agentDial(t, srv, ts, hostID, token)
|
||||||
|
sendHello(t, c, "uptodate-host")
|
||||||
|
_ = drainUntil(t, c, api.MsgScheduleSet)
|
||||||
|
|
||||||
|
// Force agent_version == version.Version.
|
||||||
|
if err := st.MarkHostHello(context.Background(), hostID, version.Version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
||||||
|
t.Fatalf("mark hello: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusConflict {
|
||||||
|
t.Fatalf("status: got %d want 409", res.StatusCode)
|
||||||
|
}
|
||||||
|
body := readJSONError(t, res.Body)
|
||||||
|
if body.Code != "already_up_to_date" {
|
||||||
|
t.Fatalf("code: %q", body.Code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHostUpdateInProgress(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
srv, ts, st := rawTestServer(t)
|
||||||
|
hostID, token := enrolHostForWS(t, srv, st, "inprog-host")
|
||||||
|
c := agentDial(t, srv, ts, hostID, token)
|
||||||
|
sendHello(t, c, "inprog-host")
|
||||||
|
_ = drainUntil(t, c, api.MsgScheduleSet)
|
||||||
|
if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
|
||||||
|
t.Fatalf("mark hello: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pre-seed an in-flight update job.
|
||||||
|
jobID := ulid.Make().String()
|
||||||
|
if err := st.CreateJob(context.Background(), store.Job{
|
||||||
|
ID: jobID, HostID: hostID, Kind: "update",
|
||||||
|
ActorKind: "user", CreatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("seed job: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cookie := loginAsAdmin(t, st)
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusConflict {
|
||||||
|
t.Fatalf("status: got %d want 409", res.StatusCode)
|
||||||
|
}
|
||||||
|
body := readJSONError(t, res.Body)
|
||||||
|
if body.Code != "update_in_progress" {
|
||||||
|
t.Fatalf("code: %q", body.Code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHostUpdateRBAC(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
_, ts, st := rawTestServer(t)
|
||||||
|
hostID := ulid.Make().String()
|
||||||
|
if err := st.CreateHost(context.Background(), store.Host{
|
||||||
|
ID: hostID, Name: "rbac-host", OS: "linux", Arch: "amd64",
|
||||||
|
EnrolledAt: time.Now().UTC(),
|
||||||
|
}, "deadbeef", ""); err != nil {
|
||||||
|
t.Fatalf("create: %v", err)
|
||||||
|
}
|
||||||
|
for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} {
|
||||||
|
role := role
|
||||||
|
t.Run(string(role), func(t *testing.T) {
|
||||||
|
cookie := loginAsRole(t, st, role)
|
||||||
|
req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
|
||||||
|
req.AddCookie(cookie)
|
||||||
|
res, err := stdhttp.DefaultClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("do: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusForbidden {
|
||||||
|
t.Fatalf("status for %s: got %d want 403", role, res.StatusCode)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type jsonErrBody struct {
|
||||||
|
Code string `json:"code"`
|
||||||
|
Message string `json:"message,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func readJSONError(t *testing.T, body io.Reader) jsonErrBody {
|
||||||
|
t.Helper()
|
||||||
|
var out jsonErrBody
|
||||||
|
if err := json.NewDecoder(body).Decode(&out); err != nil {
|
||||||
|
t.Fatalf("decode error body: %v", err)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
stdhttp "net/http"
|
stdhttp "net/http"
|
||||||
|
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
// hostView is the JSON projection of a Host row. Same shape as the
|
// hostView is the JSON projection of a Host row. Same shape as the
|
||||||
@@ -27,6 +28,8 @@ type hostView struct {
|
|||||||
RepoSizeBytes int64 `json:"repo_size_bytes"`
|
RepoSizeBytes int64 `json:"repo_size_bytes"`
|
||||||
SnapshotCount int `json:"snapshot_count"`
|
SnapshotCount int `json:"snapshot_count"`
|
||||||
OpenAlertCount int `json:"open_alert_count"`
|
OpenAlertCount int `json:"open_alert_count"`
|
||||||
|
UpdateAvailable bool `json:"update_available"`
|
||||||
|
TargetVersion string `json:"target_version,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// handleListHosts returns the full fleet as JSON. Authenticated; the
|
// handleListHosts returns the full fleet as JSON. Authenticated; the
|
||||||
@@ -85,6 +88,8 @@ func hostToView(h store.Host) hostView {
|
|||||||
RepoSizeBytes: h.RepoSizeBytes,
|
RepoSizeBytes: h.RepoSizeBytes,
|
||||||
SnapshotCount: h.SnapshotCount,
|
SnapshotCount: h.SnapshotCount,
|
||||||
OpenAlertCount: h.OpenAlertCount,
|
OpenAlertCount: h.OpenAlertCount,
|
||||||
|
TargetVersion: version.Version,
|
||||||
|
UpdateAvailable: h.AgentVersion != "" && h.AgentVersion != version.Version,
|
||||||
}
|
}
|
||||||
if v.Tags == nil {
|
if v.Tags == nil {
|
||||||
v.Tags = []string{}
|
v.Tags = []string{}
|
||||||
|
|||||||
@@ -39,6 +39,13 @@ type Deps struct {
|
|||||||
// NotificationHub (optional, wired in G1) is used by the test-fire
|
// NotificationHub (optional, wired in G1) is used by the test-fire
|
||||||
// endpoint to dispatch a single synthetic payload through a channel.
|
// endpoint to dispatch a single synthetic payload through a channel.
|
||||||
NotificationHub *notification.Hub
|
NotificationHub *notification.Hub
|
||||||
|
// UpdateWatcher tracks in-flight agent self-update dispatches and
|
||||||
|
// reconciles them against incoming hello envelopes. Optional;
|
||||||
|
// nil = no-op (handlers degrade by skipping the Track call).
|
||||||
|
UpdateWatcher UpdateWatcher
|
||||||
|
// FleetWorker drives the rolling fleet-update worker. Optional;
|
||||||
|
// nil = fleet update endpoints (P6-15) report unavailable.
|
||||||
|
FleetWorker FleetWorker
|
||||||
// Version is the binary's build version, surfaced in the chrome.
|
// Version is the binary's build version, surfaced in the chrome.
|
||||||
// Empty falls back to "dev".
|
// Empty falls back to "dev".
|
||||||
Version string
|
Version string
|
||||||
@@ -123,8 +130,9 @@ func (s *Server) routes(r chi.Router) {
|
|||||||
r.Post("/api/agents/announce", s.handleAnnounce)
|
r.Post("/api/agents/announce", s.handleAnnounce)
|
||||||
r.Get("/agent/binary", s.handleAgentBinary)
|
r.Get("/agent/binary", s.handleAgentBinary)
|
||||||
r.Get("/install/*", s.handleInstallAsset)
|
r.Get("/install/*", s.handleInstallAsset)
|
||||||
|
r.Get("/api/version", s.handleVersion)
|
||||||
if s.deps.Hub != nil {
|
if s.deps.Hub != nil {
|
||||||
r.Mount("/ws/agent", ws.AgentHandler(ws.HandlerDeps{
|
hd := ws.HandlerDeps{
|
||||||
Hub: s.deps.Hub,
|
Hub: s.deps.Hub,
|
||||||
Store: s.deps.Store,
|
Store: s.deps.Store,
|
||||||
JobHub: s.deps.JobHub,
|
JobHub: s.deps.JobHub,
|
||||||
@@ -132,7 +140,11 @@ func (s *Server) routes(r chi.Router) {
|
|||||||
OnHello: s.onAgentHello,
|
OnHello: s.onAgentHello,
|
||||||
OnScheduleAck: s.applyScheduleAck,
|
OnScheduleAck: s.applyScheduleAck,
|
||||||
OnScheduleFire: s.dispatchScheduledJob,
|
OnScheduleFire: s.dispatchScheduledJob,
|
||||||
}))
|
}
|
||||||
|
if w, ok := s.deps.UpdateWatcher.(*ws.UpdateWatcher); ok && w != nil {
|
||||||
|
hd.UpdateWatcher = w
|
||||||
|
}
|
||||||
|
r.Mount("/ws/agent", ws.AgentHandler(hd))
|
||||||
}
|
}
|
||||||
r.Get("/ws/agent/pending", s.handlePendingWS)
|
r.Get("/ws/agent/pending", s.handlePendingWS)
|
||||||
r.Mount("/static/", staticHandler())
|
r.Mount("/static/", staticHandler())
|
||||||
@@ -270,6 +282,14 @@ func (s *Server) routes(r chi.Router) {
|
|||||||
r.Group(func(r chi.Router) {
|
r.Group(func(r chi.Router) {
|
||||||
r.Use(s.requireRole(store.RoleAdmin))
|
r.Use(s.requireRole(store.RoleAdmin))
|
||||||
|
|
||||||
|
r.Post("/api/hosts/{id}/update", s.handleHostUpdate)
|
||||||
|
r.Post("/hosts/{id}/update", s.handleHostUpdateForm)
|
||||||
|
|
||||||
|
// Fleet update (P6-15): rolling update across many hosts.
|
||||||
|
r.Post("/api/fleet/update", s.handleAPIFleetUpdateStart)
|
||||||
|
r.Post("/api/fleet-updates/{id}/cancel", s.handleAPIFleetUpdateCancel)
|
||||||
|
r.Get("/api/fleet-updates/{id}", s.handleAPIFleetUpdateGet)
|
||||||
|
|
||||||
r.Get("/api/users", s.handleAPIUsersList)
|
r.Get("/api/users", s.handleAPIUsersList)
|
||||||
r.Post("/api/users", s.handleAPIUserCreate)
|
r.Post("/api/users", s.handleAPIUserCreate)
|
||||||
r.Get("/api/users/{id}", s.handleAPIUserGet)
|
r.Get("/api/users/{id}", s.handleAPIUserGet)
|
||||||
@@ -283,6 +303,8 @@ func (s *Server) routes(r chi.Router) {
|
|||||||
if s.deps.UI != nil {
|
if s.deps.UI != nil {
|
||||||
r.Post("/hosts/{id}/delete", s.handleUIHostDelete)
|
r.Post("/hosts/{id}/delete", s.handleUIHostDelete)
|
||||||
r.Get("/settings", s.handleUISettings)
|
r.Get("/settings", s.handleUISettings)
|
||||||
|
r.Get("/settings/fleet-update", s.handleUIFleetUpdate)
|
||||||
|
r.Get("/settings/fleet-update/partial", s.handleUIFleetUpdatePartial)
|
||||||
r.Get("/settings/users", s.handleUIUsersList)
|
r.Get("/settings/users", s.handleUIUsersList)
|
||||||
r.Get("/settings/users/new", s.handleUIUserNewGet)
|
r.Get("/settings/users/new", s.handleUIUserNewGet)
|
||||||
r.Post("/settings/users/new", s.handleUIUserNewPost)
|
r.Post("/settings/users/new", s.handleUIUserNewPost)
|
||||||
@@ -321,6 +343,27 @@ func (s *Server) Shutdown(ctx context.Context) error {
|
|||||||
return s.srv.Shutdown(ctx)
|
return s.srv.Shutdown(ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SetFleetWorker installs the fleet-update worker post-construction.
|
||||||
|
// Used to break the wiring loop in cmd/server (the worker depends on a
|
||||||
|
// dispatcher that delegates back into the server's host-update path).
|
||||||
|
func (s *Server) SetFleetWorker(fw FleetWorker) { s.deps.FleetWorker = fw }
|
||||||
|
|
||||||
|
// DispatchHostUpdate is the public entry point for callers (the fleet
|
||||||
|
// worker) that need to drive the same dispatch path the HTTP handler
|
||||||
|
// uses, without going through HTTP. Returns the structured result so
|
||||||
|
// the caller can map error codes to its own status enum.
|
||||||
|
func (s *Server) DispatchHostUpdate(ctx context.Context, hostID, actorUserID string) (jobID string, code string, err error) {
|
||||||
|
var actorID *string
|
||||||
|
if actorUserID != "" {
|
||||||
|
actorID = &actorUserID
|
||||||
|
}
|
||||||
|
res := s.dispatchHostUpdate(ctx, hostID, "user", actorID)
|
||||||
|
if res.Code != "" {
|
||||||
|
return res.JobID, res.Code, nil
|
||||||
|
}
|
||||||
|
return res.JobID, "", nil
|
||||||
|
}
|
||||||
|
|
||||||
// Addr returns the configured listen address. Useful in tests when
|
// Addr returns the configured listen address. Useful in tests when
|
||||||
// the caller passes :0 to get a random port.
|
// the caller passes :0 to get a random port.
|
||||||
func (s *Server) Addr() string { return s.srv.Addr }
|
func (s *Server) Addr() string { return s.srv.Addr }
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import (
|
|||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/web"
|
"gitea.dcglab.co.uk/steve/restic-manager/web"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -155,6 +156,10 @@ type dashboardPage struct {
|
|||||||
// when it's already active). Pre-computed so the template stays
|
// when it's already active). Pre-computed so the template stays
|
||||||
// dumb.
|
// dumb.
|
||||||
SortURL map[string]string
|
SortURL map[string]string
|
||||||
|
// UpdatesBehind is the count of online hosts whose agent_version
|
||||||
|
// trails the server. Surfaces as the dashboard "N hosts behind"
|
||||||
|
// hero tile and links to ?updates=behind.
|
||||||
|
UpdatesBehind int
|
||||||
}
|
}
|
||||||
|
|
||||||
// dashboardFilter holds the parsed query-string filter state.
|
// dashboardFilter holds the parsed query-string filter state.
|
||||||
@@ -165,6 +170,10 @@ type dashboardFilter struct {
|
|||||||
Tag string // mirrors ActiveTag for round-trip on links
|
Tag string // mirrors ActiveTag for round-trip on links
|
||||||
Sort string // column key (see sortDashboard)
|
Sort string // column key (see sortDashboard)
|
||||||
Dir string // "asc" | "desc"
|
Dir string // "asc" | "desc"
|
||||||
|
// Updates narrows to hosts whose agent is behind the server's
|
||||||
|
// version. Only valid value today is "behind"; empty means no
|
||||||
|
// filter.
|
||||||
|
Updates string
|
||||||
}
|
}
|
||||||
|
|
||||||
// dashboardHostRow carries a host plus the per-row Run-now decision
|
// dashboardHostRow carries a host plus the per-row Run-now decision
|
||||||
@@ -180,6 +189,13 @@ type dashboardHostRow struct {
|
|||||||
// NextRun is the next-fire time of RunAllScheduleID (when set),
|
// NextRun is the next-fire time of RunAllScheduleID (when set),
|
||||||
// computed server-side from its cron. nil otherwise.
|
// computed server-side from its cron. nil otherwise.
|
||||||
NextRun *time.Time
|
NextRun *time.Time
|
||||||
|
// UpdateAvailable is true when the host's agent has connected at
|
||||||
|
// least once AND its agent_version differs from the server's. Used
|
||||||
|
// by the host_row partial to render the update-available chip.
|
||||||
|
UpdateAvailable bool
|
||||||
|
// TargetVersion is the server's build version, surfaced in the
|
||||||
|
// chip's tooltip and label.
|
||||||
|
TargetVersion string
|
||||||
}
|
}
|
||||||
|
|
||||||
// pickRunAllSchedule returns the ID of the single schedule whose
|
// pickRunAllSchedule returns the ID of the single schedule whose
|
||||||
@@ -255,7 +271,11 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
|
|||||||
// calls per host — fine at fleet sizes we care about.
|
// calls per host — fine at fleet sizes we care about.
|
||||||
rows := make([]dashboardHostRow, 0, len(hosts))
|
rows := make([]dashboardHostRow, 0, len(hosts))
|
||||||
for _, h := range hosts {
|
for _, h := range hosts {
|
||||||
row := dashboardHostRow{Host: h}
|
row := dashboardHostRow{
|
||||||
|
Host: h,
|
||||||
|
TargetVersion: version.Version,
|
||||||
|
UpdateAvailable: h.AgentVersion != "" && h.AgentVersion != version.Version,
|
||||||
|
}
|
||||||
groups, gerr := s.deps.Store.ListSourceGroupsByHost(r.Context(), h.ID)
|
groups, gerr := s.deps.Store.ListSourceGroupsByHost(r.Context(), h.ID)
|
||||||
if gerr != nil {
|
if gerr != nil {
|
||||||
slog.Warn("ui dashboard: list source groups", "host_id", h.ID, "err", gerr)
|
slog.Warn("ui dashboard: list source groups", "host_id", h.ID, "err", gerr)
|
||||||
@@ -289,6 +309,13 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
|
|||||||
critOpenCount = len(crit)
|
critOpenCount = len(crit)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
updatesBehind := 0
|
||||||
|
for _, h := range allHosts {
|
||||||
|
if h.Status == "online" && h.AgentVersion != "" && h.AgentVersion != version.Version {
|
||||||
|
updatesBehind++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
view := s.baseView(r, u)
|
view := s.baseView(r, u)
|
||||||
view.Page = dashboardPage{
|
view.Page = dashboardPage{
|
||||||
Hosts: rows,
|
Hosts: rows,
|
||||||
@@ -302,6 +329,7 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
|
|||||||
Filter: filter,
|
Filter: filter,
|
||||||
RefreshURL: "/?" + filter.encode(),
|
RefreshURL: "/?" + filter.encode(),
|
||||||
SortURL: buildDashboardSortURLs(filter),
|
SortURL: buildDashboardSortURLs(filter),
|
||||||
|
UpdatesBehind: updatesBehind,
|
||||||
}
|
}
|
||||||
if err := s.deps.UI.Render(w, "dashboard", view); err != nil {
|
if err := s.deps.UI.Render(w, "dashboard", view); err != nil {
|
||||||
slog.Error("ui: render dashboard", "err", err)
|
slog.Error("ui: render dashboard", "err", err)
|
||||||
@@ -320,6 +348,7 @@ func parseDashboardFilter(q url.Values) dashboardFilter {
|
|||||||
Tag: q.Get("tag"),
|
Tag: q.Get("tag"),
|
||||||
Sort: q.Get("sort"),
|
Sort: q.Get("sort"),
|
||||||
Dir: q.Get("dir"),
|
Dir: q.Get("dir"),
|
||||||
|
Updates: q.Get("updates"),
|
||||||
}
|
}
|
||||||
if f.Sort == "" {
|
if f.Sort == "" {
|
||||||
f.Sort = "name"
|
f.Sort = "name"
|
||||||
@@ -352,6 +381,9 @@ func (f dashboardFilter) encode() string {
|
|||||||
if f.Dir != "" && f.Dir != "asc" {
|
if f.Dir != "" && f.Dir != "asc" {
|
||||||
v.Set("dir", f.Dir)
|
v.Set("dir", f.Dir)
|
||||||
}
|
}
|
||||||
|
if f.Updates != "" {
|
||||||
|
v.Set("updates", f.Updates)
|
||||||
|
}
|
||||||
return v.Encode()
|
return v.Encode()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -402,6 +434,11 @@ func filterAndSortDashboardHosts(hosts []store.Host, f dashboardFilter) []store.
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if f.Updates == "behind" {
|
||||||
|
if h.AgentVersion == "" || h.AgentVersion == version.Version {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
out = append(out, h)
|
out = append(out, h)
|
||||||
}
|
}
|
||||||
sortDashboardHosts(out, f.Sort, f.Dir)
|
sortDashboardHosts(out, f.Sort, f.Dir)
|
||||||
@@ -809,6 +846,20 @@ type hostChromeData struct {
|
|||||||
SourceGroupCount int
|
SourceGroupCount int
|
||||||
ScheduleCount int
|
ScheduleCount int
|
||||||
ScheduleVersion int64 // host_schedule_version (latest desired)
|
ScheduleVersion int64 // host_schedule_version (latest desired)
|
||||||
|
// UpdateAvailable + TargetVersion drive the agent-out-of-date chip
|
||||||
|
// in the host detail header. UpdateAvailable is true iff the host
|
||||||
|
// has connected at least once AND its agent_version != server's.
|
||||||
|
UpdateAvailable bool
|
||||||
|
TargetVersion string
|
||||||
|
// Online + UpdateInProgress drive the per-host "Update agent"
|
||||||
|
// button on host_detail. Online mirrors hub.Connected; pulled here
|
||||||
|
// so the button can disable when the host is unreachable.
|
||||||
|
Online bool
|
||||||
|
UpdateInProgress bool
|
||||||
|
// CanAdmin is true when the viewing user has admin role; used to
|
||||||
|
// gate the "Update agent" button. Kept on the chrome struct so any
|
||||||
|
// page reusing host_chrome already has it for free.
|
||||||
|
CanAdmin bool
|
||||||
// KnownTags is the union of tags already in use across the fleet,
|
// KnownTags is the union of tags already in use across the fleet,
|
||||||
// used for autocomplete on the host-tags edit form. Cheap query.
|
// used for autocomplete on the host-tags edit form. Cheap query.
|
||||||
KnownTags []string
|
KnownTags []string
|
||||||
@@ -834,6 +885,14 @@ type hostChromeData struct {
|
|||||||
// render the page with stale counts than 500 the whole tab.
|
// render the page with stale counts than 500 the whole tab.
|
||||||
func (s *Server) loadHostChrome(r *stdhttp.Request, host store.Host, subtab, crumb string) hostChromeData {
|
func (s *Server) loadHostChrome(r *stdhttp.Request, host store.Host, subtab, crumb string) hostChromeData {
|
||||||
d := hostChromeData{Host: host, SubTab: subtab, Crumb: crumb}
|
d := hostChromeData{Host: host, SubTab: subtab, Crumb: crumb}
|
||||||
|
d.TargetVersion = version.Version
|
||||||
|
d.UpdateAvailable = host.AgentVersion != "" && host.AgentVersion != version.Version
|
||||||
|
if s.deps.Hub != nil {
|
||||||
|
d.Online = s.deps.Hub.Connected(host.ID)
|
||||||
|
}
|
||||||
|
if existing, _ := s.deps.Store.RunningUpdateJobForHost(r.Context(), host.ID); existing != "" {
|
||||||
|
d.UpdateInProgress = true
|
||||||
|
}
|
||||||
if groups, err := s.deps.Store.ListSourceGroupsByHost(r.Context(), host.ID); err == nil {
|
if groups, err := s.deps.Store.ListSourceGroupsByHost(r.Context(), host.ID); err == nil {
|
||||||
d.SourceGroupCount = len(groups)
|
d.SourceGroupCount = len(groups)
|
||||||
} else {
|
} else {
|
||||||
@@ -972,8 +1031,10 @@ func (s *Server) handleUIHostDetail(w stdhttp.ResponseWriter, r *stdhttp.Request
|
|||||||
|
|
||||||
view := s.baseView(r, u)
|
view := s.baseView(r, u)
|
||||||
view.Title = host.Name + " · restic-manager"
|
view.Title = host.Name + " · restic-manager"
|
||||||
|
chrome := s.loadHostChrome(r, *host, "snapshots", "snapshots")
|
||||||
|
chrome.CanAdmin = u.Role == string(store.RoleAdmin)
|
||||||
view.Page = hostDetailPage{
|
view.Page = hostDetailPage{
|
||||||
hostChromeData: s.loadHostChrome(r, *host, "snapshots", "snapshots"),
|
hostChromeData: chrome,
|
||||||
Snapshots: shown,
|
Snapshots: shown,
|
||||||
SnapshotsShown: len(shown),
|
SnapshotsShown: len(shown),
|
||||||
LegacyRestic: !restic.Env{Version: host.ResticVersion}.AtLeastVersion(0, 17),
|
LegacyRestic: !restic.Env{Version: host.ResticVersion}.AtLeastVersion(0, 17),
|
||||||
|
|||||||
@@ -0,0 +1,20 @@
|
|||||||
|
package http
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
stdhttp "net/http"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||||
|
)
|
||||||
|
|
||||||
|
// handleVersion exposes the server's build-time identifying constants
|
||||||
|
// (set via -ldflags). Public-band — no secrets surface here, the agent
|
||||||
|
// updater compares its own agent_version byte-for-byte against the
|
||||||
|
// Version field to drive the "out of date" signal.
|
||||||
|
func (s *Server) handleVersion(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
_ = json.NewEncoder(w).Encode(map[string]string{
|
||||||
|
"version": version.Version,
|
||||||
|
"commit": version.Commit,
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
package http
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
stdhttp "net/http"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestVersionEndpoint(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
prevV, prevC := version.Version, version.Commit
|
||||||
|
version.Version = "v9.9.9-test"
|
||||||
|
version.Commit = "abc1234"
|
||||||
|
t.Cleanup(func() {
|
||||||
|
version.Version = prevV
|
||||||
|
version.Commit = prevC
|
||||||
|
})
|
||||||
|
|
||||||
|
_, url, _ := newTestServerWithHub(t)
|
||||||
|
|
||||||
|
res, err := stdhttp.Get(url + "/api/version")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("get: %v", err)
|
||||||
|
}
|
||||||
|
defer res.Body.Close()
|
||||||
|
if res.StatusCode != stdhttp.StatusOK {
|
||||||
|
t.Fatalf("status: got %d want 200", res.StatusCode)
|
||||||
|
}
|
||||||
|
var body map[string]string
|
||||||
|
if err := json.NewDecoder(res.Body).Decode(&body); err != nil {
|
||||||
|
t.Fatalf("decode: %v", err)
|
||||||
|
}
|
||||||
|
if body["version"] != "v9.9.9-test" {
|
||||||
|
t.Fatalf("version: got %q", body["version"])
|
||||||
|
}
|
||||||
|
if body["commit"] != "abc1234" {
|
||||||
|
t.Fatalf("commit: got %q", body["commit"])
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -108,6 +108,8 @@ func New() (*Renderer, error) {
|
|||||||
"templates/partials/tree_node.html",
|
"templates/partials/tree_node.html",
|
||||||
"templates/partials/alert_row.html",
|
"templates/partials/alert_row.html",
|
||||||
"templates/partials/crit_banner.html",
|
"templates/partials/crit_banner.html",
|
||||||
|
"templates/partials/fleet_update_inner.html",
|
||||||
|
"templates/partials/host_update_chip.html",
|
||||||
}
|
}
|
||||||
|
|
||||||
pageEntries, err := fs.Glob(web.FS, "templates/pages/*.html")
|
pageEntries, err := fs.Glob(web.FS, "templates/pages/*.html")
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ import (
|
|||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
|
||||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
// HandlerDeps is the set of collaborators the agent WS handler needs.
|
// HandlerDeps is the set of collaborators the agent WS handler needs.
|
||||||
@@ -26,6 +27,9 @@ type HandlerDeps struct {
|
|||||||
// AlertEngine receives job-finished and host-online events so the
|
// AlertEngine receives job-finished and host-online events so the
|
||||||
// alert engine can evaluate its rules. Optional; nil = no-op.
|
// alert engine can evaluate its rules. Optional; nil = no-op.
|
||||||
AlertEngine *alert.Engine
|
AlertEngine *alert.Engine
|
||||||
|
// UpdateWatcher reconciles in-flight agent-update dispatches against
|
||||||
|
// hello envelopes. Optional; nil = no-op.
|
||||||
|
UpdateWatcher *UpdateWatcher
|
||||||
// OnHello is called once per successful hello, after the host row
|
// OnHello is called once per successful hello, after the host row
|
||||||
// has been touched and the conn registered. Used by the HTTP
|
// has been touched and the conn registered. Used by the HTTP
|
||||||
// layer to push host_credentials down as a config.update before
|
// layer to push host_credentials down as a config.update before
|
||||||
@@ -147,6 +151,9 @@ func runAgentLoop(ctx context.Context, c *Conn, hostID string, deps HandlerDeps)
|
|||||||
if deps.AlertEngine != nil {
|
if deps.AlertEngine != nil {
|
||||||
deps.AlertEngine.NotifyHostOnline(hostID)
|
deps.AlertEngine.NotifyHostOnline(hostID)
|
||||||
}
|
}
|
||||||
|
if deps.UpdateWatcher != nil {
|
||||||
|
deps.UpdateWatcher.OnHello(ctx, hostID, helloPayload.AgentVersion, version.Version)
|
||||||
|
}
|
||||||
|
|
||||||
deps.Hub.Register(hostID, c)
|
deps.Hub.Register(hostID, c)
|
||||||
defer deps.Hub.Unregister(hostID, c)
|
defer deps.Hub.Unregister(hostID, c)
|
||||||
|
|||||||
@@ -0,0 +1,151 @@
|
|||||||
|
package ws
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
// updateTimeout bounds how long the watcher waits for an agent to come
|
||||||
|
// back with its new version after a command.update dispatch. var (not
|
||||||
|
// const) so tests can shrink it.
|
||||||
|
var updateTimeout = 90 * time.Second
|
||||||
|
|
||||||
|
// AlertRaiser is the slim subset of *alert.Engine the update watcher
|
||||||
|
// touches. Defined here (not in the alert package) so the dependency
|
||||||
|
// arrow points the right way.
|
||||||
|
type AlertRaiser interface {
|
||||||
|
RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time)
|
||||||
|
ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time)
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateWatcher tracks in-flight agent-update dispatches and reconciles
|
||||||
|
// them against incoming hello envelopes. Entries land on Track and
|
||||||
|
// resolve via OnHello (success path) or the periodic sweep (timeout).
|
||||||
|
type UpdateWatcher struct {
|
||||||
|
store *store.Store
|
||||||
|
alerts AlertRaiser
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
entries map[string]*updateEntry // hostID → entry
|
||||||
|
|
||||||
|
tickPeriod time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
type updateEntry struct {
|
||||||
|
jobID string
|
||||||
|
startedAt time.Time
|
||||||
|
// terminated is set once the entry has reached a terminal state so
|
||||||
|
// late OnHellos don't resurrect it.
|
||||||
|
terminated bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewUpdateWatcher builds an unstarted watcher. Call Run in a goroutine
|
||||||
|
// to start the periodic sweep.
|
||||||
|
func NewUpdateWatcher(st *store.Store, alerts AlertRaiser) *UpdateWatcher {
|
||||||
|
return &UpdateWatcher{
|
||||||
|
store: st,
|
||||||
|
alerts: alerts,
|
||||||
|
entries: make(map[string]*updateEntry),
|
||||||
|
tickPeriod: 5 * time.Second,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track registers a freshly-dispatched update job. A subsequent Track
|
||||||
|
// for the same host replaces the prior entry (last-write-wins).
|
||||||
|
func (w *UpdateWatcher) Track(jobID, hostID string) {
|
||||||
|
if w == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.mu.Lock()
|
||||||
|
w.entries[hostID] = &updateEntry{jobID: jobID, startedAt: time.Now()}
|
||||||
|
w.mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
// OnHello is called by the WS handler after a successful hello has been
|
||||||
|
// persisted. If a tracked update for the host matches the targetVersion,
|
||||||
|
// the job is marked succeeded and any open update_failed alert is
|
||||||
|
// auto-resolved. A non-matching version is a no-op (the watcher keeps
|
||||||
|
// waiting until the timeout).
|
||||||
|
func (w *UpdateWatcher) OnHello(ctx context.Context, hostID, agentVersion, targetVersion string) {
|
||||||
|
if w == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.mu.Lock()
|
||||||
|
e, ok := w.entries[hostID]
|
||||||
|
if !ok || e.terminated {
|
||||||
|
w.mu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if agentVersion != targetVersion {
|
||||||
|
// Not the version we asked for — keep waiting.
|
||||||
|
w.mu.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
e.terminated = true
|
||||||
|
jobID := e.jobID
|
||||||
|
delete(w.entries, hostID)
|
||||||
|
w.mu.Unlock()
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
if err := w.store.MarkJobFinished(ctx, jobID, "succeeded", 0, nil, "", now); err != nil {
|
||||||
|
slog.Warn("ws update watcher: mark succeeded", "job_id", jobID, "host_id", hostID, "err", err)
|
||||||
|
}
|
||||||
|
if w.alerts != nil {
|
||||||
|
w.alerts.ResolveUpdateFailed(ctx, hostID, now)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run drives the periodic sweep. Returns when ctx is done.
|
||||||
|
func (w *UpdateWatcher) Run(ctx context.Context) {
|
||||||
|
if w == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
t := time.NewTicker(w.tickPeriod)
|
||||||
|
defer t.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case now := <-t.C:
|
||||||
|
w.sweep(ctx, now)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *UpdateWatcher) sweep(ctx context.Context, now time.Time) {
|
||||||
|
type expired struct {
|
||||||
|
hostID string
|
||||||
|
jobID string
|
||||||
|
age time.Duration
|
||||||
|
}
|
||||||
|
var toFail []expired
|
||||||
|
w.mu.Lock()
|
||||||
|
for hostID, e := range w.entries {
|
||||||
|
if e.terminated {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if now.Sub(e.startedAt) >= updateTimeout {
|
||||||
|
toFail = append(toFail, expired{hostID: hostID, jobID: e.jobID, age: now.Sub(e.startedAt)})
|
||||||
|
e.terminated = true
|
||||||
|
delete(w.entries, hostID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
|
||||||
|
for _, x := range toFail {
|
||||||
|
reason := fmt.Sprintf("timeout: agent did not reconnect within %s", updateTimeout)
|
||||||
|
stamp := now.UTC()
|
||||||
|
errMsg := reason
|
||||||
|
if err := w.store.MarkJobFinished(ctx, x.jobID, "failed", -1, nil, errMsg, stamp); err != nil {
|
||||||
|
slog.Warn("ws update watcher: mark failed", "job_id", x.jobID, "host_id", x.hostID, "err", err)
|
||||||
|
}
|
||||||
|
if w.alerts != nil {
|
||||||
|
w.alerts.RaiseUpdateFailed(ctx, x.hostID, x.jobID, reason, stamp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,161 @@
|
|||||||
|
package ws
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
|
||||||
|
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
type fakeAlerts struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
raised []string // hostIDs
|
||||||
|
resolved []string
|
||||||
|
reasons []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeAlerts) RaiseUpdateFailed(_ context.Context, hostID, _ /*jobID*/, reason string, _ time.Time) {
|
||||||
|
f.mu.Lock()
|
||||||
|
defer f.mu.Unlock()
|
||||||
|
f.raised = append(f.raised, hostID)
|
||||||
|
f.reasons = append(f.reasons, reason)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeAlerts) ResolveUpdateFailed(_ context.Context, hostID string, _ time.Time) {
|
||||||
|
f.mu.Lock()
|
||||||
|
defer f.mu.Unlock()
|
||||||
|
f.resolved = append(f.resolved, hostID)
|
||||||
|
}
|
||||||
|
|
||||||
|
func seedJob(t *testing.T, st *store.Store, hostID string) string {
|
||||||
|
t.Helper()
|
||||||
|
jobID := ulid.Make().String()
|
||||||
|
if err := st.CreateJob(context.Background(), store.Job{
|
||||||
|
ID: jobID, HostID: hostID, Kind: "update",
|
||||||
|
ActorKind: "user", CreatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("create job: %v", err)
|
||||||
|
}
|
||||||
|
return jobID
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUpdateWatcherOnHelloSuccess(t *testing.T) {
|
||||||
|
st := openWSTestStore(t)
|
||||||
|
hostID := ulid.Make().String()
|
||||||
|
seedHostWS(t, st, hostID)
|
||||||
|
jobID := seedJob(t, st, hostID)
|
||||||
|
|
||||||
|
a := &fakeAlerts{}
|
||||||
|
w := NewUpdateWatcher(st, a)
|
||||||
|
w.Track(jobID, hostID)
|
||||||
|
|
||||||
|
w.OnHello(context.Background(), hostID, "v2", "v2")
|
||||||
|
|
||||||
|
job, err := st.GetJob(context.Background(), jobID)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("get job: %v", err)
|
||||||
|
}
|
||||||
|
if job.Status != "succeeded" {
|
||||||
|
t.Fatalf("status: got %q want succeeded", job.Status)
|
||||||
|
}
|
||||||
|
a.mu.Lock()
|
||||||
|
defer a.mu.Unlock()
|
||||||
|
if len(a.resolved) != 1 || a.resolved[0] != hostID {
|
||||||
|
t.Fatalf("resolve calls: %v", a.resolved)
|
||||||
|
}
|
||||||
|
if len(a.raised) != 0 {
|
||||||
|
t.Fatalf("unexpected raises: %v", a.raised)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUpdateWatcherTimeout(t *testing.T) {
|
||||||
|
prev := updateTimeout
|
||||||
|
updateTimeout = 50 * time.Millisecond
|
||||||
|
t.Cleanup(func() { updateTimeout = prev })
|
||||||
|
|
||||||
|
st := openWSTestStore(t)
|
||||||
|
hostID := ulid.Make().String()
|
||||||
|
seedHostWS(t, st, hostID)
|
||||||
|
jobID := seedJob(t, st, hostID)
|
||||||
|
|
||||||
|
a := &fakeAlerts{}
|
||||||
|
w := NewUpdateWatcher(st, a)
|
||||||
|
w.Track(jobID, hostID)
|
||||||
|
|
||||||
|
time.Sleep(80 * time.Millisecond)
|
||||||
|
w.sweep(context.Background(), time.Now())
|
||||||
|
|
||||||
|
job, err := st.GetJob(context.Background(), jobID)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("get job: %v", err)
|
||||||
|
}
|
||||||
|
if job.Status != "failed" {
|
||||||
|
t.Fatalf("status: got %q want failed", job.Status)
|
||||||
|
}
|
||||||
|
a.mu.Lock()
|
||||||
|
defer a.mu.Unlock()
|
||||||
|
if len(a.raised) != 1 || a.raised[0] != hostID {
|
||||||
|
t.Fatalf("raise calls: %v", a.raised)
|
||||||
|
}
|
||||||
|
if len(a.reasons) == 0 || a.reasons[0] == "" {
|
||||||
|
t.Fatalf("missing reason")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUpdateWatcherMismatchedVersionNoOp(t *testing.T) {
|
||||||
|
st := openWSTestStore(t)
|
||||||
|
hostID := ulid.Make().String()
|
||||||
|
seedHostWS(t, st, hostID)
|
||||||
|
jobID := seedJob(t, st, hostID)
|
||||||
|
|
||||||
|
a := &fakeAlerts{}
|
||||||
|
w := NewUpdateWatcher(st, a)
|
||||||
|
w.Track(jobID, hostID)
|
||||||
|
|
||||||
|
w.OnHello(context.Background(), hostID, "v1", "v2")
|
||||||
|
|
||||||
|
job, _ := st.GetJob(context.Background(), jobID)
|
||||||
|
if job.Status == "succeeded" || job.Status == "failed" {
|
||||||
|
t.Fatalf("status flipped on mismatched hello: %q", job.Status)
|
||||||
|
}
|
||||||
|
a.mu.Lock()
|
||||||
|
defer a.mu.Unlock()
|
||||||
|
if len(a.raised) != 0 || len(a.resolved) != 0 {
|
||||||
|
t.Fatalf("unexpected alert calls raised=%v resolved=%v", a.raised, a.resolved)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUpdateWatcherHelloAfterTimeoutIsNoOp(t *testing.T) {
|
||||||
|
prev := updateTimeout
|
||||||
|
updateTimeout = 50 * time.Millisecond
|
||||||
|
t.Cleanup(func() { updateTimeout = prev })
|
||||||
|
|
||||||
|
st := openWSTestStore(t)
|
||||||
|
hostID := ulid.Make().String()
|
||||||
|
seedHostWS(t, st, hostID)
|
||||||
|
jobID := seedJob(t, st, hostID)
|
||||||
|
|
||||||
|
a := &fakeAlerts{}
|
||||||
|
w := NewUpdateWatcher(st, a)
|
||||||
|
w.Track(jobID, hostID)
|
||||||
|
|
||||||
|
time.Sleep(80 * time.Millisecond)
|
||||||
|
w.sweep(context.Background(), time.Now())
|
||||||
|
|
||||||
|
// Hello arrives after sweep — entry already gone, must be no-op.
|
||||||
|
w.OnHello(context.Background(), hostID, "v2", "v2")
|
||||||
|
|
||||||
|
job, _ := st.GetJob(context.Background(), jobID)
|
||||||
|
if job.Status != "failed" {
|
||||||
|
t.Fatalf("status flipped from failed → %q", job.Status)
|
||||||
|
}
|
||||||
|
a.mu.Lock()
|
||||||
|
defer a.mu.Unlock()
|
||||||
|
if len(a.resolved) != 0 {
|
||||||
|
t.Fatalf("late hello triggered ResolveUpdateFailed: %v", a.resolved)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -77,6 +77,56 @@ func (s *Store) RaiseOrTouch(ctx context.Context, hostID, kind, dedupKey, severi
|
|||||||
return id, true, nil
|
return id, true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RaiseOrTouchSystem is the host-less variant of RaiseOrTouch — the
|
||||||
|
// alert row's host_id is stored as NULL, so the FK to hosts is bypassed.
|
||||||
|
// Used by fleet-wide alerts (e.g. fleet_update_halted) where the
|
||||||
|
// failure surface isn't pinned to a single host.
|
||||||
|
func (s *Store) RaiseOrTouchSystem(ctx context.Context, kind, dedupKey, severity, message string, when time.Time) (id string, didRaise bool, err error) {
|
||||||
|
tx, err := s.db.BeginTx(ctx, nil)
|
||||||
|
if err != nil {
|
||||||
|
return "", false, fmt.Errorf("store: begin: %w", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = tx.Rollback() }()
|
||||||
|
|
||||||
|
row := tx.QueryRowContext(ctx,
|
||||||
|
`SELECT id FROM alerts
|
||||||
|
WHERE host_id IS NULL AND kind = ? AND dedup_key = ? AND resolved_at IS NULL
|
||||||
|
LIMIT 1`,
|
||||||
|
kind, dedupKey)
|
||||||
|
var existing string
|
||||||
|
switch err := row.Scan(&existing); {
|
||||||
|
case err == nil:
|
||||||
|
_, uerr := tx.ExecContext(ctx,
|
||||||
|
`UPDATE alerts SET last_seen_at = ?, message = ? WHERE id = ?`,
|
||||||
|
when.UTC().Format(time.RFC3339Nano), message, existing)
|
||||||
|
if uerr != nil {
|
||||||
|
return "", false, fmt.Errorf("store: touch alert: %w", uerr)
|
||||||
|
}
|
||||||
|
if err := tx.Commit(); err != nil {
|
||||||
|
return "", false, err
|
||||||
|
}
|
||||||
|
return existing, false, nil
|
||||||
|
case errors.Is(err, sql.ErrNoRows):
|
||||||
|
// fall through to insert
|
||||||
|
default:
|
||||||
|
return "", false, fmt.Errorf("store: lookup alert: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
id = ulid.Make().String()
|
||||||
|
whenStr := when.UTC().Format(time.RFC3339Nano)
|
||||||
|
_, err = tx.ExecContext(ctx,
|
||||||
|
`INSERT INTO alerts (id, host_id, kind, dedup_key, severity, message, created_at, last_seen_at)
|
||||||
|
VALUES (?, NULL, ?, ?, ?, ?, ?, ?)`,
|
||||||
|
id, kind, dedupKey, severity, message, whenStr, whenStr)
|
||||||
|
if err != nil {
|
||||||
|
return "", false, fmt.Errorf("store: insert alert: %w", err)
|
||||||
|
}
|
||||||
|
if err := tx.Commit(); err != nil {
|
||||||
|
return "", false, err
|
||||||
|
}
|
||||||
|
return id, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
// refreshHostOpenAlertCount recomputes hosts.open_alert_count from the
|
// refreshHostOpenAlertCount recomputes hosts.open_alert_count from the
|
||||||
// alerts table for one host. Self-healing: idempotent and survives
|
// alerts table for one host. Self-healing: idempotent and survives
|
||||||
// out-of-order edits. Best-effort — errors are returned but callers
|
// out-of-order edits. Best-effort — errors are returned but callers
|
||||||
|
|||||||
@@ -0,0 +1,258 @@
|
|||||||
|
package store
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ErrFleetUpdateRunning is returned by CreateFleetUpdate if another
|
||||||
|
// fleet update is already in 'running' state. The HTTP layer surfaces
|
||||||
|
// this as a 409 with a structured error code.
|
||||||
|
var ErrFleetUpdateRunning = errors.New("store: fleet update already running")
|
||||||
|
|
||||||
|
// CreateFleetUpdate inserts the parent row and one pending child per
|
||||||
|
// hostID, in the order given (position = index). Returns
|
||||||
|
// ErrFleetUpdateRunning if a fleet update is already in flight.
|
||||||
|
func (st *Store) CreateFleetUpdate(ctx context.Context, fu FleetUpdate, hostIDs []string) error {
|
||||||
|
if fu.ID == "" || fu.StartedByUserID == "" || fu.TargetVersion == "" {
|
||||||
|
return errors.New("store: fleet update id, user_id, target_version required")
|
||||||
|
}
|
||||||
|
if fu.Status == "" {
|
||||||
|
fu.Status = "running"
|
||||||
|
}
|
||||||
|
if fu.StartedAt.IsZero() {
|
||||||
|
fu.StartedAt = time.Now().UTC()
|
||||||
|
}
|
||||||
|
tx, err := st.db.BeginTx(ctx, nil)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("store: begin: %w", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = tx.Rollback() }()
|
||||||
|
|
||||||
|
var existing string
|
||||||
|
if err := tx.QueryRowContext(ctx,
|
||||||
|
`SELECT id FROM fleet_updates WHERE status = 'running' LIMIT 1`).
|
||||||
|
Scan(&existing); err == nil {
|
||||||
|
return fmt.Errorf("%w: %s", ErrFleetUpdateRunning, existing)
|
||||||
|
} else if !errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return fmt.Errorf("store: check active fleet update: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := tx.ExecContext(ctx,
|
||||||
|
`INSERT INTO fleet_updates (id, started_at, started_by_user_id, target_version, status)
|
||||||
|
VALUES (?, ?, ?, ?, ?)`,
|
||||||
|
fu.ID, fu.StartedAt.UTC().Format(time.RFC3339Nano), fu.StartedByUserID, fu.TargetVersion, fu.Status,
|
||||||
|
); err != nil {
|
||||||
|
return fmt.Errorf("store: insert fleet_updates: %w", err)
|
||||||
|
}
|
||||||
|
for i, hid := range hostIDs {
|
||||||
|
if _, err := tx.ExecContext(ctx,
|
||||||
|
`INSERT INTO fleet_update_hosts (fleet_update_id, host_id, position, status)
|
||||||
|
VALUES (?, ?, ?, 'pending')`,
|
||||||
|
fu.ID, hid, i,
|
||||||
|
); err != nil {
|
||||||
|
return fmt.Errorf("store: insert fleet_update_hosts: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tx.Commit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ActiveFleetUpdate returns the currently-running fleet update or nil.
|
||||||
|
func (st *Store) ActiveFleetUpdate(ctx context.Context) (*FleetUpdate, error) {
|
||||||
|
var fu FleetUpdate
|
||||||
|
var startedAt string
|
||||||
|
var current sql.NullString
|
||||||
|
var halted sql.NullString
|
||||||
|
var completedAt sql.NullString
|
||||||
|
err := st.db.QueryRowContext(ctx,
|
||||||
|
`SELECT id, started_at, started_by_user_id, target_version, status,
|
||||||
|
current_host_id, halted_reason, completed_at
|
||||||
|
FROM fleet_updates WHERE status = 'running' LIMIT 1`).
|
||||||
|
Scan(&fu.ID, &startedAt, &fu.StartedByUserID, &fu.TargetVersion, &fu.Status,
|
||||||
|
¤t, &halted, &completedAt)
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("store: active fleet update: %w", err)
|
||||||
|
}
|
||||||
|
fu.StartedAt, _ = time.Parse(time.RFC3339Nano, startedAt)
|
||||||
|
fu.CurrentHostID = current.String
|
||||||
|
fu.HaltedReason = halted.String
|
||||||
|
if completedAt.Valid {
|
||||||
|
t, _ := time.Parse(time.RFC3339Nano, completedAt.String)
|
||||||
|
fu.CompletedAt = &t
|
||||||
|
}
|
||||||
|
return &fu, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetFleetUpdate hydrates parent + ordered child rows. Returns
|
||||||
|
// ErrNotFound on missing id.
|
||||||
|
func (st *Store) GetFleetUpdate(ctx context.Context, id string) (*FleetUpdate, []FleetUpdateHost, error) {
|
||||||
|
var fu FleetUpdate
|
||||||
|
var startedAt string
|
||||||
|
var current sql.NullString
|
||||||
|
var halted sql.NullString
|
||||||
|
var completedAt sql.NullString
|
||||||
|
err := st.db.QueryRowContext(ctx,
|
||||||
|
`SELECT id, started_at, started_by_user_id, target_version, status,
|
||||||
|
current_host_id, halted_reason, completed_at
|
||||||
|
FROM fleet_updates WHERE id = ?`, id).
|
||||||
|
Scan(&fu.ID, &startedAt, &fu.StartedByUserID, &fu.TargetVersion, &fu.Status,
|
||||||
|
¤t, &halted, &completedAt)
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return nil, nil, ErrNotFound
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("store: get fleet update: %w", err)
|
||||||
|
}
|
||||||
|
fu.StartedAt, _ = time.Parse(time.RFC3339Nano, startedAt)
|
||||||
|
fu.CurrentHostID = current.String
|
||||||
|
fu.HaltedReason = halted.String
|
||||||
|
if completedAt.Valid {
|
||||||
|
t, _ := time.Parse(time.RFC3339Nano, completedAt.String)
|
||||||
|
fu.CompletedAt = &t
|
||||||
|
}
|
||||||
|
|
||||||
|
rows, err := st.db.QueryContext(ctx,
|
||||||
|
`SELECT host_id, position, status, COALESCE(job_id, ''), COALESCE(failed_reason, '')
|
||||||
|
FROM fleet_update_hosts
|
||||||
|
WHERE fleet_update_id = ?
|
||||||
|
ORDER BY position`, id)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("store: list fleet hosts: %w", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
out := []FleetUpdateHost{}
|
||||||
|
for rows.Next() {
|
||||||
|
fh := FleetUpdateHost{FleetUpdateID: id}
|
||||||
|
if err := rows.Scan(&fh.HostID, &fh.Position, &fh.Status, &fh.JobID, &fh.FailedReason); err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("store: scan fleet host: %w", err)
|
||||||
|
}
|
||||||
|
out = append(out, fh)
|
||||||
|
}
|
||||||
|
return &fu, out, rows.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListPendingFleetUpdateHosts returns rows with status='pending' for
|
||||||
|
// this fleet update, in position order. The worker calls this to
|
||||||
|
// pick the next host to dispatch.
|
||||||
|
func (st *Store) ListPendingFleetUpdateHosts(ctx context.Context, fuID string) ([]FleetUpdateHost, error) {
|
||||||
|
rows, err := st.db.QueryContext(ctx,
|
||||||
|
`SELECT host_id, position, status, COALESCE(job_id, ''), COALESCE(failed_reason, '')
|
||||||
|
FROM fleet_update_hosts
|
||||||
|
WHERE fleet_update_id = ? AND status = 'pending'
|
||||||
|
ORDER BY position`, fuID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("store: list pending fleet hosts: %w", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = rows.Close() }()
|
||||||
|
out := []FleetUpdateHost{}
|
||||||
|
for rows.Next() {
|
||||||
|
fh := FleetUpdateHost{FleetUpdateID: fuID}
|
||||||
|
if err := rows.Scan(&fh.HostID, &fh.Position, &fh.Status, &fh.JobID, &fh.FailedReason); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
out = append(out, fh)
|
||||||
|
}
|
||||||
|
return out, rows.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetFleetUpdateHostStatus moves one row through pending → running →
|
||||||
|
// {succeeded, failed, skipped}. failedReason and jobID may be empty
|
||||||
|
// (e.g. on succeeded). Empty values are stored as NULL so subsequent
|
||||||
|
// reads round-trip cleanly via COALESCE.
|
||||||
|
func (st *Store) SetFleetUpdateHostStatus(ctx context.Context, fuID, hostID, status, failedReason, jobID string) error {
|
||||||
|
_, err := st.db.ExecContext(ctx,
|
||||||
|
`UPDATE fleet_update_hosts
|
||||||
|
SET status = ?, failed_reason = ?, job_id = COALESCE(?, job_id)
|
||||||
|
WHERE fleet_update_id = ? AND host_id = ?`,
|
||||||
|
status, nullableString(failedReason), nullableString(jobID),
|
||||||
|
fuID, hostID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("store: set fleet host status: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetFleetUpdateCurrentHost stamps which host the worker is actively
|
||||||
|
// waiting on. Pass empty string to clear.
|
||||||
|
func (st *Store) SetFleetUpdateCurrentHost(ctx context.Context, fuID, hostID string) error {
|
||||||
|
_, err := st.db.ExecContext(ctx,
|
||||||
|
`UPDATE fleet_updates SET current_host_id = ? WHERE id = ?`,
|
||||||
|
nullableString(hostID), fuID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("store: set fleet current host: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// HaltFleetUpdate flips status to 'halted', stamps the reason, and
|
||||||
|
// clears current_host_id.
|
||||||
|
func (st *Store) HaltFleetUpdate(ctx context.Context, fuID, reason string, when time.Time) error {
|
||||||
|
_, err := st.db.ExecContext(ctx,
|
||||||
|
`UPDATE fleet_updates
|
||||||
|
SET status = 'halted', halted_reason = ?, current_host_id = NULL,
|
||||||
|
completed_at = ?
|
||||||
|
WHERE id = ? AND status = 'running'`,
|
||||||
|
reason, when.UTC().Format(time.RFC3339Nano), fuID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("store: halt fleet update: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CancelFleetUpdate flips status to 'cancelled'. Caller checks that
|
||||||
|
// the row is still 'running' before calling.
|
||||||
|
func (st *Store) CancelFleetUpdate(ctx context.Context, fuID string, when time.Time) error {
|
||||||
|
_, err := st.db.ExecContext(ctx,
|
||||||
|
`UPDATE fleet_updates
|
||||||
|
SET status = 'cancelled', current_host_id = NULL, completed_at = ?
|
||||||
|
WHERE id = ? AND status = 'running'`,
|
||||||
|
when.UTC().Format(time.RFC3339Nano), fuID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("store: cancel fleet update: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CompleteFleetUpdate flips status to 'completed' once every host has
|
||||||
|
// reached a terminal state.
|
||||||
|
func (st *Store) CompleteFleetUpdate(ctx context.Context, fuID string, when time.Time) error {
|
||||||
|
_, err := st.db.ExecContext(ctx,
|
||||||
|
`UPDATE fleet_updates
|
||||||
|
SET status = 'completed', current_host_id = NULL, completed_at = ?
|
||||||
|
WHERE id = ? AND status = 'running'`,
|
||||||
|
when.UTC().Format(time.RFC3339Nano), fuID,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("store: complete fleet update: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunningUpdateJobForHost returns the id of any in-flight (queued or
|
||||||
|
// running) `update` job for hostID, or "" + nil if none. Used by the
|
||||||
|
// host-update HTTP handler to refuse double-dispatch and by the
|
||||||
|
// fleet worker to dedupe on retry.
|
||||||
|
func (st *Store) RunningUpdateJobForHost(ctx context.Context, hostID string) (string, error) {
|
||||||
|
var id string
|
||||||
|
err := st.db.QueryRowContext(ctx,
|
||||||
|
`SELECT id FROM jobs
|
||||||
|
WHERE host_id = ? AND kind = 'update' AND status IN ('queued','running')
|
||||||
|
ORDER BY created_at DESC LIMIT 1`, hostID).Scan(&id)
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("store: running update job: %w", err)
|
||||||
|
}
|
||||||
|
return id, nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,180 @@
|
|||||||
|
package store
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/oklog/ulid/v2"
|
||||||
|
)
|
||||||
|
|
||||||
|
func ptrStr(s string) *string { return &s }
|
||||||
|
|
||||||
|
func seedFleetUser(t *testing.T, s *Store) string {
|
||||||
|
t.Helper()
|
||||||
|
id := ulid.Make().String()
|
||||||
|
if err := s.CreateUser(context.Background(), User{
|
||||||
|
ID: id, Username: "u-" + id[:6], PasswordHash: "x", Role: RoleAdmin,
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("create user: %v", err)
|
||||||
|
}
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
|
||||||
|
func seedFleetHost(t *testing.T, s *Store, name string) string {
|
||||||
|
t.Helper()
|
||||||
|
id := ulid.Make().String()
|
||||||
|
if err := s.CreateHost(context.Background(), Host{
|
||||||
|
ID: id, Name: name, OS: "linux", Arch: "amd64",
|
||||||
|
EnrolledAt: time.Now().UTC(),
|
||||||
|
}, "tokenhash-"+id[:6], ""); err != nil {
|
||||||
|
t.Fatalf("create host: %v", err)
|
||||||
|
}
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCreateFleetUpdate_RefusesIfRunning(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
s := openTestStore(t)
|
||||||
|
uid := seedFleetUser(t, s)
|
||||||
|
h1 := seedFleetHost(t, s, "h1")
|
||||||
|
|
||||||
|
fu1 := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v1"}
|
||||||
|
if err := s.CreateFleetUpdate(context.Background(), fu1, []string{h1}); err != nil {
|
||||||
|
t.Fatalf("create #1: %v", err)
|
||||||
|
}
|
||||||
|
fu2 := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v2"}
|
||||||
|
err := s.CreateFleetUpdate(context.Background(), fu2, []string{h1})
|
||||||
|
if !errors.Is(err, ErrFleetUpdateRunning) {
|
||||||
|
t.Fatalf("want ErrFleetUpdateRunning, got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCreateFleetUpdate_HydrateRoundTrip(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
s := openTestStore(t)
|
||||||
|
uid := seedFleetUser(t, s)
|
||||||
|
h1 := seedFleetHost(t, s, "h1")
|
||||||
|
h2 := seedFleetHost(t, s, "h2")
|
||||||
|
|
||||||
|
fu := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v1.2.3"}
|
||||||
|
if err := s.CreateFleetUpdate(context.Background(), fu, []string{h1, h2}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
got, hosts, err := s.GetFleetUpdate(context.Background(), fu.ID)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if got.Status != "running" || got.TargetVersion != "v1.2.3" {
|
||||||
|
t.Fatalf("parent: %+v", got)
|
||||||
|
}
|
||||||
|
if len(hosts) != 2 || hosts[0].Position != 0 || hosts[1].Position != 1 {
|
||||||
|
t.Fatalf("hosts: %+v", hosts)
|
||||||
|
}
|
||||||
|
if hosts[0].Status != "pending" || hosts[1].Status != "pending" {
|
||||||
|
t.Fatalf("hosts status: %+v", hosts)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSetFleetUpdateHostStatus_ProgressesAndStoresJobID(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
s := openTestStore(t)
|
||||||
|
uid := seedFleetUser(t, s)
|
||||||
|
h := seedFleetHost(t, s, "h1")
|
||||||
|
fu := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v1"}
|
||||||
|
_ = s.CreateFleetUpdate(context.Background(), fu, []string{h})
|
||||||
|
|
||||||
|
jobID := ulid.Make().String()
|
||||||
|
if err := s.CreateJob(context.Background(), Job{
|
||||||
|
ID: jobID, HostID: h, Kind: "update",
|
||||||
|
ActorKind: "user", ActorID: ptrStr(uid), CreatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := s.SetFleetUpdateHostStatus(context.Background(), fu.ID, h, "running", "", ""); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := s.SetFleetUpdateHostStatus(context.Background(), fu.ID, h, "succeeded", "", jobID); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
_, hs, _ := s.GetFleetUpdate(context.Background(), fu.ID)
|
||||||
|
if hs[0].Status != "succeeded" || hs[0].JobID != jobID {
|
||||||
|
t.Fatalf("after succeed: %+v", hs[0])
|
||||||
|
}
|
||||||
|
|
||||||
|
pending, _ := s.ListPendingFleetUpdateHosts(context.Background(), fu.ID)
|
||||||
|
if len(pending) != 0 {
|
||||||
|
t.Fatalf("pending should be empty: %+v", pending)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHaltAndCompleteFleetUpdate(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
s := openTestStore(t)
|
||||||
|
uid := seedFleetUser(t, s)
|
||||||
|
h := seedFleetHost(t, s, "h1")
|
||||||
|
|
||||||
|
fu1 := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v1"}
|
||||||
|
_ = s.CreateFleetUpdate(context.Background(), fu1, []string{h})
|
||||||
|
if err := s.HaltFleetUpdate(context.Background(), fu1.ID, "boom", time.Now().UTC()); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
got, _, _ := s.GetFleetUpdate(context.Background(), fu1.ID)
|
||||||
|
if got.Status != "halted" || got.HaltedReason != "boom" {
|
||||||
|
t.Fatalf("after halt: %+v", got)
|
||||||
|
}
|
||||||
|
if got.CompletedAt == nil {
|
||||||
|
t.Fatal("halted must stamp completed_at")
|
||||||
|
}
|
||||||
|
if active, _ := s.ActiveFleetUpdate(context.Background()); active != nil {
|
||||||
|
t.Fatalf("halted should clear active: %+v", active)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now a fresh run can start.
|
||||||
|
fu2 := FleetUpdate{ID: ulid.Make().String(), StartedByUserID: uid, TargetVersion: "v2"}
|
||||||
|
if err := s.CreateFleetUpdate(context.Background(), fu2, []string{h}); err != nil {
|
||||||
|
t.Fatalf("create after halt: %v", err)
|
||||||
|
}
|
||||||
|
if err := s.CompleteFleetUpdate(context.Background(), fu2.ID, time.Now().UTC()); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
got, _, _ = s.GetFleetUpdate(context.Background(), fu2.ID)
|
||||||
|
if got.Status != "completed" {
|
||||||
|
t.Fatalf("after complete: %+v", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunningUpdateJobForHost(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
s := openTestStore(t)
|
||||||
|
h := seedFleetHost(t, s, "h1")
|
||||||
|
|
||||||
|
got, err := s.RunningUpdateJobForHost(context.Background(), h)
|
||||||
|
if err != nil || got != "" {
|
||||||
|
t.Fatalf("empty case: got=%q err=%v", got, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
jobID := ulid.Make().String()
|
||||||
|
if err := s.CreateJob(context.Background(), Job{
|
||||||
|
ID: jobID, HostID: h, Kind: "update",
|
||||||
|
ActorKind: "user", ActorID: ptrStr("u-1"), CreatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
got, err = s.RunningUpdateJobForHost(context.Background(), h)
|
||||||
|
if err != nil || got != jobID {
|
||||||
|
t.Fatalf("queued case: got=%q err=%v", got, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark succeeded → no longer "in flight".
|
||||||
|
if err := s.MarkJobFinished(context.Background(), jobID, "succeeded", 0, nil, "", time.Now().UTC()); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
got, err = s.RunningUpdateJobForHost(context.Background(), h)
|
||||||
|
if err != nil || got != "" {
|
||||||
|
t.Fatalf("after succeed: got=%q err=%v", got, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
-- 0021_jobs_update_kind.sql
|
||||||
|
--
|
||||||
|
-- Add 'update' to the jobs.kind CHECK constraint so the agent
|
||||||
|
-- self-update flow (P6-01) can persist its job rows. SQLite can't
|
||||||
|
-- ALTER a CHECK in place, so we rebuild the table.
|
||||||
|
--
|
||||||
|
-- Same safe rebuild pattern as 0012:
|
||||||
|
-- 1. Stash job_logs into a temp table BEFORE rebuilding jobs.
|
||||||
|
-- 2. Create jobs_new with the wider CHECK; copy data; DROP jobs;
|
||||||
|
-- RENAME jobs_new TO jobs.
|
||||||
|
-- 3. Restore job_logs (cascade-trap defence — see CLAUDE.md).
|
||||||
|
--
|
||||||
|
-- jobs_new mirrors the live schema *including* post-0012 column
|
||||||
|
-- additions (0015 added source_group_id). When adding a new
|
||||||
|
-- migration that touches this table, mirror the latest column set.
|
||||||
|
|
||||||
|
CREATE TEMPORARY TABLE _job_logs_backup AS
|
||||||
|
SELECT job_id, seq, ts, stream, payload FROM job_logs;
|
||||||
|
|
||||||
|
CREATE TABLE jobs_new (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
|
||||||
|
kind TEXT NOT NULL CHECK (kind IN
|
||||||
|
('backup','init','forget','prune','check','unlock','restore','diff','update')),
|
||||||
|
status TEXT NOT NULL CHECK (status IN ('queued','running','succeeded','failed','cancelled')),
|
||||||
|
scheduled_id TEXT REFERENCES schedules(id) ON DELETE SET NULL,
|
||||||
|
actor_kind TEXT NOT NULL CHECK (actor_kind IN ('user','schedule','system')),
|
||||||
|
actor_id TEXT,
|
||||||
|
started_at TEXT,
|
||||||
|
finished_at TEXT,
|
||||||
|
exit_code INTEGER,
|
||||||
|
stats TEXT,
|
||||||
|
error TEXT,
|
||||||
|
created_at TEXT NOT NULL,
|
||||||
|
source_group_id TEXT REFERENCES source_groups(id) ON DELETE SET NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO jobs_new
|
||||||
|
SELECT id, host_id, kind, status, scheduled_id, actor_kind, actor_id,
|
||||||
|
started_at, finished_at, exit_code, stats, error, created_at,
|
||||||
|
source_group_id
|
||||||
|
FROM jobs;
|
||||||
|
|
||||||
|
DROP TABLE jobs;
|
||||||
|
ALTER TABLE jobs_new RENAME TO jobs;
|
||||||
|
|
||||||
|
CREATE INDEX jobs_host_id ON jobs(host_id);
|
||||||
|
CREATE INDEX jobs_status ON jobs(status);
|
||||||
|
CREATE INDEX jobs_created_at ON jobs(created_at);
|
||||||
|
CREATE INDEX jobs_source_group_id ON jobs(source_group_id);
|
||||||
|
|
||||||
|
-- Defensive: restore job_logs from the temp backup. INSERT OR IGNORE
|
||||||
|
-- so a re-run is harmless. Same shape as 0012's safety net.
|
||||||
|
INSERT OR IGNORE INTO job_logs (job_id, seq, ts, stream, payload)
|
||||||
|
SELECT job_id, seq, ts, stream, payload FROM _job_logs_backup;
|
||||||
|
|
||||||
|
DROP TABLE _job_logs_backup;
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
-- 0022_fleet_updates.sql
|
||||||
|
--
|
||||||
|
-- Tables backing the rolling fleet-update worker (P6-02). One row in
|
||||||
|
-- fleet_updates per "update all" invocation, a child row per host so
|
||||||
|
-- the worker can iterate in position order, report progress, and
|
||||||
|
-- record per-host outcome. Halt-on-fail semantics live in the worker
|
||||||
|
-- (internal/server/fleetupdate); this schema just captures state.
|
||||||
|
|
||||||
|
CREATE TABLE fleet_updates (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
started_at TEXT NOT NULL,
|
||||||
|
started_by_user_id TEXT NOT NULL REFERENCES users(id),
|
||||||
|
target_version TEXT NOT NULL,
|
||||||
|
status TEXT NOT NULL CHECK (status IN
|
||||||
|
('running','completed','halted','cancelled')),
|
||||||
|
current_host_id TEXT REFERENCES hosts(id),
|
||||||
|
halted_reason TEXT,
|
||||||
|
completed_at TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX fleet_updates_status ON fleet_updates(status);
|
||||||
|
|
||||||
|
CREATE TABLE fleet_update_hosts (
|
||||||
|
fleet_update_id TEXT NOT NULL REFERENCES fleet_updates(id) ON DELETE CASCADE,
|
||||||
|
host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
|
||||||
|
position INTEGER NOT NULL,
|
||||||
|
status TEXT NOT NULL CHECK (status IN
|
||||||
|
('pending','running','succeeded','failed','skipped')),
|
||||||
|
job_id TEXT REFERENCES jobs(id) ON DELETE SET NULL,
|
||||||
|
failed_reason TEXT,
|
||||||
|
PRIMARY KEY (fleet_update_id, host_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX fleet_update_hosts_position
|
||||||
|
ON fleet_update_hosts(fleet_update_id, position);
|
||||||
@@ -211,6 +211,33 @@ type PendingRun struct {
|
|||||||
LastError string
|
LastError string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FleetUpdate captures one "update all" invocation. Status moves
|
||||||
|
// running → one of {completed, halted, cancelled}. CurrentHostID
|
||||||
|
// tracks the host the worker is actively waiting on; cleared (empty)
|
||||||
|
// outside an active dispatch.
|
||||||
|
type FleetUpdate struct {
|
||||||
|
ID string
|
||||||
|
StartedAt time.Time
|
||||||
|
StartedByUserID string
|
||||||
|
TargetVersion string
|
||||||
|
Status string
|
||||||
|
CurrentHostID string
|
||||||
|
HaltedReason string
|
||||||
|
CompletedAt *time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
// FleetUpdateHost is one host's slot in a fleet update. Position is
|
||||||
|
// the iteration order. JobID is set once the worker has dispatched
|
||||||
|
// command.update for this host; FailedReason on a failed/halted row.
|
||||||
|
type FleetUpdateHost struct {
|
||||||
|
FleetUpdateID string
|
||||||
|
HostID string
|
||||||
|
Position int
|
||||||
|
Status string
|
||||||
|
JobID string
|
||||||
|
FailedReason string
|
||||||
|
}
|
||||||
|
|
||||||
// EnrollmentToken is the issuer's view of a one-time token.
|
// EnrollmentToken is the issuer's view of a one-time token.
|
||||||
type EnrollmentToken struct {
|
type EnrollmentToken struct {
|
||||||
Raw string
|
Raw string
|
||||||
|
|||||||
@@ -0,0 +1,16 @@
|
|||||||
|
// Package version exposes build-time identifying constants. Both the
|
||||||
|
// server and agent link this package; their values are set via
|
||||||
|
// -ldflags during the build. An unset Version falls back to "dev"
|
||||||
|
// so source builds without ldflags still run.
|
||||||
|
package version
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Version is the human-facing release string, e.g. "v1.2.3" or
|
||||||
|
// "v1.2.3-dirty". Compared byte-for-byte between agent and
|
||||||
|
// server to drive the "out of date" signal.
|
||||||
|
Version = "dev"
|
||||||
|
|
||||||
|
// Commit is the short git SHA. Informational only; surfaced via
|
||||||
|
// /api/version but not used for any comparison.
|
||||||
|
Commit = ""
|
||||||
|
)
|
||||||
@@ -344,8 +344,33 @@ Sizes: **S** = under a day, **M** = 1–3 days, **L** = 3–7 days.
|
|||||||
|
|
||||||
> Deferred from Phase 4 on 2026-05-05 — operator-experience polish that doesn't gate a working v1.
|
> Deferred from Phase 4 on 2026-05-05 — operator-experience polish that doesn't gate a working v1.
|
||||||
|
|
||||||
- [ ] **P6-01** (S) Agent self-update from the server's bundled binaries. P5-03 already bakes matching `agent-{linux-amd64,linux-arm64,windows-amd64}` into the server image under `/opt/restic-manager/dist/`, served by `/agent/binary`. Add a `restic-manager-agent update` subcommand (and a server-dispatched `command.update` WS envelope) that fetches `$RM_SERVER/agent/binary?os=…&arch=…`, verifies sha256 against a digest the server advertises alongside the binary, atomic-renames over the running binary (`tmp+fsync+rename`), and asks the service manager to restart (`systemctl restart` on Linux, SCM restart on Windows). Version pinning is automatic — the server only ever serves the agent that matches its own release. No apt repo, no Chocolatey, no third-party signing infra. _(Was P4-01; original apt/choco plan dropped after the P5-03 Docker pivot made the server the natural distribution point.)_
|
- [x] **P6-01** (S) Agent self-update from the server's bundled binaries. Server-dispatched `command.update` WS envelope; agent fetches `$RM_SERVER/agent/binary?os=…&arch=…` to `<bin>.new`, copies running binary to `<bin>.old` (M1 — keep one revision back), atomic-rename, exit cleanly. Linux relies on systemd `Restart=always`; Windows writes a detached `update.cmd` helper that waits 3s, `sc stop`s, renames, `sc start`s. No sha256 digest verification — TLS already covers corruption-in-transit (decision deferred per spec §4). _(Was P4-01.)_
|
||||||
- [ ] **P6-02** (M) Agent version reporting + fleet update on dashboard. Server already knows its own build version and each agent's `agent_version` from the WS hello. Surface "N hosts behind" on the dashboard, a per-host "out of date" chip, and an admin-only **Update all** action that fans out `command.update` to every online host (offline hosts queue via `pending_runs`-style retry on reconnect). Per-host **Update** button on host detail for one-shot upgrades. Audit-logged. _(Was P4-02.)_
|
- [x] **P6-02** (M) Agent version reporting + fleet update on dashboard. `internal/version` package + Makefile ldflags injection so server and agent are comparable byte-for-byte. Out-of-date chip on host rows + detail header (amber, format `out of date · A → B`). Hero tile "N hosts behind" with `?updates=behind` filter. Per-host **Update agent** button on host detail. Admin `/settings/fleet-update` page drives a rolling worker (`internal/server/fleetupdate`) that updates one host at a time, polls for hello-with-target-version up to 95s, halts on first failure with `fleet_update_halted` alert. Per-host `update_failed` alerts auto-resolve when the agent reconnects at the right version. `host.update_dispatched/_succeeded/_failed` and `fleet.update_started/_completed/_halted/_cancelled` audit actions. _(Was P4-02.)_
|
||||||
|
|
||||||
|
> **As shipped (2026-05-06, branch `p6-agent-self-update`):**
|
||||||
|
> Spec `docs/superpowers/specs/2026-05-06-p6-01-02-agent-self-update-design.md`,
|
||||||
|
> plan `docs/superpowers/plans/2026-05-06-p6-01-02-agent-self-update.md`.
|
||||||
|
> Schema: migration 0021 widens `jobs.kind` CHECK to include `update`;
|
||||||
|
> 0022 creates `fleet_updates` + `fleet_update_hosts`. Agent: new
|
||||||
|
> `internal/agent/updater` package (build-tag split unix/windows);
|
||||||
|
> dispatcher case `MsgCommandUpdate` in `cmd/agent/update_dispatch.go`
|
||||||
|
> emits `job.started` + `log.stream` updates before exit. Server: WS
|
||||||
|
> update-watcher (`internal/server/ws/update_watch.go`) tracks in-flight
|
||||||
|
> dispatches, marks succeeded on hello-with-matching-version, fails after
|
||||||
|
> 90s timeout (covers both no-show and rollback cases per spec §3.2).
|
||||||
|
> Endpoint `POST /api/hosts/{id}/update` (admin, JSON) + `POST /hosts/{id}/update`
|
||||||
|
> (HTMX, `HX-Redirect: /jobs/{id}`); pre-checks for offline / already
|
||||||
|
> up-to-date / update_in_progress. Fleet worker exposes `Start` /
|
||||||
|
> `Cancel` and runs at most one rolling sequence at a time. Alert kinds
|
||||||
|
> `update_failed` and `fleet_update_halted` plug into the P3-05 engine.
|
||||||
|
>
|
||||||
|
> **Smoke caught + fixed mid-sweep:** the systemd unit's
|
||||||
|
> `ProtectSystem=full` made `/usr/local/bin` read-only, blocking the
|
||||||
|
> .new staging file. Added `/usr/local/bin` to `ReadWritePaths`. With
|
||||||
|
> the fix in place: end-to-end Update agent took the host from
|
||||||
|
> `v0.9.0-11-gccaccd8-dirty` → `v9.9.9-smoke` in <5s; `.old` preserved
|
||||||
|
> on disk; chip and hero tile cleared on reconnect; audit row landed.
|
||||||
|
> Screenshots in `_diag/p6-update-sweep/`.
|
||||||
- [ ] **P6-03** (M) Repo size trend graphs (sparkline on host card, full chart on repo page). _(Was P4-06.)_
|
- [ ] **P6-03** (M) Repo size trend graphs (sparkline on host card, full chart on repo page). _(Was P4-06.)_
|
||||||
- [ ] **P6-04** (M) Prometheus `/metrics` endpoint: per-host gauges (last backup timestamp, last backup status, repo size, snapshot count, agent online), server gauges (active alerts, build info), job duration histograms; protected by bearer token or IP allow-list. _(Was P4-08.)_
|
- [ ] **P6-04** (M) Prometheus `/metrics` endpoint: per-host gauges (last backup timestamp, last backup status, repo size, snapshot count, agent online), server gauges (active alerts, build info), job duration histograms; protected by bearer token or IP allow-list. _(Was P4-08.)_
|
||||||
- [ ] **P6-05** (S) Document Prometheus integration + sample Grafana dashboard JSON. _(Was P4-09.)_
|
- [ ] **P6-05** (S) Document Prometheus integration + sample Grafana dashboard JSON. _(Was P4-09.)_
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -104,6 +104,65 @@
|
|||||||
.btn-lg { font-size: 13px; padding: 9px 14px; }
|
.btn-lg { font-size: 13px; padding: 9px 14px; }
|
||||||
.btn-block { width: 100%; justify-content: center; }
|
.btn-block { width: 100%; justify-content: center; }
|
||||||
|
|
||||||
|
/* Amber action — used for the per-host "Update agent" button and
|
||||||
|
the fleet-update Start button. Same warning palette as the
|
||||||
|
update-chip below. */
|
||||||
|
.btn-amber {
|
||||||
|
color: oklch(0.18 0.01 80);
|
||||||
|
background: var(--warn);
|
||||||
|
border-color: var(--warn);
|
||||||
|
}
|
||||||
|
.btn-amber:hover { filter: brightness(1.08); }
|
||||||
|
.btn-amber:disabled, .btn-amber[disabled] {
|
||||||
|
opacity: 0.45; cursor: not-allowed; pointer-events: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Update-available chip — small amber pill rendered next to a host's
|
||||||
|
agent version (in the row OS column and in the host detail
|
||||||
|
header). Hidden when the host is up to date. */
|
||||||
|
.update-chip {
|
||||||
|
display: inline-flex; align-items: center; gap: 4px;
|
||||||
|
padding: 1px 6px;
|
||||||
|
border-radius: 3px;
|
||||||
|
font-size: 10px; font-weight: 500;
|
||||||
|
line-height: 1.4;
|
||||||
|
color: oklch(0.18 0.01 80);
|
||||||
|
background: color-mix(in oklch, var(--warn), transparent 30%);
|
||||||
|
border: 1px solid color-mix(in oklch, var(--warn), transparent 50%);
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Hero tile — large, clickable summary card on the dashboard.
|
||||||
|
Today only used by the "N hosts behind" tile; the existing
|
||||||
|
four summary boxes use bespoke grid markup. Add more variants
|
||||||
|
as adjacent dashboard tiles adopt this. */
|
||||||
|
.hero-tile {
|
||||||
|
display: flex; flex-direction: column; gap: 4px;
|
||||||
|
padding: 14px 16px;
|
||||||
|
border-radius: 7px;
|
||||||
|
border: 1px solid var(--line-soft);
|
||||||
|
background: var(--panel);
|
||||||
|
text-decoration: none;
|
||||||
|
transition: filter 120ms ease, background 120ms ease;
|
||||||
|
}
|
||||||
|
.hero-tile:hover { filter: brightness(1.08); }
|
||||||
|
.hero-tile .hero-num {
|
||||||
|
font-family: 'JetBrains Mono', ui-monospace, monospace;
|
||||||
|
font-size: 22px; font-weight: 500;
|
||||||
|
letter-spacing: -0.01em;
|
||||||
|
color: var(--ink);
|
||||||
|
}
|
||||||
|
.hero-tile .hero-label {
|
||||||
|
font-size: 11.5px;
|
||||||
|
color: var(--ink-mute);
|
||||||
|
}
|
||||||
|
.hero-tile--amber {
|
||||||
|
background: color-mix(in oklch, var(--warn), transparent 88%);
|
||||||
|
border-color: color-mix(in oklch, var(--warn), transparent 60%);
|
||||||
|
}
|
||||||
|
.hero-tile--amber .hero-num { color: oklch(0.86 0.13 80); }
|
||||||
|
.hero-tile--amber .hero-label { color: oklch(0.78 0.08 80); }
|
||||||
|
|
||||||
/* ---------- nav tabs ---------- */
|
/* ---------- nav tabs ---------- */
|
||||||
.nav-tab {
|
.nav-tab {
|
||||||
font-size: 13px; padding: 18px 0;
|
font-size: 13px; padding: 18px 0;
|
||||||
|
|||||||
@@ -66,6 +66,16 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{{/* ---------- Hosts-behind hero tile (P6-18) ---------- */}}
|
||||||
|
{{if gt $page.UpdatesBehind 0}}
|
||||||
|
<div class="pt-4">
|
||||||
|
<a href="?updates=behind" class="hero-tile hero-tile--amber" style="display:inline-flex;">
|
||||||
|
<span class="hero-num">{{$page.UpdatesBehind}}</span>
|
||||||
|
<span class="hero-label">{{if eq $page.UpdatesBehind 1}}host behind{{else}}hosts behind{{end}} · review →</span>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
{{end}}
|
||||||
|
|
||||||
{{/* ---------- Pending hosts (announce-and-approve queue) ---------- */}}
|
{{/* ---------- Pending hosts (announce-and-approve queue) ---------- */}}
|
||||||
{{if gt (len $page.PendingHosts) 0}}
|
{{if gt (len $page.PendingHosts) 0}}
|
||||||
<div class="pt-6">
|
<div class="pt-6">
|
||||||
|
|||||||
@@ -0,0 +1,32 @@
|
|||||||
|
{{define "title"}}Fleet update · restic-manager{{end}}
|
||||||
|
|
||||||
|
{{define "content"}}
|
||||||
|
{{$page := .Page}}
|
||||||
|
<div class="max-w-[1280px] mx-auto px-8 pb-14">
|
||||||
|
|
||||||
|
{{/* breadcrumbs */}}
|
||||||
|
<div class="crumbs pt-6">
|
||||||
|
<a href="/">Dashboard</a><span class="sep">/</span>
|
||||||
|
<a href="/settings">Settings</a><span class="sep">/</span>
|
||||||
|
<span class="text-ink-mid">fleet update</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{{/* page header */}}
|
||||||
|
<div class="flex items-baseline justify-between mt-3.5">
|
||||||
|
<div>
|
||||||
|
<h1 class="text-[22px] font-medium tracking-[-0.005em]">
|
||||||
|
Fleet update
|
||||||
|
<span class="text-ink-fade font-normal text-[14px] ml-2 mono">target {{$page.TargetVersion}}</span>
|
||||||
|
</h1>
|
||||||
|
<p class="text-ink-mute text-[12px] mt-1 max-w-[760px] leading-[1.55]">
|
||||||
|
Rolling, sequential agent self-update. One host at a time, halts on first failure,
|
||||||
|
cancellable mid-roll. Only online hosts whose <span class="mono">agent_version</span>
|
||||||
|
differs from the server are eligible.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{{template "fleet_update_inner" .}}
|
||||||
|
|
||||||
|
</div>
|
||||||
|
{{end}}
|
||||||
@@ -78,6 +78,26 @@
|
|||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{{if and $page.CanAdmin $page.UpdateAvailable}}
|
||||||
|
<div class="panel rounded-[7px] px-4 py-3.5">
|
||||||
|
<div class="text-[11px] text-ink-fade uppercase tracking-[0.1em] mb-2.5">Agent update</div>
|
||||||
|
<p class="text-[12px] text-ink-mute leading-[1.55] mb-3">
|
||||||
|
Agent at <span class="mono text-ink-mid">{{$host.AgentVersion}}</span> ·
|
||||||
|
server at <span class="mono text-ink-mid">{{$page.TargetVersion}}</span>.
|
||||||
|
Pushes a self-update command; the agent re-launches into the new binary
|
||||||
|
and reconnects.
|
||||||
|
</p>
|
||||||
|
<form hx-post="/hosts/{{$host.ID}}/update" hx-swap="none">
|
||||||
|
<button class="btn btn-amber btn-block"
|
||||||
|
{{if not $page.Online}}disabled title="Agent must be online"
|
||||||
|
{{else if $page.UpdateInProgress}}disabled title="Update already in progress"
|
||||||
|
{{end}}>
|
||||||
|
Update agent
|
||||||
|
</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
{{end}}
|
||||||
|
|
||||||
<div class="panel rounded-[7px] px-4 py-3.5">
|
<div class="panel rounded-[7px] px-4 py-3.5">
|
||||||
<div class="text-[11px] text-ink-fade uppercase tracking-[0.1em] mb-2.5">Restore</div>
|
<div class="text-[11px] text-ink-fade uppercase tracking-[0.1em] mb-2.5">Restore</div>
|
||||||
<p class="text-[12px] text-ink-mute leading-[1.55] mb-3">
|
<p class="text-[12px] text-ink-mute leading-[1.55] mb-3">
|
||||||
|
|||||||
@@ -0,0 +1,171 @@
|
|||||||
|
{{/*
|
||||||
|
fleet_update_inner — inner panel for /settings/fleet-update.
|
||||||
|
Rendered both as part of the full page and as the htmx polling
|
||||||
|
fragment via /settings/fleet-update/partial.
|
||||||
|
|
||||||
|
Expects .Page to be a fleetUpdatePage struct (see fleet_update.go).
|
||||||
|
*/}}
|
||||||
|
{{define "fleet_update_inner"}}
|
||||||
|
{{$page := .Page}}
|
||||||
|
<div id="fleet-update-panel" class="mt-5"
|
||||||
|
hx-get="{{$page.PollURL}}"
|
||||||
|
hx-trigger="every 3s [document.visibilityState==='visible']"
|
||||||
|
hx-select="#fleet-update-panel"
|
||||||
|
hx-swap="outerHTML">
|
||||||
|
|
||||||
|
{{if and $page.Active (eq $page.Active.Status "running")}}
|
||||||
|
|
||||||
|
{{/* ---------- running state ---------- */}}
|
||||||
|
<div class="panel rounded-[7px] px-5 py-4">
|
||||||
|
<div class="flex items-baseline justify-between">
|
||||||
|
<div>
|
||||||
|
<span class="mono text-[12px] text-ink-fade">fleet update</span>
|
||||||
|
<span class="mono text-[12px] text-accent ml-2">running</span>
|
||||||
|
<span class="mono text-[11px] text-ink-fade ml-2">{{$page.Active.ID}}</span>
|
||||||
|
</div>
|
||||||
|
<form hx-post="/api/fleet-updates/{{$page.Active.ID}}/cancel" hx-swap="none">
|
||||||
|
<button class="btn btn-danger" type="submit"
|
||||||
|
onclick="return confirm('Cancel this fleet update? Hosts already updated stay updated; pending hosts will be skipped.');">
|
||||||
|
Cancel
|
||||||
|
</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
<div class="text-[11.5px] text-ink-mute mt-1">
|
||||||
|
target <span class="mono text-ink-mid">{{$page.Active.TargetVersion}}</span>
|
||||||
|
· started <span class="mono text-ink-mid">{{relTime $page.Active.StartedAt}}</span>
|
||||||
|
{{if $page.Active.CurrentHostID}}
|
||||||
|
· waiting on <span class="mono text-ink-mid">{{index $page.HostNames $page.Active.CurrentHostID}}</span>
|
||||||
|
{{end}}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{{template "fleet_update_rows" $page}}
|
||||||
|
|
||||||
|
{{else if $page.Active}}
|
||||||
|
|
||||||
|
{{/* ---------- terminal state (completed / halted / cancelled) ---------- */}}
|
||||||
|
<div class="panel rounded-[7px] px-5 py-4">
|
||||||
|
<div class="flex items-baseline justify-between">
|
||||||
|
<div>
|
||||||
|
<span class="mono text-[12px] text-ink-fade">last fleet update</span>
|
||||||
|
{{if eq $page.Active.Status "completed"}}
|
||||||
|
<span class="mono text-[12px] text-ok ml-2">completed</span>
|
||||||
|
{{else if eq $page.Active.Status "halted"}}
|
||||||
|
<span class="mono text-[12px] text-bad ml-2">halted</span>
|
||||||
|
{{else if eq $page.Active.Status "cancelled"}}
|
||||||
|
<span class="mono text-[12px] text-warn ml-2">cancelled</span>
|
||||||
|
{{else}}
|
||||||
|
<span class="mono text-[12px] text-ink-mid ml-2">{{$page.Active.Status}}</span>
|
||||||
|
{{end}}
|
||||||
|
<span class="mono text-[11px] text-ink-fade ml-2">{{$page.Active.ID}}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="text-[11.5px] text-ink-mute mt-1">
|
||||||
|
target <span class="mono text-ink-mid">{{$page.Active.TargetVersion}}</span>
|
||||||
|
· started <span class="mono text-ink-mid">{{relTime $page.Active.StartedAt}}</span>
|
||||||
|
{{if $page.Active.CompletedAt}} · finished <span class="mono text-ink-mid">{{relTime $page.Active.CompletedAt}}</span>{{end}}
|
||||||
|
</div>
|
||||||
|
{{if $page.Active.HaltedReason}}
|
||||||
|
<div class="text-[12px] text-bad mt-2">{{$page.Active.HaltedReason}}</div>
|
||||||
|
{{end}}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{{template "fleet_update_rows" $page}}
|
||||||
|
|
||||||
|
{{if gt (len $page.OutOfDateHosts) 0}}
|
||||||
|
<div class="mt-5">
|
||||||
|
{{template "fleet_update_idle_panel" $page}}
|
||||||
|
</div>
|
||||||
|
{{end}}
|
||||||
|
|
||||||
|
{{else}}
|
||||||
|
|
||||||
|
{{template "fleet_update_idle_panel" $page}}
|
||||||
|
|
||||||
|
{{end}}
|
||||||
|
</div>
|
||||||
|
{{end}}
|
||||||
|
|
||||||
|
{{define "fleet_update_rows"}}
|
||||||
|
{{$page := .}}
|
||||||
|
<div class="panel mt-3 rounded-[7px] overflow-hidden">
|
||||||
|
<div class="hairline grid items-baseline px-4 py-2.5 text-[11px] text-ink-fade uppercase tracking-[0.08em]"
|
||||||
|
style="grid-template-columns: 0.4fr 1.5fr 0.8fr 1.2fr 1.5fr; column-gap: 18px;">
|
||||||
|
<div>#</div>
|
||||||
|
<div>Host</div>
|
||||||
|
<div>Status</div>
|
||||||
|
<div>Job</div>
|
||||||
|
<div>Detail</div>
|
||||||
|
</div>
|
||||||
|
{{range $page.ActiveRows}}
|
||||||
|
<div class="grid items-center px-4 py-2.5 text-[12.5px] hairline"
|
||||||
|
style="grid-template-columns: 0.4fr 1.5fr 0.8fr 1.2fr 1.5fr; column-gap: 18px;">
|
||||||
|
<div class="mono text-ink-fade">{{.Position}}</div>
|
||||||
|
<div class="mono text-ink">{{if .HostName}}{{.HostName}}{{else}}{{.HostID}}{{end}}</div>
|
||||||
|
<div>
|
||||||
|
{{if eq .Status "pending"}}<span class="text-ink-fade">pending</span>
|
||||||
|
{{else if eq .Status "running"}}<span class="text-accent">running…</span>
|
||||||
|
{{else if eq .Status "succeeded"}}<span class="text-ok">succeeded</span>
|
||||||
|
{{else if eq .Status "failed"}}<span class="text-bad font-medium">failed</span>
|
||||||
|
{{else if eq .Status "skipped"}}<span class="text-ink-mute">skipped</span>
|
||||||
|
{{else}}<span class="text-ink-mute">{{.Status}}</span>{{end}}
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
{{if .JobID}}<a class="link mono text-[11.5px]" href="/jobs/{{.JobID}}">{{.JobID}}</a>{{else}}<span class="text-ink-fade">—</span>{{end}}
|
||||||
|
</div>
|
||||||
|
<div class="mono text-[11.5px] text-ink-mute truncate" title="{{.FailedReason}}">{{.FailedReason}}</div>
|
||||||
|
</div>
|
||||||
|
{{end}}
|
||||||
|
</div>
|
||||||
|
{{end}}
|
||||||
|
|
||||||
|
{{define "fleet_update_idle_panel"}}
|
||||||
|
{{$page := .}}
|
||||||
|
<div class="panel rounded-[7px] px-5 py-4">
|
||||||
|
{{if eq (len $page.OutOfDateHosts) 0}}
|
||||||
|
<div class="flex items-center gap-3">
|
||||||
|
<span class="dot dot-online"></span>
|
||||||
|
<div>
|
||||||
|
<div class="text-ink text-[14px] font-medium">All hosts are up to date.</div>
|
||||||
|
<div class="text-ink-mute text-[12px] mt-0.5">
|
||||||
|
Every online agent matches server version <span class="mono">{{$page.TargetVersion}}</span>.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{{else}}
|
||||||
|
<div class="flex items-baseline justify-between">
|
||||||
|
<h2 class="text-[14px] font-medium">{{len $page.OutOfDateHosts}} host{{if ne (len $page.OutOfDateHosts) 1}}s{{end}} out of date</h2>
|
||||||
|
<span class="mono text-[11px] text-ink-fade">target {{$page.TargetVersion}}</span>
|
||||||
|
</div>
|
||||||
|
<ul class="mt-3 space-y-1 text-[12px]">
|
||||||
|
{{range $page.OutOfDateHosts}}
|
||||||
|
<li class="flex items-center gap-3">
|
||||||
|
<span class="dot dot-online"></span>
|
||||||
|
<span class="mono text-ink">{{.Name}}</span>
|
||||||
|
<span class="mono text-ink-mute">{{if .AgentVersion}}{{.AgentVersion}}{{else}}—{{end}} → {{$page.TargetVersion}}</span>
|
||||||
|
</li>
|
||||||
|
{{end}}
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<form id="fleet-update-start-form" class="mt-4 flex items-center gap-3"
|
||||||
|
hx-post="/api/fleet/update"
|
||||||
|
hx-headers='{"Content-Type":"application/json"}'
|
||||||
|
hx-vals='{}'
|
||||||
|
hx-swap="none"
|
||||||
|
hx-on::after-request="if(event.detail.successful) location.reload()">
|
||||||
|
<label class="text-[11.5px] text-ink-mute">
|
||||||
|
Type the count
|
||||||
|
<span class="mono text-ink-mid">({{len $page.OutOfDateHosts}})</span>
|
||||||
|
to enable Start:
|
||||||
|
</label>
|
||||||
|
<input type="text" id="fleet-update-confirm" class="field mono text-[12.5px]"
|
||||||
|
style="width: 80px; padding: 5px 8px;"
|
||||||
|
oninput="document.getElementById('fleet-update-start-btn').disabled = (this.value !== '{{len $page.OutOfDateHosts}}');"
|
||||||
|
autocomplete="off" />
|
||||||
|
<button type="submit" id="fleet-update-start-btn" class="btn btn-amber" disabled>
|
||||||
|
Start fleet update
|
||||||
|
</button>
|
||||||
|
</form>
|
||||||
|
{{end}}
|
||||||
|
</div>
|
||||||
|
{{end}}
|
||||||
@@ -83,7 +83,7 @@
|
|||||||
<div class="flex items-center gap-3 mt-3 text-[13px] text-ink-mute">
|
<div class="flex items-center gap-3 mt-3 text-[13px] text-ink-mute">
|
||||||
<span class="mono text-ink-mid">{{$host.OS}}/{{$host.Arch}}</span>
|
<span class="mono text-ink-mid">{{$host.OS}}/{{$host.Arch}}</span>
|
||||||
<span class="text-ink-fade">·</span>
|
<span class="text-ink-fade">·</span>
|
||||||
<span>agent <span class="mono text-ink-mid">{{if $host.AgentVersion}}{{$host.AgentVersion}}{{else}}—{{end}}</span></span>
|
<span>agent <span class="mono text-ink-mid">{{if $host.AgentVersion}}{{$host.AgentVersion}}{{else}}—{{end}}</span>{{if $page.UpdateAvailable}} {{template "host_update_chip" $page}}{{end}}</span>
|
||||||
<span class="text-ink-fade">·</span>
|
<span class="text-ink-fade">·</span>
|
||||||
<span>restic <span class="mono text-ink-mid">{{if $host.ResticVersion}}{{$host.ResticVersion}}{{else}}—{{end}}</span></span>
|
<span>restic <span class="mono text-ink-mid">{{if $host.ResticVersion}}{{$host.ResticVersion}}{{else}}—{{end}}</span></span>
|
||||||
<span class="text-ink-fade">·</span>
|
<span class="text-ink-fade">·</span>
|
||||||
|
|||||||
@@ -14,7 +14,7 @@
|
|||||||
{{- end -}}
|
{{- end -}}
|
||||||
</div>
|
</div>
|
||||||
<div class="mono {{if eq $h.Status "offline"}}text-ink-mid{{else}}text-ink{{end}} font-medium">{{$h.Name}}</div>
|
<div class="mono {{if eq $h.Status "offline"}}text-ink-mid{{else}}text-ink{{end}} font-medium">{{$h.Name}}</div>
|
||||||
<div class="mono text-ink-mid text-[12px]">{{$h.OS}}/{{$h.Arch}}</div>
|
<div class="mono text-ink-mid text-[12px]">{{$h.OS}}/{{$h.Arch}}{{if .UpdateAvailable}} {{template "host_update_chip" .}}{{end}}</div>
|
||||||
<div class="text-xs text-ink-mid">
|
<div class="text-xs text-ink-mid">
|
||||||
{{- if $h.CurrentJobID -}}
|
{{- if $h.CurrentJobID -}}
|
||||||
<span class="text-accent">backup running…</span><br>
|
<span class="text-accent">backup running…</span><br>
|
||||||
|
|||||||
@@ -0,0 +1,11 @@
|
|||||||
|
{{/*
|
||||||
|
host_update_chip — small amber chip rendered when the agent version
|
||||||
|
on a host is behind the server's. Expects:
|
||||||
|
.UpdateAvailable bool
|
||||||
|
.TargetVersion string
|
||||||
|
.Host store.Host (for AgentVersion)
|
||||||
|
Hidden entirely when UpdateAvailable is false.
|
||||||
|
*/}}
|
||||||
|
{{define "host_update_chip"}}
|
||||||
|
{{if .UpdateAvailable}}<span class="update-chip" title="Agent at {{.Host.AgentVersion}}; server at {{.TargetVersion}}">out of date · {{.Host.AgentVersion}} → {{.TargetVersion}}</span>{{end}}
|
||||||
|
{{end}}
|
||||||
Reference in New Issue
Block a user