diff --git a/CLAUDE.md b/CLAUDE.md index c623059..8b1dd42 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,6 +2,10 @@ Project-specific rules for Claude when working in this repo. +## Repo + +The repo lives inside a Gitea instance; `tea` CLI is available for use by agents + ## Run `go vet` before every commit CI runs `go vet ./...` and will fail the build on any vet error. @@ -43,6 +47,8 @@ cp bin/restic-manager-agent \ /tmp/rm-smoke/data/agent-binaries/restic-manager-agent-linux-amd64 cp deploy/install/install.sh \ /tmp/rm-smoke/data/install/install.sh +cp deploy/install/install.ps1 \ + /tmp/rm-smoke/data/install/install.ps1 cp deploy/install/restic-manager-agent.service \ /tmp/rm-smoke/data/install/restic-manager-agent.service diff --git a/cmd/agent/announce.go b/cmd/agent/announce.go new file mode 100644 index 0000000..536baba --- /dev/null +++ b/cmd/agent/announce.go @@ -0,0 +1,262 @@ +// announce.go — agent-side announce-and-approve enrolment (P2-18c). +// +// Run path: when the agent has no AgentToken set but RM_SERVER is +// configured (and no -enroll-token was supplied), main() switches +// into announce mode: +// 1. Load (or mint+persist) an Ed25519 keypair in agent.yaml. +// 2. POST {hostname, os, arch, agent_version, restic_version, +// public_key} to /api/agents/announce. +// 3. Print the fingerprint to stderr in a copy-friendly banner so +// the operator can compare it against the dashboard. +// 4. Open /ws/agent/pending?pending_id=…, sign the nonce with our +// private key, wait for an `enrolled` message. +// 5. On enrolled: persist the bearer + repo creds, return; main() +// then drops into the normal WS run loop with the new bearer. +// 6. On reject: server closes the socket with code 4001; we exit +// with a clear message. +package main + +import ( + "context" + "crypto/ed25519" + "crypto/rand" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + stdhttp "net/http" + "os" + "strings" + "time" + + "github.com/coder/websocket" + + "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/config" + "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/secrets" + "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/sysinfo" + "gitea.dcglab.co.uk/steve/restic-manager/internal/store" +) + +// announceRequest mirrors the server's announceRequest. Duplicated +// here so cmd/agent stays decoupled from the http package. +type announceRequest struct { + Hostname string `json:"hostname"` + OS string `json:"os"` + Arch string `json:"arch"` + AgentVersion string `json:"agent_version"` + ResticVersion string `json:"restic_version"` + PublicKey string `json:"public_key"` +} + +type announceResponse struct { + PendingID string `json:"pending_id"` + Fingerprint string `json:"fingerprint"` + HostnameCollision bool `json:"hostname_collision"` +} + +type pendingNonceMessage struct { + Type string `json:"type"` + Nonce string `json:"nonce"` +} + +type pendingSignedMessage struct { + Type string `json:"type"` + Signature string `json:"signature"` +} + +type pendingEnrolledMessage struct { + Type string `json:"type"` + HostID string `json:"host_id"` + Bearer string `json:"bearer"` +} + +// doAnnounce runs the full announce → wait-for-accept flow. On +// success, persists the bearer + host_id into cfg + writes secrets +// for the repo creds the admin supplied at accept time. Returns +// only after the bearer has landed (or on hard error / reject). +func doAnnounce(serverURL string, cfg *config.Config, agentVersion string) error { + ctx, cancel := context.WithTimeout(context.Background(), 24*time.Hour) + defer cancel() + + // Ensure we have a keypair. + priv, pub, err := loadOrMintAnnounceKey(cfg) + if err != nil { + return fmt.Errorf("announce: keypair: %w", err) + } + fingerprint := store.FingerprintForKey(pub) + + snap, err := sysinfo.Collect(ctx, cfg.ResticPath) + if err != nil { + return fmt.Errorf("announce: sysinfo: %w", err) + } + + // POST /api/agents/announce. + body, _ := json.Marshal(announceRequest{ + Hostname: snap.Hostname, OS: string(snap.OS), Arch: string(snap.Arch), + AgentVersion: agentVersion, ResticVersion: snap.ResticVersion, + PublicKey: base64.StdEncoding.EncodeToString(pub), + }) + req, _ := stdhttp.NewRequestWithContext(ctx, "POST", + strings.TrimRight(serverURL, "/")+"/api/agents/announce", + strings.NewReader(string(body))) + req.Header.Set("Content-Type", "application/json") + res, err := stdhttp.DefaultClient.Do(req) + if err != nil { + return fmt.Errorf("announce: POST: %w", err) + } + rawBody := readAllShort(res) + _ = res.Body.Close() + if res.StatusCode != stdhttp.StatusOK { + return fmt.Errorf("announce: server returned %d: %s", res.StatusCode, rawBody) + } + var ar announceResponse + if err := json.Unmarshal(rawBody, &ar); err != nil { + return fmt.Errorf("announce: parse response: %w", err) + } + + // Print the fingerprint banner. + fmt.Fprintln(os.Stderr, strings.Repeat("=", 64)) + fmt.Fprintln(os.Stderr, " Restic-manager: announce-and-approve enrolment") + fmt.Fprintln(os.Stderr, "") + fmt.Fprintln(os.Stderr, " Hostname : "+snap.Hostname) + fmt.Fprintln(os.Stderr, " Server : "+serverURL) + fmt.Fprintln(os.Stderr, " Pending ID : "+ar.PendingID) + fmt.Fprintln(os.Stderr, " Fingerprint : "+fingerprint) + if ar.HostnameCollision { + fmt.Fprintln(os.Stderr, "") + fmt.Fprintln(os.Stderr, " WARNING: another pending host already uses this hostname.") + fmt.Fprintln(os.Stderr, " Confirm the fingerprint above matches what you see in the UI.") + } + fmt.Fprintln(os.Stderr, "") + fmt.Fprintln(os.Stderr, " Compare the fingerprint with the one in the UI before accepting.") + fmt.Fprintln(os.Stderr, " Waiting for an admin to accept (1 hour timeout)…") + fmt.Fprintln(os.Stderr, strings.Repeat("=", 64)) + + // Open /ws/agent/pending and run the nonce-sign handshake. + wsURL := wsURLFromHTTP(serverURL) + "/ws/agent/pending?pending_id=" + ar.PendingID + dialCtx, dialCancel := context.WithTimeout(ctx, 30*time.Second) + c, dialRes, err := websocket.Dial(dialCtx, wsURL, nil) + dialCancel() + if err != nil { + return fmt.Errorf("announce: dial pending ws: %w", err) + } + if dialRes != nil && dialRes.Body != nil { + _ = dialRes.Body.Close() + } + defer func() { _ = c.CloseNow() }() + + // Read nonce. + rctx, rcancel := context.WithTimeout(ctx, 30*time.Second) + _, raw, err := c.Read(rctx) + rcancel() + if err != nil { + return fmt.Errorf("announce: read nonce: %w", err) + } + var nm pendingNonceMessage + if err := json.Unmarshal(raw, &nm); err != nil { + return fmt.Errorf("announce: parse nonce: %w", err) + } + nonce, err := base64.StdEncoding.DecodeString(nm.Nonce) + if err != nil { + return fmt.Errorf("announce: decode nonce: %w", err) + } + sig := ed25519.Sign(priv, nonce) + reply, _ := json.Marshal(pendingSignedMessage{ + Type: "signed_nonce", Signature: base64.StdEncoding.EncodeToString(sig), + }) + wctx, wcancel := context.WithTimeout(ctx, 10*time.Second) + if err := c.Write(wctx, websocket.MessageText, reply); err != nil { + wcancel() + return fmt.Errorf("announce: write signed nonce: %w", err) + } + wcancel() + + // Block until enrolled (or reject / disconnect). + rctx2, rcancel2 := context.WithTimeout(ctx, 1*time.Hour) + defer rcancel2() + _, raw2, err := c.Read(rctx2) + if err != nil { + // CloseError with our reject code 4001 = admin rejected. + var ce websocket.CloseError + if errors.As(err, &ce) && ce.Code == 4001 { + return errors.New("announce: rejected by admin") + } + return fmt.Errorf("announce: wait for enrolled: %w", err) + } + var em pendingEnrolledMessage + if err := json.Unmarshal(raw2, &em); err != nil { + return fmt.Errorf("announce: parse enrolled: %w", err) + } + if em.Type != "enrolled" || em.Bearer == "" { + return fmt.Errorf("announce: bad enrolled payload: %s", raw2) + } + + // Persist the bearer + host_id. + cfg.ServerURL = serverURL + cfg.HostID = em.HostID + cfg.AgentToken = em.Bearer + if err := cfg.EnsureSecretsKey(); err != nil { + return fmt.Errorf("announce: mint secrets key: %w", err) + } + // Note: repo creds aren't pushed in the enrolled message — the + // server pushes them via `config.update` on first WS hello. The + // secrets store will start empty and fill in then. + if err := cfg.Save(); err != nil { + return fmt.Errorf("announce: save config: %w", err) + } + // Touch the secrets store so it exists with the right perms. + keyBytes, _ := cfg.SecretsKeyBytes() + if _, err := secrets.New(cfg.ResolvedSecretsPath(), keyBytes); err != nil { + return fmt.Errorf("announce: open secrets store: %w", err) + } + fmt.Fprintln(os.Stderr, "Accepted. Bearer persisted; reconnecting via the standard WS.") + return nil +} + +// loadOrMintAnnounceKey returns the (priv, pub) keypair, generating +// + persisting one when AnnounceKey is empty. The private key holds +// the public half in its tail 32 bytes per ed25519 convention. +func loadOrMintAnnounceKey(cfg *config.Config) (ed25519.PrivateKey, ed25519.PublicKey, error) { + if cfg.AnnounceKey != "" { + raw, err := base64.StdEncoding.DecodeString(cfg.AnnounceKey) + if err != nil { + return nil, nil, fmt.Errorf("decode AnnounceKey: %w", err) + } + if len(raw) != ed25519.PrivateKeySize { + return nil, nil, fmt.Errorf("AnnounceKey must be %d bytes, got %d", + ed25519.PrivateKeySize, len(raw)) + } + priv := ed25519.PrivateKey(raw) + pub := priv.Public().(ed25519.PublicKey) + return priv, pub, nil + } + pub, priv, err := ed25519.GenerateKey(rand.Reader) + if err != nil { + return nil, nil, fmt.Errorf("generate keypair: %w", err) + } + cfg.AnnounceKey = base64.StdEncoding.EncodeToString(priv) + if err := cfg.Save(); err != nil { + return nil, nil, fmt.Errorf("persist AnnounceKey: %w", err) + } + return priv, pub, nil +} + +// wsURLFromHTTP swaps the http(s) scheme for ws(s). +func wsURLFromHTTP(httpURL string) string { + switch { + case strings.HasPrefix(httpURL, "https://"): + return "wss://" + strings.TrimPrefix(httpURL, "https://") + case strings.HasPrefix(httpURL, "http://"): + return "ws://" + strings.TrimPrefix(httpURL, "http://") + default: + return httpURL + } +} + +// readAllShort reads up to 64KB of the response body. The announce +// response is small; we cap to avoid pathological server replies. +func readAllShort(res *stdhttp.Response) []byte { + buf := make([]byte, 64*1024) + n, _ := res.Body.Read(buf) + return buf[:n] +} diff --git a/cmd/agent/main.go b/cmd/agent/main.go index d401640..ac43d3c 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -9,6 +9,7 @@ import ( "os" "os/signal" "strconv" + "sync" "syscall" "time" @@ -16,6 +17,7 @@ import ( "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/runner" "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/scheduler" "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/secrets" + "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/service" "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/sysinfo" "gitea.dcglab.co.uk/steve/restic-manager/internal/agent/wsclient" "gitea.dcglab.co.uk/steve/restic-manager/internal/api" @@ -32,6 +34,27 @@ func main() { } func run() error { + // Optional first positional verb for SCM control on Windows. + // `restic-manager-agent install|uninstall|start|stop` route into + // the service package; everything else falls through to the + // flag-driven default (which is what systemd / interactive runs + // hit). On non-Windows builds these verbs return a clear error. + if len(os.Args) > 1 { + switch os.Args[1] { + case "install": + return service.Install() + case "uninstall": + return service.Uninstall() + case "start": + return service.Start() + case "stop": + return service.Stop() + case "run": + // Strip the verb so flag.Parse sees the rest unchanged. + os.Args = append([]string{os.Args[0]}, os.Args[2:]...) + } + } + configPath := flag.String("config", config.DefaultPath(), "path to agent.yaml") enrollServer := flag.String("enroll-server", "", "server URL (used with -enroll-token to perform first-run enrollment)") enrollToken := flag.String("enroll-token", "", "one-time enrollment token (operator copies this from the UI)") @@ -58,8 +81,17 @@ func run() error { return doEnroll(*enrollServer, *enrollToken, cfg, version) } + // Announce-and-approve: -enroll-server set, no token, agent not + // yet enrolled. Run the announce flow inline; on success the cfg + // has the bearer + host_id and we drop into the normal run loop. + if !cfg.Enrolled() && *enrollServer != "" { + if err := doAnnounce(*enrollServer, cfg, version); err != nil { + return fmt.Errorf("announce: %w", err) + } + } + if !cfg.Enrolled() { - return fmt.Errorf("agent is not enrolled; run with -enroll-server and -enroll-token first (config %q)", *configPath) + return fmt.Errorf("agent is not enrolled; run with -enroll-server (and either -enroll-token or wait for admin to accept the announce) first (config %q)", *configPath) } ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) @@ -170,6 +202,14 @@ type dispatcher struct { resticBin string secrets *secrets.Store scheduler *scheduler.Scheduler + + // Bandwidth caps in KB/s pushed via config.update. Mutated under + // bwMu by the config.update handler; read by runJob when building + // the runner. <=0 means "no cap" (do not pass --limit-* to restic). + // Per-job overrides on CommandRunPayload take precedence. + bwMu sync.Mutex + bwUpKBps int + bwDownKBps int } func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.Sender) error { @@ -263,6 +303,24 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S slog.Warn("ws agent: unknown config.update slot, ignoring", "slot", p.Slot) } + // Bandwidth caps ride independently of the slot — they're host- + // wide and apply to every restic invocation regardless of which + // credentials slot the job uses. nil pointer = no change in this + // push; non-nil = set to that value (≤0 clears the cap). + if p.BandwidthUpKBps != nil || p.BandwidthDownKBps != nil { + d.bwMu.Lock() + if p.BandwidthUpKBps != nil { + d.bwUpKBps = *p.BandwidthUpKBps + } + if p.BandwidthDownKBps != nil { + d.bwDownKBps = *p.BandwidthDownKBps + } + up, down := d.bwUpKBps, d.bwDownKBps + d.bwMu.Unlock() + slog.Info("ws agent: bandwidth caps updated", + "up_kbps", up, "down_kbps", down) + } + case api.MsgAgentUpdateAvail: var p api.AgentUpdateAvailablePayload _ = env.UnmarshalPayload(&p) @@ -295,11 +353,25 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc // not on r). If you find yourself adding a new JobKind that // needs delete authority, mirror the JobPrune pattern below // — don't try to overload r. + // Resolve bandwidth caps: per-job override (if set) wins over the + // host-wide caps last pushed via config.update. <=0 means no cap. + d.bwMu.Lock() + upKBps, downKBps := d.bwUpKBps, d.bwDownKBps + d.bwMu.Unlock() + if p.BandwidthUpKBps != nil { + upKBps = *p.BandwidthUpKBps + } + if p.BandwidthDownKBps != nil { + downKBps = *p.BandwidthDownKBps + } + r := runner.New(runner.Config{ - ResticBin: d.resticBin, - RepoURL: creds.URL, - RepoUsername: creds.Username, - RepoPassword: creds.Password, + ResticBin: d.resticBin, + RepoURL: creds.URL, + RepoUsername: creds.Username, + RepoPassword: creds.Password, + LimitUploadKBps: upKBps, + LimitDownloadKBps: downKBps, }, tx, time.Second) switch p.Kind { @@ -318,8 +390,9 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc } slog.Info("agent: accepting backup job", "job_id", p.JobID, "paths", paths, "excludes", p.Excludes, "tag", p.Tag) + hooks := runner.BackupHooks{Pre: p.PreHook, Post: p.PostHook} go func() { - if err := r.RunBackup(ctx, p.JobID, paths, p.Excludes, tags); err != nil { + if err := r.RunBackup(ctx, p.JobID, paths, p.Excludes, tags, hooks); err != nil { slog.Warn("agent: backup job failed", "job_id", p.JobID, "err", err) return } @@ -381,10 +454,12 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc runCreds = ac } prr := runner.New(runner.Config{ - ResticBin: d.resticBin, - RepoURL: runCreds.URL, - RepoUsername: runCreds.Username, - RepoPassword: runCreds.Password, + ResticBin: d.resticBin, + RepoURL: runCreds.URL, + RepoUsername: runCreds.Username, + RepoPassword: runCreds.Password, + LimitUploadKBps: upKBps, + LimitDownloadKBps: downKBps, }, tx, time.Second) slog.Info("agent: accepting prune job", "job_id", p.JobID, "admin_creds", p.RequiresAdminCreds) go func() { diff --git a/cmd/server/main.go b/cmd/server/main.go index c97b39d..a083a6d 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -156,6 +156,10 @@ func run() error { // shouldn't, but the queue exists either way). pendingDrainTick := time.NewTicker(30 * time.Second) defer pendingDrainTick.Stop() + // Pending-hosts expiry sweeper: drops announce rows past their 1h + // ceiling so the dashboard panel doesn't accumulate stale entries. + pendingExpiryTick := time.NewTicker(60 * time.Second) + defer pendingExpiryTick.Stop() mt := maintenance.New(st) go func() { for { @@ -176,6 +180,10 @@ func run() error { } case <-pendingDrainTick.C: srv.DrainAllDue(ctx) + case <-pendingExpiryTick.C: + if n, err := st.DeleteExpiredPendingHosts(ctx, time.Now().UTC()); err == nil && n > 0 { + slog.Info("expired pending hosts swept", "n", n) + } case <-maintenanceTick.C: decisions, err := mt.Decide(ctx, time.Now().UTC()) if err != nil { diff --git a/deploy/install/install.ps1 b/deploy/install/install.ps1 new file mode 100644 index 0000000..72b7c9d --- /dev/null +++ b/deploy/install/install.ps1 @@ -0,0 +1,133 @@ +# install.ps1 — Windows installer for the restic-manager agent (P2-17). +# +# Usage (Run as administrator): +# $env:RM_SERVER = "https://restic.lab.example" +# $env:RM_TOKEN = "" # omit for announce-and-approve +# iwr "$env:RM_SERVER/install/install.ps1" -UseBasicParsing | iex +# +# What it does: +# 1. checks for admin elevation +# 2. downloads the matching agent binary from the server +# 3. lays down C:\Program Files\restic-manager\ and +# C:\ProgramData\restic-manager\ (config + state) +# 4. registers the agent as a Windows service via the agent's own +# `install` subcommand (which uses the SCM API) +# 5. enrolls (token flow if RM_TOKEN set, otherwise announce flow) +# by spawning the agent with the right CLI flags and waits +# until config is written +# 6. surfaces (but does NOT disable) any existing scheduled tasks +# whose name contains "restic" so the operator can decide +# +# Idempotent — safe to re-run. + +[CmdletBinding()] +param( + [string]$Server = $env:RM_SERVER, + [string]$Token = $env:RM_TOKEN, + [string]$InstallDir = 'C:\Program Files\restic-manager', + [string]$DataDir = 'C:\ProgramData\restic-manager' +) + +$ErrorActionPreference = 'Stop' + +function Test-Admin { + $id = [System.Security.Principal.WindowsIdentity]::GetCurrent() + $pri = New-Object System.Security.Principal.WindowsPrincipal($id) + return $pri.IsInRole([System.Security.Principal.WindowsBuiltInRole]::Administrator) +} + +function Detect-Arch { + switch ($env:PROCESSOR_ARCHITECTURE) { + 'AMD64' { return 'amd64' } + 'ARM64' { return 'arm64' } + default { throw "unsupported PROCESSOR_ARCHITECTURE: $($env:PROCESSOR_ARCHITECTURE)" } + } +} + +function Detect-ResticTasks { + Write-Host '' + Write-Host '— Existing restic-named scheduled tasks (review manually) —' + try { + $tasks = Get-ScheduledTask -ErrorAction SilentlyContinue | + Where-Object { $_.TaskName -match 'restic' -or $_.TaskPath -match 'restic' } + if ($tasks) { + foreach ($t in $tasks) { + Write-Host " * $($t.TaskPath)$($t.TaskName) state=$($t.State)" + Write-Host " Disable with: Disable-ScheduledTask -TaskName '$($t.TaskName)' -TaskPath '$($t.TaskPath)'" + } + } else { + Write-Host ' (none found)' + } + } catch { + Write-Host ' (Get-ScheduledTask failed; review the Task Scheduler UI manually)' + } + Write-Host '' +} + +# --- preflight ------------------------------------------------------- + +if (-not (Test-Admin)) { + throw 'install.ps1: must be run from an elevated PowerShell (Run as administrator).' +} +if (-not $Server) { + throw 'install.ps1: -Server (or $env:RM_SERVER) is required, e.g. https://restic.lab.example' +} + +$arch = Detect-Arch +Write-Host "install.ps1: server=$Server arch=$arch" + +# --- directories ----------------------------------------------------- + +New-Item -ItemType Directory -Force -Path $InstallDir | Out-Null +New-Item -ItemType Directory -Force -Path $DataDir | Out-Null + +# --- download agent -------------------------------------------------- + +$agentExe = Join-Path $InstallDir 'restic-manager-agent.exe' +$tmpExe = "$agentExe.tmp" +$dlURL = "$Server/agent/binary?os=windows&arch=$arch" +Write-Host "install.ps1: downloading $dlURL" +Invoke-WebRequest -UseBasicParsing -Uri $dlURL -OutFile $tmpExe +# Atomic-ish replace: stop service if running so the .exe isn't busy. +try { Stop-Service -Name 'restic-manager-agent' -ErrorAction SilentlyContinue } catch {} +Move-Item -Force -Path $tmpExe -Destination $agentExe + +# --- enroll / announce ----------------------------------------------- + +$cfgPath = Join-Path $DataDir 'agent.yaml' +$args = @('-config', $cfgPath, '-enroll-server', $Server) +if ($Token) { + $args += @('-enroll-token', $Token) + Write-Host 'install.ps1: enrolling with one-time token' +} else { + Write-Host 'install.ps1: no RM_TOKEN — running announce-and-approve flow.' + Write-Host ' The fingerprint will print below. Compare it with the dashboard before clicking Accept.' +} +& $agentExe @args +if ($LASTEXITCODE -ne 0) { + throw "install.ps1: agent enrolment failed (exit $LASTEXITCODE)" +} + +# --- install + start service ---------------------------------------- + +# The 'install' subcommand registers the service via the SCM. If +# already registered, it errors loudly — re-run with -Force only if +# you've manually verified. +try { + & $agentExe install +} catch { + Write-Host "install.ps1: service may already be registered ($_); continuing." +} +try { + Start-Service -Name 'restic-manager-agent' +} catch { + Write-Host "install.ps1: Start-Service failed ($_); check Event Viewer." +} + +Detect-ResticTasks + +Write-Host '' +Write-Host 'install.ps1: done.' +Write-Host " config : $cfgPath" +Write-Host " binary : $agentExe" +Write-Host " service: restic-manager-agent (Get-Service to inspect)" diff --git a/docs/superpowers/plans/2026-05-04-p2-completion.md b/docs/superpowers/plans/2026-05-04-p2-completion.md new file mode 100644 index 0000000..1bc93f2 --- /dev/null +++ b/docs/superpowers/plans/2026-05-04-p2-completion.md @@ -0,0 +1,259 @@ +# P2 Completion Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Close every remaining P2 task in `tasks.md`: P2R-09 (auto-init UX), P2R-10/11/12 (hooks), P2R-13 (bandwidth wiring + per-job override), P2R-14 (schedule next/last run), P2-16 (Windows svc), P2-17 (`install.ps1`), P2-18 (announce-and-approve). + +**Architecture:** Server stays HTTP+WS; agent stays a single binary that auto-restages via `make build`. Hooks live on `source_groups` (and host-level defaults). Announce-and-approve adds a separate WS path (`/ws/agent/pending`) and a Pending hosts panel; token-flow stays default. Windows service support uses `golang.org/x/sys/windows/svc` behind a `//go:build windows` tag — Linux builds untouched. **Operator is away — make best guesses on small UX choices, but commit each item separately so the choices are reviewable.** + +**Tech Stack:** Go 1.23+, chi router, modernc/sqlite, `coder/websocket`, `robfig/cron/v3`, HTMX + Tailwind, `golang.org/x/sys/windows/svc`, Ed25519 (stdlib). + +--- + +## Pre-flight + +- [ ] **Run baseline:** `go vet ./... && go build ./... && go test ./...` — must be green before starting. Restage agent + restart server (per CLAUDE.md restage block) so smoke env is warm. + +## Order of execution + +Smallest blast-radius first. UI polish → bandwidth → next/last → hooks → announce → Windows. Commit and restage at each task boundary. Run `go vet ./... && go test ./...` before every commit. + +--- + +## Task 1 — P2R-13a: Wire bandwidth caps into restic invocations + +**Files:** +- Modify: `internal/restic/runner.go` (add `LimitUploadKBps`, `LimitDownloadKBps` to `Env` or to a per-call options struct already present; emit `--limit-upload N`/`--limit-download N` on `restic backup|forget|prune|check|restore`) +- Modify: `internal/agent/runner/*.go` — pass host-wide caps into the runner. Caps come from `agent.config.Config` or are pushed via `config.update`. Decision: ship caps in the existing `config.update` envelope as new fields `bandwidth_up_kbps`, `bandwidth_down_kbps`. Server pushes on hello + on `PUT /api/hosts/{id}/bandwidth`. +- Modify: `internal/api/messages.go` — extend `ConfigUpdatePayload` with the two int pointers. +- Modify: `internal/server/ws/handler.go` (or wherever hello/config push lives) — include caps in the pushed config. +- Modify: `internal/server/http/host_bandwidth.go` — after `SetHostBandwidth`, fan out a `config.update` to the connected agent (mirror the credentials-edit path). +- Test: `internal/restic/runner_test.go` — assert flag injection. +- Test: `internal/server/ws/*_test.go` — assert config.update carries caps on hello and on edit. + +- [ ] **Step 1.1** Add `LimitUploadKBps *int`, `LimitDownloadKBps *int` to whatever per-host config the runner already consults. Existing pattern is `restic.Env{}`; extend it. +- [ ] **Step 1.2** Failing test in `internal/restic/runner_test.go`: build a backup command with `LimitUploadKBps=1024`, assert the resulting argv contains `--limit-upload 1024`. +- [ ] **Step 1.3** Implement: prepend the flags in argv builders for `backup`, `forget`, `prune`, `check`, `restore`. Skip when nil/<=0. +- [ ] **Step 1.4** Wire `config.update` payload — server reads `Host.BandwidthUpKBps`/`DownKBps`, includes them in the existing `ConfigUpdatePayload` push on hello and on bandwidth edit (mirror cred-edit fan-out in `internal/server/http/host_credentials.go`). +- [ ] **Step 1.5** Agent applies caps: store in the in-memory dispatcher state on `config.update`, attach to every restic call. +- [ ] **Step 1.6** `go vet ./... && go test ./... && make build && `. Commit: +``` +agent+server: apply host bandwidth caps to restic invocations +``` + +## Task 2 — P2R-13b: Per-job override on Run-now confirm dialog + +**Decision:** A small numeric input on the per-source-group Run-now button (and dashboard Run-all). Operator is away — keep it minimal: two optional inputs (up/down KB/s) on the dispatch endpoint; UI shows a `
` "Limit bandwidth for this run" disclosure with two number inputs. + +**Files:** +- Modify: `internal/server/http/sources.go` (or wherever the per-group Run-now POST lives) — accept optional `bandwidth_up_kbps`/`bandwidth_down_kbps` form fields, pass through. +- Modify: dispatch path (`internal/server/dispatch_*.go` or `ws/handler.go` job-dispatch core) — accept overrides, include in the `command.run` payload. +- Modify: `internal/api/messages.go` — `CommandRunPayload` gains optional caps that take precedence over host-wide caps when present. +- Modify: agent dispatcher — use payload override if present else falls back to config caps. +- Modify: `web/templates/pages/host_sources.html` (and the schedules Run-now form) — `
` block. +- Test: HTTP test for the new form fields; agent runner test for override precedence. + +- [ ] **Step 2.1** Failing test: POST to per-group Run-now with `bandwidth_up_kbps=512` → assert dispatched payload carries 512. +- [ ] **Step 2.2** Implement endpoint changes + payload extension. +- [ ] **Step 2.3** Agent override precedence test (payload wins over config). +- [ ] **Step 2.4** UI `
` blocks (one per Run-now form). +- [ ] **Step 2.5** Playwright spot-check via `:8080` smoke env: open Sources tab, expand the Run-now disclosure, fire with limit=128, then open the live job log and confirm the agent's restic argv (read `/tmp/rm-smoke/server.log` for the dispatched command — it logs argv) shows `--limit-upload 128`. +- [ ] **Step 2.6** Commit. + +## Task 3 — P2R-14: Schedule "next run" / "last run" + +**Files:** +- Modify: `internal/store/schedules.go` — add `NextRunAt(time.Time)` derivation helper and `LatestScheduledJobAt(host_id, schedule_id) (time.Time, error)` (or a single batched fetch for all schedules of a host). +- Modify: dashboard host row (`web/templates/partials/host_row.html`) — show "Next: …" and "Last: …" when there's a single covering schedule (already detected in slice 5). +- Modify: `web/templates/pages/host_schedules.html` — add Next/Last columns to the schedules table. +- Modify: relevant page handlers (`internal/server/http/ui_schedules.go`, dashboard handler) — populate the data. +- Test: `schedules_test.go` for next-run derivation (parse cron, compute next from a fixed `now`). + +- [ ] **Step 3.1** Add `NextRun(cronExpr string, from time.Time) (time.Time, error)` helper using `robfig/cron/v3`'s `Parse(...).Next(from)`. Test with three crons. +- [ ] **Step 3.2** Add `LatestJobByActorKindForSchedule(host_id, schedule_id) (time.Time, status, error)` query against `jobs` (filter `actor_kind='schedule'` AND `schedule_id=?`, ORDER BY `started_at` DESC LIMIT 1). +- [ ] **Step 3.3** Wire schedules-page handler to populate Next/Last per row; render relative time + ISO tooltip (mirror existing `formatRelTime` template helper if it exists; otherwise use a simple "5m ago" helper). +- [ ] **Step 3.4** Wire dashboard row: when single covering schedule, surface "Next: 03:00" / "Last: 8h ago — succeeded". +- [ ] **Step 3.5** Playwright spot-check: a host with a schedule shows Next/Last; pause it → Next becomes "—" / "(paused)". +- [ ] **Step 3.6** Commit. + +## Task 4 — P2R-09: Auto-init UX polish + +**Files:** +- Modify: `web/templates/pages/host_repo.html` — danger-zone re-init button + two-step confirm (type the host name). +- Modify: `internal/server/http/ui_repo.go` (or new `repo_reinit.go`) — `POST /hosts/{id}/repo/reinit` admin-only, audit-logged. Server runs `restic init --force` (or wipes-then-inits — pick the safer of the two; restic doesn't truly wipe a repo, the operator must clear the bucket. **Best guess:** dispatch a normal `init` job with a flag that re-runs even if the repo claims to exist; if restic refuses, surface "the repo on the remote already has data — clear it manually before re-init" via the job log). +- Modify: host detail page header / vitals strip — surface init result line. Use the existing latest-`init`-job query to render "repo ready · initialised ago" or "init failed · job N · retry". +- Test: HTTP test for re-init endpoint (auth, audit, host-name confirm); template test that the result line renders for both states. + +- [ ] **Step 4.1** Add helper: `LatestJobByKind(host_id, "init")` — already exists from P2R-06 (`store.LatestJobByKind`). Reuse. +- [ ] **Step 4.2** Render init line into vitals strip; show "init failed" amber when latest init failed. +- [ ] **Step 4.3** Implement `POST /hosts/{id}/repo/reinit` handler — admin role check, requires a `confirm_hostname` form field that must equal `host.Name`, returns 400 otherwise. Dispatches a fresh `init` job. +- [ ] **Step 4.4** Add danger-zone re-init form to `host_repo.html` (currently disabled per slice 4). Two-step confirm with the typed hostname. +- [ ] **Step 4.5** Playwright: visit `/hosts/{id}/repo`, click re-init, type wrong hostname → blocked; type right hostname → dispatches init job → returns to live log. +- [ ] **Step 4.6** Commit. + +## Task 5 — P2R-10: Hook schema (migration 0010) + +**Files:** +- Create: `internal/store/migrations/0010_hooks.sql` + - `ALTER TABLE source_groups ADD COLUMN pre_hook BLOB;` (AEAD ciphertext, NULLable) + - `ALTER TABLE source_groups ADD COLUMN post_hook BLOB;` + - `ALTER TABLE hosts ADD COLUMN pre_hook_default BLOB;` + - `ALTER TABLE hosts ADD COLUMN post_hook_default BLOB;` + - All four are AEAD ciphertext (existing `crypto.AEAD`); BLOB column type. +- Modify: `internal/store/types.go` — add `PreHook *string` (decrypted), `PostHook *string` to `SourceGroup`; same to `Host`. +- Modify: `internal/store/sources.go` + `internal/store/hosts.go` — getters/setters encrypt on write, decrypt on read. Pass `crypto.AEAD` through (pattern mirrors `host_credentials.go`). +- Test: encrypt/decrypt round-trip; setting `nil` clears the column. + +- [ ] **Step 5.1** Write migration SQL. Column-level ALTERs only (per CLAUDE.md). +- [ ] **Step 5.2** Update store types + getters/setters with AEAD encrypt/decrypt. Mirror `internal/store/host_credentials.go` patterns exactly. +- [ ] **Step 5.3** Round-trip test: set hook on a source group; reload; assert plaintext returned. Set nil; assert nil after reload. +- [ ] **Step 5.4** `go vet && go test`. Commit. + +## Task 6 — P2R-11: Agent execution of hooks + +**Files:** +- Modify: `internal/api/messages.go` — `ConfigUpdatePayload` (or the per-source-group bundle inside `ScheduleSetPayload`) carries `PreHook`, `PostHook` plaintext (server has decrypted by then; wire is authenticated WS, same trust boundary as repo creds). +- Modify: agent dispatcher — for `kind=backup` only: + - Run `pre_hook` (if present) via `os/exec` with the host shell (`/bin/sh -c` on Linux, `cmd.exe /C` on Windows). Capture stdout+stderr → JobLog with `hook:` prefix. Non-zero exit aborts the backup, marks the job failed with `pre_hook` error. + - Run `post_hook` (if present) **always** after the backup, with `RM_JOB_STATUS=succeeded|failed` env var. Capture into JobLog, prefix `hook:`. Non-zero exit on post_hook does NOT change job status (warning logged). +- Skip both for `kind` ∈ {forget, prune, check, unlock, init} per spec.md §14.3. +- Test: dispatcher test with a `pre_hook` that exits 1 → backup not started; `post_hook` always runs and sees `RM_JOB_STATUS`. + +- [ ] **Step 6.1** Plumb hooks through `ScheduleSetPayload` source-group bundle + per-group Run-now `command.run` payload (override host-default with group hook if both present). Server-side resolution: host default if group hook is empty. +- [ ] **Step 6.2** Agent dispatcher: factor hook execution into `internal/agent/runner/hooks.go`. Use `exec.CommandContext`, set env, plumb output to existing JobLog stream with `Source: "hook"` (or prefix the log lines `hook: …`). +- [ ] **Step 6.3** Failing test in `internal/agent/runner/runner_test.go` (create file if absent): `pre_hook=/bin/false` → job fails with `pre_hook failed (exit 1)` and the actual restic backup never runs (assert via mock-restic shim). +- [ ] **Step 6.4** Test: `post_hook` runs even when backup fails; receives `RM_JOB_STATUS=failed`. +- [ ] **Step 6.5** Test: hooks skipped on `forget`/`prune`/`check`/`unlock` jobs. +- [ ] **Step 6.6** `go vet && go test && make build && `. Commit. + +## Task 7 — P2R-12: Hook editor UI + +**Files:** +- Modify: `web/templates/pages/source_group_edit.html` (new or extend existing source-group form) — ` + +
+ + +
+
+ +
+ + {{/* ---------- Danger zone ---------- */}}

Danger zone

secrets.enc is reused.

- +
+ + +
diff --git a/web/templates/pages/host_schedules.html b/web/templates/pages/host_schedules.html index 764ae2d..4a49e79 100644 --- a/web/templates/pages/host_schedules.html +++ b/web/templates/pages/host_schedules.html @@ -33,6 +33,8 @@
Status
Cron
Sources
+
Next
+
Last
{{range $i, $sc := $page.Schedules}} @@ -52,6 +54,14 @@ {{if $name}}{{$name}}{{else}}unknown{{end}} {{end}} +
+ {{if $sc.NextRun}}{{relTime $sc.NextRun}}{{else if not $sc.Enabled}}(paused){{else}}—{{end}} +
+
+ {{if $sc.LastRun}}{{relTime $sc.LastRun}}{{else}}—{{end}} +
{{if eq $host.Status "online"}} {{if $sc.Enabled}} diff --git a/web/templates/pages/host_sources.html b/web/templates/pages/host_sources.html index 36a8077..d0d1087 100644 --- a/web/templates/pages/host_sources.html +++ b/web/templates/pages/host_sources.html @@ -53,12 +53,27 @@ {{if gt $row.SnapshotCount 0}} · {{$row.SnapshotCount}} snapshot{{if ne $row.SnapshotCount 1}}s{{end}}{{end}}
-
+
{{if and (gt (len $g.Includes) 0) (eq $host.Status "online")}} - +
+ +
+ Limit bandwidth for this run +
+ + + + + KB/s +
+
+
{{else}} diff --git a/web/templates/pages/source_group_edit.html b/web/templates/pages/source_group_edit.html index 3ef784e..6bea017 100644 --- a/web/templates/pages/source_group_edit.html +++ b/web/templates/pages/source_group_edit.html @@ -95,6 +95,27 @@ Each retry doubles the wait. Manual run-now ignores this — it just fails immediately if the agent is offline.
+

+ Hooks + backup jobs only +

+
+ Hooks run as the agent service user — root on Linux, LocalSystem on Windows. Treat them like any other root cron entry. +
+
+ + +
Non-zero exit aborts the backup. Stored AEAD-encrypted.
+
+
+ + +
Always runs. RM_JOB_STATUS is set to the backup's outcome. Stored AEAD-encrypted.
+
+
Cancel diff --git a/web/templates/partials/host_chrome.html b/web/templates/partials/host_chrome.html index 01606de..9e3f741 100644 --- a/web/templates/partials/host_chrome.html +++ b/web/templates/partials/host_chrome.html @@ -105,6 +105,22 @@
+ {{/* ---------- repo init line (P2R-09) ---------- */}} + {{if $page.InitStatus}} +
+ {{if eq $page.InitStatus "succeeded"}} + repo ready · initialised {{relTime $page.InitAt}} + {{else if eq $page.InitStatus "failed"}} + init failed · + job {{$page.InitJobID}} · retry from the Repo tab's danger zone + {{else if eq $page.InitStatus "running"}} + init running… · live log → + {{else if eq $page.InitStatus "queued"}} + init queued · job {{$page.InitJobID}} + {{end}} +
+ {{end}} + {{/* ---------- secondary tabs ---------- */}}
Snapshots {{comma $host.SnapshotCount}} diff --git a/web/templates/partials/host_row.html b/web/templates/partials/host_row.html index 98b27ea..9c7799e 100644 --- a/web/templates/partials/host_row.html +++ b/web/templates/partials/host_row.html @@ -30,6 +30,9 @@ {{- else -}} never run {{- end -}} + {{- if .NextRun -}} +
next {{relTime .NextRun}} + {{- end -}}
{{bytes $h.RepoSizeBytes}}