ci(release): use DEV_TOKEN for registry login

The auto-issued GITHUB_TOKEN lacks write:package scope on this Gitea instance, so the v0.9.0 tag build failed at docker login. Switch to the user-level DEV_TOKEN secret which has the correct scope.
Merge pull request 'testing: bootstrap UI, agent reliability, NS-01..04 + alert username' (#18 ) from ns-batch-host-ops into main
2026-05-06 19:05:54 +01:00 · 2026-05-05 21:09:17 +00:00 · 2026-05-05 22:03:15 +01:00 · 2026-05-05 16:36:08 +00:00 · 2026-05-05 17:15:00 +01:00 · 2026-05-05 15:18:48 +01:00
133 changed files with 111 additions and 25882 deletions
@@ -1,32 +0,0 @@
 <!--
 Thanks for the PR! A few quick checks before submitting:
 * Did you open an issue first for non-trivial changes?
 * `make lint test` is green locally?
 * Commits are focused (one logical change per commit)?
 * No `Co-Authored-By` trailers (repo policy)?
 * No new dependencies without a one-line justification below?
 -->
 ## Summary
 <!-- One paragraph: what changed and why. -->
 ## Test plan
 <!-- Bullet list of what you actually ran. Be specific.
     - `make test` → green
     - Manually exercised the new flow at /hosts/{id}/foo
     - Smoke env: enrolled a fresh host, ran a backup end-to-end
 -->
 ## Notes for the reviewer
 <!-- Anything the reviewer needs to know that isn't obvious from the
     diff: related issue, follow-up work that's intentionally not
     in this PR, deferred concerns, design alternatives considered
     and rejected. -->
 ## Linked issues
 <!-- "Closes #123" / "Refs #456" / "Part of P5-06" -->
@@ -1,52 +0,0 @@
 ---
 name: Bug report
 about: Something isn't behaving the way the docs / code suggest it should
 title: "[bug] "
 labels: bug
 ---
 ## What happened
 <!-- A clear description of the actual behaviour. Include the exact
     UI surface, API endpoint, or CLI invocation involved. -->
 ## What you expected
 <!-- What you thought would happen, and where that expectation came from
     (docs page, command output, prior behaviour). -->
 ## Steps to reproduce
 1.
 2.
 3.
 ## Environment
 - restic-manager server version: <!-- `restic-manager-server --version` or footer of the UI -->
 - Agent version (if relevant): <!-- `restic-manager-agent --version` -->
 - restic version on affected host: <!-- `restic version` -->
 - Host OS: <!-- e.g. "Ubuntu 22.04 amd64" or "Windows Server 2022" -->
 - How was the server installed: <!-- docker compose / source build / other -->
 ## Logs / output
 <details><summary>Server log (sanitised)</summary>
 ```
 <!-- paste relevant lines; redact tokens, passwords, repo URLs -->
 ```
 </details>
 <details><summary>Agent log (sanitised)</summary>
 ```
 ```
 </details>
 ## Anything else
 <!-- Screenshots, related issues, recent changes you made before the
     bug appeared, anything that might help. -->
@@ -1,34 +0,0 @@
 ---
 name: Feature request
 about: Suggest a new capability or change to existing behaviour
 title: "[feature] "
 labels: enhancement
 ---
 ## What you're trying to do
 <!-- Describe the use case, not the proposed solution. Who is the
     operator, what are they trying to accomplish, and what's
     blocking them today? -->
 ## Why the current behaviour falls short
 <!-- What does the system do today, and where does it stop short of
     the use case above? -->
 ## Proposed direction (optional)
 <!-- If you have a specific design in mind, describe it. Skip this
     section if you'd rather leave it to the maintainer. -->
 ## Scope check
 - [ ] I've read [`spec.md`](../spec.md) §2 (Goals & Non-Goals).
 - [ ] This isn't already on the roadmap in [`tasks.md`](../tasks.md).
 - [ ] This fits the project's "small fleet, one person operating"
      target rather than enterprise / multi-tenant / SaaS use cases.
 ## Anything else
 <!-- Related restic features, prior art in similar tools, links to
     discussions you've had elsewhere. -->
@@ -1,98 +0,0 @@
 # P5-06 — End-to-end test suite.
 #
 # Spec : docs/superpowers/specs/2026-05-07-p5-oss-readiness-design.md
 # Stack: e2e/compose.e2e.yml (server + agent + rest-server + playwright)
 # Tests: e2e/playwright/tests/*.spec.ts
 #
 # Triggered on every PR into main and on workflow_dispatch. Runs
 # longer than the unit-test workflow (~3-4 minutes for a clean run);
 # kept separate so a slow e2e doesn't block the fast lint/test loop.
 #
 # Networking note: every interaction with the server (health probe,
 # Playwright) happens from a container on the compose `rmnet`
 # network, addressing the server as `http://server:8080`. We can't
 # rely on `127.0.0.1:8080` because Gitea's runner executes steps
 # inside its own container, where compose's host port-publish is
 # not visible.
 name: e2e
 on:
  pull_request:
    branches: [main]
  workflow_dispatch:
 jobs:
  e2e:
    name: Playwright vs docker-compose
    runs-on: ubuntu-latest
    timeout-minutes: 15
    steps:
      - uses: actions/checkout@v4
      - name: Build the e2e stack
        run: docker compose -f e2e/compose.e2e.yml build
      - name: Bring up the stack
        run: docker compose -f e2e/compose.e2e.yml up -d server rest-server source-fixture
      - name: Wait for server health
        run: |
          set -eu
          for i in $(seq 1 30); do
            if docker run --rm --network e2e_rmnet curlimages/curl:8.10.1 \
                  -fsS http://server:8080/api/version >/dev/null 2>&1; then
              echo "server up"; exit 0
            fi
            sleep 2
          done
          echo "server didn't come up"; docker compose -f e2e/compose.e2e.yml logs server; exit 1
      - name: Capture bootstrap token from server logs
        id: bootstrap
        run: |
          set -eu
          for i in $(seq 1 15); do
            line=$(docker compose -f e2e/compose.e2e.yml logs server 2>&1 | grep -E 'bootstrap token' -A2 | grep -Eo '[a-zA-Z0-9_-]{40,}' | head -1 || true)
            if [ -n "$line" ]; then
              echo "RM_BOOTSTRAP_TOKEN=$line" >> "$GITHUB_ENV"
              echo "got bootstrap token (${#line} chars)"
              exit 0
            fi
            sleep 1
          done
          echo "bootstrap token not found in logs"
          docker compose -f e2e/compose.e2e.yml logs server
          exit 1
      - name: Start the agent
        run: docker compose -f e2e/compose.e2e.yml up -d agent
      - name: Prepare report mounts
        run: |
          mkdir -p e2e/playwright/playwright-report e2e/playwright/test-results
          chmod -R a+rwX e2e/playwright/playwright-report e2e/playwright/test-results
      - name: Run Playwright tests
        env:
          RM_BOOTSTRAP_TOKEN: ${{ env.RM_BOOTSTRAP_TOKEN }}
        run: docker compose -f e2e/compose.e2e.yml run --rm playwright
      - name: Compose logs (on failure)
        if: failure()
        run: |
          docker compose -f e2e/compose.e2e.yml logs --tail=200 server
          docker compose -f e2e/compose.e2e.yml logs --tail=200 agent
          docker compose -f e2e/compose.e2e.yml logs --tail=200 rest-server
      - name: Upload Playwright report (on failure)
        if: failure()
        uses: actions/upload-artifact@v3
        with:
          name: playwright-report
          path: e2e/playwright/playwright-report
          retention-days: 7
      - name: Tear down
        if: always()
        run: docker compose -f e2e/compose.e2e.yml down -v
@@ -2,10 +2,6 @@
 /bin/
 /dist/
 # Generated mdBook output (source under docs/book/src is committed,
 # the rendered book/ directory is not).
 /docs/book/book/
 # Local data / runtime state
 /data/
 /certs/
@@ -38,7 +38,7 @@ but the **agent** is fetched by the install script from the server's
 **install script** are fetched from `<DataDir>/install/`. Plain
 `make build` doesn't touch any of those — the source-of-truth files
 in the working tree (`deploy/install/*`, `bin/restic-manager-agent`)
-must be copied into `$HOME/smoke/data/...` *and* the running agent
+must be copied into `/tmp/rm-smoke/data/...` *and* the running agent
 on this dev host needs replacing if the change touches agent code or
 the unit file.
@@ -53,13 +53,13 @@ asking the operator to test.**
 ```sh
 # 1. Restage what the install script serves (binary + unit + script).
 cp bin/restic-manager-agent \
-   $HOME/smoke/data/agent-binaries/restic-manager-agent-linux-amd64
+   /tmp/rm-smoke/data/agent-binaries/restic-manager-agent-linux-amd64
 cp deploy/install/install.sh \
-   $HOME/smoke/data/install/install.sh
+   /tmp/rm-smoke/data/install/install.sh
 cp deploy/install/install.ps1 \
-   $HOME/smoke/data/install/install.ps1
+   /tmp/rm-smoke/data/install/install.ps1
 cp deploy/install/restic-manager-agent.service \
-   $HOME/smoke/data/install/restic-manager-agent.service
+   /tmp/rm-smoke/data/install/restic-manager-agent.service
 # 2. Replace the running agent on this dev box and restart the
 #    service. Skip only when the change is server-side only AND
@@ -74,36 +74,15 @@ sudo -n systemctl restart restic-manager-agent
 # 3. The server runs from the working tree; restart it manually
 #    after a build that touches server code:
 pkill -f restic-manager-server
-RM_LISTEN=:8080 RM_DATA_DIR=$HOME/smoke/data \
+RM_LISTEN=:8080 RM_DATA_DIR=/tmp/rm-smoke/data \
 RM_BASE_URL=http://127.0.0.1:8080 \
-RM_SECRET_KEY_FILE=$HOME/smoke/data/secret.key \
+RM_SECRET_KEY_FILE=/tmp/rm-smoke/data/secret.key \
 RM_COOKIE_SECURE=false \
-./bin/restic-manager-server >> $HOME/smoke/server.log 2>&1 &
+./bin/restic-manager-server >> /tmp/rm-smoke/server.log 2>&1 &
 ```
-## Smoke server: use the Make targets, not raw `nohup`
+A `make smoke-deploy` target that bundles all of this would be a
-
+good follow-up.
 The smoke server runs as a transient `systemd --user` unit named
 `restic-manager-smoke.service` so it survives any sandbox or
 process-group boundary that would otherwise SIGTERM a backgrounded
 process. Use the Make targets:
 ```
 make smoke-restart   # rebuild server + (re)launch as systemd --user unit
 make smoke-status    # systemctl --user status
 make smoke-logs      # tail $HOME/smoke/server.log
 make smoke-stop      # stop the unit
 make smoke-deploy    # full rebuild + restage agent assets + restart
 ```
 `./bin/restic-manager-server &` from inside a Bash tool call gets
 reaped when the tool exits — don't do that. If the unit fails to
 start: `systemctl --user status restic-manager-smoke` and
 `$HOME/smoke/server.log` have the diagnosis.
 `smoke-deploy` does NOT touch `/usr/local/bin/restic-manager-agent`
 on this dev box; if your change requires the live agent here to
 update, run the agent restage block above by hand.
 ## Migrations: prefer column-level ALTERs over table rebuilds
@@ -1,69 +0,0 @@
 # Code of Conduct
 restic-manager is a small project run by one person. This Code of
 Conduct sets out the basic expectations for participating in the
 project's issue tracker, pull requests, and any other community
 spaces (chat, mailing lists) we may run in future.
 ## Expected behaviour
 - **Be civil.** Disagreement is fine; rudeness is not. The same
  comment can usually be made without making it personal.
 - **Assume good faith.** People asking what feels like a basic
  question may be new to the project. People proposing what feels
  like a duplicate idea may not have seen the prior discussion.
  Point them to the right place politely.
 - **Stay on topic.** Issue threads are for the issue. Tangential
  conversations belong in their own thread.
 - **Acknowledge the project's scope.** restic-manager is
  intentionally small in scope (see `spec.md` §2). Reasonable
  feature suggestions may still be declined for fit reasons.
 ## Unacceptable behaviour
 - Harassment, threats, or insults — public or private.
 - Discriminatory comments based on age, body size, disability,
  ethnicity, gender identity or expression, level of experience,
  nationality, personal appearance, race, religion, sexual identity
  or orientation.
 - Sustained disruption — derailing threads, ignoring repeated
  requests to take a discussion elsewhere, brigading.
 - Publishing other people's private information without permission.
 ## Reporting
 If someone in the project's spaces is behaving in a way that
 breaches this Code of Conduct, contact the maintainer directly
 through the contact details on their Gitea profile, or via the
 private security disclosure path documented in
 [SECURITY.md](./SECURITY.md). Reports stay confidential.
 The maintainer will review the report, gather context if needed,
 and respond. Possible outcomes include a private warning, a public
 clarification of expectations, a temporary or permanent ban from
 project spaces, or no action if the report doesn't hold up.
 There is no formal appeals process — this is a one-person project,
 not a foundation. If you think a decision was wrong you can say
 so, in writing, to the maintainer; that's it.
 ## Scope
 This Code of Conduct applies to interactions in any space the
 project owns or operates: the Gitea repository (issues, pull
 requests, discussions, wiki), any chat channels we publish, and
 any conferences or events the project is officially represented at.
 It does not apply to:
 - Forks of the project that aren't being submitted back upstream.
 - Conversations between contributors that don't reference the
  project.
 - Public criticism of the project itself.
 ## Acknowledgement
 This document borrows shape and language from the
 [Contributor Covenant](https://www.contributor-covenant.org/) v2.1
 but is intentionally shorter and adapted to the project's
 single-maintainer reality.
@@ -1,168 +1,30 @@
-# Contributing to restic-manager
+# Contributing
-Thanks for your interest in restic-manager. This document covers how
+Thanks for your interest in contributing to restic-manager.
 to set up a development environment, the conventions the project
 follows, and how patches make it from your machine into `main`.
-## Project status and scope
+> This is a placeholder. The project is in pre-alpha (Phase 1 / MVP). A
 > full contributor guide will land alongside the Phase 5 OSS-readiness
 > work — see [`tasks.md`](./tasks.md) P5-02. Until then the notes below
 > apply.
-restic-manager is in pre-1.0. Core functionality (Phases 0–4) is
+## Before opening a PR
 landed; OSS-readiness polish is in progress. The top of
 [`tasks.md`](./tasks.md) tracks what's next; [`spec.md`](./spec.md)
 is the canonical design doc and the source of truth for any
 "why is it built this way" question.
-The project is **single-maintainer, hobbyist-scale, and licensed
+1. Open an issue first for non-trivial changes — the design is still
-under [PolyForm Noncommercial 1.0.0](./LICENSE)**. That has two
+   moving (see [`spec.md`](./spec.md)) and unsolicited large PRs may
-practical implications:
+   conflict with in-flight work.
 2. `make lint test` should pass.
 3. Match the existing code style — `gofumpt`, `goimports`, no comments
   that just restate what the code does.
 4. Keep commits focused; one logical change per commit.
-1. Big PRs without prior discussion may be declined for fit
+## Reporting security issues
   reasons even when they're correct — opening an issue first lets
   us check alignment cheaply.
 2. Commercial use is not permitted by the license. Bug reports and
   patches from operators of personal/community deployments are
   very welcome.
-## Getting started
+Please do **not** open a public issue for security problems. A
-
+`SECURITY.md` with a private disclosure path will be added in Phase 5
-### Prerequisites
+(P5-05). Until then, contact the repository owner directly via the
-
+contact details on their gitea profile.
 - Go 1.25 or newer (`go.mod` is the source of truth)
 - `make`
 - For the front-end CSS bundle: nothing extra — `make build`
  downloads a pinned `tailwindcss` standalone binary into `bin/`.
 - For the docs site: nothing extra — `make docs` does the same trick
  with `mdbook`.
 - For end-to-end tests: Docker + Docker Compose, plus `npx` for
  Playwright.
 ### One-time setup
 ```sh
 git clone https://gitea.dcglab.co.uk/steve/restic-manager.git
 cd restic-manager
 make build          # compiles bin/restic-manager-{server,agent}
 make test           # full unit + integration test sweep
 make lint           # gofumpt + goimports + golangci-lint
 ```
 ### Running locally
 For most development, the [smoke environment](./docs/e2e-smoke.md)
 is the path of least resistance:
 ```sh
 make smoke-restart  # rebuilds, launches as a systemd --user unit
 make smoke-logs     # tail of the server log
 ```
 Then point a browser at `http://127.0.0.1:8080`. The first run
 prints a one-time bootstrap token to the log; use it to create the
 admin user.
 ## Code conventions
 ### Style
 - `gofumpt` for formatting; `goimports` for import grouping.
  Both run via the pre-commit hook in this repo.
 - `golangci-lint` with `.golangci.yml` defaults; CI rejects on lint
  errors.
 - UK English in identifiers, comments, log messages, and UI strings
  (the misspell linter is configured for the UK locale — see
  P3-X5 for the original sweep).
 - Comments explain **why**, not what; avoid restating the code.
  A surprising invariant or an external constraint is worth
  writing down. "Adds 1 to x" is not.
 - `slog` for structured logs. Never log secrets — and especially
  never the merged-creds rest-server URL (see [`CLAUDE.md`](./CLAUDE.md)).
 ### File and package layout
 - `cmd/server` and `cmd/agent` are the two binary entry points.
 - `internal/` holds everything that's not part of the public Go
  API (which is none of it — restic-manager isn't a library).
 - Per-feature packages live under `internal/server/...` for the
  control plane and `internal/agent/...` for the agent.
 - `web/templates/` are HTML templates rendered with the standard
  library; embedded via `web.FS`.
 ### Tests
 - Unit tests live alongside the code as `*_test.go`. Use the
  in-process sqlite store (`store.Open(":memory:")`) when you need
  state — there is no test mock layer to maintain.
 - HTTP handlers test through `httptest.NewServer` against the real
  router; see `internal/server/http/auth_test.go` for the canonical
  fixture pattern.
 - End-to-end tests live in `e2e/` and run against a Docker Compose
  stack. See [`docs/e2e.md`](./docs/e2e.md).
 ### Database migrations
 - Migrations are hand-rolled SQL in `internal/store/migrations/`
  and embedded via `embed.FS`.
 - Prefer column-level `ALTER TABLE` over rebuilds — see
  [`CLAUDE.md`](./CLAUDE.md) "Migrations" section for the FK-cascade
  trap that bit migration 0007's first draft.
 ## Workflow
 ### Before opening a PR
 1. **Open an issue first** for non-trivial changes. The design is
   still moving; an issue lets us agree on direction cheaply.
 2. Run `make lint test` locally — both must pass.
 3. Match existing code style (see above).
 4. Keep commits focused: one logical change per commit. Imperative
   subject lines, body explaining why if it isn't obvious.
 5. Don't add `Co-Authored-By` trailers — repo policy. If you used
   AI assistance in writing the patch, that's fine; we just don't
   pollute every commit message with attribution boilerplate.
 ### Pull requests
 PRs target `main`. CI runs lint + tests on Linux amd64/arm64 and
 Windows amd64; all three must be green to merge. Squash-merge is
 the default; the PR title becomes the merge-commit subject, so
 keep it short and informative.
 The PR template asks for:
 - A short description of what changed and why.
 - A test plan (commands run, scenarios verified).
 - Anything reviewers need to know to assess the change (related
  issue, follow-up work, deferred concerns).
 ### Reporting bugs
 Open an issue with:
 - restic-manager version (`server --version`) and agent version.
 - restic version on the affected host.
 - Steps to reproduce.
 - Server and agent logs (sanitise any tokens before pasting).
 Security-sensitive bugs go through the [SECURITY.md](./SECURITY.md)
 disclosure path instead — please don't open a public issue for
 them.
 ### Suggesting features
 Open an issue describing the use case (not just the proposed
 solution). The roadmap in `tasks.md` shows where the project is
 heading; if the suggestion fits a future phase we'll wire it in
 there. If it falls outside the project's scope (multi-tenancy, SaaS,
 non-restic backends — see `spec.md` §2 non-goals) we'll say so
 early to save your time.
 ## Code of conduct
 Project participation is governed by [CODE_OF_CONDUCT.md](./CODE_OF_CONDUCT.md).
 The short version: be civil; assume good faith; harassment is not
 tolerated.
 ## License
-By contributing you agree that your contributions are licensed
+By contributing you agree that your contributions are licensed under
-under the [PolyForm Noncommercial 1.0.0](./LICENSE) license.
+the [PolyForm Noncommercial 1.0.0](./LICENSE) license.
@@ -7,9 +7,7 @@ AGENT_BIN      := $(BIN_DIR)/restic-manager-agent
 VERSION        ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev)
 COMMIT         ?= $(shell git rev-parse HEAD 2>/dev/null || echo none)
 DATE           ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
-VERSION_PKG    := gitea.dcglab.co.uk/steve/restic-manager/internal/version
+LDFLAGS        := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE)
 LDFLAGS        := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE) \
                  -X $(VERSION_PKG).Version=$(VERSION) -X $(VERSION_PKG).Commit=$(COMMIT)
 GOFLAGS        := -trimpath
 DOCKER_IMAGE   ?= gitea.dcglab.co.uk/steve/restic-manager
 DOCKER_TAG     ?= dev
@@ -24,29 +22,7 @@ TAILWIND_URL      := https://github.com/tailwindlabs/tailwindcss/releases/downlo
 TAILWIND_INPUT    := web/styles/input.css
 TAILWIND_OUTPUT   := web/static/css/styles.css
-# mdBook for the docs site (P5-01). Single static binary, no
+.PHONY: help build server agent test test-race lint fmt tidy clean run-server run-agent docker release tailwind tailwind-watch setup hooks
 # Rust toolchain — same pattern as Tailwind.
 MDBOOK_VERSION    ?= v0.4.51
 MDBOOK_OS         := $(shell uname -s | tr A-Z a-z)
 MDBOOK_TRIPLE     := $(shell uname -m)-unknown-$(if $(filter darwin,$(MDBOOK_OS)),apple-darwin,linux-gnu)
 MDBOOK_BIN        := $(BIN_DIR)/mdbook
 MDBOOK_TARBALL    := mdbook-$(MDBOOK_VERSION)-$(MDBOOK_TRIPLE).tar.gz
 MDBOOK_URL        := https://github.com/rust-lang/mdBook/releases/download/$(MDBOOK_VERSION)/$(MDBOOK_TARBALL)
 DOCS_BOOK_DIR     := docs/book
 DOCS_BOOK_OUT     := $(DOCS_BOOK_DIR)/book
 .PHONY: help build server agent test test-race lint fmt tidy clean run-server run-agent docker release tailwind tailwind-watch docs docs-watch setup hooks smoke-restart smoke-stop smoke-status smoke-logs smoke-deploy
 # ---- smoke-env tooling -------------------------------------------------
 # The smoke server runs as a transient user-systemd unit so it survives
 # bash-tool boundaries and reboots-of-the-shell. Use `make smoke-restart`
 # any time you've rebuilt the server. `make smoke-deploy` is the full
 # rebuild + restage + restart workflow described in CLAUDE.md.
 SMOKE_UNIT       := restic-manager-smoke
 SMOKE_DATA_DIR   := $(HOME)/smoke/data
 SMOKE_LOG_FILE   := $(HOME)/smoke/server.log
 SMOKE_BASE_URL   := http://127.0.0.1:8080
 SMOKE_LISTEN     := :8080
 help:
 	@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN{FS=":.*?## "};{printf "  \033[36m%-14s\033[0m %s\n",$$1,$$2}'
@@ -71,18 +47,6 @@ tailwind-watch: $(TAILWIND_BIN) ## Watch and rebuild on every save
 	@mkdir -p $$(dirname $(TAILWIND_OUTPUT))
 	$(TAILWIND_BIN) -c tailwind.config.js -i $(TAILWIND_INPUT) -o $(TAILWIND_OUTPUT) --watch
 $(MDBOOK_BIN):
 	@mkdir -p $(BIN_DIR)
 	@echo "==> downloading mdbook $(MDBOOK_VERSION) ($(MDBOOK_TRIPLE))"
 	curl -fsSL "$(MDBOOK_URL)" | tar -xz -C $(BIN_DIR) mdbook
 	@chmod +x $@
 docs: $(MDBOOK_BIN) ## Build the docs/book/ mdBook site into docs/book/book/
 	$(MDBOOK_BIN) build $(DOCS_BOOK_DIR)
 docs-watch: $(MDBOOK_BIN) ## Serve the docs site at http://127.0.0.1:3000 with live reload
 	$(MDBOOK_BIN) serve $(DOCS_BOOK_DIR) -n 127.0.0.1 -p 3000
 agent: ## Build the agent binary
 	@mkdir -p $(BIN_DIR)
 	CGO_ENABLED=0 go build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(AGENT_BIN) ./cmd/agent
@@ -113,7 +77,7 @@ tidy: ## go mod tidy
 	go mod tidy
 clean: ## Remove build artifacts
-	rm -rf $(BIN_DIR) coverage.out coverage.html $(TAILWIND_OUTPUT) $(DOCS_BOOK_OUT)
+	rm -rf $(BIN_DIR) coverage.out coverage.html $(TAILWIND_OUTPUT)
 run-server: server ## Build and run the server
 	$(SERVER_BIN)
@@ -128,48 +92,6 @@ docker: ## Build the server Docker image
 	  --build-arg DATE=$(DATE) \
 	  -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
 smoke-restart: server ## (Re)start the smoke server as a transient user-systemd unit
 	@systemctl --user reset-failed $(SMOKE_UNIT) >/dev/null 2>&1 || true
 	@systemctl --user stop $(SMOKE_UNIT) >/dev/null 2>&1 || true
 	@echo "==> launching $(SMOKE_UNIT)"
 	systemd-run --user --unit=$(SMOKE_UNIT) \
 	  --setenv=RM_LISTEN=$(SMOKE_LISTEN) \
 	  --setenv=RM_DATA_DIR=$(SMOKE_DATA_DIR) \
 	  --setenv=RM_BASE_URL=$(SMOKE_BASE_URL) \
 	  --setenv=RM_SECRET_KEY_FILE=$(SMOKE_DATA_DIR)/secret.key \
 	  --setenv=RM_COOKIE_SECURE=false \
 	  --property=StandardOutput=append:$(SMOKE_LOG_FILE) \
 	  --property=StandardError=append:$(SMOKE_LOG_FILE) \
 	  --property=Restart=on-failure \
 	  $(PWD)/$(SERVER_BIN)
 	@for i in 1 2 3 4 5; do \
 	  curl -fsS -o /dev/null $(SMOKE_BASE_URL)/api/version 2>/dev/null && \
 	    { echo "==> smoke server up: $$(curl -s $(SMOKE_BASE_URL)/api/version)"; exit 0; }; \
 	  sleep 1; \
 	done; \
 	echo "!! smoke server did not respond on $(SMOKE_BASE_URL) — check $(SMOKE_LOG_FILE)" >&2; \
 	systemctl --user status --no-pager $(SMOKE_UNIT) || true; \
 	exit 1
 smoke-stop: ## Stop the smoke server
 	systemctl --user stop $(SMOKE_UNIT) || true
 	@systemctl --user reset-failed $(SMOKE_UNIT) >/dev/null 2>&1 || true
 smoke-status: ## Show status of the smoke server
 	@systemctl --user status --no-pager $(SMOKE_UNIT) 2>&1 | head -20 || true
 smoke-logs: ## Tail the smoke server log
 	tail -50 $(SMOKE_LOG_FILE)
 smoke-deploy: build smoke-restart ## Rebuild + restage agent into smoke + restart server (full per-CLAUDE.md cycle)
 	@echo "==> restaging agent + install assets into $(SMOKE_DATA_DIR)"
 	cp $(AGENT_BIN) $(SMOKE_DATA_DIR)/agent-binaries/restic-manager-agent-linux-amd64
 	cp deploy/install/install.sh $(SMOKE_DATA_DIR)/install/install.sh
 	cp deploy/install/install.ps1 $(SMOKE_DATA_DIR)/install/install.ps1
 	cp deploy/install/restic-manager-agent.service $(SMOKE_DATA_DIR)/install/restic-manager-agent.service
 	@echo "==> NOTE: this dev box's installed agent at /usr/local/bin/restic-manager-agent is NOT updated by this target."
 	@echo "    Run the agent restage block in CLAUDE.md if your change touches agent code or the unit file."
 release: ## Cross-compile for all supported platforms
 	@mkdir -p $(BIN_DIR)
 	@for target in linux/amd64 linux/arm64 windows/amd64; do                          \
@@ -1,62 +1,36 @@
 # restic-manager
 Self-hosted, browser-based, single-pane-of-glass for managing
-[restic](https://restic.net) backups across a fleet of Linux and
+[restic](https://restic.net) backups across a fleet of Linux and Windows
-Windows endpoints.
+endpoints.
-> **Status:** pre-1.0, feature-complete for the original use
+> Status: pre-alpha. Phase 0 (project bootstrap) complete; Phase 1 (MVP) in
-> case. Phases 0–4 + 6 are landed (MVP, scheduling, restore,
+> progress. See [`spec.md`](./spec.md) for the design and
-> RBAC + OIDC, observability); Phase 5 (OSS readiness — docs site,
+> [`tasks.md`](./tasks.md) for the roadmap.
 > contributor onboarding, end-to-end CI) is in flight. See
 > [`spec.md`](./spec.md) for the design and [`tasks.md`](./tasks.md)
 > for the live roadmap.
-## What it does
+## What it does (target)
- Central visibility into backup state for every endpoint.
+- Central visibility into backup state for every endpoint
- Trigger any restic operation remotely (`backup`, `forget`,
+- Trigger any restic operation remotely (`backup`, `forget`, `prune`,
-  `prune`, `check`, `unlock`, `snapshots`, `stats`, `diff`,
+  `check`, `unlock`, `snapshots`, `stats`, `diff`, `restore`)
-  `restore`).
+- Manage per-host backup schedules from the UI
- Per-host schedules with named source groups + retention.
+- Live job progress streamed back to the UI
- Live job log streamed to the browser; downloadable as
+- Restore wizard (browse snapshots, pick paths, restore to original or
-  text/NDJSON afterwards.
+  alternate host)
- Restore wizard: browse a snapshot's tree, pick paths, restore
+- Repo health surfacing (size, dedup ratio, last check, lock state)
-  in-place or to a new directory.
+- Alerting on failure or staleness
- Repo health surfacing (size, raw size, last check, lock state),
+- Cross-platform agent (Linux + Windows)
-  plus a 30/90-day repo-size trend.
+- Ransomware-resistant repo access via append-only credentials
 - Alerting over webhook, ntfy, or SMTP.
 - Cross-platform agent (Linux systemd + Windows SCM).
 - Append-only-friendly: separate admin credential for prune.
 - Optional Prometheus `/metrics` endpoint + sample Grafana
  dashboard.
 - Optional OIDC SSO (Authelia, Authentik, etc.).
-## Screenshots
+## Architecture (one-line summary)
-| Sign in | Empty dashboard | Add host |
+A small Go control-plane on the Proxmox host, lightweight Go agents on each
-|:-------:|:---------------:|:--------:|
+endpoint that hold an outbound WebSocket to the control-plane, and a
-| ![Sign in](docs/screenshots/01-login.png) | ![Dashboard, fresh](docs/screenshots/02-dashboard-empty.png) | ![Add host](docs/screenshots/03-add-host.png) |
+`restic/rest-server` on Unraid that holds the actual backup data. The
-
+control-plane never touches backup bytes.
 | Alerts | Settings | Audit log |
 |:------:|:--------:|:---------:|
 | ![Alerts](docs/screenshots/04-alerts.png) | ![Settings](docs/screenshots/05-settings.png) | ![Audit log](docs/screenshots/06-audit.png) |
 (Screenshots from a fresh smoke install with no hosts. A populated
 fleet view and the live-log + restore wizard surfaces are part of
 the docs site under [`docs/book/`](./docs/book) — `make docs` to
 render locally.)
 ## Architecture (one-line)
 A small Go control-plane in Docker, lightweight Go agents on each
 endpoint holding an outbound WebSocket to the control-plane, and
 a restic repository (rest-server, S3, B2, SFTP — anything restic
 speaks) that holds the actual backup data. **The control-plane
 never touches backup bytes.**
 Full architecture diagram and component breakdown:
-[`spec.md` §3](./spec.md), or the rendered version in the
+[`spec.md` §3](./spec.md).
 [docs site](./docs/book/src/concepts/architecture.md).
 ## Repository layout
@@ -64,63 +38,31 @@ Full architecture diagram and component breakdown:
 cmd/server/        control-plane binary
 cmd/agent/         endpoint agent binary
 internal/api       shared API types (REST + WS envelopes)
-internal/server/   HTTP, WS, UI handlers, alert engine
+internal/server/   HTTP, WS, UI handlers
 internal/agent/    service integration, restic runner, local scheduler
 internal/restic    restic CLI wrapper
 internal/store     SQLite persistence
-internal/crypto    secret encryption (AEAD)
+internal/crypto    secret encryption
 internal/auth      passwords, sessions, agent tokens
 web/               server-rendered templates + static assets
-deploy/            Dockerfile, docker-compose.yml, install scripts, Grafana dashboard
+deploy/            Dockerfile, docker-compose.yml, install scripts
-docs/              prose docs + the mdBook site under docs/book
+design/            UI wireframes (Phase 0 design pass)
 e2e/               compose stack + Playwright tests for end-to-end CI
 ```
 ## Quickstart
 The reference deployment is a single Docker container fronted by
 your existing reverse proxy. See the [installation guide](docs/book/src/getting-started/install.md)
 for the full path; the very short version:
 ```sh
 export RM_VERSION=v0.9.0    # pin a real tag
 export RM_BASE_URL=https://restic.example.com
 export RM_TRUSTED_PROXY=10.0.0.0/8
 docker compose -f deploy/docker-compose.yml up -d
 ```
 The server prints a one-time bootstrap token to the log on first
 start. POST it to `/api/bootstrap` (or open `/bootstrap` in a
 browser) to create the admin user.
 ## Local development
-Requires Go 1.25+. The floor is set by `modernc.org/sqlite` v1.50.
+Requires Go 1.25+ (built and tested on 1.26). The floor is set by
 `modernc.org/sqlite` v1.50.
 ```sh
 make build           # builds cmd/server and cmd/agent into ./bin
 make test            # runs go test ./...
 make lint            # runs golangci-lint
-make smoke-restart   # systemd --user smoke server (see CLAUDE.md)
+make run-server      # runs the server (dev defaults)
 make docs            # renders the mdBook site to docs/book/book/
 ```
 End-to-end test harness against a Docker Compose stack with a
 sibling Linux agent: see [`docs/e2e.md`](docs/e2e.md). Runs in CI
 on every PR.
 ## Documentation
 - **Concepts and operator guides**: [docs site](docs/book/src/intro.md),
  rendered with `make docs`.
 - **Reverse-proxy setup**: [docs/reverse-proxy.md](docs/reverse-proxy.md).
 - **Prometheus + Grafana**: [docs/prometheus.md](docs/prometheus.md).
 - **End-to-end test harness**: [docs/e2e.md](docs/e2e.md).
 - **Security policy**: [SECURITY.md](SECURITY.md).
 - **Contributing**: [CONTRIBUTING.md](CONTRIBUTING.md).
 ## License
-[PolyForm Noncommercial 1.0.0](./LICENSE). Free for personal,
+PolyForm Noncommercial 1.0.0 — see [`LICENSE`](./LICENSE). Free for personal,
-hobby, research, educational, governmental, and other noncommercial
+hobby, research, educational, governmental, and other noncommercial use.
-use. Commercial use requires a separate license.
+Commercial use requires a separate license.
@@ -1,137 +0,0 @@
 # Security policy
 restic-manager handles credentials that grant access to backup
 repositories — losing them means an attacker can read or destroy a
 fleet's backups. We take security reports seriously even at this
 project's small scale.
 ## Supported versions
 Pre-1.0, only the latest tagged release on `main` is supported.
 Backporting fixes to older tags is not currently offered.
 | Version            | Supported      |
 |--------------------|----------------|
 | `main` HEAD        | Yes            |
 | Latest released tag| Yes            |
 | Anything older     | No             |
 ## Reporting a vulnerability
 **Please don't open a public issue for security problems.**
 Instead, use one of these private channels:
 1. **Gitea private message** to the repository owner. The
   instance is at <https://gitea.dcglab.co.uk> and the owner's
   profile (`steve`) has direct-message contact set up.
 2. **Email** to the address on the maintainer's Gitea profile.
   Use a subject like `[SECURITY] restic-manager: <one-line summary>`
   so it doesn't get lost. PGP optional — if you want to encrypt,
   ask for a key first.
 If you don't get an acknowledgement within **3 working days**,
 please escalate through the other channel — solo maintainers do
 miss things, and the goal here is to fix the problem, not to
 preserve protocol.
 ### What to include
 - A description of the issue and the impact (what does an attacker
  gain? confidentiality, integrity, availability?).
 - Affected component (server, agent, install script, docs).
 - Affected version (`restic-manager-server --version`).
 - Reproduction steps if you have them. A working PoC is welcome
  but not required — a credible threat model is enough.
 - Whether you intend to publish a writeup, and any timing
  preferences.
 ### What we'll do
 1. Acknowledge receipt within 3 working days.
 2. Confirm or refute the issue, and agree a rough severity (CVSS
   or just "this is bad / this isn't"). Asking clarifying
   questions is normal at this stage — please don't read it as
   foot-dragging.
 3. Develop a fix on a private branch, test it, and prepare a
   release.
 4. Coordinate disclosure timing with you. The default is **30
   days from confirmed report to public disclosure**, with a
   patched release published before the disclosure date. Faster
   if a workable PoC is already circulating; slower only by
   mutual agreement.
 5. Credit the reporter in the release notes (or omit the credit
   if you'd rather stay anonymous — your choice).
 ## Scope
 In scope:
 - The server binary (`cmd/server`) and any HTTP, WebSocket, or CLI
  surface it exposes.
 - The agent binary (`cmd/agent`) and the way it consumes commands
  from the server.
 - The install scripts (`deploy/install/install.sh`, `install.ps1`)
  and the systemd unit shipped with them.
 - The docker-compose reference deployment and the docker image we
  publish.
 - Any cryptographic primitive choice or implementation detail
  (AEAD, token hashing, session handling, OIDC handshake).
 - Documentation that, if followed, leads operators into an
  insecure configuration.
 Out of scope (not because they aren't real problems, just not ones
 this report channel can act on):
 - Vulnerabilities in restic itself — report those upstream at
  <https://github.com/restic/restic>.
 - Vulnerabilities in third-party dependencies that haven't yet been
  patched upstream — report upstream first.
 - Issues that require pre-authenticated admin access on the control
  plane (admins can already do everything; that's not a privilege
  escalation, that's the design).
 - DoS via resource exhaustion on a deployment without the
  recommended reverse proxy / rate limiting in front (see
  `docs/reverse-proxy.md`).
 - Social-engineering scenarios that don't have a technical hook
  into the project's own surfaces.
 ## Threat model summary
 For context (longer version in [`spec.md`](./spec.md) §11):
 - The server is **HTTP-only**; TLS termination, ACME, HSTS, and
  edge rate-limiting are the reverse proxy's job.
 - Credentials are encrypted at rest with an AEAD key loaded from
  `RM_SECRET_KEY_FILE`. The same key encrypts agent secrets that
  travel to the agent over the WS channel.
 - Agents authenticate with bearer tokens issued at enrolment and
  hashed at rest. Compromise of the server DB does **not** leak
  bearer tokens in plaintext, but does leak the hashes (which is
  enough to log in *as* the agent until the operator revokes —
  see [NS-01 / NS-02](./tasks.md) for the revoke + regenerate
  flows).
 - The control plane intentionally **never touches backup bytes** —
  the agent runs `restic` directly against the repo. A
  compromised control plane can dispatch new jobs but cannot
  exfiltrate snapshot contents in-band.
 - Append-only credentials are first-class. Forget/prune jobs use a
  separate, admin-marked credential that the server only pushes
  for the duration of a maintenance dispatch.
 ## Hardening checklist for operators
 - Run behind a TLS-terminating reverse proxy (Caddy/nginx/Traefik).
 - Set `RM_TRUSTED_PROXY` to the proxy's CIDR so request IPs aren't
  spoofable.
 - Back up `RM_SECRET_KEY_FILE` separately from the database.
  Without it the encrypted creds are unrecoverable.
 - Use append-only credentials for the everyday backup path; only
  the optional admin credential should have write/forget/prune
  power.
 - Disable users (don't delete) when staff change roles — bearer
  tokens stay valid until rotated.
 - Watch the alert and audit-log views during enrolment of new
  hosts.
 Thanks for helping keep restic-manager users safe.
@@ -1,8 +0,0 @@
 # The ask!
 I have numerous servers deployed out in a lab, mainly Linux but some Windows
 All have restic installed on them
 I need to build a browser based management service that allows me to have a central single-plane-of-glass to monitor and manage all teh endpoints
 All endpoints will be enabled for SSH (unless other methods are better?)
 Plan out how we would go about this please?
@@ -148,7 +148,6 @@ func run() error {
 		resticBin:                 resticBin,
 		resticVer:                 snap.ResticVersion,
 		resticSupportsNoOwnership: resticSupportsNoOwnership,
 		serverURL:                 cfg.ServerURL,
 		secrets:                   sec,
 		scheduler:                 scheduler.New(),
 	}
@@ -215,7 +214,6 @@ type dispatcher struct {
 	resticBin                 string
 	resticVer                 string // e.g. "0.17.1"; empty if restic isn't installed yet
 	resticSupportsNoOwnership bool   // captured at startup from `restic restore --help`
 	serverURL                 string // base URL of the server (used by the self-update fetch)
 	secrets                   *secrets.Store
 	scheduler                 *scheduler.Scheduler
@@ -397,12 +395,10 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S
 				"up_kbps", up, "down_kbps", down)
 		}
-	case api.MsgCommandUpdate:
+	case api.MsgAgentUpdateAvail:
-		var p api.CommandUpdatePayload
+		var p api.AgentUpdateAvailablePayload
-		if err := env.UnmarshalPayload(&p); err != nil {
+		_ = env.UnmarshalPayload(&p)
-			return fmt.Errorf("command.update: %w", err)
+		slog.Info("ws agent: update available", "version", p.LatestVersion, "url", p.PackageURL)
 		}
 		go d.runUpdate(ctx, p, tx)
 	default:
 		slog.Debug("ws agent: ignored message", "type", env.Type)
@@ -1,65 +0,0 @@
 package main
 import (
 	"context"
 	"fmt"
 	"log/slog"
 	"time"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/updater"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/wsclient"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 )
 // runUpdate handles a server-dispatched command.update. It logs progress
 // via log.stream so the live job page captures pre-restart state, then
 // calls the platform updater. On Linux the updater calls os.Exit; on
 // Windows it spawns a detached helper and returns, with the agent then
 // exiting.
 //
 // The terminal job state is set by the server, not the agent: success
 // is "agent re-hellos with matching version" rather than anything the
 // agent itself can assert. The only `job.finished` we send from here is
 // on the failure path, before any restart attempt.
 func (d *dispatcher) runUpdate(ctx context.Context, p api.CommandUpdatePayload, tx wsclient.Sender) {
 	logf := func(format string, args ...any) {
 		line := fmt.Sprintf(format, args...)
 		slog.Info("ws agent: update: " + line)
 		env, err := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
 			JobID:   p.JobID,
 			TS:      time.Now().UTC(),
 			Stream:  api.LogStdout,
 			Payload: line,
 		})
 		if err == nil {
 			_ = tx.Send(env)
 		}
 	}
 	startedEnv, err := api.Marshal(api.MsgJobStarted, "", api.JobStartedPayload{
 		JobID:     p.JobID,
 		Kind:      api.JobUpdate,
 		StartedAt: time.Now().UTC(),
 	})
 	if err == nil {
 		_ = tx.Send(startedEnv)
 	}
 	logf("fetching new binary from %s", d.serverURL)
 	if err := updater.Update(ctx, d.serverURL); err != nil {
 		logf("update failed: %v", err)
 		finishedEnv, mErr := api.Marshal(api.MsgJobFinished, "", api.JobFinishedPayload{
 			JobID:      p.JobID,
 			Status:     api.JobFailed,
 			FinishedAt: time.Now().UTC(),
 			Error:      err.Error(),
 		})
 		if mErr == nil {
 			_ = tx.Send(finishedEnv)
 		}
 		return
 	}
 	// Unreachable on Linux (Update calls os.Exit). On Windows control
 	// returns here while the detached helper does the swap-and-restart;
 	// the agent then exits cleanly so SCM hands off.
 }
@@ -17,7 +17,6 @@ import (
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/fleetupdate"
 	rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
@@ -92,7 +91,6 @@ func run() error {
 	notifHub := notification.NewHub(st, aead, cfg.BaseURL)
 	alertEngine := alert.NewEngine(st, notifHub)
 	updateWatcher := ws.NewUpdateWatcher(st, alertEngine, jobHub)
 	renderer, err := ui.New()
 	if err != nil {
@@ -118,7 +116,6 @@ func run() error {
 		JobHub:          jobHub,
 		AlertEngine:     alertEngine,
 		NotificationHub: notifHub,
 		UpdateWatcher:   updateWatcher,
 		UI:              renderer,
 		Version:         version,
 		OIDC:            oidcClient,
@@ -150,17 +147,10 @@ func run() error {
 	srv := rmhttp.New(deps)
 	// Fleet-update worker — built after the HTTP server because the
 	// dispatcher delegates back into srv.DispatchHostUpdate.
 	fleetWorker := fleetupdate.NewWorker(st, hub,
 		&serverDispatcher{srv: srv}, alertEngine)
 	srv.SetFleetWorker(fleetWorker)
 	ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
 	defer stop()
 	go alertEngine.Run(ctx)
 	go updateWatcher.Run(ctx)
 	errCh := make(chan error, 1)
 	go func() {
@@ -253,12 +243,3 @@ func run() error {
 	}
 	return nil
 }
 // serverDispatcher adapts the http.Server's DispatchHostUpdate method
 // to the fleetupdate.Dispatcher interface. Lives in main so the
 // http and fleetupdate packages don't need to know about each other.
 type serverDispatcher struct{ srv *rmhttp.Server }
 func (d *serverDispatcher) DispatchUpdate(ctx context.Context, hostID, actorUserID string) (string, string, error) {
 	return d.srv.DispatchHostUpdate(ctx, hostID, actorUserID)
 }
@@ -52,12 +52,7 @@ ProtectSystem=full
 # whenever a new SecretsKey is minted, so we need a targeted
 # write-exemption for that dir. No exemption for the rest of /etc:
 # the agent has no business editing /etc/passwd, /etc/sudoers, etc.
-#
+ReadWritePaths=/etc/restic-manager
 # /usr/local/bin is writable so the self-update flow (P6-01) can
 # atomic-rename a fresh binary over the running one. Permitting the
 # whole directory (rather than just the binary path) is required
 # because os.Rename takes a write lock on the parent dir.
 ReadWritePaths=/etc/restic-manager /usr/local/bin
 ProtectHostname=true
 ProtectKernelTunables=true
 ProtectKernelModules=true
@@ -1,19 +0,0 @@
 [book]
 title = "restic-manager"
 description = "Self-hosted control plane for restic backups across a fleet of Linux and Windows endpoints."
 authors = ["Steve Cliff"]
 language = "en-GB"
 multilingual = false
 src = "src"
 [output.html]
 default-theme = "ayu"
 preferred-dark-theme = "ayu"
 git-repository-url = "https://gitea.dcglab.co.uk/steve/restic-manager"
 git-repository-icon = "fa-code-fork"
 edit-url-template = "https://gitea.dcglab.co.uk/steve/restic-manager/_edit/main/docs/book/{path}"
 no-section-label = false
 [output.html.fold]
 enable = true
 level = 2
@@ -1,40 +0,0 @@
 # Summary
 [Introduction](./intro.md)
 # Getting started
 - [Installing the server](./getting-started/install.md)
 - [Enrolling your first host](./getting-started/enrolling-hosts.md)
 - [Running behind a reverse proxy](./getting-started/reverse-proxy.md)
 # Concepts
 - [Architecture](./concepts/architecture.md)
 - [Credentials and how they flow](./concepts/credentials.md)
 - [Schedules and source groups](./concepts/schedules-and-source-groups.md)
 - [Repo maintenance](./concepts/repo-maintenance.md)
 # Operations
 - [Backups and restores](./operations/backups-and-restores.md)
 - [Alerts and notifications](./operations/alerts.md)
 - [Observability with Prometheus](./operations/observability.md)
 - [Updating agents](./operations/updates.md)
 # Security
 - [Threat model](./security/threat-model.md)
 - [Hardening checklist](./security/hardening.md)
 - [Reporting vulnerabilities](./security/disclosure.md)
 # Reference
 - [Environment variables](./reference/env-vars.md)
 - [HTTP endpoints](./reference/http-endpoints.md)
 ---
 [Contributing](./contributing.md)
 [Roadmap](./roadmap.md)
 [License](./license.md)
@@ -1,121 +0,0 @@
 # Architecture
 ## Components
 ```
 ┌────────────────────────────────────────────────────────────┐
 │  Server (control plane, single process)                    │
 │   * chi-based HTTP API + HTMX server-rendered UI           │
 │   * WebSocket hub for agent fan-out + browser fan-out      │
 │   * SQLite store (modernc.org/sqlite, pure Go)             │
 │   * AEAD encryption helpers                                │
 │   * Alert engine + notification hub                        │
 └────────────┬───────────────────────────────────┬───────────┘
             │ outbound WS only                   │ HTTP(S)
             │                                    │
 ┌────────────▼─────────────┐         ┌────────────▼─────────────┐
 │  Agent (per host)        │         │  Browser (operator)      │
 │   * coder/websocket      │         │   * htmx + a tiny bit    │
 │   * cron for schedules   │         │     of vanilla JS for    │
 │   * restic wrapper       │         │     live job updates     │
 │   * sysinfo collector    │         └──────────────────────────┘
 └────────────┬─────────────┘
             │ subprocess: restic ...
             │
 ┌────────────▼─────────────────────────────────────────────────┐
 │  restic repository (rest-server, S3, B2, SFTP, local …)      │
 │  Backup data flows directly here. Server never touches it.   │
 └──────────────────────────────────────────────────────────────┘
 ```
 ## Why outbound-only WebSockets?
 The agent dials the server on `/ws/agent` with a bearer token. The
 server doesn't initiate connections to the agent. Three reasons:
 1. **Firewall friendliness.** Nothing on the endpoint needs an
   inbound port; this works behind the typical "branch office NAT"
   without router config.
 2. **Single auth point.** The bearer token is the only credential
   that crosses the boundary; the agent never accepts an
   incoming socket.
 3. **Reconnect semantics are simpler.** When the connection drops
   (NAT timeout, server restart, transient network glitch) the
   agent backs off and re-dials; the server marks the host
   offline after 90s and lets the alert engine raise a stale-host
   alert.
 ## Why SQLite?
 SQLite covers the project's HA non-goal: there isn't one. A small
 control plane managing twelve endpoints does not need replication
 or a separate database tier. SQLite gives us:
 - A single file to back up (plus the secret key).
 - Hand-rolled migrations under `internal/store/migrations/` —
  no migration framework lock-in.
 - `WAL` mode plus per-connection foreign-key enforcement.
 The migrations file the entire schema; there's no ORM or
 query-builder layer between Go code and SQL.
 ## Why the agent runs `restic` itself, not via the server
 The control plane never holds backup bytes in flight. That's
 deliberate:
 - A compromised control plane cannot exfiltrate snapshot
  contents in-band — at worst it can dispatch new backup or
  forget jobs (audit-logged) but the data path is between the
  agent and the repository.
 - The same agent process can target whichever transport restic
  natively supports (rest-server, S3, B2, SFTP, local), no
  separate mux on the server side.
 ## Job lifecycle
 ```
            ┌──────────────────────┐
 operator →  │ POST /hosts/{id}/    │
            │       run-backup     │
            └──────────┬───────────┘
                       │   1. INSERT INTO jobs (status='queued')
                       │   2. dispatch command.run over WS
                       ▼
            ┌──────────────────────┐
            │ Agent dispatches     │
            │ restic subprocess    │
            └──────────┬───────────┘
                       │
                       │   3. job.started   ───▶ store.MarkJobStarted
                       │   4. job.progress  ───▶ JobHub broadcast (live UI)
                       │   5. log.stream    ───▶ append to job_logs
                       │   6. job.finished  ───▶ store.MarkJobFinished
                       │                          + alert engine eval
                       │                          + (P6) metrics histogram
                       ▼
                  terminal: succeeded | failed | cancelled
 ```
 Operators see live updates because the browser subscribes to
 `/api/jobs/{id}/stream`, and the WS handler broadcasts each
 agent-emitted envelope to all live subscribers in addition to
 persisting it.
 ## What scheduling looks like
 - The agent runs a local `robfig/cron/v3` instance.
 - The server pushes the desired schedule set to the agent on
  hello + after every CRUD change.
 - When the agent's cron fires, it sends `schedule.fire` to the
  server. The server creates a job row, sends `command.run` back,
  and the agent dispatches a normal backup.
 - If the WS drops between fire and run, the server queues the
  schedule firing into `pending_runs` and drains on agent
  reconnect — no missed scheduled backups due to network blips.
 For everything that isn't a backup (forget, prune, check), the
 server runs a 60-second maintenance ticker against
 `host_repo_maintenance` rows and dispatches the relevant command
 when a cadence is due. The agent's local cron only handles
 backups.
@@ -1,98 +0,0 @@
 # Credentials and how they flow
 restic-manager handles three credential surfaces:
 1. **Operator credentials** — the username + password (or OIDC
   identity) that logs into the UI.
 2. **Agent bearer tokens** — issued at enrolment, used by the
   agent to authenticate its WebSocket to the server.
 3. **Repo credentials** — the rest-server / S3 / B2 / SFTP
   credentials the agent passes to `restic` itself.
 Each has a different threat model and storage strategy.
 ## Operator credentials
 - Local users are stored in `users` with a bcrypt password hash.
 - Sessions are random tokens minted at login, stored hashed in
  the `sessions` table, expired after 24h. Cookie is HttpOnly,
  SameSite=Lax, and Secure (when `RM_COOKIE_SECURE=true`,
  default).
 - OIDC users carry `auth_source='oidc'` and an `oidc_subject`
  pinning their IdP identity. Local password login is rejected
  for OIDC users.
 - Disabling a user soft-deletes them via `disabled_at` —
  pre-existing sessions are invalidated on the next request.
 ## Agent bearer tokens
 - Minted at enrolment, hashed at rest with `auth.HashToken`.
 - The plaintext token only exists in memory at enrolment time
  and on the agent's filesystem (`/etc/restic-manager/agent.yaml`,
  mode `0600`, owned by the service user).
 - Compromise of the server DB leaks the hashes, which is enough
  to *log in as that agent* until you revoke. Compromise of the
  agent host leaks the plaintext (via the config file) — same
  end result.
 - Rotation: re-enrol the host. Today there's no in-place rotate;
  the operator deletes the host (which cascades, including
  revoking the bearer hash) and re-runs the install command.
 ## Repo credentials
 This is the credential that ultimately matters for backup
 integrity. restic-manager keeps two slots per host:
 - **The everyday credential** (`host_credentials.kind = ''`).
  Append-only-friendly: this is the one your backup schedule
  uses. It can write but not delete or forget.
 - **The admin credential** (`host_credentials.kind = 'admin'`).
  Has full delete rights. Only pushed to the agent transiently
  while a `prune` or `forget` job is dispatching, and discarded
  by the agent after the job ends.
 ### Encryption flow
 1. Operator types the credential into the UI or the install form.
 2. Server AEAD-encrypts the cred (`crypto.AEAD.Encrypt`) using the
   key in `RM_SECRET_KEY_FILE`. The plaintext is dropped from
   memory.
 3. Encrypted blob is stored in `host_credentials.cred_blob`.
 4. When the agent connects, the server decrypts the blob and
   sends the **plaintext** down the WebSocket inside a
   `config.update` envelope.
 5. The agent stores the plaintext in its in-memory secrets store
   for the lifetime of the process; it's reloaded fresh on every
   server-side push.
 6. When a job runs, the agent merges the credential into the
   restic environment (`restic.Env.RepoURL` stays bare; the
   `user:pass@…` form is built only inside `envSlice()` at the
   moment of `exec.Command`).
 The merged form is **never logged**. The slog package's structured
 output gets `restic.RedactURL()` for any URL it has cause to
 mention.
 ### Why push plaintext over the wire?
 The transport itself is the trust boundary: the WebSocket runs
 inside the same TLS-terminated reverse-proxy connection your
 browser uses, and the agent has already authenticated with its
 bearer token. Re-encrypting the payload on top of that would just
 move the key-management problem somewhere else.
 If your reverse proxy isn't TLS-terminated, the deployment is
 already broken — see [Hardening](../security/hardening.md).
 ## Setup tokens (admin-driven)
 When an admin creates a new user, the server mints a one-time
 setup link valid for 1 hour. The hash is stored; the raw token
 is shown to the admin once. The user opens the link, sets a
 password, and is dropped into a session. Expired tokens are
 swept on the alert engine's 60s tick.
 Same pattern for enrolment tokens: the raw token only exists in
 memory at mint time, and the install snippet is the operator's
 only chance to capture it. If you lose it, regenerate via the
 **Add host** page (NS-02).
@@ -1,85 +0,0 @@
 # Repo maintenance
 Backups go in; without maintenance, repos grow forever and
 eventually fall over. restic-manager runs three maintenance
 operations on a per-host cadence:
 | Command  | What it does                                                | Default cadence |
 |----------|-------------------------------------------------------------|-----------------|
 | `forget` | Marks snapshots eligible for removal per the retention policy attached to each source group. Cheap; runs append-only. | Daily after the last backup of the day |
 | `prune`  | Reclaims space from the repo. Requires the **admin** credential (write+delete). | Weekly, off-peak |
 | `check`  | Verifies repo integrity. Sub-options surface lock state. | Weekly, with `--read-data-subset N%` to sample pack files |
 A new field on each host row, `host_repo_maintenance`, holds the
 cron expressions and last-fire anchors. The maintenance ticker on
 the server runs every 60s, finds hosts whose next-fire is due,
 and dispatches the right command. The agent's local cron is
 **only** for backups.
 ## Why server-side and not agent-side?
 The agent's cron knows about backups because backups are
 per-source-group. Maintenance is per-repo, not per-source-group,
 so doing it server-side keeps the per-host wiring simple:
 - One ticker, not N agent crons to keep in sync.
 - Cancelling a maintenance dispatch is just "don't dispatch the
  next one" — no agent-side state to clean up.
 - Skipping offline hosts is trivial (no queue; only scheduled
  *backups* queue into `pending_runs`).
 ## Forget and the multi-group payload
 A single `forget` job can target several source groups at once.
 The wire envelope (`ForgetGroups`) carries one entry per group,
 each with its retention policy. The agent runs N
 `restic forget --tag <name> --keep-...` invocations in sequence,
 streams their output, and reports a single terminal status.
 ## Prune and the admin credential
 Prune mutates the repo. The everyday append-only credential
 **cannot** prune — that's the whole point of append-only.
 restic-manager keeps a second slot per host (`kind = 'admin'`)
 for the credential that can.
 When a prune is dispatched (cadence-driven or operator-driven):
 1. Server pushes the admin credential to the agent in a fresh
   `config.update`.
 2. Agent runs `restic prune` with the merged credential.
 3. Job finishes; agent discards the admin credential from its
   in-memory secrets store.
 The server never logs the merged URL (see
 [Credentials](./credentials.md)).
 ## Check and lock state
 `restic check` warns about stale locks when it finds them. The
 agent ships every check's output back as a `repo.stats` envelope
 and a stream of log lines; if a stale lock is detected, the
 **Repo** page surfaces a banner with an **Unlock** button. The
 operator-only `unlock` command runs `restic unlock` and clears
 the banner.
 `unlock` has no cadence — it's a manual action, never automatic.
 Auto-unlocking would mask the cause (probably a previously
 crashed long-running operation) and risk corrupting an
 operation the operator has merely lost track of.
 ## Repo stats
 After every backup, check, prune, and unlock, the agent runs
 `restic stats --json --mode raw-data` and ships the result as a
 `repo.stats` envelope. The server stores this in
 `host_repo_stats` (latest only) and `host_repo_stats_history`
 (one row per host per day, last-write-wins per column — a
 prune-only patch never nulls a backup-time size).
 The host detail page surfaces:
 - Total size + raw size in the vitals strip.
 - Last-check timestamp + colour-coded status.
 - Last-prune timestamp.
 - 30/90-day repo size trend chart.
@@ -1,105 +0,0 @@
 # Schedules and source groups
 Two related but separable ideas:
 - A **source group** is a named bundle of "what to back up":
  include paths, exclude patterns, retention policy, retry
  configuration, optional pre/post hooks. The group's name is
  used as the restic snapshot tag, so retention can target it
  with `restic forget --tag <name>`.
 - A **schedule** is a cron expression that, when it fires,
  triggers a backup of one or more source groups on a host.
 Decoupling them means you can have one schedule covering several
 groups (e.g. `0 1 * * *` running both `system` and `data`), and
 each group has its own retention without duplicating policy
 across schedules.
 ## Source group anatomy
 ```yaml
 name: data
 includes:
  - /var/lib/postgresql
  - /home
 excludes:
  - /home/*/.cache
  - /home/*/Downloads
 retention:
  keep_last: 7
  keep_daily: 14
  keep_weekly: 4
  keep_monthly: 6
 retry_max: 3
 retry_backoff_seconds: 600
 pre_hook: |
  pg_dump -U postgres -F c -f /var/lib/postgresql/dumps/all.dump
 post_hook: |
  rm -f /var/lib/postgresql/dumps/all.dump
 ```
 ### Conflict detection
 If your retention policy says `keep_hourly: 24` but no schedule
 points at this group sub-daily, the UI surfaces a
 **conflict-dimension banner** ("`hourly` won't be honoured —
 no schedule fires more often than once a day"). The flag is
 stored on the source group (`conflict_dimension`) and refreshed
 whenever a schedule or group changes.
 ### Hooks
 `pre_hook` and `post_hook` run on the agent host inside
 `/bin/sh -c` (`cmd.exe /C` on Windows). Output is streamed back
 to the live job log as `hook(<phase>): …` lines.
 - A non-zero `pre_hook` exit aborts the backup.
 - `post_hook` always runs, with `RM_JOB_STATUS=succeeded|failed`
  in the environment. Use this for cleanup that must happen
  whether the backup worked or not.
 - Hooks only run for `kind=backup` jobs. They do not run for
  `forget`, `prune`, `check`, etc.
 - AEAD-encrypted at rest at the HTTP layer; the agent receives
  plaintext over the WS channel.
 A "host default" pair of hooks lives on the host itself; a
 source group's own hooks override them when set.
 ## Schedule anatomy
 ```yaml
 cron: "0 2 * * *"
 enabled: true
 source_group_ids:
  - <gid for "data">
  - <gid for "system">
 ```
 Slim by design: a schedule says **when** and **which groups**.
 Everything else (paths, retention, hooks) lives on the groups.
 The agent's local cron fires the schedule. If the WebSocket is
 down at fire time, the server queues the firing into
 `pending_runs` and drains it on the next agent reconnect — a
 short network blip won't lose the backup.
 ### Last / next run
 The schedules tab shows "next" (computed by parsing the cron
 expression with `robfig/cron/v3`) and "last" (the latest
 `actor_kind=schedule` job in the `jobs` table) for every
 schedule. The dashboard host row also surfaces `next 12h ago/from
 now` when a single covering schedule is the run-now candidate.
 ## Bandwidth limits
 Two places set restic's `--limit-upload` / `--limit-download`:
 1. **Host-wide caps** on the host row (`bandwidth_up_kbps`,
   `bandwidth_down_kbps`). Pushed to the agent on hello and
   after `PUT /api/hosts/{id}/bandwidth`. Apply to every restic
   invocation on the host.
 2. **Per-job overrides** on the per-source-group Run-now form.
   Win over host caps for the lifetime of that one job.
 If neither is set, restic runs unthrottled.
@@ -1,17 +0,0 @@
 # Contributing
 Full contributor guide:
 [`CONTRIBUTING.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/CONTRIBUTING.md)
 in the repository root.
 The short version:
 - Open an issue first for non-trivial changes; the design is
  still moving and unsolicited large PRs may conflict with
  in-flight work.
 - `make lint test` must pass.
 - One logical change per commit, no `Co-Authored-By` trailers.
 - UK English in identifiers and comments; comments explain the
  **why** not the **what**.
 Code of conduct: [`CODE_OF_CONDUCT.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/CODE_OF_CONDUCT.md).
@@ -1,113 +0,0 @@
 # Enrolling your first host
 The control plane only knows about hosts you've explicitly
 enrolled. Two paths exist:
 1. **Token-based enrolment** — admin generates a token, pastes it
   into an install command on the host. The host appears immediately,
   already mapped to the desired repo.
 2. **Announce-and-approve** — the agent runs without a token,
   "announces" itself to the server, and a human in the UI accepts
   the announcement.
 Token-based is the default and what most operators want; the
 announce flow exists for the case where you can't easily paste a
 secret onto the host (auto-imaged endpoints, scripted bring-ups
 from a config repo).
 ## Token-based enrolment
 ### From the UI
 1. Click **+ Add host** on the dashboard.
 2. Fill in the hostname, the restic repo URL, and the repo
   credentials. The credentials are AEAD-encrypted at the server
   immediately; what you paste is what the agent receives.
 3. Optionally pick the initial source paths — these become the
   first source group on the host.
 4. Submit. The server mints a one-time token and shows you a copy-
   pasteable install snippet.
 ### On the host (Linux)
 ```sh
 curl -fsSL https://restic.example.com/install/install.sh | \
    sudo RM_SERVER=https://restic.example.com \
         RM_ENROL_TOKEN=<token> \
         bash
 ```
 The script:
 1. Detects architecture (`amd64` or `arm64`).
 2. Downloads the agent binary from `/agent/binary?os=…&arch=…`.
 3. Drops the systemd unit at
   `/etc/systemd/system/restic-manager-agent.service`.
 4. Runs the agent in `-enrol` mode, which posts the token and
   stores the persistent bearer it gets back.
 5. Enables and starts the unit.
 Within seconds the host should appear on the dashboard as
 **online**.
 ### On the host (Windows)
 ```pwsh
 $env:RM_SERVER  = "https://restic.example.com"
 $env:RM_ENROL_TOKEN = "<token>"
 iwr -useb $env:RM_SERVER/install/install.ps1 | iex
 ```
 Equivalent shape: registers a Windows service via the SCM
 (see P2-16 for details), runs `-enrol`, starts the service.
 ## Recovering a lost token
 Tokens are single-use and short-lived (1h). If you closed the tab
 before pasting the install command, head to the **Add host** page —
 outstanding tokens are listed there with a **Regenerate** button.
 Regenerating revokes the old token's hash and mints a fresh raw
 token while preserving the original repo credentials and initial
 paths. (NS-02 in `tasks.md` if you want the design rationale.)
 ## Announce-and-approve
 If the host can reach the server but you don't want to paste a
 secret on it, run the agent in `-announce` mode:
 ```sh
 restic-manager-agent -announce \
                     -server https://restic.example.com \
                     -hostname myhost
 ```
 The host appears in the **Pending hosts** panel on the dashboard
 with its hostname, OS, arch, and the source IP that announced it.
 Click **Accept**, fill in the repo URL + credentials, and the
 server pushes the bearer over the still-open WebSocket. No
 back-and-forth round trip.
 If you don't accept within an hour the announcement is swept.
 ## What happens on the agent
 After enrolment, the agent:
 1. Connects via WebSocket to `/ws/agent` with its bearer token.
 2. Sends a `hello` envelope with its OS, arch, agent version,
   restic version, and protocol version.
 3. Receives a `config.update` carrying its encrypted repo
   credentials and any source-group paths.
 4. Sits idle, sending a heartbeat every 30s. Operator-driven
   "Run now" actions arrive as `command.run` envelopes; scheduled
   jobs are driven by the agent's local cron.
 ## Auto-init of the repository
 The first time a backup runs, the agent invokes `restic init`
 against the repo you configured at enrolment. If the repo already
 exists (`config file already exists`) the agent treats it as a
 success and proceeds. The host's repo status (`unknown` →
 `ready` / `init_failed`) is surfaced under the vitals strip on
 the host detail page; if init fails, save fresh credentials in
 the **Repo** tab to retry.
@@ -1,92 +0,0 @@
 # Installing the server
 The reference deployment is a single Docker container fronted by
 your existing reverse proxy. The image bundles the server binary,
 the cross-compiled agent binaries, and the install scripts.
 ## Prerequisites
 - A Linux host with Docker and Docker Compose.
 - A reverse proxy in front (Caddy, nginx, Traefik) terminating
  TLS on a public hostname. The server itself is HTTP-only by
  design — see [Reverse proxy](./reverse-proxy.md) for why.
 - A persistent volume for the server's data directory.
 ## Quick start
 The reference compose file lives at
 [`deploy/docker-compose.yml`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/deploy/docker-compose.yml):
 ```yaml
 services:
  restic-manager:
    image: gitea.dcglab.co.uk/steve/restic-manager:${RM_VERSION:-latest}
    restart: unless-stopped
    environment:
      RM_LISTEN: ":8080"
      RM_DATA_DIR: "/data"
      RM_BASE_URL: "https://restic.example.com"
      # Trust your reverse proxy's CIDR so X-Forwarded-* are honoured.
      RM_TRUSTED_PROXY: "10.0.0.0/8"
    volumes:
      - rm-data:/data
    ports:
      # Bind localhost only — your reverse proxy is the public face.
      - "127.0.0.1:8080:8080"
 volumes:
  rm-data:
 ```
 Bring it up:
 ```sh
 docker compose up -d
 docker compose logs -f restic-manager
 ```
 The first run prints a one-time **bootstrap token** to the log. Use
 it within an hour or it expires; if you miss the window the
 container print it again on next start as long as no admin user
 exists.
 ## First-run admin setup
 Open `https://restic.example.com/bootstrap` (or whatever your
 public URL is). Paste the bootstrap token, pick a username and a
 password (≥ 12 characters), and submit. You'll land in the
 dashboard logged in as the new admin.
 If you'd rather curl it, the equivalent is:
 ```sh
 curl -X POST https://restic.example.com/api/bootstrap \
     -H 'Content-Type: application/json' \
     -d '{"token":"<token-from-log>","username":"admin","password":"<≥12 chars>"}'
 ```
 ## Backing up the secret key
 Inside the data volume, `secret.key` holds the AEAD key used to
 encrypt every credential at rest. **Back it up separately from
 the database.** Without it, encrypted credentials in the database
 are unrecoverable; you'd have to re-enrol every host.
 A simple working approach: copy `secret.key` to your password
 manager or to a separately-backed-up secrets vault the day you
 install. It doesn't change.
 ## Updating the server
 ```sh
 # Pin a new version in your compose file (.env or docker-compose.yml),
 # then:
 docker compose pull
 docker compose up -d
 ```
 Migrations run automatically on startup; the server will refuse to
 start if a migration fails (better to bail than to half-migrate).
 For the agent self-update story, see
 [Updating agents](../operations/updates.md).
@@ -1,95 +0,0 @@
 # Running behind a reverse proxy
 The restic-manager server is HTTP-only by design. TLS termination,
 public hostname, ACME, HSTS, and edge-level rate limiting all
 belong to a reverse proxy you already operate outside this project.
 ## What the proxy must forward
 The server reads four headers when (and only when) the immediate
 peer matches `RM_TRUSTED_PROXY`:
 | Header                 | Value                                              | Why |
 |------------------------|----------------------------------------------------|-----|
 | `X-Forwarded-For`      | The original client IP                             | Rate-limit keys, audit log entries, OIDC redirect-URI checks. |
 | `X-Forwarded-Proto`    | `https`                                            | Used for absolute URLs (e.g. OIDC redirect URIs). |
 | `Host`                 | The public hostname clients use                    | Cookies are scoped to this; `RM_BASE_URL` must match. |
 | `Connection` / `Upgrade` | Pass through unchanged                           | `/ws/agent` and `/api/jobs/{id}/stream` are WebSockets; without `Upgrade: websocket` they fail. |
 Set `RM_TRUSTED_PROXY` to the CIDR (or comma-separated list of
 CIDRs) the proxy connects from. Anything outside that range has
 its `X-Forwarded-*` headers ignored, so a stray request that
 bypasses the proxy can't spoof the client IP.
 ## Caddy
 ```caddyfile
 restic.example.com {
    encode zstd gzip
    reverse_proxy 127.0.0.1:8080 {
        header_up X-Real-IP {remote_host}
    }
 }
 ```
 Caddy adds `X-Forwarded-For` / `X-Forwarded-Proto` automatically
 and passes WebSocket headers through by default, so this is the
 whole config.
 ## nginx
 ```nginx
 server {
    listen 443 ssl http2;
    server_name restic.example.com;
    ssl_certificate     /etc/letsencrypt/live/restic.example.com/fullchain.pem;
    ssl_certificate_key /etc/letsencrypt/live/restic.example.com/privkey.pem;
    location / {
        proxy_pass         http://127.0.0.1:8080;
        proxy_http_version 1.1;
        proxy_set_header   Host              $host;
        proxy_set_header   X-Forwarded-For   $proxy_add_x_forwarded_for;
        proxy_set_header   X-Forwarded-Proto https;
        # WebSocket upgrade
        proxy_set_header   Upgrade           $http_upgrade;
        proxy_set_header   Connection        "upgrade";
        # Long-lived agent WS — disable read timeout for this surface.
        proxy_read_timeout 86400s;
    }
 }
 ```
 ## Traefik
 ```yaml
 http:
  routers:
    restic-manager:
      rule: "Host(`restic.example.com`)"
      entryPoints: [websecure]
      tls:
        certResolver: letsencrypt
      service: restic-manager
  services:
    restic-manager:
      loadBalancer:
        servers:
          - url: "http://restic-manager:8080"
        passHostHeader: true
 ```
 Traefik forwards WebSocket upgrades and the standard
 `X-Forwarded-*` set out of the box.
 ## Verification
 After bringing the proxy up, the audit log should show your real
 client IP for an interactive login (not the proxy's local
 address). If you see `127.0.0.1` or the proxy's container IP, your
 `RM_TRUSTED_PROXY` is wrong or `X-Forwarded-For` isn't being
 forwarded.
@@ -1,86 +0,0 @@
 # restic-manager
 restic-manager is a self-hosted, browser-based, single-pane-of-glass
 for managing [restic](https://restic.net) backups across a fleet of
 Linux and Windows endpoints. It's designed for **small fleets** —
 the original target was twelve endpoints — and **one operator**.
 ## What it does
 - Centralised view of every endpoint's last backup, repo size,
  snapshot count, and recent jobs.
 - Trigger any restic operation remotely (`backup`, `forget`, `prune`,
  `check`, `unlock`, `snapshots`, `stats`, `diff`, `restore`).
 - Per-host backup schedules with source groups (named bundles of
  paths + retention policy).
 - Live job log streamed to the browser; downloadable as text or NDJSON.
 - Restore wizard with snapshot tree browse + path selection.
 - Repo-level health surfacing (size, raw size, last-check, lock
  state) plus a 30/90-day size trend.
 - Alerting over webhook, ntfy, or SMTP.
 - Cross-platform agent (Linux + Windows).
 - Append-only-credential-friendly with a separate admin credential
  for forget/prune.
 ## What it isn't
 - **Not a SaaS.** Single-instance, single-tenant, by design.
 - **Not a replacement for restic** — it's a control plane. The agent
  shells out to a real `restic` binary.
 - **Not highly available.** SQLite, single process; if you need
  HA backups, you're shopping in the wrong aisle.
 - **Not a multi-protocol backup tool.** restic only.
 ## How it fits together
 ```
 ┌──────────────────────────────────────────────┐
 │  Server (control plane, Docker)              │
 │   - REST + WebSocket API                     │
 │   - SQLite store                             │
 │   - Embedded HTMX UI                         │
 └──────────┬─────────────────────────┬─────────┘
           │ outbound WS              │ HTTP(S)
           │                          │
 ┌──────────▼──────────┐    ┌──────────▼─────────┐
 │  Agent (per host)   │    │  Browser (operator) │
 │   - restic wrapper  │    └─────────────────────┘
 │   - cron for sched. │
 └──────────┬──────────┘
           │ restic
 ┌──────────▼──────────────────────────────────┐
 │  rest-server / S3 / SFTP / local repo       │
 │  (the actual backup data — server never     │
 │   touches it)                               │
 └─────────────────────────────────────────────┘
 ```
 The control plane is a Go binary that runs in Docker. Each endpoint
 runs a small Go agent that holds an outbound WebSocket to the
 control plane. Backup data flows directly between the agent and the
 restic repository — the control plane never sees a snapshot byte.
 ## Where to start
 - [Installing the server](./getting-started/install.md) walks
  through the Docker-based reference deployment.
 - [Enrolling your first host](./getting-started/enrolling-hosts.md)
  covers the install scripts and the announce-and-approve flow.
 - [Architecture](./concepts/architecture.md) is the right read if
  you want to know why something is the way it is before running
  the install.
 ## Project status
 Pre-1.0 but feature-complete for the original use case. Phases
 0–4 are landed (MVP, scheduling, restore, RBAC + OIDC); Phase 5
 (this docs site, contributor onboarding, end-to-end CI) is in
 flight. See [`tasks.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/tasks.md)
 for the live roadmap and [`spec.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md)
 for the canonical design doc.
 ## License
 [PolyForm Noncommercial 1.0.0](https://polyformproject.org/licenses/noncommercial/1.0.0/).
 Personal and community deployments welcome; commercial use
 requires a separate license.
@@ -1,39 +0,0 @@
 # License
 restic-manager is licensed under
 [**PolyForm Noncommercial 1.0.0**](https://polyformproject.org/licenses/noncommercial/1.0.0/).
 The full text lives at
 [`LICENSE`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/LICENSE)
 in the repository root.
 ## What this means
 - **Personal, hobbyist, educational, charitable, and similar
  noncommercial use** is fully permitted, including modification
  and redistribution.
 - **Commercial use is not permitted** without a separate
  license. The maintainer is not currently offering one — if
  you need commercial rights, open an issue to start the
  conversation.
 - The license is permissive about everything except commercial
  use: you can fork, modify, deploy in your home/lab, and
  contribute back.
 ## Why this license
 The PolyForm Noncommercial license was chosen because:
 - It's a real, legal, plainly-worded license (not a custom
  half-written variant).
 - It permits the realistic uses for a hobby project (the
  maintainer's homelab, a friend's fleet, a charity's IT
  closet) without inviting commercial vendors to repackage
  the work.
 - It's compatible with the project staying small and
  maintainable — the maintainer doesn't want to be on the hook
  for SLA-grade commercial support.
 ## Contributions
 By contributing, you agree your contributions are licensed
 under the same PolyForm Noncommercial 1.0.0 license.
@@ -1,73 +0,0 @@
 # Alerts and notifications
 restic-manager raises alerts on conditions that need human
 attention. The alert engine evaluates rules on a 60s tick and
 on every job-finished / host-online event.
 ## Built-in alert kinds
 | Kind                | Trigger | Severity |
 |---------------------|---------|----------|
 | `backup_failed`     | A backup job ends in `failed` or `cancelled` | warning |
 | `forget_failed`     | A forget job ends in `failed` | warning |
 | `prune_failed`      | A prune job ends in `failed` | critical |
 | `check_failed`      | A check job ends in `failed` | critical |
 | `agent_offline`     | A host has been offline more than 90s past its heartbeat cadence | warning |
 | `stale_schedule`    | A schedule's "last run" is more than 1.5 × its interval ago | warning |
 | `update_failed`     | An agent self-update returned a fail or didn't reconnect within 90s | warning |
 | `fleet_update_halted`| The rolling fleet-update worker stopped on a failure | critical |
 Each alert has a `dedup_key` so re-firing the same condition
 just bumps `last_seen_at` — the operator gets one row per
 condition, not a thousand.
 ## Lifecycle
 ```
 raised  ──acknowledge──▶  acknowledged  ──resolve──▶  resolved
   │                          │
   └────────auto-resolve──────┘
   (e.g. agent_offline auto-resolves on agent_online)
 ```
 - **Acknowledge** says "I've seen this, stop notifying about it".
 - **Resolve** says "the underlying condition is gone".
 - Some alerts auto-resolve when the condition clears
  (`agent_offline` is the canonical example).
 ## Notification channels
 Configure under **Settings → Notifications**. Each channel can
 subscribe to all alerts or filter by severity.
 ### Webhook
 Posts a JSON envelope to a URL of your choice. Useful for
 piping into Slack via an Incoming Webhook URL or into your own
 alerting tooling.
 ### ntfy
 Pushes a plain-text alert to an [ntfy.sh](https://ntfy.sh/)
 topic. Configure the topic URL; optional bearer token if you
 self-host with auth.
 ### SMTP
 Plain SMTP (with optional TLS). Configure host, port,
 username, password, and the recipient list.
 ## Test fire
 Each channel exposes a **Test fire** button that dispatches a
 single synthetic alert through the channel without touching the
 alert engine. Use this when you've added a channel and want to
 verify connectivity before the next real failure happens.
 ## What gets logged
 Every alert raise / acknowledge / resolve writes an audit log
 entry. The audit log UI at **Settings → Audit log** filters by
 user, action, target, and time range — useful for the
 post-incident "who clicked acknowledge on the prune-failure
 alert" question.
@@ -1,73 +0,0 @@
 # Backups and restores
 ## Running a backup
 Three ways to trigger one:
 1. **Scheduled** — the agent's local cron fires at the time set
   on the schedule.
 2. **Run-now** — operator clicks **Run now** on the host detail
   right rail. Posts to `/hosts/{id}/run-backup` (defaults to all
   source groups) or to a per-group form for finer control.
 3. **API** — `POST /api/hosts/{id}/jobs` with the appropriate
   payload. Same audit + dispatch path.
 In every case the server creates a `jobs` row, broadcasts a
 `command.run` to the host, and lands the operator on the live
 job log page (HTMX `HX-Redirect`).
 ## Cancelling a job
 Any running job — backup, forget, prune, restore, anything —
 exposes a **Cancel** button on its detail page. The server
 broadcasts `command.cancel`, and the agent kills the running
 restic subprocess via context cancel: SIGTERM first, SIGKILL
 after a 5s grace (`cmd.Cancel` + `cmd.WaitDelay`). On Windows the
 SIGTERM step is replaced with `os.Kill` because Windows can't
 deliver SIGTERM. Result: a cancelled job lands as `cancelled`
 within a couple of hundred milliseconds.
 ## Restore wizard
 Restoring a file or path goes through a four-step wizard at
 `/hosts/{id}/restore`:
 1. **Pick a snapshot.** Search by id or by date; the page is
   pre-populated when you launched the wizard from a snapshot row.
 2. **Browse the snapshot tree.** Lazy-loaded children via the
   `MsgTreeList` synchronous WS RPC; results are cached
   per-wizard-session for 30 minutes. Pick the absolute paths
   you want.
 3. **Choose a target.** Either **In place** (overwrites the
   live filesystem; requires you to type the hostname to
   confirm) or **New directory** (default
   `$HOME/rm-restore/<job-id>/`; agent expands `$HOME` /
   `${HOME}` / `~/` and creates the directory chain).
 4. **Review and submit.** Server mints a job, dispatches
   `command.run` with a `RestorePayload`, and `HX-Redirect`s to
   the live job log.
 `--no-ownership` is gated on restic ≥ 0.17 (the flag was added
 in that release). Hosts running 0.16 don't get the flag and
 restore as the running user instead.
 ## Snapshot diff
 Two snapshot ids in the **Diff** form on the host detail page →
 a `JobDiff` job that runs `restic diff <a> <b>`. Output streams
 to the standard live job log. Useful when investigating a
 suspiciously-sized backup.
 ## Job log artefacts
 Every job's log is persisted in `job_logs` (one row per line),
 not just streamed in-memory. That gives you:
 - A live view at `/jobs/{id}` while the job runs.
 - Two download formats from the same page header dropdown:
  - **txt** — one line per row, `HH:MM:SS.mmm  TAG  payload`.
  - **ndjson** — one self-contained JSON object per line
    (`{seq, ts, stream, payload}`), perfect for `jq`.
 Downloads work whether the job is running or finished —
 the source is the DB, not the live socket.
@@ -1,61 +0,0 @@
 # Observability with Prometheus
 restic-manager can expose a Prometheus scrape endpoint at
 `GET /metrics`. The endpoint is **opt-in** — without an explicit
 auth gate it isn't even mounted, so a forgotten config can't
 accidentally publish fleet state.
 The full reference lives at
 [`docs/prometheus.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/docs/prometheus.md);
 the short version follows.
 ## Enable the endpoint
 Set at least one of:
 - `RM_METRICS_TOKEN` — `Authorization: Bearer <token>` required.
 - `RM_METRICS_TRUSTED_CIDR` — restricts source IPs (comma-CIDR).
 Both ANDed when both set. Constant-time token compare; CIDR
 honours `X-Forwarded-For` only when the immediate hop matches
 `RM_TRUSTED_PROXY`.
 ## Metrics emitted
 - **Server gauges**: `rm_hosts_total`, `rm_hosts_online`,
  `rm_active_alerts{severity}`, `rm_build_info{...}`.
 - **Per-host gauges**: `rm_host_agent_online`,
  `rm_host_last_backup_timestamp_seconds`,
  `rm_host_last_backup_success`, `rm_host_repo_size_bytes`,
  `rm_host_snapshot_count`, `rm_host_open_alerts`,
  `rm_host_repo_status`.
 - **Histogram**:
  `rm_job_duration_seconds{kind,status,le=…}` (buckets
  `1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf`).
 In-memory histogram only. Prometheus persists the scrapes; if
 you need durable history at hourly resolution that's
 Prometheus's job.
 ## Sample Grafana dashboard
 [`deploy/grafana/restic-manager-dashboard.json`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/deploy/grafana/restic-manager-dashboard.json)
 imports through Grafana's **+ → Import → Upload JSON file**.
 Six panels:
 1. Fleet status (online / total).
 2. Open alerts by severity.
 3. Backups failing on most-recent run.
 4. Hosts table — last backup, repo size, snapshots, open alerts.
 5. Repo size over time, one line per host.
 6. Job-duration p95 over a 1h window per kind.
 ## Alerting
 restic-manager already has a built-in alert engine
 ([Alerts](./alerts.md)). The dashboard intentionally doesn't
 duplicate it as Prometheus alert rules. If you want
 Prometheus-side alerts on top, write your own based on the
 metrics above — `rm_host_last_backup_success == 0`,
 `time() - rm_host_last_backup_timestamp_seconds > <max age>`,
 or whatever suits your environment.
@@ -1,50 +0,0 @@
 # Updating agents
 Server updates are a `docker compose pull && up -d` away.
 Agents update via the control plane.
 ## Single-host update
 Each host's detail page shows an **Update agent** button when
 the agent's reported version is older than the server's. The
 button:
 1. Dispatches a `command.update` to that host.
 2. The agent fetches the appropriate binary from
   `$RM_SERVER/agent/binary?os=…&arch=…` to
   `<binary-path>.new`.
 3. Copies the running binary to `<binary-path>.old` (one
   revision back, in case rollback is needed).
 4. Atomic-renames `.new` over the running binary.
 5. Exits cleanly. systemd's `Restart=always` (or Windows SCM)
   brings the process back on the new binary.
 A 90-second timer on the server side waits for a hello at the
 target version and marks the update succeeded — or, if the
 agent doesn't reconnect at the expected version in time, marks
 the update **failed** and raises an `update_failed` alert.
 ## Fleet update
 The admin-only **Settings → Fleet update** page drives a rolling
 update across every host in the fleet:
 - One host at a time.
 - Wait for hello-with-target-version (max 95s).
 - On any host failing, **halt** the rollout, raise a
  `fleet_update_halted` alert, leave the rest of the fleet on
  the old version. No surprise mass-failures.
 You can cancel an in-progress fleet update; the worker stops
 after the current host finishes.
 ## TLS and corruption
 Updates rely on the reverse proxy's TLS to detect corruption in
 transit. There's no separate sha256 verification step — we
 chose the simpler model on the basis that the same TLS already
 gates every other byte the server hands to the agent.
 If you'd like a separate signature step before applying updates,
 that's a future-phase enhancement (see `tasks.md` Phase 6
 candidates).
@@ -1,58 +0,0 @@
 # Environment variables
 The server reads its configuration from environment variables
 (canonical) with an optional YAML overlay. Env wins over YAML so
 operators can tweak a single setting without rewriting the file.
 ## Server
 | Variable                  | Default                          | Meaning |
 |---------------------------|----------------------------------|---------|
 | `RM_LISTEN`               | `:8080`                          | TCP listener for the HTTP server. |
 | `RM_DATA_DIR`             | `/data`                          | Persistent state directory (SQLite, secret key, agent assets). |
 | `RM_BASE_URL`             | (none)                           | Public URL clients use; required for OIDC redirects + cookie scope. |
 | `RM_SECRET_KEY_FILE`      | `${RM_DATA_DIR}/secret.key`      | Path to the AEAD key file. Auto-generated on first run. |
 | `RM_COOKIE_SECURE`        | `true`                           | Set `false` only for local HTTP testing. Controls `Secure` on session cookies. |
 | `RM_TRUSTED_PROXY`        | (none)                           | Comma-separated CIDRs trusted for `X-Forwarded-*`. |
 | `RM_BUNDLED_ASSETS_DIR`   | `/opt/restic-manager/dist`       | Read-only path with bundled agent binaries + install scripts (the docker image bakes them here). |
 | `RM_METRICS_TOKEN`        | (off)                            | When set, `GET /metrics` requires `Authorization: Bearer <token>`. |
 | `RM_METRICS_TRUSTED_CIDR` | (off)                            | When set, `GET /metrics` restricts source IPs (comma-CIDR). |
 OIDC variables (all optional; empty issuer disables OIDC):
 | Variable                       | Meaning |
 |--------------------------------|---------|
 | `RM_OIDC_ISSUER`               | OIDC discovery URL (e.g. `https://auth.example.com`). |
 | `RM_OIDC_CLIENT_ID`            | Client ID registered with the IdP. |
 | `RM_OIDC_CLIENT_SECRET`        | Client secret (or use `RM_OIDC_CLIENT_SECRET_FILE`). |
 | `RM_OIDC_CLIENT_SECRET_FILE`   | Path to a file holding the client secret. |
 | `RM_OIDC_DISPLAY_NAME`         | Button label on the login page (e.g. "Authelia"). |
 | `RM_OIDC_ROLE_CLAIM`           | Token claim that carries roles (default `groups`). |
 | `RM_OIDC_ROLE_MAPPING`         | `idp-group=role` entries, comma-separated (e.g. `rm-admin=admin,rm-ops=operator`). |
 | `RM_OIDC_REDIRECT_URL`         | Override for the redirect URL; defaults to `${RM_BASE_URL}/auth/oidc/callback`. |
 ## Agent
 | Variable             | Default | Meaning |
 |----------------------|---------|---------|
 | `RM_AGENT_CONFIG`    | `/etc/restic-manager/agent.yaml` (Linux) | Config file path. |
 The agent's other settings live in the YAML file (server URL,
 bearer token, optional cert pin). The install script writes that
 file for you at enrolment.
 ## Build-time
 The Makefile threads `-ldflags` from `git describe` into the
 `internal/version` package so `--version` and the dashboard
 footer show the right values:
 ```
 -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=$(VERSION)
 -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Commit=$(COMMIT)
 ```
 If you build with `go build` directly (no Makefile), `Version`
 falls back to `dev` and the agent-update comparison falls back
 to "always equal". Source-build deployments can still run; they
 just don't participate in the self-update flow.
@@ -1,82 +0,0 @@
 # HTTP endpoints
 A non-exhaustive map of the surfaces the control plane exposes.
 All `/api/*` routes return JSON; all other paths render HTML
 (server-rendered with HTMX in the loop).
 The canonical wiring lives at
 [`internal/server/http/server.go`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/internal/server/http/server.go);
 when in doubt, read the routes block there.
 ## Public (no auth)
 | Method | Path                       | Purpose |
 |--------|----------------------------|---------|
 | GET    | `/healthz`                 | Liveness probe. Returns 204. |
 | POST   | `/api/auth/login`          | Local-user login. JSON body: `{username, password}`. |
 | POST   | `/api/auth/logout`         | Invalidate the session cookie. |
 | POST   | `/api/bootstrap`           | First-run admin creation. Accepts the token printed at first start. |
 | POST   | `/api/agents/enroll`       | Token-based agent enrolment. |
 | POST   | `/api/agents/announce`     | Announce-and-approve agent enrolment. |
 | GET    | `/agent/binary?os=&arch=`  | Serves the agent binary for the install scripts. |
 | GET    | `/install/*`               | Serves the Linux + Windows install scripts and the systemd unit. |
 | GET    | `/api/version`             | Build version + commit JSON. |
 | GET    | `/metrics`                 | Prometheus exposition (only when opted-in via `RM_METRICS_TOKEN` / `RM_METRICS_TRUSTED_CIDR`). |
 | GET    | `/login`, `/setup`, `/bootstrap` | UI pages. |
 ## Authenticated (any role)
 | Method | Path                                     | Purpose |
 |--------|------------------------------------------|---------|
 | GET    | `/`                                      | Dashboard. |
 | GET    | `/hosts/{id}`                            | Host detail. |
 | GET    | `/hosts/{id}/repo`                       | Repo tab. |
 | GET    | `/hosts/{id}/jobs`                       | Jobs tab. |
 | GET    | `/hosts/{id}/sources`                    | Source groups list. |
 | GET    | `/hosts/{id}/schedules`                  | Schedules list. |
 | GET    | `/jobs/{id}`                             | Live job log. |
 | GET    | `/api/hosts`, `/api/fleet/summary`       | JSON list + summary. |
 | GET    | `/api/jobs/{id}/stream`                  | WebSocket subscription to a job's live log. |
 | GET    | `/api/jobs/{id}/log.{txt,ndjson}`        | Persisted log download. |
 ## Operator role and above
 | Method | Path                                  | Purpose |
 |--------|---------------------------------------|---------|
 | POST   | `/hosts/{id}/run-backup`              | Run-now (HTMX form-post). |
 | POST   | `/hosts/{id}/sources/{gid}/run-now`   | Per-source-group run-now. |
 | POST   | `/hosts/{id}/repo/{prune,check,unlock,reinit,probe}` | Maintenance actions. |
 | POST   | `/api/hosts/{id}/snapshots/diff`      | Snapshot-diff job. |
 | POST   | `/hosts/{id}/restore`                 | Restore wizard submit. |
 | POST   | `/api/jobs/{id}/cancel`               | Cancel a running job. |
 | POST   | `/hosts/{id}/tags`                    | Update host tags. |
 | POST   | `/hosts/{id}/sources` and friends     | Source-group CRUD. |
 | POST   | `/hosts/{id}/schedules` and friends   | Schedule CRUD. |
 | POST   | `/hosts/{id}/repo/credentials`, `/admin-credentials` | Credential update. |
 ## Admin role only
 | Method | Path                                  | Purpose |
 |--------|---------------------------------------|---------|
 | POST   | `/hosts/new`                          | Mint enrolment token (Add host). |
 | POST   | `/hosts/{id}/delete`                  | Delete + cascade. |
 | POST   | `/hosts/{id}/update`                  | Dispatch a single agent update. |
 | GET/POST | `/settings/users/...`                | User management. |
 | POST   | `/settings/notifications/...`         | Notification channel CRUD + test fire. |
 | POST   | `/settings/fleet-update/...`          | Fleet-update worker. |
 ## WebSocket
 | Path                           | Who connects | Auth |
 |--------------------------------|--------------|------|
 | `/ws/agent`                    | Agent        | Bearer token issued at enrolment. |
 | `/ws/agent/pending`            | Agent (announce flow) | Pending-id query param. |
 | `/api/jobs/{id}/stream`        | Browser      | Session cookie. |
 ## RBAC enforcement
 Routes are grouped into chi route-groups by required role
 (`viewer < operator < admin`); the `requireRole` middleware in
 `internal/server/http/middleware.go` is the bouncer. Sessions
 re-validate `disabled_at` on every request, so a disabled user's
 cookie stops working immediately.
@@ -1,32 +0,0 @@
 # Roadmap
 The live roadmap is in
 [`tasks.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/tasks.md).
 Phases ship in order; items inside a phase ship as the
 opportunity arises.
 ## Status snapshot
 | Phase | Theme                                            | Status |
 |-------|--------------------------------------------------|--------|
 | 0     | Project bootstrap                                | ✅ done |
 | 1     | MVP: enrolment, visibility, on-demand backup     | ✅ done |
 | 2     | Scheduling, retention, repo operations           | ✅ done |
 | 3     | Restore, alerts, audit                           | ✅ done |
 | 4     | RBAC, OIDC, host tags                            | ✅ done |
 | 5     | OSS readiness                                    | 🚧 in flight (this docs site is part of it) |
 | 6     | Update delivery + observability polish           | ✅ done |
 ## What's not on the roadmap
 The non-goals list in [`spec.md` §2](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md):
 - Replacing restic itself or providing custom repo formats
 - Managing non-restic backup tools
 - Multi-tenancy / SaaS deployment
 - High availability of the control plane (SQLite, single-instance)
 - Mobile-native apps (responsive web only)
 If something there is critical to your use case, restic-manager
 isn't the right tool. That's not a closed door — it's a
 deliberate scope decision so the project stays maintainable.
@@ -1,35 +0,0 @@
 # Reporting vulnerabilities
 The full disclosure policy lives in
 [`SECURITY.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/SECURITY.md)
 at the repo root. The short version:
 - **Don't open a public issue.**
 - Send a Gitea private message to `steve` on
  <https://gitea.dcglab.co.uk>, or email the address on the
  maintainer's profile, with a subject like
  `[SECURITY] restic-manager: <one-line summary>`.
 - Expect an acknowledgement within 3 working days; escalate
  through the other channel if you don't get one.
 - Default disclosure window is **30 days from confirmed report
  to public disclosure**, faster if a PoC is already
  circulating, slower only by mutual agreement.
 ## What to include
 A description of the issue and the impact, the affected
 component (server / agent / install script / docs), the version,
 and reproduction steps. A working PoC is welcome but not
 required — a credible threat model is enough.
 ## In scope vs. out of scope
 See the full policy. Quick highlights:
 - **In scope:** server, agent, install scripts, docker image,
  docker-compose reference, crypto choices, docs that lead to
  insecure configs.
 - **Out of scope:** restic itself (report upstream), unpatched
  third-party deps (report upstream first), pre-authenticated
  admin abuse (admins are designed to have full power), DoS on
  deployments without the recommended reverse proxy.
@@ -1,72 +0,0 @@
 # Hardening checklist
 A baseline for new deployments. Most of these are defaults; the
 list is here to make audit easy.
 ## Server
 - [ ] Reverse proxy in front, TLS terminating at the proxy
      (Caddy/nginx/Traefik).
 - [ ] `RM_TRUSTED_PROXY` set to the proxy's CIDR.
 - [ ] `RM_BASE_URL` matches the public hostname and the cookie
      scope you want.
 - [ ] `RM_COOKIE_SECURE=true` (the default; only set `false`
      for local HTTP testing).
 - [ ] HTTP listener bound to **localhost** in the compose file,
      not `0.0.0.0`. The reverse proxy is the only thing that
      should reach it.
 - [ ] `secret.key` backed up separately from the database.
 - [ ] Bootstrap token consumed and the printed log line scrubbed
      from any log archive.
 ## Authentication
 - [ ] Admin user has a password ≥ 12 characters (the floor).
 - [ ] OIDC enabled if you have an IdP — local password auth
      stays as a break-glass.
 - [ ] Disabled (not deleted) any users who change roles or leave
      so their session is invalidated immediately.
 - [ ] The last-admin guard isn't tripped — there's always at
      least one enabled admin user.
 ## Repo credentials
 - [ ] Append-only credential set as the everyday cred for every
      host.
 - [ ] Admin credential set only where prune cadence is enabled.
 - [ ] No credentials reused across hosts. Each host should have
      its own credential pair so a single host compromise has a
      single blast radius.
 - [ ] If using rest-server, `--append-only` flag is on for the
      everyday user; the prune user is a separate identity.
 ## Agent
 - [ ] Agent runs as `root` (Linux) or `LocalSystem` (Windows)
      **only when** the source paths require it. Otherwise pin
      a service user that has read access to what's backed up
      and nothing else.
 - [ ] systemd unit's sandboxing flags are intact
      (`NoNewPrivileges`, `Protect*`, `MemoryDenyWriteExecute`).
 - [ ] Agent's config file `/etc/restic-manager/agent.yaml` is
      mode `0600` and owned by the service user. The bearer
      token lives in there.
 ## Operations
 - [ ] Alerts wired to a real channel (webhook into Slack,
      ntfy topic, SMTP) — not just sitting in the UI.
 - [ ] Test-fire each notification channel after configuring.
 - [ ] Audit-log retention is long enough to cover the operator's
      incident-response window.
 - [ ] Prometheus endpoint, if enabled, gated by token AND CIDR
      where practical (default is opt-in / off).
 ## Recovery
 - [ ] A documented procedure for rotating a leaked agent bearer
      (delete + re-enrol the host).
 - [ ] A test-restore done at least once, end-to-end, before
      relying on the system in anger.
 - [ ] `secret.key` and the SQLite database covered by separate
      backup paths so neither alone reconstitutes the other.
@@ -1,110 +0,0 @@
 # Threat model
 This page documents what restic-manager defends against, what it
 doesn't, and the trust assumptions a deployment is making. The
 canonical version lives in [`spec.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md)
 §11; the summary here is shaped for operators rather than
 implementers.
 ## Trust boundaries
 ```
 ┌──────────────────────────────────────────┐
 │  TRUSTED zone                            │
 │  ┌─────────────┐    ┌──────────────┐     │
 │  │  Operator's │    │   Reverse    │     │
 │  │   browser   │◄──►│    proxy     │     │  TLS terminates here
 │  └─────────────┘    └──────┬───────┘     │
 └────────────────────────────┼─────────────┘
                             │ HTTP, plaintext
                             │ (loopback or trusted LAN)
 ┌────────────────────────────▼─────────────┐
 │  Server (control plane)                  │
 └────────────┬─────────────────────────────┘
             │ outbound WebSocket (TLS to clients via proxy)
             │ — bearer-authenticated
 ┌────────────▼──────────────┐
 │  Agent (per host)         │  ◄── attacker model: assume one
 └────────────┬──────────────┘       endpoint can be compromised
             │ subprocess
             ▼
   restic ──▶ repository (rest-server / S3 / SFTP / …)
 ```
 ## What we defend against
 ### Network attacker between operator and server
 - HTTPS via the reverse proxy is the only operator-facing surface
  on a sane deployment.
 - `RM_COOKIE_SECURE=true` (default) means the session cookie
  refuses to ride a non-HTTPS connection.
 - `RM_TRUSTED_PROXY` gates whether `X-Forwarded-*` is honoured;
  a bypassing request can't spoof the client IP.
 ### Compromised agent host
 - The agent's bearer token can dispatch commands **only on its
  own host**. It can't read other hosts' state, dispatch jobs
  on other hosts, or escalate within the control plane.
 - If you suspect a host compromise:
  1. Disable the agent's host row from **Hosts → Delete**
     (cascades the bearer hash).
  2. Rotate the repo credential at the rest-server / object
     store side.
  3. Audit-log lists every action that bearer ever drove.
 ### DB compromise without the secret key
 - Repo credentials are AEAD-encrypted at rest. A DB dump alone
  doesn't expose them.
 - Agent bearer **hashes** are leaked; that's enough to
  authenticate as any agent until you revoke. A rotation
  procedure is just "delete + re-enrol" today.
 - Operator passwords are bcrypt-hashed; OIDC users have no
  password to leak.
 - Session tokens are hashed; an attacker can't replay a
  session from a DB dump.
 ### DB compromise WITH the secret key
 The attacker can decrypt every credential. Treat
 `secret.key` with the same care as a password manager database.
 Back it up to a separate vault, not to the same Docker volume
 as the database.
 ### Forget/prune as a DoS vector
 - The everyday backup credential cannot prune (append-only).
 - The admin credential is only pushed to the agent at the
  moment of dispatch and discarded after the job ends.
 - Compromise of a single agent host does **not** grant prune
  rights — at worst the attacker gets fresh write access until
  the credential is rotated.
 ### Operator-side typo or bad copy-paste
 - Repo credentials are stored encrypted; mis-typed creds fail
  fast on the next `restic` invocation rather than silently
  corrupting state.
 - NS-03 added auto-init: the first dispatched job after creds
  change runs `restic init`, surfaces the error eagerly under
  the host's vitals strip if the creds are bad, and resets the
  host's `repo_status` so the operator can retry without
  hunting through job logs.
 ## What we don't defend against
 - **Insider threat at the maintainer level.** A malicious
  maintainer can publish a backdoored container; SBOM /
  signing infrastructure (Phase 6 candidate) would help here
  but isn't shipped today.
 - **Supply chain.** We pin module versions (`go.sum`) and
  pin the Tailwind binary's release tag, but a compromise in
  one of those upstreams would land here.
 - **Side-channel via restic itself.** A bug in restic that
  enables snapshot-content disclosure is restic's problem; the
  control plane doesn't see snapshot bytes either way.
 - **DoS via resource exhaustion** without the recommended
  reverse-proxy / rate-limit in front. Don't expose the
  server's HTTP port to the public internet directly.
@@ -1,120 +0,0 @@
 # End-to-end test harness
 The e2e harness stands up the full production-shaped stack
 (server + agent + rest-server) in Docker Compose and drives it
 through Playwright. CI runs it on every PR; operators can run it
 locally too.
 ## Files
 ```
 e2e/
 ├── compose.e2e.yml         compose stack: server + rest-server + agent
 ├── Dockerfile.agent        Linux container for the agent (alpine + restic)
 ├── agent-entrypoint.sh     decides between announce / token-enrol / run
 └── playwright/
    ├── package.json
    ├── playwright.config.ts
    └── tests/
        ├── lib/server.ts   bootstrap, login, accept, poll helpers
        └── smoke.spec.ts   happy-path: enrol → backup → succeeded
 ```
 ## Local run
 Prerequisites: Docker + Docker Compose, and `npx` for Playwright.
 ```sh
 # 1. Build + bring up the stack (server, rest-server, source data).
 docker compose -f e2e/compose.e2e.yml up --build -d server rest-server source-fixture
 # 2. Wait for the server, then scrape the bootstrap token from the log.
 until curl -fsS http://127.0.0.1:8080/api/version >/dev/null; do sleep 1; done
 RM_BOOTSTRAP_TOKEN=$(docker compose -f e2e/compose.e2e.yml logs server \
    | grep -Eo '[a-zA-Z0-9_-]{40,}' | head -1)
 export RM_BOOTSTRAP_TOKEN
 # 3. Start the agent (it announces against the running server).
 docker compose -f e2e/compose.e2e.yml up -d agent
 # 4. Install + run Playwright.
 cd e2e/playwright
 npm install
 npx playwright install --with-deps chromium
 npx playwright test
 ```
 When the test passes you'll see:
 ```
 Running 2 tests using 1 worker
  ✓  smoke: enrol-via-announce → backup › happy path completes in under a minute (47s)
  ✓  smoke: scrape /metrics › metrics endpoint exposes the host gauge (180ms)
  2 passed (47.5s)
 ```
 Tear-down:
 ```sh
 docker compose -f e2e/compose.e2e.yml down -v
 ```
 `-v` removes the named volumes too — important between runs because
 the rest-server volume holds an initialised repo and the
 agent-config volume holds a stale bearer.
 ## What the test exercises
 1. **Bootstrap.** Posts the admin-creation request to
   `/api/bootstrap` with the token scraped from the server log.
 2. **Login (UI).** Drives the login form via Playwright; verifies
   the dashboard loads with a session cookie set.
 3. **Pending host appears.** Polls the dashboard for the inline
   accept form generated by the announcing agent; reads the
   pending-id out of its action URL.
 4. **Accept.** POSTs `/api/pending-hosts/{id}/accept` with the
   rest-server URL + repo password. The server mints a Host row
   + bearer + AEAD-encrypted creds and pushes the bearer down
   the still-open pending WebSocket.
 5. **Online + auto-init.** Polls `/api/hosts` until the new host
   is `status=online`. Auto-init runs as part of this — the
   first dispatched job after creds save is `restic init`.
 6. **Run backup.** Submits the host detail page's `Run now`
   form; expects `HX-Redirect` to the live job page.
 7. **Verify.** Polls `/api/hosts` until the host's
   `last_backup_status` flips to `succeeded`.
 8. **Metrics.** Scrapes `/metrics` and asserts the
   server-gauge + build-info lines are present (the compose
   stack opens the endpoint via `RM_METRICS_TRUSTED_CIDR=0.0.0.0/0`).
 ## CI workflow
 [`.gitea/workflows/e2e.yml`](../.gitea/workflows/e2e.yml) runs the
 suite on every PR into `main`. On failure it dumps the last 200
 lines of each container log as a workflow annotation and uploads
 the Playwright HTML report as an artefact.
 ## When tests fail
 - **Pending host never appears.** Agent container probably
  couldn't reach the server. Check `docker compose logs agent`
  for connection errors and `docker compose logs server` for
  any 4xx on `/api/agents/announce`.
 - **Backup hangs in `running`.** The agent shells out to
  `restic`; check the live job log at
  `http://127.0.0.1:8080/jobs/<id>` (still up after a
  failed test as long as you didn't `down -v`).
 - **`RM_BOOTSTRAP_TOKEN not set`.** The server log scrape
  matched the wrong line or the token regex is too tight. The
  server prints the token on a line starting with `    ` (four
  spaces) inside a banner; widen the regex if your server log
  format changes.
 ## Adding new tests
 The harness is intentionally flat — one `*.spec.ts` per
 scenario. Reuse the helpers in `lib/server.ts` and avoid
 duplicating bootstrap / login boilerplate. Heavy fixtures
 (custom users, OIDC IdP) belong in their own compose override
 file rather than complicating `compose.e2e.yml`.
@@ -1,259 +0,0 @@
 # P2 Completion Implementation Plan
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
 **Goal:** Close every remaining P2 task in `tasks.md`: P2R-09 (auto-init UX), P2R-10/11/12 (hooks), P2R-13 (bandwidth wiring + per-job override), P2R-14 (schedule next/last run), P2-16 (Windows svc), P2-17 (`install.ps1`), P2-18 (announce-and-approve).
 **Architecture:** Server stays HTTP+WS; agent stays a single binary that auto-restages via `make build`. Hooks live on `source_groups` (and host-level defaults). Announce-and-approve adds a separate WS path (`/ws/agent/pending`) and a Pending hosts panel; token-flow stays default. Windows service support uses `golang.org/x/sys/windows/svc` behind a `//go:build windows` tag — Linux builds untouched. **Operator is away — make best guesses on small UX choices, but commit each item separately so the choices are reviewable.**
 **Tech Stack:** Go 1.23+, chi router, modernc/sqlite, `coder/websocket`, `robfig/cron/v3`, HTMX + Tailwind, `golang.org/x/sys/windows/svc`, Ed25519 (stdlib).
 ---
 ## Pre-flight
 - [ ] **Run baseline:** `go vet ./... && go build ./... && go test ./...` — must be green before starting. Restage agent + restart server (per CLAUDE.md restage block) so smoke env is warm.
 ## Order of execution
 Smallest blast-radius first. UI polish → bandwidth → next/last → hooks → announce → Windows. Commit and restage at each task boundary. Run `go vet ./... && go test ./...` before every commit.
 ---
 ## Task 1 — P2R-13a: Wire bandwidth caps into restic invocations
 **Files:**
 - Modify: `internal/restic/runner.go` (add `LimitUploadKBps`, `LimitDownloadKBps` to `Env` or to a per-call options struct already present; emit `--limit-upload N`/`--limit-download N` on `restic backup|forget|prune|check|restore`)
 - Modify: `internal/agent/runner/*.go` — pass host-wide caps into the runner. Caps come from `agent.config.Config` or are pushed via `config.update`. Decision: ship caps in the existing `config.update` envelope as new fields `bandwidth_up_kbps`, `bandwidth_down_kbps`. Server pushes on hello + on `PUT /api/hosts/{id}/bandwidth`.
 - Modify: `internal/api/messages.go` — extend `ConfigUpdatePayload` with the two int pointers.
 - Modify: `internal/server/ws/handler.go` (or wherever hello/config push lives) — include caps in the pushed config.
 - Modify: `internal/server/http/host_bandwidth.go` — after `SetHostBandwidth`, fan out a `config.update` to the connected agent (mirror the credentials-edit path).
 - Test: `internal/restic/runner_test.go` — assert flag injection.
 - Test: `internal/server/ws/*_test.go` — assert config.update carries caps on hello and on edit.
 - [ ] **Step 1.1** Add `LimitUploadKBps *int`, `LimitDownloadKBps *int` to whatever per-host config the runner already consults. Existing pattern is `restic.Env{}`; extend it.
 - [ ] **Step 1.2** Failing test in `internal/restic/runner_test.go`: build a backup command with `LimitUploadKBps=1024`, assert the resulting argv contains `--limit-upload 1024`.
 - [ ] **Step 1.3** Implement: prepend the flags in argv builders for `backup`, `forget`, `prune`, `check`, `restore`. Skip when nil/<=0.
 - [ ] **Step 1.4** Wire `config.update` payload — server reads `Host.BandwidthUpKBps`/`DownKBps`, includes them in the existing `ConfigUpdatePayload` push on hello and on bandwidth edit (mirror cred-edit fan-out in `internal/server/http/host_credentials.go`).
 - [ ] **Step 1.5** Agent applies caps: store in the in-memory dispatcher state on `config.update`, attach to every restic call.
 - [ ] **Step 1.6** `go vet ./... && go test ./... && make build && <restage block>`. Commit:
 ```
 agent+server: apply host bandwidth caps to restic invocations
 ```
 ## Task 2 — P2R-13b: Per-job override on Run-now confirm dialog
 **Decision:** A small numeric input on the per-source-group Run-now button (and dashboard Run-all). Operator is away — keep it minimal: two optional inputs (up/down KB/s) on the dispatch endpoint; UI shows a `<details>` "Limit bandwidth for this run" disclosure with two number inputs.
 **Files:**
 - Modify: `internal/server/http/sources.go` (or wherever the per-group Run-now POST lives) — accept optional `bandwidth_up_kbps`/`bandwidth_down_kbps` form fields, pass through.
 - Modify: dispatch path (`internal/server/dispatch_*.go` or `ws/handler.go` job-dispatch core) — accept overrides, include in the `command.run` payload.
 - Modify: `internal/api/messages.go` — `CommandRunPayload` gains optional caps that take precedence over host-wide caps when present.
 - Modify: agent dispatcher — use payload override if present else falls back to config caps.
 - Modify: `web/templates/pages/host_sources.html` (and the schedules Run-now form) — `<details>` block.
 - Test: HTTP test for the new form fields; agent runner test for override precedence.
 - [ ] **Step 2.1** Failing test: POST to per-group Run-now with `bandwidth_up_kbps=512` → assert dispatched payload carries 512.
 - [ ] **Step 2.2** Implement endpoint changes + payload extension.
 - [ ] **Step 2.3** Agent override precedence test (payload wins over config).
 - [ ] **Step 2.4** UI `<details>` blocks (one per Run-now form).
 - [ ] **Step 2.5** Playwright spot-check via `:8080` smoke env: open Sources tab, expand the Run-now disclosure, fire with limit=128, then open the live job log and confirm the agent's restic argv (read `/tmp/rm-smoke/server.log` for the dispatched command — it logs argv) shows `--limit-upload 128`.
 - [ ] **Step 2.6** Commit.
 ## Task 3 — P2R-14: Schedule "next run" / "last run"
 **Files:**
 - Modify: `internal/store/schedules.go` — add `NextRunAt(time.Time)` derivation helper and `LatestScheduledJobAt(host_id, schedule_id) (time.Time, error)` (or a single batched fetch for all schedules of a host).
 - Modify: dashboard host row (`web/templates/partials/host_row.html`) — show "Next: …" and "Last: …" when there's a single covering schedule (already detected in slice 5).
 - Modify: `web/templates/pages/host_schedules.html` — add Next/Last columns to the schedules table.
 - Modify: relevant page handlers (`internal/server/http/ui_schedules.go`, dashboard handler) — populate the data.
 - Test: `schedules_test.go` for next-run derivation (parse cron, compute next from a fixed `now`).
 - [ ] **Step 3.1** Add `NextRun(cronExpr string, from time.Time) (time.Time, error)` helper using `robfig/cron/v3`'s `Parse(...).Next(from)`. Test with three crons.
 - [ ] **Step 3.2** Add `LatestJobByActorKindForSchedule(host_id, schedule_id) (time.Time, status, error)` query against `jobs` (filter `actor_kind='schedule'` AND `schedule_id=?`, ORDER BY `started_at` DESC LIMIT 1).
 - [ ] **Step 3.3** Wire schedules-page handler to populate Next/Last per row; render relative time + ISO tooltip (mirror existing `formatRelTime` template helper if it exists; otherwise use a simple "5m ago" helper).
 - [ ] **Step 3.4** Wire dashboard row: when single covering schedule, surface "Next: 03:00" / "Last: 8h ago — succeeded".
 - [ ] **Step 3.5** Playwright spot-check: a host with a schedule shows Next/Last; pause it → Next becomes "—" / "(paused)".
 - [ ] **Step 3.6** Commit.
 ## Task 4 — P2R-09: Auto-init UX polish
 **Files:**
 - Modify: `web/templates/pages/host_repo.html` — danger-zone re-init button + two-step confirm (type the host name).
 - Modify: `internal/server/http/ui_repo.go` (or new `repo_reinit.go`) — `POST /hosts/{id}/repo/reinit` admin-only, audit-logged. Server runs `restic init --force` (or wipes-then-inits — pick the safer of the two; restic doesn't truly wipe a repo, the operator must clear the bucket. **Best guess:** dispatch a normal `init` job with a flag that re-runs even if the repo claims to exist; if restic refuses, surface "the repo on the remote already has data — clear it manually before re-init" via the job log).
 - Modify: host detail page header / vitals strip — surface init result line. Use the existing latest-`init`-job query to render "repo ready · initialised <relative time> ago" or "init failed · job N · retry".
 - Test: HTTP test for re-init endpoint (auth, audit, host-name confirm); template test that the result line renders for both states.
 - [ ] **Step 4.1** Add helper: `LatestJobByKind(host_id, "init")` — already exists from P2R-06 (`store.LatestJobByKind`). Reuse.
 - [ ] **Step 4.2** Render init line into vitals strip; show "init failed" amber when latest init failed.
 - [ ] **Step 4.3** Implement `POST /hosts/{id}/repo/reinit` handler — admin role check, requires a `confirm_hostname` form field that must equal `host.Name`, returns 400 otherwise. Dispatches a fresh `init` job.
 - [ ] **Step 4.4** Add danger-zone re-init form to `host_repo.html` (currently disabled per slice 4). Two-step confirm with the typed hostname.
 - [ ] **Step 4.5** Playwright: visit `/hosts/{id}/repo`, click re-init, type wrong hostname → blocked; type right hostname → dispatches init job → returns to live log.
 - [ ] **Step 4.6** Commit.
 ## Task 5 — P2R-10: Hook schema (migration 0010)
 **Files:**
 - Create: `internal/store/migrations/0010_hooks.sql`
  - `ALTER TABLE source_groups ADD COLUMN pre_hook BLOB;`  (AEAD ciphertext, NULLable)
  - `ALTER TABLE source_groups ADD COLUMN post_hook BLOB;`
  - `ALTER TABLE hosts ADD COLUMN pre_hook_default BLOB;`
  - `ALTER TABLE hosts ADD COLUMN post_hook_default BLOB;`
  - All four are AEAD ciphertext (existing `crypto.AEAD`); BLOB column type.
 - Modify: `internal/store/types.go` — add `PreHook *string` (decrypted), `PostHook *string` to `SourceGroup`; same to `Host`.
 - Modify: `internal/store/sources.go` + `internal/store/hosts.go` — getters/setters encrypt on write, decrypt on read. Pass `crypto.AEAD` through (pattern mirrors `host_credentials.go`).
 - Test: encrypt/decrypt round-trip; setting `nil` clears the column.
 - [ ] **Step 5.1** Write migration SQL. Column-level ALTERs only (per CLAUDE.md).
 - [ ] **Step 5.2** Update store types + getters/setters with AEAD encrypt/decrypt. Mirror `internal/store/host_credentials.go` patterns exactly.
 - [ ] **Step 5.3** Round-trip test: set hook on a source group; reload; assert plaintext returned. Set nil; assert nil after reload.
 - [ ] **Step 5.4** `go vet && go test`. Commit.
 ## Task 6 — P2R-11: Agent execution of hooks
 **Files:**
 - Modify: `internal/api/messages.go` — `ConfigUpdatePayload` (or the per-source-group bundle inside `ScheduleSetPayload`) carries `PreHook`, `PostHook` plaintext (server has decrypted by then; wire is authenticated WS, same trust boundary as repo creds).
 - Modify: agent dispatcher — for `kind=backup` only:
  - Run `pre_hook` (if present) via `os/exec` with the host shell (`/bin/sh -c` on Linux, `cmd.exe /C` on Windows). Capture stdout+stderr → JobLog with `hook:` prefix. Non-zero exit aborts the backup, marks the job failed with `pre_hook` error.
  - Run `post_hook` (if present) **always** after the backup, with `RM_JOB_STATUS=succeeded|failed` env var. Capture into JobLog, prefix `hook:`. Non-zero exit on post_hook does NOT change job status (warning logged).
 - Skip both for `kind` ∈ {forget, prune, check, unlock, init} per spec.md §14.3.
 - Test: dispatcher test with a `pre_hook` that exits 1 → backup not started; `post_hook` always runs and sees `RM_JOB_STATUS`.
 - [ ] **Step 6.1** Plumb hooks through `ScheduleSetPayload` source-group bundle + per-group Run-now `command.run` payload (override host-default with group hook if both present). Server-side resolution: host default if group hook is empty.
 - [ ] **Step 6.2** Agent dispatcher: factor hook execution into `internal/agent/runner/hooks.go`. Use `exec.CommandContext`, set env, plumb output to existing JobLog stream with `Source: "hook"` (or prefix the log lines `hook: …`).
 - [ ] **Step 6.3** Failing test in `internal/agent/runner/runner_test.go` (create file if absent): `pre_hook=/bin/false` → job fails with `pre_hook failed (exit 1)` and the actual restic backup never runs (assert via mock-restic shim).
 - [ ] **Step 6.4** Test: `post_hook` runs even when backup fails; receives `RM_JOB_STATUS=failed`.
 - [ ] **Step 6.5** Test: hooks skipped on `forget`/`prune`/`check`/`unlock` jobs.
 - [ ] **Step 6.6** `go vet && go test && make build && <restage block>`. Commit.
 ## Task 7 — P2R-12: Hook editor UI
 **Files:**
 - Modify: `web/templates/pages/source_group_edit.html` (new or extend existing source-group form) — `<textarea>` for pre_hook, `<textarea>` for post_hook, with the warning banner: "this hook runs as the agent service user (root on Linux; LocalSystem on Windows)".
 - Modify: source-group HTTP handler (`internal/server/http/sources.go`) — accept hook fields on POST/PUT, encrypt-and-persist via store.
 - Create: a new "Settings" tab section on host detail (currently inert per P1-25) — wait, just add a new sub-tab or extend Repo page. **Decision:** add `pre_hook_default` / `post_hook_default` to the Repo page under a new "Hooks" section since Settings is still inert.
 - Modify: source-group form admin-only check; post-only edit allowed by operators? **Decision:** admin-only edit per spec; render but disable for operators.
 - Modify: audit-log writer — emit `source_group.hook_updated` and `host.default_hook_updated` events (without the hook body).
 - Test: HTTP test for create + update; admin-only enforcement; audit row written without secret.
 - [ ] **Step 7.1** Source-group form extension + handler wiring.
 - [ ] **Step 7.2** Repo page Hooks section (host defaults).
 - [ ] **Step 7.3** Audit entries.
 - [ ] **Step 7.4** Playwright: as admin, set a `pre_hook` of `echo hello`, fire Run-now, open live log, confirm `hook: hello` line appears.
 - [ ] **Step 7.5** Commit.
 ## Task 8 — P2-18a: Announce schema + endpoint
 **Files:**
 - Create: `internal/store/migrations/0011_pending_hosts.sql`
  ```sql
  CREATE TABLE pending_hosts (
    id                 TEXT PRIMARY KEY,
    hostname           TEXT NOT NULL,
    os                 TEXT NOT NULL,
    arch               TEXT NOT NULL,
    agent_version      TEXT NOT NULL,
    restic_version     TEXT NOT NULL,
    public_key         BLOB NOT NULL,             -- 32-byte Ed25519
    fingerprint        TEXT NOT NULL,             -- "SHA256:hex"
    announced_from_ip  TEXT NOT NULL,
    first_seen_at      TEXT NOT NULL,
    last_seen_at       TEXT NOT NULL,
    expires_at         TEXT NOT NULL
  );
  CREATE INDEX pending_hosts_expires ON pending_hosts(expires_at);
  CREATE INDEX pending_hosts_fingerprint ON pending_hosts(fingerprint);
  ```
 - Create: `internal/store/pending_hosts.go` — `CreatePendingHost`, `GetPendingHostByFingerprint`, `ListPendingHosts`, `DeletePendingHost`, `TouchPendingHost`, `DeleteExpiredPendingHosts`.
 - Create: `internal/server/http/announce.go` — `POST /api/agents/announce` accepts `{hostname, os, arch, agent_version, restic_version, public_key (base64)}`. Validates protocol_version implicitly via `agent_version` check. Token-bucket rate limit per source IP (10/min). Global cap 100 pending rows. Returns `{fingerprint, pending_id, hostname_collision: bool}`.
 - Test: `announce_test.go` — happy path; rate limit; cap; collision flag.
 - [ ] **Step 8.1** Migration + store layer + tests.
 - [ ] **Step 8.2** Endpoint + tests (use a fake clock + in-process token bucket).
 - [ ] **Step 8.3** Commit.
 ## Task 9 — P2-18b: Pending WS + accept/reject
 **Files:**
 - Create: `internal/server/ws/pending.go` — `GET /ws/agent/pending` upgrade. Server issues a 32-byte nonce; agent signs it with its Ed25519 private key; server verifies against the `public_key` stored on the pending row keyed by the supplied `pending_id`. If valid, hold the connection open; on accept, push a single `enrolled` message containing `{bearer_token, repo_credentials_aead_blob}` and close cleanly. On reject, close with code 4001 + reason "rejected".
 - Create: `internal/server/http/pending.go` — admin-only `POST /api/pending-hosts/{id}/accept` (atomically: mint bearer, decrypt admin-supplied repo creds (passed in form), promote pending row → real `hosts` row, push `enrolled` to the open WS, audit-log) and `POST /api/pending-hosts/{id}/reject` (delete row + close socket).
 - Modify: server `main.go` route registration.
 - Test: integration test — fake agent opens pending WS, admin POST /accept, agent receives bearer.
 - [ ] **Step 9.1** Pending WS handler with nonce-sign verify.
 - [ ] **Step 9.2** Accept/reject endpoints. Accept reuses the existing token-consume path internally (mints persistent bearer from `crypto.RandomToken`-style helper, inserts host row + `host_credentials`).
 - [ ] **Step 9.3** Tests.
 - [ ] **Step 9.4** Commit.
 ## Task 10 — P2-18c: Agent announce path
 **Files:**
 - Modify: `cmd/agent/main.go` — when `RM_TOKEN` is unset, switch to announce mode instead of erroring out. `RM_SERVER` still required.
 - Create: `internal/agent/announce/announce.go` — generate-or-load Ed25519 keypair (persisted as a file alongside `secrets.enc`, mode 0600). POST `/api/agents/announce`. Open `/ws/agent/pending`. Wait. On `enrolled` message, persist bearer to `agent.yaml`, persist repo creds via existing secrets store, exit announce mode and reconnect via the normal WS path.
 - Modify: `deploy/install/install.sh` — when `RM_TOKEN` is missing, run agent in announce mode and `journalctl --follow` until the agent prints the fingerprint, print it to the operator's terminal in big copy-friendly format, then keep following until enrolled.
 - Test: end-to-end test in `internal/server/...` using a fake agent.
 - [ ] **Step 10.1** Keypair generation + persistence.
 - [ ] **Step 10.2** Announce client + pending WS client; print `SHA256:…` fingerprint to stdout in a banner.
 - [ ] **Step 10.3** Install script branch.
 - [ ] **Step 10.4** Playwright: register a host via announce mode (run agent locally with no RM_TOKEN), log into UI, see Pending hosts panel with the fingerprint, click Accept, confirm host appears.
 - [ ] **Step 10.5** Commit.
 ## Task 11 — P2-18d: Pending hosts UI panel
 **Files:**
 - Modify: `web/templates/pages/dashboard.html` — add Pending hosts panel above the host list when any pending rows exist.
 - Modify: dashboard handler — `Store.ListPendingHosts(now)` (auto-skips expired).
 - Add buttons → POST `/api/pending-hosts/{id}/accept` and `/reject` via HTMX.
 - Background sweeper for `DeleteExpiredPendingHosts` every 60s (mirror the existing offline-sweeper goroutine pattern).
 - [ ] **Step 11.1** Sweeper goroutine.
 - [ ] **Step 11.2** Dashboard handler + template.
 - [ ] **Step 11.3** Accept form must include the same repo URL/user/pw fields as the token-mint form (admin still supplies repo creds at accept time).
 - [ ] **Step 11.4** Playwright sweep.
 - [ ] **Step 11.5** Commit.
 ## Task 12 — P2-16: Windows service integration
 **Decision:** Cannot test on Windows from WSL. Goal is a clean compile under `GOOS=windows GOARCH=amd64` and code that follows the canonical `golang.org/x/sys/windows/svc/example` pattern. Untestable beyond compile + manual review; mark in commit message.
 **Files:**
 - Create: `internal/agent/service/service_windows.go` (build tag `//go:build windows`) — implements `svc.Handler`. `Execute` starts the agent's main loop in a goroutine, listens for `svc.Stop`/`svc.Shutdown`, cancels ctx, waits.
 - Create: `internal/agent/service/service_other.go` (build tag `//go:build !windows`) — stub `RunService` that just runs the agent loop in the foreground.
 - Create: `internal/agent/service/install_windows.go` — `Install`, `Uninstall`, `Start`, `Stop` thin wrappers around `mgr` package.
 - Modify: `cmd/agent/main.go` — sub-commands: `install`, `uninstall`, `start`, `stop`, `run` (default). `run` delegates to `service.Run()` which on Windows checks `svc.IsWindowsService()` and dispatches accordingly.
 - Test: `internal/agent/service/service_windows_test.go` (build-tagged) for argv parsing only — actual SCM interaction can't be tested in CI.
 - [ ] **Step 12.1** Implement the svc.Handler shell.
 - [ ] **Step 12.2** Install/uninstall wrappers (use `mgr.ConnectLocal()`, `m.CreateService(name, exepath, mgr.Config{...}, "run")`).
 - [ ] **Step 12.3** Cross-compile check: `GOOS=windows GOARCH=amd64 go build ./cmd/agent` must succeed.
 - [ ] **Step 12.4** Commit with note "untested on Windows; compile-verified only".
 ## Task 13 — P2-17: install.ps1
 **Files:**
 - Create: `deploy/install/install.ps1` — PowerShell 5.1+ compatible. Checks admin elevation. Downloads agent binary from `$RM_SERVER/agent/binary?os=windows&arch=amd64`. Drops it at `C:\Program Files\restic-manager\restic-manager-agent.exe`. Runs `restic-manager-agent.exe install` (registers service). Starts it. Detects existing tasks named `*restic*` via `Get-ScheduledTask` and prints them — does not auto-disable. Writes `C:\ProgramData\restic-manager\agent.yaml` with `RM_SERVER` + `RM_TOKEN` (or no token if announce-mode).
 - Modify: `internal/server/http/install.go` (or wherever install scripts are served) to also serve `/install/install.ps1`.
 - Modify: CLAUDE.md restage block to also stage `install.ps1`.
 - [ ] **Step 13.1** Write the script.
 - [ ] **Step 13.2** Wire serving + restage.
 - [ ] **Step 13.3** Smoke parse: `pwsh -NoProfile -Command "Get-Command -Syntax (Get-ChildItem deploy/install/install.ps1)"` if pwsh is on PATH, else `Set-StrictMode` parse via `pwsh -c "$null = [scriptblock]::Create((Get-Content deploy/install/install.ps1 -Raw))"`. Skip if no pwsh available — note in commit.
 - [ ] **Step 13.4** Commit.
 ## Task 14 — Final integration sweep
 - [ ] **Step 14.1** `go vet ./... && go test ./... -race`. Full build. Restage. Restart server.
 - [ ] **Step 14.2** Playwright walkthrough on `:8080`: login → dashboard shows pending-hosts empty state → create source group → set a `pre_hook` → Run-now with bandwidth override → confirm hook fires + bandwidth applied → schedules tab shows next/last → repo page shows init-OK line → re-init flow gated by typed hostname.
 - [ ] **Step 14.3** Update `tasks.md`: tick P2R-09, P2R-10, P2R-11, P2R-12, P2R-13, P2R-14, P2-16, P2-17, P2-18 done. Update Phase 2 acceptance line items as satisfied.
 - [ ] **Step 14.4** Open PR `p2-completion → main` with a summary of every item closed.
 ---
 ## Decisions made on the operator's behalf (away)
 1. **Bandwidth UI for per-job override:** small `<details>` disclosure under each Run-now button. Simpler than a modal; matches the rest of the app's progressive-disclosure style.
 2. **Re-init UX:** server dispatches a fresh `init` job; if restic refuses because the repo already exists, surfaces the error in the job log and instructs the operator to clear the remote bucket. We don't try to forcibly wipe — too dangerous, and the agent doesn't have credentials to wipe S3/B2/etc generically.
 3. **Hooks editor lives on the Repo page (host defaults) + on the source-group edit form (per-group override).** Skips inventing a new "Settings" tab since that surface is still inert.
 4. **Announce flow:** admin still supplies repo creds at accept time (same form as the token-mint flow). The pending row only carries identity-of-the-endpoint material, never repo creds.
 5. **Windows service:** compile-verified only; untested. Commit message will say so.
@@ -1,131 +0,0 @@
 # P5-03 implementation plan — Docker-only release
 Spec: `docs/superpowers/specs/2026-05-05-p5-03-docker-only-release.md`.
 Branch: `p5-03-docker-release`. Do not auto-open a PR (see CLAUDE.md
 memory: CI runs are expensive on the self-hosted cluster).
 ---
 ## Slice 1 — Server config + handler fallback
 **Goal:** server can serve agent binaries / install scripts from a
 read-only "bundled assets" path when `<DataDir>` doesn't have them.
 1. `internal/server/config/config.go` (or wherever `Cfg` lives) gains
   a `BundledAssetsDir string` field, defaulting to
   `/opt/restic-manager/dist`. Wire from `RM_BUNDLED_ASSETS_DIR` env
   var, mirroring the existing env-var conventions.
 2. `internal/server/http/agent_assets.go`:
   - `handleAgentBinary`: try `<DataDir>/agent-binaries/<name>`
     first; on `os.Stat` ENOENT, try
     `<BundledAssetsDir>/agent-binaries/<name>`; on second ENOENT,
     existing 404.
   - `handleInstallAsset`: same dual-path, with `install/` subpath.
 3. Tests in `internal/server/http/agent_assets_test.go` (new file):
   - DataDir hit serves DataDir bytes.
   - DataDir miss + bundled hit serves bundled bytes.
   - DataDir hit shadows bundled.
   - Both miss → 404 + existing error envelope.
   - Path-traversal still rejected for `install/*` (regression check).
 **Verify:** `go vet ./...` + `go test ./internal/server/http/...`.
 ---
 ## Slice 2 — Version ldflags on both binaries
 1. `cmd/server/main.go`: keep `var version`, add
   `var commit = "none"` and `var date = "unknown"`. Surface via
   existing version-log line.
 2. `cmd/agent/main.go`: same three vars. Agent already reports
   `agent_version` in the WS hello — extend to include commit if
   it's already plumbed through `internal/api`; otherwise leave the
   commit out of the wire and just log it on startup.
 3. `Makefile`: extend the `make build` `-ldflags` to set all three
   from `git describe --tags --always` + `git rev-parse HEAD` +
   UTC timestamp. Source-build users get real values, not "dev".
 4. `deploy/Dockerfile.server`: add `ARG COMMIT=none` and
   `ARG DATE=unknown`; pass through `-ldflags`.
 **Verify:** `make build && ./bin/restic-manager-server -version`
 (or whatever the existing flag is) prints non-`dev` values.
 ---
 ## Slice 3 — Dockerfile bakes agents + install assets
 1. Build stage cross-compiles three agents:
   ```dockerfile
   RUN go build -trimpath -ldflags="-s -w \
         -X main.version=${VERSION} -X main.commit=${COMMIT} -X main.date=${DATE}" \
       -o /out/agent/restic-manager-agent-linux-amd64 ./cmd/agent
   ENV GOARCH=arm64
   RUN go build ... -o /out/agent/restic-manager-agent-linux-arm64 ./cmd/agent
   ENV GOOS=windows GOARCH=amd64
   RUN go build ... -o /out/agent/restic-manager-agent-windows-amd64.exe ./cmd/agent
   ```
   (Reset `GOOS`/`GOARCH` between layers via `ENV`. Server build
   stays at `GOOS=linux GOARCH=$TARGETARCH`.)
 2. Final stage `COPY --from=build`:
   - `/out/restic-manager-server` → `/usr/local/bin/`
   - `/out/agent/*` → `/opt/restic-manager/dist/agent-binaries/`
   - `deploy/install/install.sh` →
     `/opt/restic-manager/dist/install/install.sh`
   - `deploy/install/install.ps1` →
     `/opt/restic-manager/dist/install/install.ps1`
   - `deploy/install/restic-manager-agent.service` →
     `/opt/restic-manager/dist/install/restic-manager-agent.service`
 3. Set `--chmod=0755` on the agent binaries and `install.sh`,
   `--chmod=0644` on the unit file and `install.ps1`. Distroless
   final stage runs as `nonroot`; bundled assets are readable by
   anyone (mode `o+r`), so the user switch doesn't break reads.
 **Verify:**
 ```sh
 docker build -f deploy/Dockerfile.server -t rm:dev .
 docker run --rm -d -p 18080:8080 \
    -e RM_LISTEN=:8080 -e RM_DATA_DIR=/data \
    -e RM_BASE_URL=http://127.0.0.1:18080 \
    -v rm-test:/data rm:dev
 curl -fsSL "http://127.0.0.1:18080/agent/binary?os=linux&arch=amd64" | wc -c
 curl -fsSL "http://127.0.0.1:18080/install/install.sh" | head -1
 ```
 Both should succeed against a fresh volume (no operator staging).
 ---
 ## Slice 4 — Release workflow
 `.gitea/workflows/release.yml` per the spec. Two jobs:
 1. **`image`**: checkout → setup-qemu → setup-buildx → login → compute
   tags → buildx build+push.
 2. (Future) `release-notes`: stub left as a TODO comment for now.
   Operator can hand-write release notes via the Gitea UI on first
   cut.
 The `compute tags` shell step is the only non-trivial bit; tested
 inline by running the script with mocked `GITHUB_REF_TYPE` /
 `GITHUB_REF_NAME` env vars before committing.
 **Verify on first dispatch:** trigger `workflow_dispatch` from the
 Gitea UI, check the runner produces `:snapshot-<sha>` and pushes
 multi-arch.
 ---
 ## Slice 5 — Tasks.md + commit + push
 1. `tasks.md`: tick P5-03; add a one-line note that goreleaser was
   dropped in favour of Docker-only after a 2026-05-05 design pass
   (link the spec).
 2. `git add -A && git commit -m "p5-03: docker-only release path"`
   (no Co-Authored-By trailer — CLAUDE.md rule).
 3. `git push -u origin p5-03-docker-release`.
 4. **Stop.** Do not open a PR. Wait for operator review.
@@ -1,473 +0,0 @@
 # P3 — Alerts (design)
 > Phase 3 sub-spec covering the alerts engine, notification channels, and UI
 > (P3-05 / P3-06 / P3-07).
 >
 > Wireframe: `_diag/p3-alerts-wireframe/wireframe.html`. Screenshots in the
 > same directory. Spec brainstorm ran 2026-05-04; user approved all ten
 > design decisions before this spec was written.
 ## Scope locked
 Brainstorm decisions (in order asked):
 1. **Rule model.** Hardcoded rule set, no operator-tunable thresholds in v1.
   The engine knows about each rule type internally; per-rule config can land
   later if/when an operator asks.
 2. **Rule set.** Six rules: `backup_failed`, `forget_failed`, `prune_failed`,
   `check_failed`, `stale_schedule`, `agent_offline`.
 3. **Engine cadence.** Hybrid. Event hooks at the existing
   `MarkJobFinished` and offline-sweeper sites for the immediate triggers;
   one 60-second ticker handles stale-schedule detection and auto-resolution.
 4. **Resolution.** Auto-resolve when the underlying condition clears + manual
   Resolve at any time. Acknowledge is a separate "I've seen it" intermediate
   state that does NOT close the alert.
 5. **v1 channels.** Webhook + native ntfy + SMTP. Apprise deferred (the
   channel plumbing accepts new kinds without reshaping). SMTP added as
   a first-class channel post-brainstorm because the use case — overnight
   alerts the operator wants to read in the morning rather than be pinged
   on at 03:00 — is poorly served by ntfy's push model and clumsy via
   webhook → email-gateway.
 6. **Channel scope.** Global only. No per-host or per-severity routing in v1.
 7. **Notification body.** Structured JSON for webhooks, formatted
   title+body+click-URL for ntfy, plus a per-channel "Send test notification"
   button with inline result feedback.
 8. **Deduplication.** Open-alert uniqueness on `(host_id, kind)` with a
   `last_seen_at` bump on every confirming tick. One notification per
   occurrence; the UI shows "still happening · Ns ago" while a rule keeps
   matching.
 9. **Alert UI.** Top-level `/alerts` page (the existing nav stub becomes
   real). Per-host vitals "Open alerts" cell links to `/alerts?host_id=...`.
   Channel CRUD lives at `/settings/notifications`.
 10. **Delivery semantics.** Best-effort fire-and-forget with a 5s timeout
    per notification. Failures are logged but not retried. The alert row in
    the DB is the source of truth.
 ## Architecture
 The subsystem is three loosely-coupled units behind one `AlertEngine`
 goroutine:
 ```
                                 ┌───────────────────────────┐
   event hooks ─────────────────►│                           │
                                 │   AlertEngine             │ ──► raise/resolve
   60s ticker ──────────────────►│   (rule evaluation)       │     alert row
                                 │                           │
                                 └────────────┬──────────────┘
                                              │
                                              ▼
                                  ┌──────────────────────┐
                                  │   notification.Hub   │
                                  │   (fire-and-forget)  │
                                  └──┬────────┬──────────┘
                                     │        │
                              ┌──────▼──┐  ┌──▼──────┐
                              │ Webhook │  │  Ntfy   │  …future channels
                              └─────────┘  └─────────┘
 ```
 ### Component boundaries
 | Component                                | Purpose                                                                                  | Depends on                             |
 | ---------------------------------------- | ---------------------------------------------------------------------------------------- | -------------------------------------- |
 | `internal/alert.Engine`                  | Owns the rule evaluation. Exposes `OnJobFinished`, `OnHostOffline`, `OnHostOnline` event hooks; runs a 60s ticker for stale-schedule + auto-resolution sweeps. Persists raises/resolves through the store. | store, notification.Hub, slog          |
 | `internal/alert.Rule` + per-rule files   | Each of the six rules is a small struct with `Kind() string`, `Severity() string`, `MessageFor(ctx) string`. The engine iterates over a registered slice. | store models                           |
 | `internal/notification.Hub`              | Receives "alert raised/resolved/test" events; fans out to enabled channels in parallel; logs results to a new `notification_log` table.        | store, channel adapters                |
 | `internal/notification.Channel` (iface)  | Single method `Send(ctx, payload) error` with a 5s context for HTTP channels, 10s for SMTP. Three impls in v1: `webhookChannel`, `ntfyChannel`, `smtpChannel`. | http.Client; net/smtp + crypto/tls for SMTP |
 | `internal/store/alerts.go`               | CRUD on `alerts` table: `RaiseOrTouch(host_id, kind, severity, message)`, `Acknowledge(id, user)`, `Resolve(id, by user)`, `AutoResolve(host_id, kind)`, `ListAlerts(filter)`, plus the `last_seen_at` bump. | sqlite                                 |
 | `internal/store/notification_channels.go` | CRUD on `notification_channels` (new table) + `notification_log` (new table).            | sqlite, crypto.AEAD (for secrets)      |
 | `internal/server/http/ui_alerts.go`      | `/alerts` page handler + filter parsing + ack/resolve form actions.                      | store                                  |
 | `internal/server/http/ui_notifications.go` | `/settings/notifications` page + channel CRUD + "Send test" handler.                   | store, notification.Hub                |
 ### Engine event shape
 The engine runs as one goroutine per server process started in
 `cmd/server/main.go`. It exposes a small set of channels other code writes to:
 ```go
 type Engine struct {
    store *store.Store
    hub   *notification.Hub
    // Event channels (buffered, drop-on-full with a slog warning to keep
    // hot paths non-blocking). The engine drains them on its own
    // goroutine, evaluates the rule, and acts.
    jobFinished chan jobFinishedEvent  // from store.MarkJobFinished hook
    hostOffline chan string            // host_id; from offline sweeper
    hostOnline  chan string            // host_id; from ws handler hello
    // 60s ticker drives stale-schedule + auto-resolution sweeps.
    tick *time.Ticker
 }
 ```
 The hot-path call sites (`store.MarkJobFinished`, `ws.handler` offline
 sweep, `ws.handler` hello) push to these channels via a tiny
 `Engine.Notify*` method that does a non-blocking send. The engine's own
 goroutine handles every match — keeps mutation off the hot path.
 ### Rule catalogue
 | Kind                | Severity | Trigger                                                                 | Auto-resolve when                                  |
 | ------------------- | -------- | ----------------------------------------------------------------------- | -------------------------------------------------- |
 | `backup_failed`     | warning  | `MarkJobFinished` with kind=backup, status=failed                       | next backup for the same host succeeds             |
 | `forget_failed`     | warning  | `MarkJobFinished` with kind=forget, status=failed                       | next forget for the same host succeeds             |
 | `prune_failed`      | warning  | `MarkJobFinished` with kind=prune, status=failed                        | next prune for the same host succeeds              |
 | `check_failed`      | critical | `MarkJobFinished` with kind=check, status=failed OR errors_found        | next check for the same host succeeds without errors |
 | `stale_schedule`    | warning  | 60s ticker: a schedule's next-fire time is more than 5 minutes in the past with no matching job since | next job for that schedule succeeds OR schedule deleted |
 | `agent_offline`     | warning  | offline-sweeper marks the host offline AND the host has been offline > 15 min (engine checks `last_seen_at`) | hostOnline event for that host                     |
 The 15-minute floor on `agent_offline` exists so a 30-second blip during
 agent restart doesn't generate a notification storm. The store's existing
 offline sweeper (`hosts.last_seen_at` with 90s threshold) already marks the
 host offline; the engine sees the event but waits for the threshold before
 raising.
 ### Dedup + last_seen_at
 `store.RaiseOrTouch(host_id, kind, severity, message)`:
 ```sql
 SELECT id, last_seen_at FROM alerts
 WHERE host_id = ? AND kind = ? AND resolved_at IS NULL
 LIMIT 1;
 ```
 - Found: `UPDATE alerts SET last_seen_at = ?, message = ? WHERE id = ?`,
  return `(id, didRaise=false)`.
 - Not found: `INSERT INTO alerts (id, host_id, kind, severity, message,
  created_at, last_seen_at) VALUES (?, ?, ?, ?, ?, ?, ?)`, return
  `(id, didRaise=true)`.
 The engine fires a notification through the Hub only when `didRaise=true`.
 Touch-only events keep the row's `last_seen_at` fresh so the UI can render
 "still happening · Ns ago" without spamming the operator's phone.
 ### Notification payload shapes
 **Webhook** — a single JSON envelope per event:
 ```json
 {
  "event":     "alert.raised",
  "alert_id":  "01KQT...",
  "severity":  "warning",
  "kind":      "backup_failed",
  "host_id":   "01KQ...",
  "host_name": "alfa-01",
  "message":   "Backup 'system-config' failed: rest-server returned 401",
  "raised_at": "2026-05-04T15:42:01Z",
  "link":      "https://restic-manager.example/alerts/01KQT..."
 }
 ```
 `event` is one of `alert.raised | alert.acknowledged | alert.resolved |
 alert.test`. The same envelope shape is reused across events — operators
 build one bridge, switch on `event` and `severity`.
 **SMTP** — single-recipient plain-text email per channel. The channel
 config carries the SMTP server credentials and a `to` address; one
 channel = one recipient (or one distribution-list address). Operators
 who want multiple recipients add multiple channels — keeps the config
 flat and the failure modes per-recipient.
 Subject pattern is hardcoded (no per-channel template in v1):
 ```
 Subject: [restic-manager] [<severity>] <host_name>: <kind>
 From: <configured-from-address>
 To: <configured-to-address>
 Date: <RFC 5322>
 Message-ID: <alert_id@<server-host>>
 <message line — same string the webhook/ntfy gets>
 —
 Raised at: 2026-05-04T15:42:01Z
 Severity:  warning
 Host:      alfa-01
 Kind:      backup_failed
 Open in restic-manager:
 https://restic-manager.example/alerts/01KQT...
 (This message was sent by restic-manager. Acknowledge or resolve in the UI.)
 ```
 The body is plain text only in v1 — no HTML alternative — both because
 the data is already structured well enough as text and because HTML
 email opens a long tail of rendering / sanitisation concerns. The
 `Message-ID` includes the alert id so a thread-aware client can group
 related events (raised → acknowledged → resolved) together.
 Encryption:
 - **STARTTLS** (default, port 587). Opportunistic upgrade. Most
  operator-facing relays.
 - **Implicit TLS** (port 465). Connect-then-TLS-handshake.
 - **None** (port 25). Plain. Hidden behind a "Yes I understand" warning
  on the form because the password goes over the wire.
 Auth:
 - **PLAIN** (RFC 4616) over TLS. Default and almost always what's wanted.
 - **CRAM-MD5** (RFC 2195). Offered if the server advertises it, no UI
  toggle — automatic.
 - No OAuth2 / XOAUTH2 in v1; that's a real next step if Gmail-without-
  app-passwords becomes a recurring ask.
 Per-message timeout is 10s (vs 5s for HTTP channels) — STARTTLS
 handshake + DATA over a slow link can legitimately take that long.
 **Ntfy** — uses the standard publish format:
 ```
 POST /<topic> HTTP/1.1
 Host: <server>
 Authorization: Bearer <access-token>   (if configured)
 Title: [warning] alfa-01 backup failed
 Priority: 4
 Tags: warning,backup_failed
 Click: https://restic-manager.example/alerts/01KQT...
 Backup 'system-config' failed: rest-server returned 401
 ```
 Severity → priority mapping:
 | Severity  | Priority |
 | --------- | -------- |
 | info      | 3 (default) |
 | warning   | 4 (high)    |
 | critical  | 5 (urgent)  |
 Per-channel `default_priority` setting overrides for non-critical alerts;
 critical always goes urgent regardless.
 ### Test notification
 `POST /api/notifications/{channel_id}/test` builds a synthetic event
 (severity=info, kind=test_notification, message="Test from
 restic-manager", link to the channel's edit page) and runs it through the
 real send path. Returns `{ok: bool, latency_ms: int, status_code?: int,
 error?: string}`. UI renders the green ✓ / red ✗ feedback inline.
 ## Routes added
 | Method  | Path                                                  | Purpose                                                       |
 | ------- | ----------------------------------------------------- | ------------------------------------------------------------- |
 | GET     | `/alerts`                                             | Fleet alerts list with filters (`?status=open&severity=warning&host_id=...&q=...`) |
 | POST    | `/alerts/{id}/acknowledge`                            | Mark alert acknowledged (HTMX form)                           |
 | POST    | `/alerts/{id}/resolve`                                | Manual resolve (HTMX form)                                    |
 | GET     | `/settings/notifications`                             | Channel list page                                             |
 | GET     | `/settings/notifications/new`                         | Channel kind picker + empty form                              |
 | POST    | `/settings/notifications/new`                         | Validate + create + redirect                                  |
 | GET     | `/settings/notifications/{id}/edit`                   | Channel edit form                                             |
 | POST    | `/settings/notifications/{id}/edit`                   | Validate + update                                             |
 | POST    | `/settings/notifications/{id}/delete`                 | Delete channel (typed-confirm name in the form)               |
 | POST    | `/api/notifications/{id}/test`                        | Fire test notification, return JSON result                    |
 | GET     | `/api/alerts`                                         | JSON list (mirrors the UI filters) for future REST callers    |
 ## Data model
 ### Migration 0013 — alerts.last_seen_at
 ```sql
 ALTER TABLE alerts ADD COLUMN last_seen_at TEXT;
 UPDATE alerts SET last_seen_at = created_at WHERE last_seen_at IS NULL;
 ```
 Existing alerts (currently zero in production — nothing writes them yet)
 get `last_seen_at = created_at`. Column is nullable for forwards-compat
 with rows from the alert-engine-pre-bump period.
 ### Migration 0014 — notification_channels + notification_log
 ```sql
 CREATE TABLE notification_channels (
  id              TEXT PRIMARY KEY,
  kind            TEXT NOT NULL CHECK (kind IN ('webhook', 'ntfy', 'smtp')),
  name            TEXT NOT NULL,
  enabled         INTEGER NOT NULL DEFAULT 1 CHECK (enabled IN (0, 1)),
  config          BLOB NOT NULL,        -- AEAD-encrypted JSON; per-kind shape
  default_priority TEXT,                -- ntfy only; null for webhook + smtp
  created_at      TEXT NOT NULL,
  updated_at      TEXT NOT NULL,
  last_fired_at   TEXT
 );
 CREATE INDEX notification_channels_enabled ON notification_channels(enabled) WHERE enabled = 1;
 CREATE TABLE notification_log (
  id           TEXT PRIMARY KEY,
  channel_id   TEXT NOT NULL REFERENCES notification_channels(id) ON DELETE CASCADE,
  alert_id     TEXT REFERENCES alerts(id) ON DELETE SET NULL,
  event        TEXT NOT NULL,           -- alert.raised | alert.acknowledged | alert.resolved | alert.test
  ok           INTEGER NOT NULL CHECK (ok IN (0, 1)),
  status_code  INTEGER,
  latency_ms   INTEGER,
  error        TEXT,
  fired_at     TEXT NOT NULL
 );
 CREATE INDEX notification_log_channel ON notification_log(channel_id, fired_at DESC);
 CREATE INDEX notification_log_alert ON notification_log(alert_id);
 ```
 `config` is an AEAD-encrypted JSON blob — bearer tokens for webhooks and
 access tokens for ntfy live there. Per-kind config shapes:
 ```go
 type webhookConfig struct {
    URL          string `json:"url"`
    BearerToken  string `json:"bearer_token,omitempty"`
    HeaderName   string `json:"header_name,omitempty"`
    HeaderValue  string `json:"header_value,omitempty"`
 }
 type ntfyConfig struct {
    ServerURL    string `json:"server_url"`     // default https://ntfy.sh
    Topic        string `json:"topic"`
    AccessToken  string `json:"access_token,omitempty"`
 }
 type smtpConfig struct {
    Host       string `json:"host"`         // e.g. smtp.example.com
    Port       int    `json:"port"`         // default 587 (STARTTLS), 465 (TLS), 25 (none)
    Encryption string `json:"encryption"`   // "starttls" | "tls" | "none"
    Username   string `json:"username"`
    Password   string `json:"password"`     // sensitive — AEAD-encrypted with the rest of config
    From       string `json:"from"`         // RFC 5322 address; "alerts@example.com" or "Restic-Manager <alerts@…>"
    To         string `json:"to"`           // single recipient or distribution-list address; v1 = one channel = one to-line
 }
 ```
 ### Engine state
 The engine itself is stateless beyond the channels it owns; all
 persisted state is in the existing `alerts` table + the new
 `notification_log` table. A process restart re-evaluates from scratch:
 on next tick the stale-schedule + auto-resolution sweeps catch up with
 whatever happened during the downtime. No outbox to drain.
 ## UI templates
 | Template                                  | Purpose                                                |
 | ----------------------------------------- | ------------------------------------------------------ |
 | `web/templates/pages/alerts.html`         | Fleet alerts page                                      |
 | `web/templates/partials/alert_row.html`   | One alert row (used by both list and detail-fragment swap) |
 | `web/templates/pages/settings.html`       | Settings shell with Notifications / Users / Auth sub-tabs |
 | `web/templates/pages/notifications.html`  | Channel list (Notifications sub-tab body)              |
 | `web/templates/pages/notification_edit.html` | Channel kind picker + per-kind form + test button + payload preview |
 | `web/templates/partials/crit_banner.html` | Dashboard top-of-page banner                           |
 | `web/templates/partials/nav.html`         | Existing — gain a `data-alerts-count` attribute on the Alerts tab so the badge auto-updates |
 The Settings shell + Notifications sub-tab is the new chrome the wireframe
 introduced; Users + Authentication tabs are placeholder links that 404 in
 v1 (or render an "Lands later" notice). Same pattern P2R-02 used for
 inert sub-tabs.
 ## Tests (target coverage)
 - `internal/alert/engine_test.go` — rule firing per kind: backup_failed
  raises on `MarkJobFinished(kind=backup, status=failed)`; touch-only on
  the second failure for the same host (no second notification);
  auto-resolve on next success.
 - `internal/alert/agent_offline_test.go` — `OnHostOffline` emits without
  raising until the 15-min floor; `OnHostOnline` clears the alert.
 - `internal/alert/stale_schedule_test.go` — synthetic schedule whose next
  fire is in the past triggers; resets when a job lands.
 - `internal/notification/webhook_test.go` — payload shape pinned;
  authorisation header sent when bearer set; custom header echoed; 5s
  timeout enforced; error in `notification_log`.
 - `internal/notification/ntfy_test.go` — title/priority/tags/click headers
  match the severity mapping; access token sent as `Authorization: Bearer
  <token>`; default priority overridden by severity for critical.
 - `internal/notification/smtp_test.go` — round-trip against a local
  `net/smtp.NewServer`-style fake (or `mhog`/MailHog if convenient):
  STARTTLS handshake completes against a self-signed cert; PLAIN auth
  uses configured creds; subject + from + to + body bytes match the
  spec'd format; Message-ID contains the alert id; 10s timeout enforced;
  failure path (auth refused) lands in `notification_log` with the
  server's error string.
 - `internal/server/http/ui_alerts_test.go` — page renders with filters
  applied; ack/resolve POSTs flip the row + write audit; HX-Redirect
  bounces back to the filtered list.
 - `internal/server/http/ui_notifications_test.go` — CRUD happy paths,
  validation re-render, secrets-encrypted-at-rest assertion (load row,
  decrypt, compare), test-button hits the real send path against a
  test http.Server.
 - Migration 0013 + 0014 round-trip tested via `store.Open` on a fresh
  db.
 ## Playwright sweep
 End-of-phase sweep mirrors the P2R-02 / P3-restore pattern:
 1. Login → `/alerts` (initially empty) → see "All clear · last alert
   never" empty state.
 2. Trigger a fake-failed-backup via `POST /api/hosts/{id}/jobs` against a
   host with a deliberately-wrong rest-server URL. Wait for the
   `backup_failed` alert to appear in the list within ~2s of the job
   finishing.
 3. Acknowledge → row tints + ack actor visible.
 4. Take the agent offline (`systemctl stop`); wait 15 min OR mock
   `last_seen_at` to 16 min ago via the test harness; confirm
   `agent_offline` alert raises once.
 5. Restart the agent → `agent_offline` auto-resolves; `backup_failed` is
   still open.
 6. Configure a webhook channel pointing at a local test sink; click "Send
   test" → green ✓.
 7. Configure a ntfy channel pointing at a local sink → click "Send test"
   → green ✓.
 8. Configure an SMTP channel pointing at a local MailHog (Docker, port
   1025, no TLS for the local-only sweep) → click "Send test" → green ✓
   → MailHog UI at :8025 shows the test email with the right subject
   and Message-ID.
 9. Trigger a fresh failed backup → all three channels receive the
   notification (verified from sink logs + MailHog inbox);
   `notification_log` has three rows `event=alert.raised, ok=true`.
 10. Manually Resolve the open `backup_failed`; confirm all three channels
    receive `event=alert.resolved`.
 11. Critical-severity test: trigger `check_failed` (mocked) → dashboard
    banner appears; clicking it lands on `/alerts?severity=critical&status=open`.
 12. Empty the alerts again → banner disappears.
 Screenshots into `_diag/p3-alerts-sweep/`. End-to-end clean, zero console
 errors, before handing back.
 ## What does NOT change
 - Existing chrome/templates beyond the small additions noted above.
 - Existing `alerts.severity` CHECK (`info`/`warning`/`critical`) — already
  the right shape; no migration needed for that.
 - Audit log writer pattern — engine writes audit rows for ack/resolve
  the same way every other state-changing handler does.
 - The agent. Alerts are entirely a server concern; the agent doesn't
  know they exist.
 ## Open questions / explicit non-goals
 - **Per-rule cooldowns / re-raise on long-running issues.** Out of scope
  (brainstorm question 8 ruled this out). Operators see "still happening"
  in the UI; they don't get a reminder ping.
 - **SMTP HTML emails.** v1 is plain text only — operators wanting rich
  rendering can deploy a webhook → mail-merge bridge, or wait for a v2
  template engine. The Message-ID threading + plain text body should be
  enough for almost every overnight-digest workflow.
 - **SMTP OAuth2 / XOAUTH2.** Out of scope. Gmail / Microsoft 365 with
  modern OAuth requires an `app password` workaround in v1. Native
  XOAUTH2 lands when an operator asks (or when Google starts refusing
  app passwords for non-business accounts in earnest).
 - **Multi-recipient SMTP channels.** A channel = one `To`. Operators
  wanting multiple recipients add multiple channels. Keeps failure
  attribution per-recipient.
 - **Apprise sidecar integration.** Deferred per brainstorm. The
  `Channel` interface accepts a third impl without reshaping when we get
  there.
 - **Per-host or per-severity channel routing.** Out of scope. Likely
  next step if operators ask: a `min_severity` field on the channel row.
 - **Snooze / mute.** Out of scope. Acknowledge is the closest analogue;
  full silence-windows would need a new table and is YAGNI for v1.
 - **PagerDuty / OpsGenie.** Both have webhook receivers; operators wire
  them via the webhook channel today.
 - **Alert "rules" UI.** No CRUD; the rule set is hardcoded.
@@ -1,342 +0,0 @@
 # P3 — Restore (design)
 > Phase 3 sub-spec covering single-host restore (P3-01, P3-02, P3-03, P3-09).
 > P3-04 (cross-host restore) is deferred to a new "Future / unscheduled"
 > section in `tasks.md` — disaster recovery is already covered by re-enrolling
 > a replacement host with the same repo credentials.
 >
 > Wireframe: `_diag/p3-restore-wizard/wireframe.html`. Screenshot:
 > `_diag/p3-restore-wizard/01-full-wizard.png`.
 ## Scope locked
 Brainstorm decisions (in order asked):
 1. **In-place vs new-directory.** Default is a new directory under
   `/var/restic-restore/<job-id>/`. An "Restore in place (overwrite original
   paths)" toggle is gated by typed-confirmation of the host name, mirroring
   the repo re-init pattern.
 2. **Path-selection granularity.** Tree browser as the path selector, lazy-
   loaded via `restic ls --json <snapshot> <path>` per directory expansion.
 3. **Cross-host restore (P3-04).** Out of scope this phase. Move to
   "Future / unscheduled" in `tasks.md`. The disaster-recovery case is covered
   by the standard enrolment flow: stand up a replacement host, paste the
   original repo creds at enrolment, snapshots reappear, restore is
   same-host.
 4. **Snapshot diff (P3-09).** Diff-as-a-job. New `JobDiff` JobKind dispatched
   like every other agent operation. Output streams as `log.stream` and
   renders on the live job log page.
 5. **Wizard entry points.** Top-level "Restore" button on host detail
   (`/hosts/{id}/restore`, opens wizard at step 1) plus a per-snapshot
   Restore action on snapshot rows (`/hosts/{id}/snapshots/{sid}/restore`,
   skips step 1).
 6. **Wizard interaction model.** Single-page, sections progressively enable;
   tree-browser nodes lazy-load via HTMX partials. No `restore_drafts` table.
 7. **Tree-browser data path.** Synchronous WS RPC (`tree.list` ↔
   `tree.list.result`, correlation-ID) plus a per-wizard-session in-memory
   cache keyed by `{snapshot_id, path}` with ~30-min TTL.
 8. **Restore progress UI.** Restore-specific job-page variant: files-restored
   / bytes-restored / throughput / ETA / current-file display, driven by
   restic restore's JSON status events surfaced through `job.progress`.
 9. **Permissions/ownership.** Policy, not toggle. In-place restore preserves
   original ownership; new-directory restore drops ownership
   (`--no-ownership`).
 10. **Concurrency.** Single-flight per host (one job at a time across all
    kinds). Plus a real cancel-job feature: `command.cancel` envelope, agent
    kills the `restic` subprocess via context cancel (SIGTERM, SIGKILL after
    grace), server transitions the job to `cancelled`. The "Cancel" button
    already in the `job_detail` template becomes real for any running job
    kind.
 11. **Audit + safety.** Audit row on every restore dispatch (`host.restore`
    with snapshot ID, paths, target, in-place flag). Recent-restores panel
    on the host page surfacing the latest restore job alongside last-backup
    and last-init signals. Role gate deferred to P4-03.
 ## Architecture
 Restore composes from existing primitives plus three new pieces:
 - **New JobKind values**: `JobRestore`, `JobDiff`. Dispatcher cases mirror
  the prune/check pattern. Agent-side handlers wrap `restic.RunRestore` and
  `restic.RunDiff` (new methods on the `restic` package).
 - **New WS RPC**: `tree.list` request (`{snapshot_id, path}`) ↔
  `tree.list.result` reply (`{entries: [{name, type, size}], ...}` or
  `{error}`). Reuses existing correlation-ID infrastructure from P1-09. No
  `jobs` row.
 - **New cancel surface**: `command.cancel` request (`{job_id}`), agent
  cancels the running subprocess context, returns `command.ack` + `job.finished`
  with status `cancelled`. Server endpoint `POST /api/jobs/{id}/cancel`
  bridges UI button → WS envelope.
 Everything else (job lifecycle, log streaming, progress envelope, snapshot
 listing, audit log writer, host_chrome partial, danger-zone typed-confirmation)
 already exists and is reused verbatim.
 ### Component boundaries
 | Component                          | Purpose                                              | Depends on                                |
 | ---------------------------------- | ---------------------------------------------------- | ----------------------------------------- |
 | `internal/restic.RunRestore`        | Run `restic restore` with paths + target + ownership | `restic.Env`                              |
 | `internal/restic.RunDiff`           | Run `restic diff --json a b`                         | `restic.Env`                              |
 | `internal/agent/runner` cases       | Dispatch `JobRestore` / `JobDiff` jobs               | `restic.Run*`, hooks (skipped: backup-only) |
 | `internal/agent/runner` cancel hook | Wire WS `command.cancel` → ctx.CancelFunc per job   | runner job map                            |
 | `internal/agent/runner` tree-list   | Sync RPC handler: `restic ls --json` for one path   | `restic.Env`                              |
 | `internal/server/ws/cancel.go`      | Validate + send `command.cancel` envelope            | hub.Send, store.UpdateJobStatus           |
 | `internal/server/ws/tree.go`        | RPC mediator: `tree.list` request → reply, with cache | hub.SendRPC, in-memory cache              |
 | `internal/server/http/restore.go`   | Wizard routes + dispatch endpoint                    | store, ws, audit                          |
 | `internal/server/http/diff.go`      | Snapshot-diff dispatch endpoint                      | store, ws                                 |
 | `internal/server/http/cancel.go`    | `POST /api/jobs/{id}/cancel`                         | ws                                        |
 | `web/templates/pages/host_restore.html` | Wizard page                                      | host_chrome partial                       |
 | `web/templates/partials/tree_node.html` | Lazy-loaded tree node fragment for HTMX swap     | —                                         |
 | `web/templates/pages/job_detail.html` | Restore-kind progress widget (variant)             | existing job_detail                       |
 ### Data flow — wizard happy path
 ```
 operator
  ├─ GET /hosts/{id}/restore
  │     server renders wizard shell, snapshot table from store.ListSnapshotsByHost
  │
  ├─ click snapshot row (or arrives via /hosts/{id}/snapshots/{sid}/restore)
  │     wizard advances to step 2, snapshot summary card rendered
  │
  ├─ expand a tree node (chevron click)
  │     HTMX GET /hosts/{id}/restore/tree?snapshot={sid}&path=/etc
  │       server checks per-session cache (keyed by sid+path)
  │         hit  → render tree_node fragment from cache
  │         miss → hub.SendRPC(host_id, "tree.list", {sid, path}) → wait reply
  │                cache result, render tree_node fragment
  │
  ├─ tick file/dir checkboxes (form state, no round-trip)
  │
  ├─ pick target radio (and optionally type host name to unlock in-place)
  │
  └─ POST /hosts/{id}/restore  (form submit)
        server validates: ≥1 path, target mode, in-place ⇒ host name match
        write audit row host.restore
        store.CreateJob{kind=restore, payload={snapshot_id, paths, target, in_place}}
        hub.Send(host_id, "command.run", {job_id, kind=restore, payload})
        HX-Redirect: /jobs/{job_id}
 ```
 ### Data flow — agent restore execution
 ```
 agent.runner receives command.run kind=restore
  ├─ check single-flight: if r.activeJobID != "" → reply busy
  │   (server queues to pending_runs only for kind=backup; restore returns busy)
  ├─ allocate ctx, ctxCancel — store cancelFunc against job_id in r.cancels
  ├─ sendStarted(job_id, JobRestore, now)
  ├─ build target path: if in_place → "/" else "/var/restic-restore/<job_id>/"
  ├─ build flags: paths from payload, --no-ownership when !in_place
  ├─ restic.RunRestore(ctx, env, snapshot_id, paths, target, in_place):
  │   restic restore <sid> --target <path> [--no-ownership] -- <p1> <p2> ...
  │   parse stdout JSON: forward "status" → job.progress (1Hz throttle), "summary" → final
  ├─ on success: sendFinished(job_id, succeeded, exit=0)
  ├─ on ctx.Err() == context.Canceled: sendFinished(job_id, cancelled, exit=130)
  └─ delete cancel func from r.cancels
 ```
 ### Data flow — cancel
 ```
 operator clicks Cancel on /jobs/{id} (running)
  POST /api/jobs/{id}/cancel
    server: lookup job, ensure status=running, find host
    hub.Send(host_id, "command.cancel", {job_id})
  → agent.runner receives command.cancel
       cancelFunc, ok := r.cancels[job_id]
       ok && cancelFunc()
       → restic subprocess context done → exec.Cmd kills via SIGTERM
       → if still alive after 5s grace → SIGKILL
       → runner sendFinished(job_id, cancelled, exit=130)
  → server receives job.finished status=cancelled, persists, broadcasts
  → browser refresh shows cancelled state
 ```
 The cancel surface is independently useful for any kind (prune/check/backup) —
 not gated to restore. The button already in `job_detail.html` becomes real.
 ### Tree-list RPC details
 New WS message types (added to `internal/api/messages.go`):
 ```
 type TreeListRequestPayload struct {
    SnapshotID string `json:"snapshot_id"`
    Path       string `json:"path"`
 }
 type TreeListEntry struct {
    Name string `json:"name"`
    Type string `json:"type"`        // "dir" | "file" | "symlink"
    Size int64  `json:"size,omitempty"`
 }
 type TreeListResultPayload struct {
    SnapshotID string          `json:"snapshot_id"`
    Path       string          `json:"path"`
    Entries    []TreeListEntry `json:"entries,omitempty"`
    Error      string          `json:"error,omitempty"`
 }
 ```
 Server-side mediator (`ws.SendRPC`) takes a request envelope, registers the
 correlation ID in a pending map, sends, blocks on a per-call channel until
 the matching reply arrives (or 30s timeout). The pattern is small enough
 to inline in `internal/server/ws/rpc.go` as a generic helper — future
 synchronous RPCs reuse it.
 In-memory cache: `map[sessionID]map[cacheKey]TreeListResultPayload` with
 `cacheKey = snapshot_id + "\x00" + path`. Session ID minted per wizard
 load (HTTP-only cookie scoped to `/hosts/{id}/restore/tree`, lifetime 30
 min). On wizard close (browser navigation away) the entry expires
 naturally. No persistence, no migration.
 Agent handler runs `restic ls --json <sid> <path>` (non-recursive — restic
 defaults to recursive but `restic ls` accepts `--long` and a path filter;
 parse output line-by-line and emit only direct children of `path`). 60s
 context timeout, mirroring existing `restic snapshots` invocation.
 ### Restore payload
 `api.CommandRunPayload` gains a nested optional `restore` field:
 ```
 type RestorePayload struct {
    SnapshotID    string   `json:"snapshot_id"`
    Paths         []string `json:"paths"`           // absolute paths inside the snapshot
    InPlace       bool     `json:"in_place"`
    TargetDir     string   `json:"target_dir"`      // empty when in_place=true
    PreserveOwner bool     `json:"preserve_owner"`  // mirrors policy: in_place=>true, else=>false
 }
 ```
 The payload is set by the server when dispatching `JobRestore` and ignored
 on every other kind. Wire-shape test pinned in `wire_test.go`.
 ### Diff payload
 `api.CommandRunPayload` gains:
 ```
 type DiffPayload struct {
    SnapshotA string `json:"snapshot_a"`
    SnapshotB string `json:"snapshot_b"`
 }
 ```
 Set on `JobDiff`. Output is plain `restic diff --json <a> <b>` forwarded as
 `log.stream` lines. Job page renders unchanged — operator reads the diff
 output directly.
 ### Recent-restores panel
 A small panel rendered on the host detail page below the existing init-status
 line:
 ```
 last restore: succeeded 2h ago · job f73ab4c1… · 3 files to /var/restic-restore/...
 ```
 Backed by a new `store.LatestJobByKind(host_id, JobRestore)` query (mirroring
 the existing `store.LatestJobByKind` already used for init/forget/prune/check
 in P2R-06). One template addition in `host_chrome.html` next to the
 `InitStatus` block.
 ## Routes added
 | Method  | Path                                                      | Purpose                                                     |
 | ------- | --------------------------------------------------------- | ----------------------------------------------------------- |
 | GET     | `/hosts/{id}/restore`                                     | Wizard shell (step 1 = snapshot picker)                     |
 | GET     | `/hosts/{id}/snapshots/{sid}/restore`                     | Wizard shell with snapshot pre-selected (skips step 1)      |
 | GET     | `/hosts/{id}/restore/tree`                                | HTMX partial: tree node listing for `?snapshot=&path=`      |
 | POST    | `/hosts/{id}/restore`                                     | Validate + dispatch restore job, redirect to live job page  |
 | POST    | `/api/hosts/{id}/snapshots/diff`                          | Dispatch a diff job for `{snapshot_a, snapshot_b}`          |
 | POST    | `/api/jobs/{id}/cancel`                                   | Send `command.cancel` to host, transition job → cancelled   |
 ## Migrations
 None. Restore + diff piggyback on the existing `jobs` table (their `kind` is
 new but the schema already accepts arbitrary kind strings — there's no
 CHECK constraint on `kind`). The cancel feature uses the existing
 `JobCancelled` terminal status. The tree-list cache lives in process memory.
 ## Tests (target coverage)
 - `internal/restic/restore_test.go` — `RunRestore` invocation builds the
  expected argv (paths, --target, --no-ownership flag presence, in-place
  variant); JSON status parsing → `BackupStatus`-shaped progress envelopes.
 - `internal/restic/diff_test.go` — `RunDiff` argv shape and JSON forwarding.
 - `internal/agent/runner/restore_test.go` — happy path, cancel mid-run
  produces `cancelled` finished, in-place vs new-directory dispatch,
  single-flight rejects when another job is running.
 - `internal/agent/runner/tree_test.go` — `tree.list` handler returns
  direct children for a synthetic restic ls output, surfaces error on
  missing snapshot.
 - `internal/server/ws/rpc_test.go` — `SendRPC` correlation matching,
  timeout, concurrent calls.
 - `internal/server/http/restore_test.go` — wizard renders with snapshots,
  POST validates ≥1 path + in-place host-name match, audit row written,
  job dispatched with correct payload, in-place without typed-confirm
  re-renders form with input intact and an error.
 - `internal/server/http/diff_test.go` — POST dispatches `JobDiff`,
  snapshot IDs validated against the host's snapshot list.
 - `internal/server/http/cancel_test.go` — POST cancel happy path
  (running → cancelled), 4xx for non-running jobs, 4xx when host offline.
 - `internal/server/http/restore_e2e_test.go` — happy path: GET wizard,
  expand `/etc` (HTMX call returns expected fragment), submit, follow
  HX-Redirect to job page, see status.
 - `web/templates/pages/host_restore_test.go` (template-render test) —
  wizard renders all four sections; in-place card disabled until typed
  confirm.
 ## Playwright iteration / sweep
 A Playwright sweep at the end (mirroring P2R-02 Slice 6) runs against the
 local smoke server with a real agent enrolled. Steps:
 1. Login → navigate to alfa-01 host → click Restore.
 2. Wizard step 1: pick the most recent snapshot.
 3. Wizard step 2: expand a directory two levels, tick three files,
   verify tally updates.
 4. Wizard step 3: leave default new-directory.
 5. Wizard step 4: dispatch.
 6. Land on live job page, see progress widget animating, see log lines.
 7. Click Cancel mid-flight, verify status transitions to cancelled and
   the agent's subprocess actually died (log line `signal: killed` or exit
   130).
 8. Repeat with in-place mode: type host name, dispatch, verify red
   primary button, verify files actually overwritten on host.
 9. Snapshot diff: navigate to snapshots, pick two, dispatch diff, see
   diff output streamed.
 10. Screenshots into `_diag/p3-restore-sweep/`.
 End-to-end clean, zero console errors, before handing back.
 ## What does NOT change
 - `host_chrome.html` only grows the recent-restores line; sub-tab list
  unchanged (Restore is a top-level button on the host page, not a sub-tab).
 - `enrollment.go`, schedule reconciliation, source-group CRUD, repo
  maintenance ticker, hook execution — none of these are touched.
 - The CLAUDE.md restage block applies as-is when the agent binary changes
  (it does — runner gains restore/diff/cancel/tree handlers). The unit
  file does not change.
 ## Open questions / explicit non-goals
 - **Restore preview / dry-run.** Restic doesn't have a dry-run for restore.
  Out of scope.
 - **Resumable restore.** Restic restore is idempotent per-file but not
  resumable mid-stream from where it left off. If a restore is cancelled,
  the operator re-runs (files already written are overwritten). No state
  to track.
 - **Restore to a glob/pattern (e.g. `*.conf`).** Out of scope; the tree
  picker requires explicit ticks. Power users can edit the URL or use the
  CLI.
 - **Bandwidth caps for restore.** Honoured automatically — restic's
  `--limit-download` is part of `restic.Env` already (P2R-13) and applies
  to restore unchanged.
 - **Pre/post hooks for restore.** Hooks today gate only `kind=backup`
  (P2R-11). Out of scope.
@@ -1,340 +0,0 @@
 # P4-03 / P4-04 — RBAC + User Management Design
 > **Date:** 2026-05-05
 > **Status:** brainstorm complete; ready for plan
 > **Closes:** P4-03 (RBAC enforcement at API layer), P4-04 (User management UI)
 ## Goal
 Enforce role-based access control at the HTTP layer (currently every authenticated user has admin powers) and ship the operator-facing screens for managing users, roles, and password lifecycle.
 ## Architecture
 Two coupled subsystems landing in one PR:
 1. **RBAC enforcement** — chi route-group middleware that gates each subtree by minimum role. Fail-closed default (admin) so a forgotten declaration doesn't accidentally widen access.
 2. **User management** — `/settings/users` sub-tab with list / add / edit / disable. Setup-link flow for new users (1-hour-expiry single-use token). Self-service password change at `/settings/account`.
 The audit log already records actor + user_id on every mutation; new endpoints fold in naturally.
 ## Role taxonomy
 Locked. Three roles, hierarchical (admin ⊇ operator ⊇ viewer):
 | Action | admin | operator | viewer |
 |---|:-:|:-:|:-:|
 | View dashboard / alerts / audit / hosts | ✓ | ✓ | ✓ |
 | Trigger Run-now / Restore / Snapshot diff | ✓ | ✓ | ✗ |
 | Acknowledge / resolve alerts | ✓ | ✓ | ✗ |
 | Edit schedules / source groups / retention / hooks | ✓ | ✓ | ✗ |
 | Add / remove hosts (enrolment, accept/reject pending) | ✓ | ✓ | ✗ |
 | Cancel running jobs | ✓ | ✓ | ✗ |
 | Edit repo credentials | ✓ | ✓ | ✗ |
 | Edit notification channels | ✓ | ✗ | ✗ |
 | Manage users | ✓ | ✗ | ✗ |
 | Self password change (`/settings/account`) | ✓ | ✓ | ✓ |
 The role enum already exists in the schema (`CHECK (role IN ('admin','operator','viewer'))`) and in `internal/store/types.go`. Bootstrap creates the first user as admin. Zero migration needed for existing installs.
 ## Schema changes
 All column-level ALTERs (CLAUDE.md prefers these over rebuilds; safe under `foreign_keys=ON`).
 ### Migration 0017 — `users` extensions
 ```sql
 ALTER TABLE users ADD COLUMN email TEXT;
 ALTER TABLE users ADD COLUMN disabled_at TEXT;
 ALTER TABLE users ADD COLUMN must_change_password INTEGER NOT NULL DEFAULT 0;
 -- Username case-insensitive lookup. Existing rows are kept as-is;
 -- normalisation only applies to new INSERTs (handled in Go).
 CREATE UNIQUE INDEX users_username_lower ON users(LOWER(username));
 ```
 ### Migration 0018 — `user_setup_tokens`
 ```sql
 CREATE TABLE user_setup_tokens (
  user_id     TEXT PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
  token_hash  TEXT NOT NULL,           -- sha256(raw_token), hex
  expires_at  TEXT NOT NULL,
  created_at  TEXT NOT NULL,
  created_by  TEXT NOT NULL REFERENCES users(id) ON DELETE SET NULL
 );
 CREATE INDEX user_setup_tokens_expires ON user_setup_tokens(expires_at);
 ```
 `user_id` is PRIMARY KEY, not just FOREIGN KEY — only one outstanding setup token per user. Regenerating supersedes the old via `INSERT OR REPLACE`.
 ## RBAC enforcement
 ### Middleware
 ```go
 // requireRole returns chi middleware that 403s any request whose
 // session-resolved user doesn't meet the minimum role. Roles are
 // hierarchical: admin > operator > viewer.
 func (s *Server) requireRole(min store.Role) func(http.Handler) http.Handler
 ```
 Hierarchy implemented as a small helper:
 ```go
 func roleAtLeast(have, min store.Role) bool {
    rank := map[store.Role]int{
        store.RoleViewer:   1,
        store.RoleOperator: 2,
        store.RoleAdmin:    3,
    }
    return rank[have] >= rank[min]
 }
 ```
 ### Route grouping in `server.go`
 The existing `/api` and UI routes get re-grouped into three role bands plus a self-service group:
 ```
 /api/* viewer-readable    — GET endpoints anyone authenticated can hit
 /api/* operator+          — mutating endpoints up to host/source-group/schedule level
 /api/* admin-only         — /api/users/*, channel CRUD
 /api/account              — self-service password change
 /audit, /alerts, /hosts/{id}, etc.   — viewer
 /hosts/{id}/run, /alerts/{id}/ack    — operator
 /settings/users/*, /settings/notifications/* — admin
 /settings/account                    — viewer (any authenticated)
 ```
 Default at the bottom of `routes()` is admin (fail-closed). Any future endpoint that doesn't get explicitly placed lands in admin-only, surfacing the missing declaration as a permission error rather than a silent bypass.
 ### Per-handler nuance
 One existing case warrants a handler-level check on top of the route gate: `GET /settings/users/{id}/edit` is admin-only, but the `PUT /api/account/password` is viewer-OK. The split-by-route already covers this; no per-handler overrides expected in v1.
 ### Out of scope of role middleware
 - `/ws/agent` and `/api/agents/*` — agent bearer-token auth, separate chain
 - `/healthz` — unauthenticated
 - `/login`, `/logout`, `/bootstrap` — public
 ### 403 handling
 - JSON endpoints: `{"error":"forbidden","code":"insufficient_role"}` with HTTP 403
 - HTML endpoints: render a small "You don't have permission" panel inside the chrome (so the user keeps their nav and can move away), HTTP 403
 - **No audit row on 403** — too noisy with normal users hitting URLs they don't have access to
 ### Session re-validation
 Sessions need to honour `disabled_at` and current role on every request, not just at login. The session-validation middleware reads the user row each request (single PK lookup, fast in SQLite). If `disabled_at IS NOT NULL`, the session is invalidated and the request 401s. This makes "disable user" and "force logout" effectively immediate.
 Cost: one SELECT per authenticated request. SQLite handles this comfortably for the fleet sizes this codebase targets.
 ## Setup-token flow (replacing temp passwords)
 ### Add user
 1. Admin clicks **+ Add user** on `/settings/users`
 2. Form: username (required, lowercase-normalised), email (optional, validated), role (admin/operator/viewer)
 3. Server:
   - Validates username uniqueness (case-insensitive). On collision with a *disabled* user, return a 409 with `{"existing_user_id": "...", "disabled": true}` so the UI can pivot to a "re-enable existing user" prompt
   - On collision with an enabled user: 409 with a plain "username taken" error
   - Creates user row with `password_hash = ""`, `must_change_password = 1`, `disabled_at = NULL`
   - Generates 32 random bytes, hex-encodes → raw token (64 chars). Stores `sha256(token)` hex in `user_setup_tokens`. `expires_at = now + 1h`
   - Audit: `user.created`, payload `{"username": "...", "role": "...", "with_setup_token": true}`
 4. Server returns the admin to a one-time setup-link page: `/settings/users/{id}/setup-link`
   - Shows the URL `http(s)://<base>/setup?token=<raw>` with a Copy button
   - Countdown timer (live JS) showing time-to-expiry
   - Warning: "This is the only time you'll see this link. If you lose it, regenerate from the user edit page."
   - "Done" button → `/settings/users`
 The raw token is **never persisted** server-side. Lost tokens require regeneration.
 ### Setup landing page (public, no auth required)
 1. User clicks the link, lands on `/setup?token=<raw>`
 2. Server hashes the token, looks up `user_setup_tokens` row, validates `expires_at > now`
 3. On invalid / expired: render an error page with a "Contact your administrator" message. Audit: `user.setup_token.expired` (no actor).
 4. On valid: render a password-set form: `new password + confirm`. Submit:
   - Validates password meets policy (min 12 chars, no other constraints in v1 — same as bootstrap path)
   - Hashes via `auth.HashPassword` (existing helper)
   - Updates `users.password_hash`, sets `must_change_password = 0`
   - Deletes the `user_setup_tokens` row (single-use)
   - Logs the user in via the existing session helper
   - Audit: `user.setup_completed`, payload `{"user_id": "..."}`
   - Redirect to `/`
 ### Regenerate setup link (admin)
 `/settings/users/{id}/edit` shows a "Regenerate setup link" button when `must_change_password = 1`. Clicking it:
 1. Generates a new token + hash, INSERT OR REPLACE on `user_setup_tokens`
 2. Returns the admin to the same one-time link page as the add-user flow
 3. Audit: `user.setup_token.regenerated`
 ### Cleanup
 Expired tokens linger in the DB until cleaned. Add a cheap sweep on the existing maintenance ticker: `DELETE FROM user_setup_tokens WHERE expires_at < ?`. Runs at the same cadence as the alert engine tick (60s). No new ticker needed.
 ## Self-service password change
 `/settings/account`
 - Accessible to every authenticated user (any role)
 - Form: `current password + new password + confirm`
 - Server validates current password (re-uses login bcrypt comparison), updates hash, audits `user.password_changed`
 - Special case: if `must_change_password = 1`, the current-password field is hidden / not required (covers the legacy "admin reset password" path if we ever add one — current setup-token path doesn't use this)
 The bootstrap user's password change uses this same page (no special case for "first admin").
 ## User list / management UI
 ### `/settings/users` (admin-only)
 ```
 Settings · Users [3]
 ─────────────────────────────────────────────────
 [ + Add user ]                       [ ] Show disabled
 USERNAME       EMAIL              ROLE      LAST LOGIN     STATUS
 alice          alice@example.com  admin     2 mins ago     enabled
 bob            —                  operator  3 days ago     enabled
 charlie        c@example.com      viewer    never          setup pending  ← if has open setup token
 diane          d@example.com      operator  1 month ago    disabled       ← only when "Show disabled"
 Actions per row: Edit · (Re-enable | Disable)
 ```
 - "setup pending" badge for users with `must_change_password=1` — clicking the row goes to edit, which surfaces the regenerate-link button prominently
 - "Show disabled" is a checkbox querystring filter (`?show_disabled=1`)
 - Sort columns: clickable like the audit log (username, role, last_login). Reuse the same pattern (server-side sort + URL builder + glyph)
 ### `/settings/users/new` (admin-only)
 Single form: `username + email (optional) + role`. On submit → either landed on the setup-link page (success) or returned with an inline "username exists, re-enable existing?" panel (collision with disabled user) / red error (collision with enabled user).
 ### `/settings/users/{id}/edit` (admin-only)
 - Display-only block: id, created_at, last_login_at, status
 - **Editable**: email, role
 - **Buttons**:
  - "Regenerate setup link" — only when `must_change_password = 1`
  - "Disable user" — flips `disabled_at`; rejected if last enabled admin (server-side check). Confirmation modal with typed name to confirm.
  - "Re-enable user" — clears `disabled_at`. No confirmation.
  - "Force logout" — separate from disable; just kills the session but keeps the user enabled. Useful for "I think Bob's session was hijacked" without locking him out.
 - Cancel / Save buttons at the bottom
 ### `/settings/users/{id}/setup-link` (admin-only)
 Renders the one-time link with copy button + countdown. Shown after add-user and after regenerate. Reload of this URL after the token is consumed: 410 Gone with a clear message.
 ### `/settings/account` (any authenticated)
 Self-service password change. Form-only page; no nav under Settings since most users will only see this one Settings page in v1.
 ## API surface
 ```
 GET    /api/users                        admin   — list (with ?show_disabled=1 filter)
 POST   /api/users                        admin   — create user, returns user_id + setup_url
 GET    /api/users/{id}                   admin   — read
 PATCH  /api/users/{id}                   admin   — update email, role
 POST   /api/users/{id}/disable           admin   — set disabled_at; rejects last-admin
 POST   /api/users/{id}/enable            admin   — clear disabled_at
 POST   /api/users/{id}/regenerate-setup  admin   — new token, returns setup_url
 POST   /api/users/{id}/force-logout      admin   — kill all sessions for this user
 POST   /api/account/password             any auth — self password change
 GET    /setup                            public — landing page (HTML form)
 POST   /setup                            public — submit new password
 ```
 UI routes mirror the API but at `/settings/users/...`.
 ## Last-admin self-protection
 Two operations that could lock everyone out are guarded:
 - **Disable user**: rejected if the user is admin AND there are no other enabled admins
 - **Demote admin to operator/viewer**: same check
 Server-side enforcement (single SELECT on `COUNT(*) FROM users WHERE role='admin' AND disabled_at IS NULL`). UI hint: edit page disables the role dropdown's non-admin options + disable button when the user is the last admin, with a tooltip explaining why.
 The bootstrap admin is just a regular admin row; this check covers it.
 ## Audit actions
 New action strings introduced:
 - `user.created`
 - `user.updated` (email / role change)
 - `user.disabled`
 - `user.enabled`
 - `user.password_changed`
 - `user.setup_completed`
 - `user.setup_token.regenerated`
 - `user.setup_token.expired` (system-driven, on cleanup sweep)
 - `user.force_logout`
 All target_kind = `user`, target_id = the affected user's id. Existing payload conventions apply.
 ## Ordering / dependencies
 Slices in approximate landing order (writing-plans will firm this up):
 1. **A. Schema** — migrations 0017 + 0018, `Role` helper updates, store API extensions (email, disabled_at, must_change_password, setup_token CRUD, lowercase username constraints)
 2. **B. RBAC middleware** — `requireRole` + `roleAtLeast`, route re-grouping in server.go, 403 rendering for HTML + JSON
 3. **C. Session re-validation** — extend the existing session middleware to re-read user state per request, kick disabled users
 4. **D. Setup-token flow** — `/setup` GET+POST, the one-time link page after add-user
 5. **E. User CRUD API** — handlers + handlers' tests
 6. **F. UI** — `/settings/users` list, add, edit, setup-link page, account page
 7. **G. Sweep** — Playwright walk through the full lifecycle (add → setup link → user signs in → admin disables → user gets kicked → admin re-enables → user signs back in)
 Each slice can land as its own commit on the branch. RBAC middleware (B) goes in *before* user CRUD so we don't ship an open `/api/users/*` even briefly.
 ## Test strategy
 - **Store**: `Set/GetSetupToken`, `EnableUser`/`DisableUser`, last-admin guard, lowercase-username uniqueness, expired-token cleanup
 - **HTTP middleware**: `roleAtLeast` truth table; viewer hitting an operator route returns 403; disabled user gets 401 mid-session
 - **Setup flow integration**: create user → fetch setup URL → land on `/setup?token=...` → POST password → user can log in → token row gone
 - **UI**: existing Playwright sweep pattern, screenshots into `_diag/p4-03-04-sweep/`
 ## Out of scope (deferred)
 - **OIDC** (P4-05) — adds a parallel auth chain. This PR keeps the surface for it (role taxonomy, session middleware) but doesn't wire it.
 - **Email-the-setup-link** — explicitly deferred. Easy follow-up because the SMTP channel client from P3-06 is already there.
 - **Hard delete** — disable-only in v1; can add a typed-confirm "purge" later if it turns out to be needed.
 - **Password complexity / rotation policy** — current minimum (12 chars) and no rotation; tighten later if/when policy demands.
 - **Lockout on failed login** — a brute-force protection layer is its own task and orthogonal to RBAC.
 - **Audit on 403** — not in v1; revisit if compliance asks for it.
 ## Risks / gotchas to watch
 - **Existing tests** that assume "any logged-in user can hit any endpoint" will break. Audit the test fixtures: most use `loginAsAdmin`, which is fine; any tests currently exercising specific operator/viewer paths need explicit role assignment. (Quick grep suggests there aren't many — bootstrap-only.)
 - **Bootstrap user normalisation** — the existing admin row's username is whatever it was set to at first run. The new lowercase-uniqueness index uses `LOWER(username)`, which makes the existing row implicitly lowercase-keyed for lookups. No data migration needed.
 - **Session middleware re-read cost** — one SELECT per authenticated request. SQLite WAL handles this fine at expected fleet sizes; if it ever shows up on a profile we add a small in-memory cache keyed by session id with a 30s TTL.
 - **403 vs 401 distinction** — make sure unauthenticated requests still get 401 (login redirect) and authenticated-but-insufficient get 403. The middleware should compose: auth-required first, role-required second.
 ## Acceptance
 - [ ] An admin can add a user, copy the setup link, the new user can land on `/setup?token=...`, set a password, and reach `/`
 - [ ] An expired token (>1h) on `/setup?token=...` shows the "contact your administrator" page
 - [ ] Admin regenerates the link, old token is invalid, new token works
 - [ ] Operator user can trigger Run-now but cannot reach `/settings/users` (403) and the Users tab in Settings is hidden in their nav
 - [ ] Viewer user gets 403 on Run-now, 200 on dashboard / alerts / audit
 - [ ] Admin disables a user mid-session — the user's next request is 401 and they're redirected to login
 - [ ] Admin cannot disable themselves if they are the last enabled admin (server returns 409, UI button is greyed)
 - [ ] Self-service password change at `/settings/account` works for every role
 - [ ] All existing tests pass; new test suite covers role middleware, setup-token lifecycle, last-admin guard
 ## Self-review notes
 - ✅ All sections concrete, no TBD / TODO
 - ✅ Schema migrations are column-level (CLAUDE.md compliance)
 - ✅ Audit action vocabulary listed in one place; no string typos to drift
 - ✅ Out-of-scope list explicit so reviewers can challenge what we *aren't* doing
 - ✅ Last-admin guard handled both server-side and UI-hinted
 - ✅ Token storage hashes the secret server-side; raw is shown to admin once and never again
 - ✅ Session re-validation cost noted with a fallback if it shows up on a profile
@@ -1,215 +0,0 @@
 # P4-05 — OIDC Login Design
 > **Date:** 2026-05-05
 > **Status:** brainstorm complete; ready for plan
 > **Closes:** P4-05 (OIDC login)
 ## Goal
 Wire OpenID Connect authentication as a sign-in path alongside the existing local-user system, so a deployment that already has an IdP (Authelia, Authentik, Keycloak, Okta, Auth0, etc.) can use it for restic-manager logins.
 ## Architecture
 OIDC sits on top of the local-user system rather than replacing it. The first time a user signs in via OIDC the server **just-in-time provisions** a local user row marked `auth_source='oidc'`, with role derived from the IdP's `roles` claim. Subsequent sign-ins look up the same row by stable `oidc_subject` and refresh role + email from the latest claims. Once the row exists it behaves like any other local user — admin can disable it, force-logout, see it in audit logs, etc. — except password-login is rejected because there's no password.
 The Authorization Code flow (with PKCE) is implemented against the discovered well-known config of a single configured issuer. Front-channel logout: clicking Sign out drops the local session + redirects the browser to the IdP's `end_session_endpoint` (when advertised). Back-channel logout deferred.
 ## Locked decisions
 | Decision | Pick |
 |---|---|
 | User lifecycle | **B** — JIT-provision local rows on first OIDC login (`auth_source='oidc'`, `oidc_subject`) |
 | Role mapping config | **A** — YAML/env, claim name configurable (default `groups`, matching Authelia / Keycloak / Authentik), default = deny on no-match |
 | Username source | `preferred_username`, fallback to `email` |
 | Username collision with existing local user | **Refuse** with clear remediation message |
 | Provider config | **Single provider** — `providers:` array can come later |
 | Login page layout | SSO button **above** password form; password form labelled "or sign in with a local account" |
 | OIDC users + password login | **Disabled** — `auth_source='oidc'` rows have empty `password_hash`; password form rejects them |
 | Logout shape | **Front-channel only** — drop session + redirect to `end_session_endpoint` when advertised |
 | Role re-evaluation | **At login only** — claims read at the OIDC callback; admin can disable mid-session locally |
 ## Schema changes
 Migration 0019 — `users` extensions for OIDC bookkeeping:
 ```sql
 ALTER TABLE users ADD COLUMN auth_source TEXT NOT NULL DEFAULT 'local'
  CHECK (auth_source IN ('local', 'oidc'));
 ALTER TABLE users ADD COLUMN oidc_subject TEXT;
 CREATE UNIQUE INDEX users_oidc_subject ON users(oidc_subject)
  WHERE oidc_subject IS NOT NULL;
 ```
 Both column-level ALTERs (CLAUDE.md preference). The unique partial index defends the JIT-lookup invariant (one row per IdP subject) without blocking multiple rows with NULL oidc_subject (the local users).
 ## Configuration
 ```yaml
 # server config — extend existing config struct
 oidc:
  issuer:        https://auth.example.com    # well-known config discovered from this
  client_id:     restic-manager
  client_secret: ${RM_OIDC_CLIENT_SECRET}    # or via _FILE
  display_name:  Authelia                    # button label "Sign in with <display_name>"; default "SSO"
  scopes:        [openid, profile, email, groups]
  role_claim:    groups                      # default if absent (matches Authelia / Keycloak / Authentik)
  role_mapping:
    rm-admins:    admin
    rm-operators: operator
    rm-viewers:   viewer
  # Optional — auto-derived from BaseURL if absent.
  redirect_url:  https://rm.example.com/auth/oidc/callback
 ```
 Env-var overrides: `RM_OIDC_ISSUER`, `RM_OIDC_CLIENT_ID`, `RM_OIDC_CLIENT_SECRET`, `RM_OIDC_CLIENT_SECRET_FILE`. Mapping is YAML-only (env doesn't fit a multi-key string→string map cleanly).
 When `oidc.issuer` is empty or missing, OIDC is disabled (current behaviour). No restart-toggle UI; this is a deploy-time setting.
 ## Auth flow
 ### Login start
 `GET /auth/oidc/login` — only mounted when OIDC is configured.
 1. Generate `state` (32 random bytes, base64) and `code_verifier` (64 random bytes, base64); compute `code_challenge = base64(sha256(code_verifier))`.
 2. Store `(state, code_verifier, created_at)` in a new ephemeral table (or in memory with a 5-minute TTL — see "trade-off" below).
 3. Redirect to `<authorization_endpoint>?response_type=code&client_id=...&redirect_uri=...&scope=...&state=...&code_challenge=...&code_challenge_method=S256`.
 ### Callback
 `GET /auth/oidc/callback?code=...&state=...` — also OIDC-only mount.
 1. Validate `state` against the stored value (one-shot — delete row on read). Reject if missing/expired/already used.
 2. Exchange `code` + `code_verifier` for tokens at `token_endpoint`.
 3. Validate the `id_token` JWT: signature against the JWKS endpoint, `iss`, `aud`, `exp`, `iat`, `nonce` (if used).
 4. Extract `sub`, `preferred_username`, `email`, and the configured `role_claim` (default `roles`).
 5. Pick username: `preferred_username` if non-empty, else `email`. Lowercase / trim per the existing local-user rules.
 6. Pick role: first match in `role_mapping` against the array of role-claim values. **No match → deny with a clear error page**, no row created.
 7. Look up user by `oidc_subject`. Three cases:
   - **Found** — refresh `email`, `role`, `last_login_at`. Don't touch `username` (changing it would break audit trails; if the IdP changes the username, that's an operator concern). Log `user.oidc_login`.
   - **Not found, username free** — INSERT row with `auth_source='oidc'`, `oidc_subject=<sub>`, `password_hash=''`, `must_change_password=0`. Log `user.created` with payload `{"auth_source":"oidc"}` + `user.oidc_login`.
   - **Not found, username taken by a local user** — render an error page: "This OIDC user (`<sub>`) wants to sign in as `alice`, but a local user with that name already exists. Ask your administrator to either rename / remove the local user, or exclude this user from the OIDC mapping." 403, no row created. Log `user.oidc_login_blocked`.
 8. Drop a session cookie + `MarkUserLogin` (the existing helper).
 9. Redirect to `/`.
 ### Logout
 `POST /logout` (existing handler) — augmented:
 1. Look up the session before deletion (we need the user row to know if they're an OIDC user).
 2. Delete the session as today.
 3. If the user is `auth_source='oidc'` AND the discovered `end_session_endpoint` is non-empty → 303 to `<end_session_endpoint>?id_token_hint=<id_token>&post_logout_redirect_uri=<base>/login`. Otherwise → existing 303 to `/login`.
 We need to keep the latest `id_token` per session to drive `id_token_hint`. Stash it in a new `sessions.id_token TEXT` column (one column-level ALTER on migration 0019 alongside the user columns), populated only for OIDC sessions.
 ## State table
 Two reasonable shapes for the short-lived state used during the OAuth round-trip:
 - **In-memory map** with a 5-minute TTL sweeper. Simpler, but multi-process deployments lose it (no multi-process today, but Phase 5 OSS readiness might add).
 - **`oidc_state` table** — `(state_hash PK, code_verifier, created_at)`, swept on the same 60s alert-engine tick that already handles setup-token cleanup.
 I'll go with the **table**. Costs ~3 lines in the existing cleanup tick, behaves correctly under restarts, and survives a future scale-out. Migration 0019 includes:
 ```sql
 CREATE TABLE oidc_state (
  state_hash    TEXT PRIMARY KEY,    -- sha256(state) hex; raw state never persisted
  code_verifier TEXT NOT NULL,
  created_at    TEXT NOT NULL
 );
 CREATE INDEX oidc_state_created ON oidc_state(created_at);
 ```
 ## Login-page UI
 `/login` template branches based on `view.OIDCEnabled`:
 - **OIDC off** → current layout (just the password form).
 - **OIDC on** → an `Sign in with <provider name>` button at the top, then a faint divider line, then the existing password form labelled "Or sign in with a local account". Provider name comes from a new optional config `oidc.display_name` (defaults to "SSO").
 Failed-OIDC redirects (no role match, username collision, IdP error) land on `/login?oidc_error=<reason>` with a small banner above the buttons.
 ## Audit actions
 New entries in the action vocabulary:
 - `user.oidc_login` (target_kind=user, target_id=user_id, payload `{"sub":"…"}`)
 - `user.oidc_login_blocked` (target_kind=user, target_id=oidc_subject when no row was created, payload `{"username":"…", "reason":"username_taken|no_role_match|other"}`)
 - `user.created` already exists; OIDC's first-time provisioning fires this with payload `{"auth_source":"oidc"}` so the audit log distinguishes admin-created from JIT-provisioned rows.
 ## User-management UI changes
 Small additions, not new screens:
 - **Users list** — Status column adds a small `oidc` chip when `auth_source='oidc'` so admin can see at a glance which rows came from JIT-provisioning. Sortable by auth_source via the same sortable-headers pattern (lands as a small follow-up if anyone asks; out of scope for v1).
 - **Add user form** — disabled when OIDC is the only auth path, with a hint: "User provisioning is handled by your OIDC provider; users appear here on first sign-in." Configurable later via a `oidc.disable_local_users` flag if that becomes a real ask. Out of scope for v1; both paths stay open.
 - **Edit user form** — when `auth_source='oidc'`:
  - Username field disabled (changing it would just be undone on next OIDC login)
  - Role dropdown disabled, with a hint: "Role is managed by your OIDC provider's `roles` claim mapping. Edit the mapping in server config to change."
  - Email field disabled (refreshed from IdP on each login)
  - **Disable / Enable / Force logout** still work — disabling an OIDC user kicks their session and rejects future OIDC logins ("user disabled by administrator")
  - **Regenerate setup link** hidden — there's no setup token for OIDC users
 - **Login UI** — password form rejects users with `auth_source='oidc'` ("This account uses single sign-on. Click the SSO button above.")
 ## Middleware / handler changes
 - **Routes**: new public-band entries `GET /auth/oidc/login`, `GET /auth/oidc/callback`. Skipped entirely when OIDC isn't configured (`s.deps.OIDC == nil`).
 - **Logout handler** augmented to fetch the user row + decide between local logout (303 → `/login`) and OIDC logout (303 → `end_session_endpoint`).
 - **Login handler** rejects `auth_source='oidc'` users with the SSO-prompt error.
 - **Last-admin guard** — already covers OIDC users naturally because they live in the `users` table. The role-from-claims path could create a "every admin gets demoted to operator" situation if the IdP's claim mapping is wrong; the guard rejects that demotion at the moment it'd be applied (returns the user to the login page with `oidc_error=role_change_blocked` and audit entry; admin must fix the mapping or promote a local admin first).
 ## Implementation outline
 1. **Schema** — migration 0019 (users.auth_source + oidc_subject, sessions.id_token, oidc_state table)
 2. **Config** — extend `internal/server/config` with the OIDC block + env-var overrides; load JWKS lazily
 3. **Discovery + JWKS** — small helper that fetches `<issuer>/.well-known/openid-configuration` once at startup, caches `authorization_endpoint`, `token_endpoint`, `end_session_endpoint`, `jwks_uri`. JWKS refreshed on first failed verification.
 4. **Login start handler** — `/auth/oidc/login`
 5. **Callback handler** — `/auth/oidc/callback`, with the four claim-resolution branches
 6. **Logout handler augmentation** — branch on `auth_source`
 7. **Login form rejection** — local-user password form rejects OIDC accounts
 8. **State cleanup** — extend the alert engine's existing cleanup tick
 9. **UI** — `oidc` chip on users list, disabled fields on edit-form for OIDC users, login page SSO button + error banner
 10. **Tests** — config parse tests; happy-path callback test using a fake IdP (httptest server with a hand-rolled discovery doc + JWKS); username-collision test; no-role-match test; logout test
 11. **Sweep** — full Playwright walk against an actual IdP (Authelia in a Docker container) — admin gets in via OIDC, role mapping works, logout redirects through IdP, OIDC user can't password-login
 ## Test strategy
 The IdP is the hard part to test cleanly. Two layers:
 - **Unit / integration tests** use a stub OIDC provider built into the test harness — `httptest.Server` exposing `.well-known/openid-configuration`, a token endpoint that signs minted JWTs with a test ECDSA key, and a JWKS endpoint serving the public key. This covers every code path without a real IdP. Pattern: each test mints its own claims and runs the callback against the stub.
 - **Smoke env** runs against a real Authelia container (existing `compose.smoke.yaml`-style file or one-liner `docker run`) for the final sweep — confirms the discovery doc isn't being misread, real JWT verification works, real `end_session_endpoint` redirect works.
 ## Out of scope (deferred)
 - **Multi-provider** support (`providers:` array)
 - **Back-channel logout** (RFC 8138) — schema isn't blocked from adding it later
 - **UI-driven role mapping** (config-only in v1)
 - **Refresh tokens / mid-session role re-evaluation** — login-only refresh in v1
 - **`oidc.disable_local_users`** flag — both paths stay open in v1
 - **OIDC user dashboard chip / badges** beyond the small `oidc` indicator on the users list
 - **Per-user "auth source" filter on the users list** — sortable headers cover most of the use case
 ## Risks / gotchas
 - **JWKS key rotation** — refresh on first failed verification is the standard fix; document the cache TTL (1h) in the config block.
 - **Clock skew** — accept `iat`/`exp` with a 60s leeway; matches what most OIDC libraries do.
 - **End-session 404 / not advertised** — degrade gracefully; just drop the session and 303 to `/login`. Don't 500 the logout because the IdP doesn't implement RP-initiated logout.
 - **Username changes at the IdP** — silently keep the local username (matches our locked decision: subject is the stable key, username is display-only). Document.
 - **Role claim is sometimes a string, sometimes an array, sometimes a comma-separated string** depending on IdP — normalise into `[]string` before mapping. Authelia/Keycloak emit arrays; some custom setups emit strings; handle both.
 - **Authelia `sub` is an opaque UUID, not the username** (Authelia 4.39+ default for new clients). Don't assume `sub` is human-readable; it's stable but display value is `preferred_username` or `email`. The locked design already keys lookups on `sub` and uses `preferred_username` for the display username, so this is just a correctness note.
 - **`end_session_endpoint` may not be published** (Authelia doesn't advertise it for many configs). The locked logout flow already degrades to "drop session + redirect to /login" when the discovery doc lacks it; no extra config needed.
 - **Password-form bypass for OIDC users via /api/auth/login (JSON)** — same rejection rule applies, not just the HTML form.
 ## Acceptance
 - [ ] An OIDC user with `roles: ["rm-admins"]` can sign in, becomes an admin, is visible in `/settings/users` with an `oidc` chip
 - [ ] Same user signing in again resolves to the same row (no duplicate)
 - [ ] Same user with `roles: ["something-else"]` is denied, lands on `/login?oidc_error=no_role_match` with a banner, no row created
 - [ ] OIDC user can't password-login through `/login` or `/api/auth/login`
 - [ ] Admin disables an OIDC user → next OIDC login is rejected, existing session bounced (existing disable-mid-session)
 - [ ] Sign out as an OIDC user → 303 to IdP's end-session URL (when advertised); no end-session URL → 303 to `/login`
 - [ ] OIDC config absent → password login works exactly as today (zero behavioural change)
 - [ ] Username collision: a local `alice` exists, OIDC user with `preferred_username=alice` and a different `sub` → blocked at sign-in with the clear error page
 - [ ] Last-admin guard refuses to demote the only enabled admin even if the IdP's role mapping says otherwise
 - [ ] All existing tests pass; new test suite covers the four claim-resolution branches and logout
@@ -1,229 +0,0 @@
 # P5-03 — Docker-only release path
 **Status:** approved 2026-05-05. Pivots P5-03 away from `goreleaser` +
 binary archives toward a single Docker image as the only public
 deliverable.
 ## Goal
 One artifact per tag: the `restic-manager` server image, multi-arch
 (linux amd64 + arm64), published to the Gitea container registry of
 this self-hosted instance. The image bakes in cross-compiled agent
 binaries (linux amd64, linux arm64, windows amd64), the install
 scripts, and the systemd unit at a read-only image path. The running
 server distributes those agents and scripts via its existing
 `/agent/binary` and `/install/*` endpoints; operators on N hosts never
 download a release artifact directly.
 Source builds via `make build` remain a first-class path for anyone
 who wants binaries.
 ## Non-goals
 - Standalone binary archives (`.tar.gz`, `.zip`) on the release page.
 - darwin / windows-arm64 agent targets — neither is service-tested.
 - `goreleaser`. Not used.
 - `cosign`, `SBOM`, `in-toto`, `minisign`. Re-promote when we ship
  binaries outside an image (Phase 6 candidate).
 - GHCR / GitHub mirror. Single source of truth = Gitea.
 ## Decisions captured (with one-line rationale)
 | ID | Decision | Why |
 |----|----------|-----|
 | D1 | One artifact: server Docker image | Architecture already routes agent distribution through the server (`/agent/binary`); release surface should mirror that. |
 | D2 | Trigger: `tag-push` (`v*.*.*`) **plus** `workflow_dispatch` | Tag for real cuts; dispatch for snapshot iteration without polluting tag history. |
 | D3 | Build matrix: linux amd64+arm64 server image; agent cross-compiles for linux amd64+arm64+windows amd64 | Mirrors the existing CI build matrix; nothing ships that hasn't been service-tested. |
 | D4 | Image-baked, separate path (`/opt/restic-manager/dist/`); HTTP handler reads `<DataDir>/...` first, falls back to `/opt/...` | Volume stays purely operator state; image content is immutable per tag; eliminates the smoke-env "stale agent" footgun in production. |
 | D5 | Tag fan-out: `vX.Y.Z`, `X.Y`, `X`, `latest` — but `latest` is held back until `v1.0.0` | Standard rolling-minor pattern; pre-1.0 forces explicit pinning. |
 | D6 | Snapshot tag: `:snapshot-<shortsha>`, never moves `latest` | Operator can never accidentally pull an unblessed build. |
 | D7 | Version embedding via `-ldflags`: `main.version`, `main.commit`, `main.date` on both `cmd/server` and `cmd/agent` | Server already had `version`; add `commit`/`date` to both for parity and traceability. |
 | D8 | Registry: Gitea container registry on this instance, under `<host>/<owner>/restic-manager` | One source of truth, no external creds. |
 | D9 | Integrity: a `SHA256SUMS` file + the manifest digest in the release notes; nothing else | Image is the unit of trust; pull-by-digest is the verification primitive. |
 | D10 | P1-31 (signed binaries) stays deferred | Re-promote the day we ship binaries outside an image. |
 ## Image layout
 Multi-stage Dockerfile (extends today's `deploy/Dockerfile.server`):
 ```
 build stage (golang:1.25-alpine):
    cross-compile cmd/server for $TARGETARCH (linux)
    cross-compile cmd/agent for linux/amd64
    cross-compile cmd/agent for linux/arm64
    cross-compile cmd/agent for windows/amd64
    (CGO_ENABLED=0 throughout — pure-Go SQLite)
 final stage (gcr.io/distroless/static-debian12:nonroot):
    /usr/local/bin/restic-manager-server                   (matches image arch)
    /opt/restic-manager/dist/agent-binaries/
        restic-manager-agent-linux-amd64
        restic-manager-agent-linux-arm64
        restic-manager-agent-windows-amd64.exe
    /opt/restic-manager/dist/install/
        install.sh
        install.ps1
        restic-manager-agent.service
 ```
 `/opt/restic-manager/dist/` is owned by `root:root`, mode `0755` for
 directories, `0755` for `install.sh` (script must be executable when
 the install path uses `curl ... | sh` semantics) and `0644` for the
 unit file and `install.ps1`. The agent binaries are mode `0755`.
 `<DataDir>` keeps holding only operator state: `restic-manager.db`,
 `secret.key`, `secrets.enc`, `audit/`, `tls/`. Nothing the image
 owns gets written into the volume.
 ## Server-side handler change
 `internal/server/http/agent_assets.go` today reads from
 `<DataDir>/agent-binaries/<name>` and `<DataDir>/install/<name>`.
 Change: if the file isn't present in `<DataDir>`, fall back to
 `/opt/restic-manager/dist/<subpath>/<name>`. The fallback path is a
 new server-config field defaulted to `/opt/restic-manager/dist`,
 overridable via `RM_BUNDLED_ASSETS_DIR` for tests and source-build
 deployments. If neither path resolves, return 404 (existing
 `binary_not_published` / `not_found` body unchanged).
 This means:
 - A fresh container without any operator-staged overrides serves the
  baked-in agents. No first-run setup needed.
 - An operator can still drop a custom-built agent into
  `<DataDir>/agent-binaries/` to override the image's copy (handy for
  pre-release agent testing without rebuilding the server image).
 - Source-build dev (`bin/restic-manager-server` running out of the
  working tree) still works exactly as today — the fallback dir is
  configurable, and the `<DataDir>` path remains the primary lookup.
 Tests cover four cases: (a) DataDir hit, (b) fallback hit, (c) DataDir
 hit shadows fallback, (d) neither — 404.
 ## Versioning
 Both binaries grow `commit` and `date` ldflag-targets next to the
 existing `version`:
 ```go
 var (
    version = "dev"
    commit  = "none"
    date    = "unknown"
 )
 ```
 Dockerfile gains `ARG VERSION`, `ARG COMMIT`, `ARG DATE`, all
 `""`-defaulted; the `go build` line passes them via `-ldflags`. The
 release workflow fills them from `${{ gitea.ref_name }}`,
 `${{ gitea.sha }}`, and a UTC ISO-8601 timestamp.
 Snapshot builds (workflow_dispatch) compute
 `VERSION=0.0.0-snapshot-${SHORTSHA}` and tag the image as
 `:snapshot-${SHORTSHA}` only. They never touch `latest` or any
 `vX.Y.Z` tag.
 ## Workflow (`.gitea/workflows/release.yml`)
 ```yaml
 name: Release
 on:
  push:
    tags: ['v[0-9]+.[0-9]+.[0-9]+']
  workflow_dispatch:
 env:
  IMAGE: gitea.dcglab.co.uk/${{ gitea.repository }}
 jobs:
  image:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: docker/setup-qemu-action@v3
      - uses: docker/setup-buildx-action@v3
      - uses: docker/login-action@v3
        with:
          registry: gitea.dcglab.co.uk
          username: ${{ gitea.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: compute tags
        id: meta
        run: |
          # tag-push  → :vX.Y.Z, :X.Y, :X (only :latest if X >= 1)
          # dispatch  → :snapshot-<shortsha>
          ...
      - uses: docker/build-push-action@v6
        with:
          context: .
          file: deploy/Dockerfile.server
          platforms: linux/amd64,linux/arm64
          push: true
          tags: ${{ steps.meta.outputs.tags }}
          build-args: |
            VERSION=${{ steps.meta.outputs.version }}
            COMMIT=${{ gitea.sha }}
            DATE=${{ steps.meta.outputs.date }}
 ```
 The `compute tags` step:
 - For `push:tags`: extract `vMAJOR.MINOR.PATCH`. Always emit
  `:vMAJOR.MINOR.PATCH`, `:MAJOR.MINOR`, `:MAJOR`. Emit `:latest`
  only when `MAJOR >= 1`.
 - For `workflow_dispatch`: emit `:snapshot-<shortsha>`. Nothing else.
 No release-asset upload step yet — the GHCR-equivalent registry push
 is the deliverable. A future iteration may attach a `SHA256SUMS` file
 to a Gitea release object once `tea release create` is wired in;
 that's not in scope for the first cut.
 ## Tests / verification
 1. `go vet ./...` (CLAUDE.md rule, runs locally pre-commit).
 2. `go test ./internal/server/http/...` covers the new fallback
   logic.
 3. Local manual smoke: `docker build -f deploy/Dockerfile.server .`
   produces an image; `docker run --rm <image>` starts the server;
   `curl http://127.0.0.1:8080/agent/binary?os=linux&arch=amd64`
   serves bytes; `curl http://127.0.0.1:8080/install/install.sh`
   serves the script.
 4. Release workflow itself is exercised on first tag-push; until
   then, `workflow_dispatch` is the smoke test.
 ## Operator-facing changes
 - `README.md` install snippet becomes
  `docker run -v rm-data:/var/lib/restic-manager ...
  gitea.dcglab.co.uk/<owner>/restic-manager:vX.Y.Z`. Pre-1.0
  releases are pinned by exact tag; no `:latest` is published.
 - The CLAUDE.md "restage" block is dev-only (smoke env runs the
  server out of `bin/`). Production users on the image never see
  it.
 - `RM_BUNDLED_ASSETS_DIR` is documented in the server config
  reference (defaults to `/opt/restic-manager/dist`).
 ## Risks / footguns
 - **Image size growth.** Three agent binaries (~15-20 MB each
  stripped) add ~50 MB. Acceptable; we're already shipping a
  distroless server. Watch the trajectory once Phase 4 alerting is
  in.
 - **Dockerfile cross-compile multiplies build time** on the runner.
  Pure-Go means each leg is just a `go build`; total stage time
  should stay under 60s on the self-hosted runner.
 - **`ARG VERSION` leakage.** The current Dockerfile already accepts
  `ARG VERSION=dev`; we're tightening, not loosening.
 - **Operator overriding `<DataDir>/agent-binaries/<name>`** with a
  stale binary will silently shadow the image's copy. Documented in
  the server config reference; this is a feature (lets operators
  hot-patch a pre-release agent) not a bug.
 ## Out of scope (tracked for follow-up)
 - Cosign / SBOM / in-toto provenance — defer to Phase 6 with the rest
  of the supply-chain hardening.
 - GHCR mirror — defer until P5-01 docs site goes public.
 - `tea release create` integration — pending until we have something
  worth attaching beyond the image digest.
@@ -1,448 +0,0 @@
 # P6-01 + P6-02 — Agent self-update + fleet update
 Status: design approved 2026-05-06.
 Scope: P6-01 (agent self-update mechanism) and P6-02 (dashboard
 version reporting + fleet update UI). One spec, one branch — the
 two tasks are tightly coupled (P6-02 is the operator surface for
 the mechanism P6-01 ships).
 ## 1. Background
 P5-03 pivoted release distribution to a single multi-arch server
 Docker image, with cross-compiled agent binaries baked under
 `/opt/restic-manager/dist/agent-binaries/` and served via
 `GET /agent/binary?os=…&arch=…`. The plumbing already does
 dual-path lookup: `<DataDir>/agent-binaries/<name>` overrides the
 image-baked copy, so an operator can hot-patch a pre-release agent
 without rebuilding the image.
 That makes the server the natural distribution point for agent
 upgrades. "Update agent" collapses to "re-fetch from your own
 server" — no apt repo, no Chocolatey, no third-party signing infra,
 and version pinning is automatic because the server only ever
 serves the agent that matches its own release.
 This spec wires up the update mechanism end-to-end and the
 operator surface that drives it.
 ## 2. Decisions
 | # | Decision | Rationale |
 |---|----------|-----------|
 | 1 | Operator-driven only — no auto-update | Matches the rest of the app's job-dispatch model; avoids "bad release upgrades every host instantly"; auto-update can be added later as a setting flip if asked |
 | 2 | Linux: just exit, let systemd restart. Windows: detached helper script. | Linux supports rename-while-open; Windows holds an exclusive lock on the running .exe |
 | 3 | M1 (keep `agent.old` on disk) + M2 (rolling fleet update with halt-on-fail). Skip M3 (auto-rollback watchdog). | M1 is ~5 lines, M2 falls naturally out of P6-02's UI, M3 is a lot of plumbing for "shipped a binary that doesn't start" |
 | 4 | Skip sha256 digest verification for v1 | TLS already covers the corruption-in-transit threat; image-tampering is image-build's problem, not the agent's |
 | 5 | Exact string version match for "out of date" | With server-bundled binaries there's exactly one canonical version per server image — anything else is out of date by definition |
 | 6 | WS envelope only, no `restic-manager-agent update` CLI subcommand | YAGNI; no concrete consumer; the underlying logic is reusable when one appears |
 ## 3. Wire protocol
 ### 3.1 Server → agent: `command.update`
 ```
 {
  "type": "command.update",
  "id": "<envelope id>",
  "payload": {
    "job_id": "<ulid>"
  }
 }
 ```
 No `os` / `arch` / `version` in the payload — the agent already
 knows its own build target and fetches from its configured server
 URL via the existing `/agent/binary` handler. Including a target
 version would also tempt the agent into version-comparison logic;
 keep that on the server side.
 ### 3.2 Job lifecycle (server-driven)
 The agent has limited ability to report on its own restart, so the
 job state machine lives on the server:
 - **queued → running** when the envelope is dispatched.
 - **running → succeeded** when the agent re-hellos with
  `agent_version == server.Version` after dispatch and within
  the timeout. Audit `host.update_succeeded`.
 - **running → failed (timeout)** if 90 seconds pass without a
  hello carrying the matching version. Audit `host.update_failed`.
  Raise alert kind `update_failed` (reuses P3-05 alert engine).
  This single transition covers both the "agent never came back
  at all" case and the "agent came back at the wrong version"
  case — see §6.2 for why we don't transition immediately on a
  mismatched hello.
 Migration 0021 widens the `jobs.kind` CHECK constraint to include
 `update`. Same column-level pattern as 0012 (where 0012 added
 `restore` and `diff`).
 ## 4. Agent-side execution
 Lives in `internal/agent/updater`, build-tag split:
 - `updater_unix.go` — Linux + any future POSIX target.
 - `updater_windows.go` — Windows-only, uses the helper-script
  pattern.
 - `updater.go` — shared `Update(ctx, serverURL string) error`
  interface and the HTTP fetch/streaming code (no platform deps).
 ### 4.1 Linux flow
 1. Receive `command.update` from the WS dispatcher.
 2. Resolve own binary via `os.Executable()` and `filepath.Abs`.
   Refuse if the resolved path is `/proc/self/exe` or otherwise
   not a real file (defence in depth — shouldn't happen under
   systemd, but bail loudly if it does).
 3. `GET <server>/agent/binary?os=linux&arch=<runtime.GOARCH>`,
   stream to `<binary>.new` in the same directory as the running
   binary (same filesystem ⇒ atomic rename).
 4. fsync the file, `os.Chmod(0755)`.
 5. Copy current binary to `<binary>.old` (overwrite if it
   exists). M1 — one-revision rollback target.
 6. `os.Rename(<binary>.new, <binary>)`.
 7. Close the WS connection cleanly (sends close frame so the
   server transitions the connection to `disconnected` rather
   than waiting for the heartbeat-miss sweep).
 8. `os.Exit(0)`. Systemd's `Restart=always` (already in the unit)
   brings up the new binary within seconds.
 ### 4.2 Windows flow
 The .exe is exclusively locked by the OS while running, so steps
 5–6 above can't happen in-process. Use a detached helper:
 1. Steps 1–4 the same — fetch into `<binary>.exe.new`, fsync.
 2. Write `update.cmd` to a tmp path with the orchestration:
   ```
   timeout /t 3 /nobreak >nul
   copy /Y "<binary>.exe" "<binary>.exe.old"
   sc stop restic-manager-agent
   :wait
   sc query restic-manager-agent | find "STOPPED" >nul
   if errorlevel 1 (timeout /t 1 /nobreak >nul & goto wait)
   move /Y "<binary>.exe.new" "<binary>.exe"
   sc start restic-manager-agent
   del "%~f0"
   ```
 3. `CreateProcess` it detached
   (`DETACHED_PROCESS | CREATE_NO_WINDOW`, no parent handles).
 4. Close WS, `os.Exit(0)`. SCM sees clean stop and waits — does
   *not* try to restart, because `sc stop` is the helper's job,
   not a crash. (`Restart=always` semantics differ between
   systemd and SCM. SCM treats clean-exit-after-stop as
   intentional and does not auto-restart; only crashes restart.
   That's why the helper script needs the explicit `sc start`
   at the end.)
 ### 4.3 Service-user assumption
 Both Linux (`User=root` per the existing unit) and Windows
 (`LocalSystem` by default) can write the binary path directly. If
 the agent ever moves to a non-root service user, the updater
 breaks — would need either a setuid helper or an out-of-process
 update service. Add a `// NOTE:` comment in the updater package
 flagging this; not a v1 blocker.
 ## 5. Server build version
 New package `internal/version` exposing two constants:
 ```
 package version
 var (
    Version = "dev"
    Commit  = ""
 )
 ```
 Wired via `-ldflags` in the Makefile:
 ```
 GO_LDFLAGS = -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=$(VERSION) \
             -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Commit=$(COMMIT)
 VERSION := $(shell git describe --tags --always --dirty)
 COMMIT  := $(shell git rev-parse --short HEAD)
 ```
 Both `cmd/server` and `cmd/agent` link the same package, so an
 agent's `agent_version` (sent in the hello payload, already wired
 since P1-11) is comparable byte-for-byte to the server's
 `version.Version`.
 `make build` already does what's needed for source builds. The
 Phase 2 work in this spec is the Docker release path — confirm
 during plan execution that `.gitea/workflows/release.yml` passes
 `VERSION` and `COMMIT` into the Docker `--build-arg` chain so the
 in-image binaries embed the same string the image is tagged with.
 If not, add the wiring.
 Dirty/dev builds (`v1.2.3-dirty`) won't match clean server builds,
 so every dev environment will show every host as out-of-date. This
 is acceptable — the chip is a noop in dev, real ops always run
 tagged builds.
 A new `GET /api/version` endpoint returns
 `{"version": "...", "commit": "..."}`. Used by the dashboard
 header tile and by `/settings/fleet-update`. Public-band — exposes
 no secrets, lets the install scripts surface it too.
 ## 6. P6-01 server endpoints
 ### 6.1 `POST /api/hosts/{id}/update`
 Admin-only. Refuses (with structured error code) when:
 - Host is offline (`host_offline`).
 - Host's `agent_version == server.Version` (`already_up_to_date`).
 - An update job for this host is already running (`update_in_progress`).
 Happy path: creates `jobs` row with `kind=update`, dispatches
 `command.update` envelope, audit-logs `host.update_dispatched`,
 returns `{"job_id": "..."}`.
 UI form-post variant on `/hosts/{id}/update` returns
 `HX-Redirect` to the live job log.
 ### 6.2 Hello handler integration
 The existing `onAgentHello` (P1-11) already upserts
 `agent_version`. Extend it: after the upsert, look for any
 `update` job for this host with `status='running'`. If one
 exists:
 - `agent_version == server.Version` → mark job `succeeded`,
  audit `host.update_succeeded`.
 - `agent_version != server.Version` → leave the job running so
  the timeout path catches it as a rollback failure (don't fail
  immediately — gives the agent one chance to come back, restart,
  hello again with the right version).
 Adds a small in-memory map of pending updates so the timeout
 goroutine knows when to give up. Persisted state lives in the
 `jobs` table; the in-memory map is just for the timer.
 ## 7. P6-02 fleet update
 ### 7.1 Schema
 Migration 0022, column-level adds only:
 ```
 CREATE TABLE fleet_updates (
  id              TEXT PRIMARY KEY,
  started_at      TEXT NOT NULL,
  started_by_user_id TEXT NOT NULL REFERENCES users(id),
  target_version  TEXT NOT NULL,
  status          TEXT NOT NULL CHECK (status IN ('running','completed','halted','cancelled')),
  current_host_id TEXT REFERENCES hosts(id),
  halted_reason   TEXT,
  completed_at    TEXT
 );
 CREATE TABLE fleet_update_hosts (
  fleet_update_id TEXT NOT NULL REFERENCES fleet_updates(id) ON DELETE CASCADE,
  host_id         TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
  status          TEXT NOT NULL CHECK (status IN ('pending','running','succeeded','failed','skipped')),
  job_id          TEXT REFERENCES jobs(id),
  failed_reason   TEXT,
  PRIMARY KEY (fleet_update_id, host_id)
 );
 ```
 ### 7.2 Worker loop
 A single in-process goroutine — at most one fleet update may run
 at a time (enforced via a `sync.Mutex` + a precondition check on
 `POST /api/fleet/update`).
 ```
 for each pending fleet_update_hosts row in dispatch order:
    set fleet_updates.current_host_id = row.host_id
    set fleet_update_hosts.status = 'running'
    if host.agent_version == server.Version:
        # Already updated since we built the list — skip.
        set status = 'skipped'; continue
    if !host.online:
        # Offline since we built the list — halt.
        halt(reason="host went offline")
        return
    dispatch_update_for_host(host)  # reuses 6.1 logic
    wait_up_to_90s_for_hello_with_matching_version()
    if matched:
        set status = 'succeeded'; continue
    else:
        set status = 'failed', failed_reason = "..."
        halt(reason="update failed on host X")
        return
 set fleet_updates.status = 'completed', completed_at = now
 ```
 Halt: set `fleet_updates.status = 'halted'`, raise an alert kind
 `fleet_update_halted`, audit `fleet.update_halted` with the host
 id and reason. Subsequent hosts stay `pending` so the operator can
 see what was queued and decide whether to resume (resume = start a
 new fleet update with the still-out-of-date subset).
 Cancel: admin-only `POST /api/fleet-updates/{id}/cancel`. Sets
 `status='cancelled'`. The currently-dispatched host's update job
 keeps running (the agent is already mid-restart) — cancel only
 prevents the *next* host from being picked. Audit
 `fleet.update_cancelled`.
 ### 7.3 UI surfaces
 **Per-host chip (host_row partial + host detail chrome):**
 `out of date · v1.2.2 → v1.2.3` — amber-accented, mirrors `.tag`
 token shape. Only rendered when:
 ```
 host.agent_version != "" && host.agent_version != server.Version
 ```
 Empty `agent_version` (host enrolled but never connected) renders
 nothing rather than "out of date" — we don't know what version
 they have.
 **Dashboard summary tile:**
 The hero strip already has tiles. Add an "Updates" tile:
 `N hosts behind` linking to `/?updates=behind` (extends NS-04's
 filter machinery — adds an `updates` query param alongside
 `status`/`repo_status`/`tag`). Hidden when N == 0.
 **Per-host Update button on `/hosts/{id}`:**
 Right-rail, admin-only. Disabled with hover tooltip when host
 offline / already up to date / update in progress. POSTs to
 `/hosts/{id}/update`, `HX-Redirect` to the live job log.
 **Fleet update page `/settings/fleet-update`:**
 Admin-only. Two states:
 - **Idle**: lists out-of-date online hosts (table: hostname,
  current version, target version, last seen). Big "Start rolling
  update" button behind a typed-confirm dialog (operator types
  the host count, e.g. `12`, to enable the button — same shape as
  the host-delete confirm).
 - **Running/halted/completed**: shows the currently-active
  fleet_update row + per-host progress list. Polls every 3s (htmx
  trigger conditional on `document.visibilityState === 'visible'`,
  same pattern as the alerts page). Renders:
  ```
  Updated 3/12 · currently updating <hostname>
  Halted on <hostname>: <reason> · job log →
  ```
 Audit actions: `fleet.update_started`, `fleet.update_completed`,
 `fleet.update_halted`, `fleet.update_cancelled`.
 ### 7.4 Alert engine integration
 P3-05's alert engine already supports kind-based registration. Add
 two new kinds:
 - `update_failed` — per-host, raised on individual update failure.
  Auto-resolves when the host re-hellos with the matching version.
 - `fleet_update_halted` — global, raised on fleet halt. Auto-resolves
  when a subsequent fleet update completes successfully.
 ## 8. RBAC
 | Endpoint | Role |
 |----------|------|
 | `POST /api/hosts/{id}/update` | admin |
 | `POST /api/fleet/update` | admin |
 | `POST /api/fleet-updates/{id}/cancel` | admin |
 | `GET /api/fleet-updates/{id}` | admin (status polling) |
 | `GET /api/version` | public |
 Operator and viewer see the "out of date" chip but no update
 buttons. Mirrors the existing pattern: read affordances are
 visible to all roles, write affordances are gated.
 ## 9. Testing
 ### 9.1 Unit
 - `internal/agent/updater`: fake-`/agent/binary` HTTP server +
  tmp "running binary" file, assert post-state — binary swapped,
  `.old` present, no leftover `.new`. Linux path only (Windows
  helper covered by build-tag compile-only).
 - `internal/server/http`: `POST /api/hosts/{id}/update` happy
  path, refuses-when-offline, refuses-when-up-to-date,
  refuses-when-update-in-progress, RBAC enforcement, audit row
  written.
 - Hello handler: agent reconnects with matching version after
  `update` job dispatch → marks job `succeeded`, drops the
  in-memory pending entry. Mismatched version → no-op (timeout
  catches it).
 - Timeout path: synthetic `update` job + 90s elapsed →
  marks `failed`, raises alert.
 - Fleet worker: table-driven over the loop's state machine —
  success-then-success, success-then-timeout-halts,
  cancel-mid-flight, no-online-out-of-date-hosts-completes-immediately,
  host-disappears-from-list-mid-loop-skips.
 ### 9.2 Smoke validation (per CLAUDE.md restage block)
 1. Build server + agent at version A. Restage. Enrol a host;
   confirm `agent_version=A`.
 2. Bump version to B (`make build VERSION=B`), rebuild server
   only, restart server. Dashboard shows host as out-of-date with
   `A → B` chip. Updates tile reads "1 host behind".
 3. Rebuild agent at B, restage `<DataDir>/agent-binaries/`. Click
   **Update agent** on host detail. Agent fetches, swaps, exits;
   systemd restarts it; hello-back at B → job `succeeded`, chip
   gone, tile clears.
 4. Rollback path: leave `<DataDir>/agent-binaries/` at A, server
   at B, click Update — agent fetches A, swaps to A, restarts at
   A; hello says A != B; server marks job `failed` after 90s with
   reason "agent reconnected at version A, expected B".
 5. Fleet update: spin up two smoke hosts both out-of-date, fire
   **Start rolling update**, watch progress page tick host 1 →
   host 2 → completed.
 6. Halt path: replace one of the `<DataDir>/agent-binaries/`
   files with `/bin/false`. Run fleet update. First host gets
   broken binary, fails to come back up, fleet update halts at
   host 1 after 90s, alert raised, host 2 left as `pending`.
 Step 6 validates M2 end-to-end — the rolling halt is the actual
 safety guarantee, not a nice-to-have.
 ## 10. Out of scope
 - sha256 digest verification (deferred — see decision 4).
 - `restic-manager-agent update` CLI subcommand (deferred —
  decision 6).
 - Auto-update (deferred — decision 1).
 - Auto-rollback watchdog M3 (deferred — decision 3).
 - Migrating the agent off `User=root` (separate hardening track).
 - Cross-version protocol-compatibility checks beyond the existing
  `protocol_version` handshake (P1-11). If the new agent's
  `protocol_version` is incompatible with the server, the
  existing handshake rejects it; the update job will then
  correctly time out and be marked failed.
 ## 11. Migration plan
 1. `internal/version` package + Makefile ldflags wiring.
 2. Migration 0021 (jobs.kind widening) + 0022 (fleet_updates
   tables).
 3. `internal/agent/updater` package, Linux first.
 4. WS envelope wiring + `command.update` dispatcher.
 5. `POST /api/hosts/{id}/update` + hello-handler integration +
   timeout goroutine.
 6. UI: chip + per-host update button + dashboard tile + filter.
 7. Fleet update worker + page.
 8. Windows updater path.
 9. Alert engine kinds.
 10. Smoke validation per §9.2.
 Each step is independently testable; commits should land at each
 boundary so a failed Windows path (8) doesn't block the rest of
 the work.
@@ -1,223 +0,0 @@
 # P6-03 — Repo size trend graphs
 Sparkline on the dashboard host row + full chart on the host repo
 page, both showing repo growth over time. Closes the last
 operator-visibility gap in Phase 6 alongside Prometheus metrics
 (P6-04).
 ## Goals
 - Operators can see at a glance whether a host's repo is growing,
  stable, or shrinking, without leaving the dashboard.
 - A second screen on the repo page exposes the same data over a
  longer window with a snapshot-count overlay so retention
  behaviour can be eyeballed against size.
 - Zero new client-side dependencies; matches the existing
  HTMX + server-rendered idiom used everywhere else in the UI.
 ## Non-goals
 - No backfill of historical data. Trend lights up with whatever
  the agents report from the day this ships.
 - No per-source-group breakdown — repo-level only.
 - No alerting on growth rate (dedicated to a future ticket if a
  user asks).
 - No JSON API surface. Prometheus exposure is P6-04, separate.
 ## Decisions taken in brainstorming
 - **Metrics:** `total_size_bytes` (sparkline + chart) and
  `snapshot_count` (chart only). Raw size dropped as redundant.
 - **Cadence:** one row per `(host_id, UTC date)`, last-write-wins
  per column. Bounded at ~365 rows/host/year regardless of job
  frequency.
 - **Backfill:** none. Pure forward-fill from launch day.
 - **Rendering:** server-rendered inline SVG, no JS library.
 - **Spans:** sparkline fixed at 30 days; chart has `30d | 90d | 1y`
  range selector, server-rendered swap.
 ## Schema
 New migration `internal/store/migrations/0023_host_repo_stats_history.sql`:
 ```sql
 CREATE TABLE host_repo_stats_history (
  host_id           TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
  day               TEXT NOT NULL,        -- 'YYYY-MM-DD' UTC
  total_size_bytes  INTEGER,              -- nullable; partial patches don't overwrite
  snapshot_count    INTEGER,              -- nullable
  recorded_at       TEXT NOT NULL,        -- RFC3339Nano of last write touching this row
  PRIMARY KEY (host_id, day)
 );
 CREATE INDEX host_repo_stats_history_host_day
  ON host_repo_stats_history(host_id, day DESC);
 ```
 FK cascade matches every other host-scoped table; deleting a host
 through `Store.DeleteHost` (NS-01) wipes its history automatically.
 ## Write path
 Hook the existing `MsgRepoStats` handler in
 `internal/server/ws/handler.go` (around line 319). After the
 existing `UpsertHostRepoStats(ctx, hostID, patch)` call, append:
 ```go
 day := time.Now().UTC().Format("2006-01-02")
 if err := deps.Store.UpsertHostRepoStatsHistory(ctx, hostID, day, patch); err != nil {
    slog.Warn("ws: upsert host repo stats history", "host_id", hostID, "err", err)
 }
 ```
 A history-write failure is logged and dropped — never blocks the
 main upsert. The partial-update contract that
 `UpsertHostRepoStats` already implements is preserved at the
 history layer:
 ```sql
 INSERT INTO host_repo_stats_history (host_id, day, total_size_bytes, snapshot_count, recorded_at)
 VALUES (?, ?, ?, ?, ?)
 ON CONFLICT(host_id, day) DO UPDATE SET
  total_size_bytes = COALESCE(excluded.total_size_bytes, host_repo_stats_history.total_size_bytes),
  snapshot_count   = COALESCE(excluded.snapshot_count,   host_repo_stats_history.snapshot_count),
  recorded_at      = excluded.recorded_at;
 ```
 This is critical: the agent's prune handler in
 `internal/agent/runner/runner.go:318` emits a stats patch that
 only carries `LastPruneAt`. Without `COALESCE`, that prune ack
 would null out a `total_size_bytes` we'd already captured from a
 backup earlier the same day.
 ## Read path
 Two new helpers in `internal/store/host_repo_stats_history.go`:
 ```go
 type RepoStatsHistoryPoint struct {
    Day            time.Time   // 00:00:00 UTC
    TotalSizeBytes *int64
    SnapshotCount  *int64
 }
 func (s *Store) ListHostRepoStatsHistory(
    ctx context.Context, hostID string, since time.Time,
 ) ([]RepoStatsHistoryPoint, error)
 ```
 Returns rows ordered by `day` ascending where at least one metric
 is non-null. The renderer connects available points with a
 straight line — there is no explicit gap representation. A host
 that was offline for a week shows a single segment spanning the
 gap, which is the right visual: the repo state didn't change.
 ## Rendering
 New package `internal/web/sparkline`. Pure Go, no template
 dependency:
 ```go
 type Series struct {
    Name   string
    Points []float64    // nil-points represented as math.NaN
    Stroke string       // CSS color
 }
 func RenderSparkline(points []float64, width, height int) template.HTML
 func RenderChart(series []Series, days []time.Time, opts ChartOpts) template.HTML
 ```
 `RenderChart` produces a 600×220 SVG with:
 - Light horizontal gridlines (4 bands).
 - Two y-axes: bytes (left, blue) and count (right, amber). Each
  series is normalised against its own axis.
 - X-axis labels at start, midpoint, and end of the window.
 - Per-point `<circle>` with a `<title>` for hover tooltips —
  accessible by default, no JS.
 - Empty state: faint dashed baseline + centered "no data yet"
  text.
 Sparkline is 80×20, single blue polyline, single `<title>` on the
 group element showing `"current → 30d ago"`.
 Two new partials:
 - `web/templates/partials/repo_size_sparkline.html`
 - `web/templates/partials/repo_size_chart.html`
 Both call into the renderer with the appropriate opts. No
 inline `<style>` — colours come from existing Tailwind palette
 classes already used elsewhere (`text-blue-500`, `text-amber-500`).
 ## UI placement
 ### Dashboard host row
 `web/templates/partials/host_row.html` gains one `<td>` between
 the existing "Repo size" cell and "Snapshots" cell. Width ≈ 88px.
 Cell renders the sparkline partial; if `len(points) < 2` the cell
 shows "—" centred (matches the existing no-data idiom for
 last-backup time in the same partial).
 The dashboard's existing 5-second htmx live-refresh
 (`hx-trigger="every 5s ..."` from NS-04) re-renders this cell
 along with the rest of the row. No extra polling.
 ### Host repo page
 `web/templates/pages/host_repo.html` gains a "Trend" panel
 inserted between the existing summary panel and the maintenance
 panel. Panel contains:
 - Range pills `30d | 90d | 1y` (anchor links with
  `hx-get="/hosts/{id}/repo/trend?range=…"` and
  `hx-target="#repo-trend-chart" hx-swap="outerHTML"`).
 - The chart partial wrapped in `<div id="repo-trend-chart">`.
 - A small legend strip below the chart.
 ## Endpoints
 - `GET /hosts/{id}/repo/trend?range=30d|90d|1y` — admin/operator,
  htmx fragment, returns the chart partial. Auth reuses the
  existing host-scoped middleware on the `/hosts/{id}` family.
  Invalid `range` falls back to 30d.
 No new admin-only surface — anyone with read access to the host
 can see the trend.
 ## Testing
 - `internal/store/host_repo_stats_history_test.go` — upsert
  merges partial patches without nulling; ordering; since-day
  filter; cascade on host delete.
 - `internal/web/sparkline/sparkline_test.go` — golden SVG files
  for: empty input, single point, full 30-day series, mixed
  null points. Goldens live under `testdata/`.
 - `internal/server/http/ui_repo_test.go` — trend panel renders
  with seeded history; range selector swaps server-side; empty
  state.
 - `internal/server/http/ui_dashboard_test.go` — host row sparkline
  cell present and renders SVG when points exist, "—" when not.
 - Smoke after build: dashboard row shows sparkline once two days
  of data exist; repo page chart toggles cleanly between ranges.
 ## Migration / rollout
 - Schema migration is additive — no risk to existing tables.
 - Write path is best-effort; on schema issue the main repo-stats
  upsert is unaffected.
 - No agent change required, so no fleet update needed.
 ## Acceptance
 - After two days of operation, the dashboard sparkline shows a
  visible line for any host that has run a backup or
  maintenance op on both days.
 - Host repo page renders the trend panel with the snapshot-count
  overlay; range selector switches view without a full page
  reload.
 - `go test ./...` and `go vet ./...` clean.
 - Smoke env exercise: backup → sparkline updates; range pills
  swap; FK cascade verified by deleting a host and checking the
  history table.
@@ -1,42 +0,0 @@
 # Build a Linux container that runs the restic-manager agent against a
 # sibling rest-server in the e2e compose stack. Used only by tests
 # (e2e/compose.e2e.yml + .gitea/workflows/e2e.yml).
 #
 # Two stages:
 #   1. golang:alpine to build the agent binary.
 #   2. alpine:3.20 with the `restic` package + the built binary.
 #
 # Pinning by digest is intentional for CI reproducibility.
 FROM golang:1.25-alpine AS build
 WORKDIR /src
 ENV CGO_ENABLED=0 \
    GOFLAGS="-trimpath"
 COPY go.mod go.sum* ./
 RUN go mod download
 COPY . .
 ARG VERSION=e2e
 RUN go build -ldflags="-s -w -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=${VERSION}" \
        -o /out/restic-manager-agent ./cmd/agent
 FROM alpine:3.20
 RUN apk add --no-cache restic ca-certificates curl
 COPY --from=build /out/restic-manager-agent /usr/local/bin/restic-manager-agent
 # Agents normally run as root because backup paths often need it. The
 # e2e fixture only backs up paths under /data which we own, so this
 # container would tolerate a non-root user — but staying root keeps
 # parity with the production install.
 USER root
 # The agent needs a writable directory for its config + secrets store.
 RUN mkdir -p /etc/restic-manager /var/lib/restic-manager-agent
 ENV RM_AGENT_CONFIG=/etc/restic-manager/agent.yaml
 # The compose entrypoint sets the announce URL via env.
 COPY e2e/agent-entrypoint.sh /usr/local/bin/entrypoint.sh
 RUN chmod +x /usr/local/bin/entrypoint.sh
 ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
@@ -1,21 +0,0 @@
 # Playwright runner for the e2e suite. Built and run by
 # e2e/compose.e2e.yml so the test process sits on the same docker
 # network as the server, agent, and rest-server. The previous setup
 # ran Playwright on the workflow runner host and reached the server
 # via 127.0.0.1:8080; that fails on Gitea's act-style runners
 # because the workflow steps execute inside a runner container,
 # not on the host where compose publishes its ports.
 FROM mcr.microsoft.com/playwright:v1.59.1-jammy
 WORKDIR /work
 # Install npm deps in a separate layer keyed off package.json so
 # changes to specs don't bust the dep cache.
 COPY e2e/playwright/package.json /work/package.json
 RUN npm install --no-audit --no-fund
 COPY e2e/playwright/ /work/
 ENV CI=1
 ENTRYPOINT ["npx", "playwright", "test"]
@@ -1,27 +0,0 @@
 #!/bin/sh
 # Entrypoint for the e2e agent container.
 #
 # Three states:
 #   1. Already enrolled (agent.yaml has a bearer): run the agent.
 #   2. Token supplied via $RM_ENROL_TOKEN: enrol then run.
 #   3. Otherwise: announce against $RM_SERVER and wait for an admin to
 #      accept us. The announce flow blocks until accepted, then drops
 #      straight into the normal run loop, so this is the test-friendly
 #      path.
 set -eu
 CFG="${RM_AGENT_CONFIG:-/etc/restic-manager/agent.yaml}"
 SERVER="${RM_SERVER:?set RM_SERVER}"
 if [ -f "$CFG" ] && grep -q '^agent_token:' "$CFG"; then
    exec restic-manager-agent -config "$CFG"
 fi
 if [ -n "${RM_ENROL_TOKEN:-}" ]; then
    exec restic-manager-agent -config "$CFG" \
        -enroll-server "$SERVER" \
        -enroll-token "$RM_ENROL_TOKEN"
 fi
 # Announce-and-approve: blocks until an admin accepts, then runs.
 exec restic-manager-agent -config "$CFG" -enroll-server "$SERVER"
@@ -1,108 +0,0 @@
 # End-to-end test stack — used by .gitea/workflows/e2e.yml and by
 # operators who want to run the Playwright suite locally.
 #
 # Three services:
 #   * server      — restic-manager built from the working tree
 #   * agent       — restic-manager agent built from the working tree
 #                   (announces; Playwright accepts it during the test)
 #   * rest-server — the actual restic backend, sibling of the agent
 #
 # Run from the repo root:
 #   docker compose -f e2e/compose.e2e.yml up --build --abort-on-container-exit
 services:
  rest-server:
    image: restic/rest-server:0.13.0
    environment:
      DATA_DIR: /data
      OPTIONS: "--no-auth"
    volumes:
      - rest-data:/data
    networks: [rmnet]
  server:
    build:
      context: ..
      dockerfile: deploy/Dockerfile.server
      args:
        VERSION: e2e
    environment:
      RM_LISTEN: ":8080"
      RM_DATA_DIR: "/data"
      RM_BASE_URL: "http://server:8080"
      RM_COOKIE_SECURE: "false"
      # Bind the metrics endpoint loose for the test, so one of the
      # Playwright assertions can exercise it.
      RM_METRICS_TRUSTED_CIDR: "0.0.0.0/0"
    volumes:
      - server-data:/data
    ports:
      - "127.0.0.1:8080:8080"
    healthcheck:
      test: ["CMD", "/usr/local/bin/restic-manager-server", "--version"]
      interval: 2s
      timeout: 2s
      retries: 30
    networks: [rmnet]
  agent:
    build:
      context: ..
      dockerfile: e2e/Dockerfile.agent
      args:
        VERSION: e2e
    environment:
      RM_SERVER: "http://server:8080"
    depends_on:
      - server
    volumes:
      # Source paths the agent backs up. Compose pre-populates this
      # with a few files so the snapshot list isn't empty.
      - source-data:/source
      - agent-config:/etc/restic-manager
      - agent-state:/var/lib/restic-manager-agent
    networks: [rmnet]
  # Playwright test runner. Profile-gated so `compose up` doesn't
  # start it; CI runs it via `compose run --rm playwright`. Lives on
  # rmnet so it can reach the server via its compose-network DNS
  # name rather than depending on host port-publish (which doesn't
  # work on Gitea's container-based runners).
  playwright:
    profiles: [test]
    build:
      context: ..
      dockerfile: e2e/Dockerfile.playwright
    environment:
      RM_BASE_URL: "http://server:8080"
      RM_BOOTSTRAP_TOKEN: "${RM_BOOTSTRAP_TOKEN:-}"
    volumes:
      - ./playwright/playwright-report:/work/playwright-report
      - ./playwright/test-results:/work/test-results
    depends_on:
      - server
      - agent
    networks: [rmnet]
  # One-shot init container that drops a couple of files into the
  # source volume so backups have something to snapshot.
  source-fixture:
    image: alpine:3.20
    command: >
      sh -c 'mkdir -p /source && echo "hello world" > /source/hello.txt &&
             echo "another file" > /source/two.txt && sleep 0.2'
    volumes:
      - source-data:/source
    networks: [rmnet]
    restart: "no"
 volumes:
  server-data:
  rest-data:
  source-data:
  agent-config:
  agent-state:
 networks:
  rmnet:
    driver: bridge
@@ -1,14 +0,0 @@
 {
  "name": "restic-manager-e2e",
  "version": "0.0.0",
  "private": true,
  "type": "module",
  "scripts": {
    "test": "playwright test",
    "test:headed": "playwright test --headed",
    "test:debug": "PWDEBUG=1 playwright test"
  },
  "devDependencies": {
    "@playwright/test": "1.59.1"
  }
 }
@@ -1,31 +0,0 @@
 import { defineConfig, devices } from '@playwright/test';
 // Single-target Chromium config: the e2e suite is narrow (smoke
 // the production-shaped flow against the docker-compose stack).
 // Cross-browser matrix doesn't add signal — what we're verifying is
 // the server's HTML and the agent's WebSocket handshake, neither of
 // which depends on browser engine.
 const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080';
 export default defineConfig({
    testDir: './tests',
    timeout: 60_000,
    expect: { timeout: 10_000 },
    fullyParallel: false,
    retries: process.env.CI ? 1 : 0,
    workers: 1,
    reporter: [['list'], ['html', { open: 'never' }]],
    use: {
        baseURL,
        trace: 'retain-on-failure',
        screenshot: 'only-on-failure',
        video: 'retain-on-failure',
    },
    projects: [
        {
            name: 'chromium',
            use: { ...devices['Desktop Chrome'] },
        },
    ],
 });
@@ -1,114 +0,0 @@
 // Helpers used by every test. The shape favours the JSON API for
 // reads + accept/dispatch (deterministic, easy to assert) and the
 // browser for human-facing surfaces (login form, dashboard render).
 import { APIRequestContext, expect, Page } from '@playwright/test';
 export const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080';
 export interface HostJSON {
    id: string;
    name: string;
    status: string;
    last_backup_status?: string;
 }
 export async function readBootstrapToken(): Promise<string> {
    const tok = process.env.RM_BOOTSTRAP_TOKEN;
    if (!tok) {
        throw new Error('RM_BOOTSTRAP_TOKEN not set — the harness scrapes it from server logs');
    }
    return tok;
 }
 export async function bootstrapAdmin(
    request: APIRequestContext,
    {
        username = 'admin',
        password = 'e2e-test-password-1234',
    }: { username?: string; password?: string } = {},
 ): Promise<{ username: string; password: string }> {
    const token = await readBootstrapToken();
    const res = await request.post(`${baseURL}/api/bootstrap`, {
        data: { token, username, password },
    });
    if (!res.ok() && res.status() !== 409 /* already bootstrapped */) {
        throw new Error(`bootstrap: ${res.status()} ${await res.text()}`);
    }
    return { username, password };
 }
 export async function loginViaUI(page: Page, username: string, password: string): Promise<void> {
    await page.goto(`${baseURL}/login`);
    await page.locator('#login-username').fill(username);
    await page.locator('#login-password').fill(password);
    await Promise.all([
        page.waitForURL(new RegExp(`^${baseURL}/?$`)),
        page.locator('form[action="/login"] button[type="submit"]').click(),
    ]);
 }
 /**
 * Polls the dashboard until a pending host card is visible, then
 * extracts its pending-id from the inline accept form's action URL.
 */
 export async function waitForPendingHostID(page: Page): Promise<string> {
    const formLocator = page.locator('form[action^="/api/pending-hosts/"][action$="/accept"]').first();
    await expect(formLocator).toBeVisible({ timeout: 60_000 });
    const action = await formLocator.getAttribute('action');
    if (!action) throw new Error('pending host form has no action attribute');
    const m = action.match(/\/api\/pending-hosts\/([^/]+)\/accept/);
    if (!m) throw new Error(`unexpected action URL: ${action}`);
    return m[1];
 }
 export async function acceptPending(
    request: APIRequestContext,
    cookie: string,
    pendingID: string,
    repo: { url: string; username?: string; password: string },
 ): Promise<void> {
    const res = await request.post(`${baseURL}/api/pending-hosts/${pendingID}/accept`, {
        headers: { cookie, 'content-type': 'application/json' },
        data: {
            repo_url: repo.url,
            repo_username: repo.username ?? '',
            repo_password: repo.password,
        },
    });
    if (!res.ok()) {
        throw new Error(`accept: ${res.status()} ${await res.text()}`);
    }
 }
 export async function listHosts(request: APIRequestContext, cookie: string): Promise<HostJSON[]> {
    const res = await request.get(`${baseURL}/api/hosts`, { headers: { cookie } });
    if (!res.ok()) throw new Error(`list hosts: ${res.status()} ${await res.text()}`);
    const body = (await res.json()) as { items?: HostJSON[]; hosts?: HostJSON[] };
    return body.items ?? body.hosts ?? [];
 }
 export async function waitForHostStatus(
    request: APIRequestContext,
    cookie: string,
    matcher: (h: HostJSON) => boolean,
    timeoutMs = 60_000,
 ): Promise<HostJSON> {
    const deadline = Date.now() + timeoutMs;
    let last: HostJSON | undefined;
    while (Date.now() < deadline) {
        const hosts = await listHosts(request, cookie);
        const hit = hosts.find(matcher);
        if (hit) return hit;
        last = hosts[0];
        await new Promise((r) => setTimeout(r, 1_000));
    }
    throw new Error(`waitForHostStatus: timeout. Last seen: ${JSON.stringify(last)}`);
 }
 export async function getSessionCookie(page: Page): Promise<string> {
    const cookies = await page.context().cookies();
    const c = cookies.find((c) => c.name === 'rm_session');
    if (!c) throw new Error('rm_session cookie not set after login');
    return `${c.name}=${c.value}`;
 }
@@ -1,83 +0,0 @@
 // End-to-end smoke: bootstrap → accept pending host → run backup → see succeeded.
 //
 // The compose stack stands up a server, a sibling rest-server, and an
 // agent in announce-and-approve mode. This test drives the operator
 // path through the UI (login + dashboard) and the API
 // (accept + run-now + poll for terminal) — UI for the human surfaces,
 // API for the deterministic ones.
 import { test, expect } from '@playwright/test';
 import {
    baseURL,
    bootstrapAdmin,
    loginViaUI,
    waitForPendingHostID,
    acceptPending,
    waitForHostStatus,
    getSessionCookie,
 } from './lib/server';
 test.describe('smoke: enrol-via-announce → backup', () => {
    test('happy path completes in under a minute', async ({ page, request }) => {
        const { username, password } = await bootstrapAdmin(request);
        await loginViaUI(page, username, password);
        // Dashboard renders.
        await expect(page.locator('main')).toContainText(/host|fleet|pending/i, { timeout: 10_000 });
        // Pending host appears (the agent container has been
        // announcing since startup).
        const pendingID = await waitForPendingHostID(page);
        const cookie = await getSessionCookie(page);
        // Accept with the rest-server creds. compose's rest-server runs
        // --no-auth, so any credentials work; restic still demands a
        // password to encrypt the repo.
        await acceptPending(request, cookie, pendingID, {
            url: 'rest:http://rest-server:8000/',
            password: 'e2e-repo-password',
        });
        // Wait for the host to come online + auto-init to land.
        const onlineHost = await waitForHostStatus(
            request, cookie,
            (h) => h.status === 'online',
            60_000,
        );
        expect(onlineHost.id).toBeTruthy();
        // Trigger a backup via the UI form-post (HX-Redirect to /jobs/{id}).
        await page.goto(`${baseURL}/hosts/${onlineHost.id}`);
        await Promise.all([
            page.waitForURL(/\/jobs\//),
            page.locator('form[action$="/run-backup"] button[type="submit"]').first().click(),
        ]);
        // Wait for the host's last_backup_status to flip to 'succeeded'.
        // The job page itself is harder to assert on (it uses
        // server-pushed updates and a reload-on-finish pattern); the
        // host record is the source of truth and is what the dashboard
        // surfaces.
        const finishedHost = await waitForHostStatus(
            request, cookie,
            (h) => h.id === onlineHost.id && h.last_backup_status === 'succeeded',
            120_000,
        );
        expect(finishedHost.last_backup_status).toBe('succeeded');
    });
 });
 test.describe('smoke: scrape /metrics', () => {
    // The /metrics endpoint is documented (RM_METRICS_TOKEN /
    // RM_METRICS_TRUSTED_CIDR, gauges rm_hosts_total / rm_build_info)
    // but not yet implemented in the server. Skipping until the
    // Prometheus exposition lands; tracked separately from this
    // e2e harness.
    test.skip('metrics endpoint exposes the host gauge', async ({ request }) => {
        const res = await request.get(`${baseURL}/metrics`);
        expect(res.status()).toBe(200);
        const body = await res.text();
        expect(body).toContain('rm_hosts_total');
        expect(body).toContain('rm_build_info{');
    });
 });
@@ -1,100 +0,0 @@
 // Package updater carries the agent's self-update logic.
 //
 // The flow is operator-driven: the server dispatches a command.update
 // WS envelope, the agent fetches a fresh binary from the server's
 // /agent/binary endpoint, atomic-renames it over the running binary
 // (Linux) or hands off to a detached helper script (Windows), and
 // exits cleanly so the service manager restarts under the new
 // binary. See docs/superpowers/specs/2026-05-06-p6-01-02-...
 //
 // Platform-specific code is build-tagged into updater_unix.go /
 // updater_windows.go. This file holds the shared HTTP fetch + path
 // helpers + the test seam.
 package updater
 import (
 	"context"
 	"fmt"
 	"io"
 	"net/http"
 	"os"
 	"path/filepath"
 	"runtime"
 	"time"
 )
 // fetch downloads the new binary into <binaryPath>.new, fsyncs, chmods.
 // Returns the path of the staged file (always binaryPath + ".new").
 func fetch(ctx context.Context, serverURL, binaryPath string) (string, error) {
 	url := fmt.Sprintf("%s/agent/binary?os=%s&arch=%s", serverURL, runtime.GOOS, runtime.GOARCH)
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
 	if err != nil {
 		return "", err
 	}
 	c := &http.Client{Timeout: 5 * time.Minute}
 	res, err := c.Do(req)
 	if err != nil {
 		return "", err
 	}
 	defer func() { _ = res.Body.Close() }()
 	if res.StatusCode != http.StatusOK {
 		return "", fmt.Errorf("agent binary fetch: %s", res.Status)
 	}
 	stagePath := binaryPath + ".new"
 	f, err := os.OpenFile(stagePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
 	if err != nil {
 		return "", err
 	}
 	if _, copyErr := io.Copy(f, res.Body); copyErr != nil {
 		_ = f.Close()
 		_ = os.Remove(stagePath)
 		return "", copyErr
 	}
 	if syncErr := f.Sync(); syncErr != nil {
 		_ = f.Close()
 		_ = os.Remove(stagePath)
 		return "", syncErr
 	}
 	if closeErr := f.Close(); closeErr != nil {
 		_ = os.Remove(stagePath)
 		return "", closeErr
 	}
 	if err := os.Chmod(stagePath, 0o755); err != nil {
 		_ = os.Remove(stagePath)
 		return "", err
 	}
 	return stagePath, nil
 }
 // resolveOwnBinary returns the absolute path of the running binary.
 // Refuses /proc/self/exe — that's what os.Executable returns on some
 // systems but the path can't be renamed across.
 func resolveOwnBinary() (string, error) {
 	p, err := os.Executable()
 	if err != nil {
 		return "", err
 	}
 	abs, err := filepath.Abs(p)
 	if err != nil {
 		return "", err
 	}
 	if abs == "/proc/self/exe" {
 		return "", fmt.Errorf("cannot resolve own binary path (/proc/self/exe)")
 	}
 	return abs, nil
 }
 // UpdateForTest is the platform-neutral test seam. In production the
 // platform-specific Update fetches, swaps, then exits the process.
 // UpdateForTest stops short of the exit so unit tests can assert on
 // file state.
 func UpdateForTest(serverURL, binaryPath string) error {
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
 	stage, err := fetch(ctx, serverURL, binaryPath)
 	if err != nil {
 		return err
 	}
 	return swap(stage, binaryPath)
 }
@@ -1,87 +0,0 @@
 //go:build !windows
 package updater
 import (
 	"bytes"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"runtime"
 	"testing"
 )
 // TestUpdate_LinuxAtomicSwap stages a fake "running binary" file, runs
 // UpdateForTest against a fake /agent/binary server, and asserts that
 // the binary was swapped, .old preserves the previous bytes, and .new
 // was renamed away.
 func TestUpdate_LinuxAtomicSwap(t *testing.T) {
 	tmp := t.TempDir()
 	binPath := filepath.Join(tmp, "agent")
 	if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil {
 		t.Fatal(err)
 	}
 	newBytes := []byte("NEW BINARY CONTENTS")
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		if r.URL.Path != "/agent/binary" {
 			http.NotFound(w, r)
 			return
 		}
 		gotOS, gotArch := r.URL.Query().Get("os"), r.URL.Query().Get("arch")
 		if gotOS != runtime.GOOS || gotArch != runtime.GOARCH {
 			t.Errorf("query mismatch: got os=%s arch=%s want %s/%s",
 				gotOS, gotArch, runtime.GOOS, runtime.GOARCH)
 		}
 		_, _ = io.Copy(w, bytes.NewReader(newBytes))
 	}))
 	defer srv.Close()
 	if err := UpdateForTest(srv.URL, binPath); err != nil {
 		t.Fatalf("update: %v", err)
 	}
 	got, err := os.ReadFile(binPath)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if string(got) != string(newBytes) {
 		t.Fatalf("binary contents: got %q want %q", got, newBytes)
 	}
 	old, err := os.ReadFile(binPath + ".old")
 	if err != nil {
 		t.Fatalf("agent.old missing: %v", err)
 	}
 	if string(old) != "OLD" {
 		t.Fatalf("agent.old contents: got %q want %q", old, "OLD")
 	}
 	if _, err := os.Stat(binPath + ".new"); !os.IsNotExist(err) {
 		t.Fatalf("agent.new should be absent after swap, got err=%v", err)
 	}
 }
 // TestUpdate_FetchHTTPError surfaces the server's status when the
 // binary is not published for this os/arch.
 func TestUpdate_FetchHTTPError(t *testing.T) {
 	tmp := t.TempDir()
 	binPath := filepath.Join(tmp, "agent")
 	if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil {
 		t.Fatal(err)
 	}
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		http.Error(w, `{"error":"binary_not_published"}`, http.StatusNotFound)
 	}))
 	defer srv.Close()
 	err := UpdateForTest(srv.URL, binPath)
 	if err == nil {
 		t.Fatal("expected error, got nil")
 	}
 	got, _ := os.ReadFile(binPath)
 	if string(got) != "OLD" {
 		t.Fatalf("binary should not have changed, got %q", got)
 	}
 }
@@ -1,73 +0,0 @@
 //go:build !windows
 package updater
 import (
 	"context"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"time"
 )
 // Update fetches the new binary, swaps it in, then exits so systemd
 // restarts the process under the new binary. The caller should close
 // the WS connection cleanly (so the server transitions the host to
 // disconnected immediately rather than waiting for the heartbeat
 // sweep) before invoking.
 //
 // Service-user assumption: the agent runs as root under the
 // systemd-shipped unit, which can write the binary path directly.
 // If the agent ever moves to a non-root service user, this breaks —
 // would need a setuid helper or an out-of-process update service.
 func Update(ctx context.Context, serverURL string) error {
 	binPath, err := resolveOwnBinary()
 	if err != nil {
 		return err
 	}
 	stage, err := fetch(ctx, serverURL, binPath)
 	if err != nil {
 		return err
 	}
 	if err := swap(stage, binPath); err != nil {
 		return err
 	}
 	slog.Info("agent self-update: binary swapped, exiting for systemd restart",
 		"binary", binPath)
 	// Give logger / WS close-frame a moment to flush, then exit.
 	time.Sleep(200 * time.Millisecond)
 	os.Exit(0)
 	return nil // unreachable
 }
 // swap copies the running binary to <bin>.old (M1 — keep one revision
 // back for hand-rolled rollback), then atomic-renames the staged
 // binary into place. Linux supports rename-while-open so this works
 // even though the running process holds the source open.
 func swap(stagePath, binPath string) error {
 	src, err := os.Open(binPath)
 	if err != nil {
 		return fmt.Errorf("open running binary: %w", err)
 	}
 	defer func() { _ = src.Close() }()
 	dst, err := os.OpenFile(binPath+".old", os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
 	if err != nil {
 		return fmt.Errorf("open .old: %w", err)
 	}
 	if _, err := io.Copy(dst, src); err != nil {
 		_ = dst.Close()
 		return fmt.Errorf("copy to .old: %w", err)
 	}
 	if err := dst.Sync(); err != nil {
 		_ = dst.Close()
 		return err
 	}
 	if err := dst.Close(); err != nil {
 		return err
 	}
 	if err := os.Rename(stagePath, binPath); err != nil {
 		return fmt.Errorf("rename .new over running binary: %w", err)
 	}
 	return nil
 }
@@ -1,73 +0,0 @@
 //go:build windows
 package updater
 import (
 	"context"
 	"fmt"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"syscall"
 	"time"
 )
 // helperScript is rendered with fmt.Sprintf, args order:
 //
 //	%[1]s — running binary path (source for the .old copy)
 //	%[2]s — .old path
 //	%[3]s — staged .new path
 //	%[4]s — running binary path (rename target)
 const helperScript = `@echo off
 timeout /t 3 /nobreak >nul
 copy /Y "%[1]s" "%[2]s"
 sc stop restic-manager-agent
 :wait
 sc query restic-manager-agent | find "STOPPED" >nul
 if errorlevel 1 (timeout /t 1 /nobreak >nul & goto wait)
 move /Y "%[3]s" "%[4]s"
 sc start restic-manager-agent
 del "%%~f0"
 `
 // Update on Windows can't overwrite the running .exe in-process
 // (exclusive file lock), so we stage the new binary, write a small
 // detached helper script that waits, stops the service, swaps the
 // binary, and starts the service, then exit cleanly. SCM treats
 // clean exits after sc stop as intentional and does not auto-restart;
 // the helper's final sc start handles that.
 func Update(ctx context.Context, serverURL string) error {
 	binPath, err := resolveOwnBinary()
 	if err != nil {
 		return err
 	}
 	stage, err := fetch(ctx, serverURL, binPath)
 	if err != nil {
 		return err
 	}
 	helperPath := filepath.Join(filepath.Dir(binPath), "agent-update.cmd")
 	body := fmt.Sprintf(helperScript, binPath, binPath+".old", stage, binPath)
 	if err := os.WriteFile(helperPath, []byte(body), 0o755); err != nil {
 		return err
 	}
 	cmd := exec.Command("cmd.exe", "/c", helperPath)
 	cmd.SysProcAttr = &syscall.SysProcAttr{
 		HideWindow:    true,
 		CreationFlags: 0x00000008 | 0x08000000, // DETACHED_PROCESS | CREATE_NO_WINDOW
 	}
 	if err := cmd.Start(); err != nil {
 		return err
 	}
 	slog.Info("agent self-update: helper spawned, exiting cleanly",
 		"binary", binPath, "helper", helperPath)
 	time.Sleep(200 * time.Millisecond)
 	os.Exit(0)
 	return nil // unreachable
 }
 // swap is unused on Windows — the helper script does the swap.
 // Defined to satisfy the build (UpdateForTest references it).
 func swap(_, _ string) error {
 	return fmt.Errorf("updater.swap not implemented on Windows; use the helper script via Update")
 }
@@ -1,63 +0,0 @@
 package alert
 import (
 	"context"
 	"fmt"
 	"log/slog"
 	"time"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
 )
 // Alert-kind constants for P6 self-update flows.
 const (
 	// KindUpdateFailed is raised when an agent fails to come back with
 	// the expected version after a command.update dispatch (timeout or
 	// version-mismatch). Resolved by a subsequent matching hello.
 	KindUpdateFailed = "update_failed"
 	// KindFleetUpdateHalted is raised when the fleet-update worker
 	// stops mid-run because a host failed to update or went offline.
 	// Host-less alert (system-scoped). Manually resolved by an admin.
 	KindFleetUpdateHalted = "fleet_update_halted"
 )
 // RaiseUpdateFailed records a per-host update failure. dedupKey is the
 // hostID so a re-dispatch on the same host touches the existing alert
 // rather than spawning a duplicate.
 func (e *Engine) RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time) {
 	msg := fmt.Sprintf("Agent update failed (job %s): %s", jobID, reason)
 	e.raiseAndNotify(ctx, hostID, KindUpdateFailed, hostID, "warning", msg, when)
 }
 // ResolveUpdateFailed clears any open update_failed alert for hostID.
 // Called from the WS hello path when the agent reconnects with the
 // target version.
 func (e *Engine) ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time) {
 	e.resolveAndNotify(ctx, hostID, KindUpdateFailed, hostID, when)
 }
 // RaiseFleetUpdateHalted is host-less — the fleet update is a
 // system-level concept. We persist it via the dedicated host-less
 // alert path so the alerts table's host_id column carries NULL.
 func (e *Engine) RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time) {
 	msg := fmt.Sprintf("Fleet update %s halted: %s", fleetUpdateID, reason)
 	id, didRaise, err := e.store.RaiseOrTouchSystem(ctx, KindFleetUpdateHalted, fleetUpdateID, "warning", msg, when)
 	if err != nil {
 		slog.Warn("alert: raise fleet_update_halted", "fu_id", fleetUpdateID, "err", err)
 		return
 	}
 	if !didRaise {
 		return
 	}
 	go e.hub.Dispatch(ctx, notification.Payload{
 		Event:    notification.EventRaised,
 		AlertID:  id,
 		Severity: "warning",
 		Kind:     KindFleetUpdateHalted,
 		HostID:   "",
 		HostName: "",
 		Message:  msg,
 		RaisedAt: when,
 	})
 }
@@ -63,7 +63,6 @@ const (
 	JobUnlock  JobKind = "unlock"
 	JobRestore JobKind = "restore"
 	JobDiff    JobKind = "diff"
 	JobUpdate  JobKind = "update"
 )
 // JobStatus is the lifecycle state of a job.
@@ -362,14 +361,13 @@ type ConfigUpdatePayload struct {
 	BandwidthDownKBps *int `json:"bandwidth_down_kbps,omitempty"`
 }
-// CommandUpdatePayload carries no operational data — the agent
+// AgentUpdateAvailablePayload — informational only; the agent does
-// already knows its own os/arch and fetches from its configured
+// NOT self-update. See spec.md §4.2 for the package-manager-based
-// server URL via /agent/binary. JobID is the server-issued id of
+// update model.
-// the update job; the agent echoes it on log.stream lines so the
+type AgentUpdateAvailablePayload struct {
-// live job log captures pre-restart progress, then either exits
+	LatestVersion string `json:"latest_version"`
-// (Linux) or hands off to a detached helper script (Windows).
+	PackageURL    string `json:"package_url"` // apt repo / choco source
-type CommandUpdatePayload struct {
+	Changelog     string `json:"changelog,omitempty"`
 	JobID string `json:"job_id"`
 }
 // TreeListRequestPayload is the body of a tree.list RPC. Used by the
@@ -29,12 +29,12 @@ const (
 // Server → agent message types.
 const (
-	MsgCommandRun    MessageType = "command.run"
+	MsgCommandRun       MessageType = "command.run"
-	MsgCommandCancel MessageType = "command.cancel"
+	MsgCommandCancel    MessageType = "command.cancel"
-	MsgScheduleSet   MessageType = "schedule.set"
+	MsgScheduleSet      MessageType = "schedule.set"
-	MsgConfigUpdate  MessageType = "config.update"
+	MsgConfigUpdate     MessageType = "config.update"
-	MsgCommandUpdate MessageType = "command.update"
+	MsgAgentUpdateAvail MessageType = "agent.update.available"
-	MsgTreeList      MessageType = "tree.list" // sync RPC: list a snapshot's children
+	MsgTreeList         MessageType = "tree.list" // sync RPC: list a snapshot's children
 )
 // Envelope is the framing for every WS message in either direction.
@@ -1,221 +0,0 @@
 // Package fleetupdate drives a rolling, sequential agent self-update
 // over a list of hosts. One worker goroutine per Start() call (gated
 // at the store layer to at-most-one-running-fleet-update).
 package fleetupdate
 import (
 	"context"
 	"errors"
 	"fmt"
 	"log/slog"
 	"time"
 	"github.com/oklog/ulid/v2"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 )
 // Hub is the slim "is this host connected?" surface.
 type Hub interface {
 	Connected(hostID string) bool
 }
 // Dispatcher sends one command.update envelope. The implementer also
 // creates the jobs row, writes audit, and registers with the update
 // watcher. Pre-checks are the dispatcher's responsibility — the worker
 // passes through whatever error it returns.
 type Dispatcher interface {
 	DispatchUpdate(ctx context.Context, hostID string, actorUserID string) (jobID string, code string, err error)
 }
 // AlertRaiser is the slim view of the alert engine's host-less raise
 // path. Used to emit fleet_update_halted on first failure.
 type AlertRaiser interface {
 	RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time)
 }
 // Worker is the long-lived fleet-update orchestrator. There is at most
 // one *running* fleet update at a time (enforced by the store).
 type Worker struct {
 	store  *store.Store
 	hub    Hub
 	disp   Dispatcher
 	alerts AlertRaiser
 	// targetVersion is the version every dispatched agent is expected
 	// to come back with. Captured at Start time to avoid drift.
 	targetVersion string
 	// pollPeriod controls the cadence at which the worker re-reads the
 	// host row to check for the version transition. Exposed for tests.
 	pollPeriod time.Duration
 	// hostTimeout bounds how long the worker waits for one host to
 	// reach the target version before halting.
 	hostTimeout time.Duration
 }
 // NewWorker builds an unstarted worker. targetVersion is set on each
 // Start call; the values here are defaults.
 func NewWorker(st *store.Store, hub Hub, disp Dispatcher, alerts AlertRaiser) *Worker {
 	return &Worker{
 		store:       st,
 		hub:         hub,
 		disp:        disp,
 		alerts:      alerts,
 		pollPeriod:  1 * time.Second,
 		hostTimeout: 95 * time.Second,
 	}
 }
 // Start creates the parent + child rows, then spawns the per-host
 // worker goroutine. Returns the new fleet_update_id on success.
 // store.ErrFleetUpdateRunning bubbles up unchanged.
 func (w *Worker) Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error) {
 	if userID == "" || targetVersion == "" {
 		return "", errors.New("fleetupdate: userID and targetVersion required")
 	}
 	if len(hostIDs) == 0 {
 		return "", errors.New("fleetupdate: at least one host required")
 	}
 	fuID := ulid.Make().String()
 	now := time.Now().UTC()
 	if err := w.store.CreateFleetUpdate(ctx, store.FleetUpdate{
 		ID:              fuID,
 		StartedAt:       now,
 		StartedByUserID: userID,
 		TargetVersion:   targetVersion,
 		Status:          "running",
 	}, hostIDs); err != nil {
 		return "", err
 	}
 	// The goroutine outlives the request that started it; carry a
 	// detached context so an HTTP-handler ctx cancel doesn't abort
 	// the long roll.
 	bg := context.WithoutCancel(ctx)
 	go w.run(bg, fuID, userID, targetVersion)
 	return fuID, nil
 }
 // Cancel marks the fleet update cancelled. The running goroutine
 // observes the new status on its next pre-check and exits without
 // dispatching further hosts. The currently-dispatched job is left to
 // finish on its own — cancelling agent-side is out of scope for v1.
 func (w *Worker) Cancel(ctx context.Context, fuID string) error {
 	return w.store.CancelFleetUpdate(ctx, fuID, time.Now().UTC())
 }
 // run is the per-host loop. Halts on first failure; emits one alert
 // on transition.
 func (w *Worker) run(ctx context.Context, fuID, userID, targetVersion string) {
 	w.targetVersion = targetVersion
 	for {
 		// Check the parent row's status — picks up Cancel.
 		fu, err := w.store.ActiveFleetUpdate(ctx)
 		if err != nil {
 			slog.Warn("fleetupdate: read active", "fu_id", fuID, "err", err)
 			return
 		}
 		if fu == nil || fu.ID != fuID {
 			// Cancelled, halted, or completed externally. Done.
 			return
 		}
 		pending, err := w.store.ListPendingFleetUpdateHosts(ctx, fuID)
 		if err != nil {
 			slog.Warn("fleetupdate: list pending", "fu_id", fuID, "err", err)
 			return
 		}
 		if len(pending) == 0 {
 			now := time.Now().UTC()
 			if err := w.store.CompleteFleetUpdate(ctx, fuID, now); err != nil {
 				slog.Warn("fleetupdate: complete", "fu_id", fuID, "err", err)
 			}
 			return
 		}
 		next := pending[0]
 		w.processHost(ctx, fuID, userID, next)
 	}
 }
 // processHost handles one host slot. Marks it skipped, succeeded, or
 // failed (and halts the fleet on failure).
 func (w *Worker) processHost(ctx context.Context, fuID, userID string, slot store.FleetUpdateHost) {
 	hostID := slot.HostID
 	_ = w.store.SetFleetUpdateCurrentHost(ctx, fuID, hostID)
 	// Pre-flight: re-read the host. The dispatch path repeats most of
 	// these checks but doing them up-front lets us emit the right
 	// per-host status (skipped vs failed) without consuming a job row.
 	host, err := w.store.GetHost(ctx, hostID)
 	if err != nil || host == nil {
 		_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "skipped", "host not found", "")
 		return
 	}
 	if host.AgentVersion != "" && host.AgentVersion == w.targetVersion {
 		_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "skipped", "already at target version", "")
 		return
 	}
 	if !w.hub.Connected(hostID) {
 		reason := fmt.Sprintf("host went offline: %s", hostID)
 		_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, "")
 		w.halt(ctx, fuID, reason)
 		return
 	}
 	// Dispatch.
 	_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "running", "", "")
 	jobID, code, err := w.disp.DispatchUpdate(ctx, hostID, userID)
 	if err != nil || code != "" {
 		reason := dispatchErrorReason(code, err)
 		_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, jobID)
 		w.halt(ctx, fuID, reason)
 		return
 	}
 	// Poll until the host's recorded agent_version matches target, or
 	// timeout.
 	deadline := time.Now().Add(w.hostTimeout)
 	for time.Now().Before(deadline) {
 		// Honour cancellation between polls.
 		fu, err := w.store.ActiveFleetUpdate(ctx)
 		if err == nil && (fu == nil || fu.ID != fuID) {
 			// Cancelled mid-host; leave the slot in 'running' for the
 			// admin to inspect. No further dispatches.
 			return
 		}
 		time.Sleep(w.pollPeriod)
 		h, err := w.store.GetHost(ctx, hostID)
 		if err == nil && h != nil && h.AgentVersion == w.targetVersion {
 			if err := w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "succeeded", "", jobID); err != nil {
 				slog.Warn("fleetupdate: set succeeded", "fu_id", fuID, "host_id", hostID, "err", err)
 			}
 			return
 		}
 	}
 	reason := fmt.Sprintf("timeout waiting for %s to reach %s", hostID, w.targetVersion)
 	_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, jobID)
 	w.halt(ctx, fuID, reason)
 }
 func (w *Worker) halt(ctx context.Context, fuID, reason string) {
 	now := time.Now().UTC()
 	if err := w.store.HaltFleetUpdate(ctx, fuID, reason, now); err != nil {
 		slog.Warn("fleetupdate: halt", "fu_id", fuID, "err", err)
 	}
 	if w.alerts != nil {
 		w.alerts.RaiseFleetUpdateHalted(ctx, fuID, reason, now)
 	}
 }
 func dispatchErrorReason(code string, err error) string {
 	if code != "" {
 		return "dispatch failed: " + code
 	}
 	if err != nil {
 		return err.Error()
 	}
 	return "dispatch failed"
 }
@@ -1,344 +0,0 @@
 package fleetupdate
 import (
 	"context"
 	"errors"
 	"path/filepath"
 	"sync"
 	"testing"
 	"time"
 	"github.com/oklog/ulid/v2"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 )
 type fakeHub struct {
 	mu     sync.Mutex
 	online map[string]bool
 }
 func (f *fakeHub) Connected(hostID string) bool {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 	return f.online[hostID]
 }
 type fakeDispatcher struct {
 	mu    sync.Mutex
 	calls []string // host IDs
 	// after dispatch, set the host's agent_version to this on the
 	// store so the worker observes the version transition.
 	st         *store.Store
 	target     string
 	delayMS    int
 	failOnHost map[string]string // host → error code
 }
 func (f *fakeDispatcher) DispatchUpdate(ctx context.Context, hostID, _ string) (string, string, error) {
 	f.mu.Lock()
 	f.calls = append(f.calls, hostID)
 	if code, ok := f.failOnHost[hostID]; ok {
 		f.mu.Unlock()
 		return "", code, nil
 	}
 	st := f.st
 	target := f.target
 	delay := f.delayMS
 	f.mu.Unlock()
 	jobID := ulid.Make().String()
 	if st != nil {
 		_ = st.CreateJob(context.Background(), store.Job{
 			ID: jobID, HostID: hostID, Kind: "update",
 			ActorKind: "user", CreatedAt: time.Now().UTC(),
 		})
 	}
 	if st != nil && target != "" {
 		go func() {
 			if delay > 0 {
 				time.Sleep(time.Duration(delay) * time.Millisecond)
 			}
 			_ = st.MarkHostHello(context.Background(), hostID, target, "0.17", api.CurrentProtocolVersion, time.Now().UTC())
 		}()
 	}
 	return jobID, "", nil
 }
 type recAlert struct {
 	mu      sync.Mutex
 	reasons []string
 }
 func (r *recAlert) RaiseFleetUpdateHalted(_ context.Context, _ string, reason string, _ time.Time) {
 	r.mu.Lock()
 	r.reasons = append(r.reasons, reason)
 	r.mu.Unlock()
 }
 func openStore(t *testing.T) *store.Store {
 	t.Helper()
 	dir := t.TempDir()
 	st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
 	if err != nil {
 		t.Fatalf("open: %v", err)
 	}
 	t.Cleanup(func() { _ = st.Close() })
 	return st
 }
 func mustCreateAdmin(t *testing.T, st *store.Store) string {
 	t.Helper()
 	uid := ulid.Make().String()
 	if err := st.CreateUser(context.Background(), store.User{
 		ID: uid, Username: "u-" + uid[:6],
 		PasswordHash: "x", Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
 	}); err != nil {
 		t.Fatalf("user: %v", err)
 	}
 	return uid
 }
 func mustCreateHost(t *testing.T, st *store.Store, name, version string) string {
 	t.Helper()
 	hostID := ulid.Make().String()
 	if err := st.CreateHost(context.Background(), store.Host{
 		ID: hostID, Name: name, OS: "linux", Arch: "amd64",
 		EnrolledAt: time.Now().UTC(),
 	}, "deadbeef-"+hostID, ""); err != nil {
 		t.Fatalf("host: %v", err)
 	}
 	if version != "" {
 		if err := st.MarkHostHello(context.Background(), hostID, version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
 			t.Fatalf("hello: %v", err)
 		}
 	}
 	return hostID
 }
 func waitForStatus(t *testing.T, st *store.Store, fuID, want string, timeout time.Duration) *store.FleetUpdate {
 	t.Helper()
 	deadline := time.Now().Add(timeout)
 	for time.Now().Before(deadline) {
 		fu, _, err := st.GetFleetUpdate(context.Background(), fuID)
 		if err == nil && fu != nil && fu.Status == want {
 			return fu
 		}
 		time.Sleep(20 * time.Millisecond)
 	}
 	t.Fatalf("status never reached %q", want)
 	return nil
 }
 func TestWorkerTwoHostsBothSucceed(t *testing.T) {
 	st := openStore(t)
 	uid := mustCreateAdmin(t, st)
 	h1 := mustCreateHost(t, st, "h1", "v0")
 	h2 := mustCreateHost(t, st, "h2", "v0")
 	hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
 	disp := &fakeDispatcher{st: st, target: "v2", delayMS: 30}
 	alerts := &recAlert{}
 	w := NewWorker(st, hub, disp, alerts)
 	w.pollPeriod = 20 * time.Millisecond
 	w.hostTimeout = 2 * time.Second
 	fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
 	if err != nil {
 		t.Fatalf("start: %v", err)
 	}
 	waitForStatus(t, st, fuID, "completed", 5*time.Second)
 	_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
 	for _, h := range hosts {
 		if h.Status != "succeeded" {
 			t.Errorf("host %s status %q want succeeded", h.HostID, h.Status)
 		}
 	}
 	if n := len(alerts.reasons); n != 0 {
 		t.Errorf("unexpected halt alert: %v", alerts.reasons)
 	}
 }
 func TestWorkerSecondHostTimesOutHalts(t *testing.T) {
 	st := openStore(t)
 	uid := mustCreateAdmin(t, st)
 	h1 := mustCreateHost(t, st, "h1", "v0")
 	h2 := mustCreateHost(t, st, "h2", "v0")
 	h3 := mustCreateHost(t, st, "h3", "v0")
 	hub := &fakeHub{online: map[string]bool{h1: true, h2: true, h3: true}}
 	// h1 dispatches normally (transitions to v2). h2 dispatch returns
 	// success but never transitions.
 	disp := &fakeDispatcher{st: st, target: "v2", delayMS: 20, failOnHost: map[string]string{
 		h2: "", // not a code-failure; simulate by clearing target on this disp run
 	}}
 	// Actually: drop h2 from the auto-transition by faking with a
 	// per-host store setter. Easiest: subclass via a wrapper.
 	_ = disp
 	customDisp := &perHostDispatcher{base: disp, st: st, target: "v2", noTransition: map[string]bool{h2: true}}
 	alerts := &recAlert{}
 	w := NewWorker(st, hub, customDisp, alerts)
 	w.pollPeriod = 20 * time.Millisecond
 	w.hostTimeout = 200 * time.Millisecond
 	fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2, h3})
 	if err != nil {
 		t.Fatalf("start: %v", err)
 	}
 	waitForStatus(t, st, fuID, "halted", 3*time.Second)
 	_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
 	gotStatus := map[string]string{}
 	for _, h := range hosts {
 		gotStatus[h.HostID] = h.Status
 	}
 	if gotStatus[h1] != "succeeded" {
 		t.Errorf("h1: %q", gotStatus[h1])
 	}
 	if gotStatus[h2] != "failed" {
 		t.Errorf("h2: %q", gotStatus[h2])
 	}
 	if gotStatus[h3] != "pending" {
 		t.Errorf("h3: %q", gotStatus[h3])
 	}
 	alerts.mu.Lock()
 	defer alerts.mu.Unlock()
 	if len(alerts.reasons) != 1 {
 		t.Errorf("alert reasons: %v", alerts.reasons)
 	}
 }
 // perHostDispatcher lets a test omit the auto-transition for selected
 // hosts so we can simulate timeout.
 type perHostDispatcher struct {
 	mu           sync.Mutex
 	base         *fakeDispatcher
 	st           *store.Store
 	target       string
 	noTransition map[string]bool
 }
 func (p *perHostDispatcher) DispatchUpdate(_ context.Context, hostID, _ string) (string, string, error) {
 	p.mu.Lock()
 	skip := p.noTransition[hostID]
 	p.mu.Unlock()
 	jobID := ulid.Make().String()
 	_ = p.st.CreateJob(context.Background(), store.Job{
 		ID: jobID, HostID: hostID, Kind: "update",
 		ActorKind: "user", CreatedAt: time.Now().UTC(),
 	})
 	if !skip {
 		go func() {
 			time.Sleep(20 * time.Millisecond)
 			_ = p.st.MarkHostHello(context.Background(), hostID, p.target, "0.17", api.CurrentProtocolVersion, time.Now().UTC())
 		}()
 	}
 	return jobID, "", nil
 }
 func TestWorkerHostOfflineHalts(t *testing.T) {
 	st := openStore(t)
 	uid := mustCreateAdmin(t, st)
 	h1 := mustCreateHost(t, st, "h1", "v0")
 	h2 := mustCreateHost(t, st, "h2", "v0")
 	hub := &fakeHub{online: map[string]bool{h1: false, h2: true}}
 	disp := &fakeDispatcher{st: st, target: "v2"}
 	alerts := &recAlert{}
 	w := NewWorker(st, hub, disp, alerts)
 	w.pollPeriod = 20 * time.Millisecond
 	w.hostTimeout = 500 * time.Millisecond
 	fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
 	if err != nil {
 		t.Fatalf("start: %v", err)
 	}
 	waitForStatus(t, st, fuID, "halted", 2*time.Second)
 	_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
 	if hosts[0].Status != "failed" {
 		t.Errorf("h1 status: %q", hosts[0].Status)
 	}
 	if hosts[1].Status != "pending" {
 		t.Errorf("h2 status: %q", hosts[1].Status)
 	}
 }
 func TestWorkerAlreadyAtTargetSkipped(t *testing.T) {
 	st := openStore(t)
 	uid := mustCreateAdmin(t, st)
 	h1 := mustCreateHost(t, st, "h1", "v2")
 	h2 := mustCreateHost(t, st, "h2", "v0")
 	hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
 	disp := &fakeDispatcher{st: st, target: "v2", delayMS: 20}
 	alerts := &recAlert{}
 	w := NewWorker(st, hub, disp, alerts)
 	w.pollPeriod = 20 * time.Millisecond
 	w.hostTimeout = 2 * time.Second
 	fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
 	if err != nil {
 		t.Fatalf("start: %v", err)
 	}
 	waitForStatus(t, st, fuID, "completed", 4*time.Second)
 	_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
 	want := map[string]string{h1: "skipped", h2: "succeeded"}
 	for _, h := range hosts {
 		if h.Status != want[h.HostID] {
 			t.Errorf("host %s: got %q want %q", h.HostID, h.Status, want[h.HostID])
 		}
 	}
 }
 func TestWorkerCancelMidRun(t *testing.T) {
 	st := openStore(t)
 	uid := mustCreateAdmin(t, st)
 	h1 := mustCreateHost(t, st, "h1", "v0")
 	h2 := mustCreateHost(t, st, "h2", "v0")
 	hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
 	// h1's transition is delayed long enough that we can cancel
 	// before it lands; h2 should never be touched.
 	disp := &fakeDispatcher{st: st, target: "v2", delayMS: 500}
 	alerts := &recAlert{}
 	w := NewWorker(st, hub, disp, alerts)
 	w.pollPeriod = 50 * time.Millisecond
 	w.hostTimeout = 5 * time.Second
 	fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
 	if err != nil {
 		t.Fatalf("start: %v", err)
 	}
 	// Give the worker a moment to dispatch h1.
 	time.Sleep(100 * time.Millisecond)
 	if err := w.Cancel(context.Background(), fuID); err != nil {
 		t.Fatalf("cancel: %v", err)
 	}
 	waitForStatus(t, st, fuID, "cancelled", 2*time.Second)
 	// h2 should never be dispatched.
 	disp.mu.Lock()
 	defer disp.mu.Unlock()
 	for _, c := range disp.calls {
 		if c == h2 {
 			t.Errorf("h2 dispatched after cancel")
 		}
 	}
 }
 func TestWorkerStartWhileActiveErrors(t *testing.T) {
 	st := openStore(t)
 	uid := mustCreateAdmin(t, st)
 	h1 := mustCreateHost(t, st, "h1", "v0")
 	h2 := mustCreateHost(t, st, "h2", "v0")
 	hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
 	disp := &fakeDispatcher{st: st, target: "v2", delayMS: 5_000}
 	w := NewWorker(st, hub, disp, &recAlert{})
 	w.pollPeriod = 50 * time.Millisecond
 	w.hostTimeout = 2 * time.Second
 	if _, err := w.Start(context.Background(), uid, "v2", []string{h1}); err != nil {
 		t.Fatalf("first start: %v", err)
 	}
 	_, err := w.Start(context.Background(), uid, "v2", []string{h2})
 	if !errors.Is(err, store.ErrFleetUpdateRunning) {
 		t.Fatalf("err: %v want ErrFleetUpdateRunning", err)
 	}
 }
@@ -11,7 +11,6 @@ import (
 	"time"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )
 func makeFilterHosts() []store.Host {
@@ -99,23 +98,6 @@ func TestSortDashboardHostsColumns(t *testing.T) {
 	}
 }
 // TestFilterAndSortDashboardUpdatesBehind: ?updates=behind narrows
 // to hosts whose agent_version is non-empty AND != server's version.
 func TestFilterAndSortDashboardUpdatesBehind(t *testing.T) {
 	t.Parallel()
 	hosts := []store.Host{
 		{ID: "01a", Name: "alpha", AgentVersion: "v0.0.1", Status: "online"},
 		{ID: "01b", Name: "bravo", AgentVersion: version.Version, Status: "online"},
 		{ID: "01c", Name: "charlie", AgentVersion: "", Status: "online"}, // never seen
 		{ID: "01d", Name: "delta", AgentVersion: "v0.0.1", Status: "offline"},
 	}
 	got := filterAndSortDashboardHosts(hosts, dashboardFilter{Updates: "behind", Sort: "name", Dir: "asc"})
 	// alpha + delta both behind; bravo (current) and charlie (empty) excluded.
 	if len(got) != 2 || got[0].Name != "alpha" || got[1].Name != "delta" {
 		t.Errorf("updates=behind: got %v", namesOf(got))
 	}
 }
 // TestParseDashboardFilterDefaults: empty query gives sort=name asc.
 func TestParseDashboardFilterDefaults(t *testing.T) {
 	t.Parallel()
@@ -1,379 +0,0 @@
 // fleet_update.go — admin-only fleet rolling-update endpoints + page.
 //
 // Surface:
 //   - POST /api/fleet/update          → starts a fleet update (JSON)
 //   - POST /api/fleet-updates/{id}/cancel
 //   - GET  /api/fleet-updates/{id}    → JSON parent + per-host array
 //   - GET  /settings/fleet-update     → admin UI page
 //   - GET  /settings/fleet-update/partial → htmx polling fragment
 //
 // All routes are mounted in the admin band (see routes()).
 package http
 import (
 	"context"
 	"encoding/json"
 	"errors"
 	"log/slog"
 	stdhttp "net/http"
 	"time"
 	"github.com/go-chi/chi/v5"
 	"github.com/oklog/ulid/v2"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )
 // fleetUpdateStartReq is the JSON body for POST /api/fleet/update.
 // Both fields are optional: empty target_version defaults to the
 // server's current version, empty host_ids derives the out-of-date
 // online subset.
 type fleetUpdateStartReq struct {
 	TargetVersion string   `json:"target_version,omitempty"`
 	HostIDs       []string `json:"host_ids,omitempty"`
 }
 // fleetUpdateHostView is one row in the JSON response for GET
 // /api/fleet-updates/{id}. Hostname is hydrated from the store so
 // callers don't need a second round-trip per host.
 type fleetUpdateHostView struct {
 	HostID       string `json:"host_id"`
 	HostName     string `json:"host_name,omitempty"`
 	Position     int    `json:"position"`
 	Status       string `json:"status"`
 	JobID        string `json:"job_id,omitempty"`
 	FailedReason string `json:"failed_reason,omitempty"`
 }
 // fleetUpdateView is the JSON projection of the parent + children.
 type fleetUpdateView struct {
 	ID              string                `json:"id"`
 	StartedAt       string                `json:"started_at"`
 	StartedByUserID string                `json:"started_by_user_id"`
 	TargetVersion   string                `json:"target_version"`
 	Status          string                `json:"status"`
 	CurrentHostID   string                `json:"current_host_id,omitempty"`
 	HaltedReason    string                `json:"halted_reason,omitempty"`
 	CompletedAt     *string               `json:"completed_at,omitempty"`
 	Hosts           []fleetUpdateHostView `json:"hosts"`
 }
 // fleetUpdatePage backs both the full /settings/fleet-update page
 // and the partial polled fragment. Idle / Active are mutually
 // exclusive: if Active is non-nil, render the progress view.
 type fleetUpdatePage struct {
 	// Idle-state fields.
 	OutOfDateHosts []store.Host // online hosts whose version != target
 	TargetVersion  string
 	// Active-state fields. Nil when no fleet update has ever run.
 	Active     *store.FleetUpdate
 	ActiveRows []fleetUpdateHostView
 	// Common.
 	HostNames map[string]string
 	// PollURL is the partial endpoint htmx polls every few seconds.
 	PollURL string
 }
 // handleAPIFleetUpdateStart is POST /api/fleet/update.
 func (s *Server) handleAPIFleetUpdateStart(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	user, ok := s.requireUser(r)
 	if !ok {
 		writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
 		return
 	}
 	if s.deps.FleetWorker == nil {
 		writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "")
 		return
 	}
 	var body fleetUpdateStartReq
 	// Empty body is fine — both fields are optional.
 	if r.ContentLength != 0 {
 		if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
 			writeJSONError(w, stdhttp.StatusBadRequest, "invalid_json", err.Error())
 			return
 		}
 	}
 	target := body.TargetVersion
 	if target == "" {
 		target = version.Version
 	}
 	hostIDs := body.HostIDs
 	if len(hostIDs) == 0 {
 		derived, err := s.deriveOutOfDateOnlineHostIDs(r.Context(), target)
 		if err != nil {
 			writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
 			return
 		}
 		hostIDs = derived
 	}
 	if len(hostIDs) == 0 {
 		writeJSONError(w, stdhttp.StatusConflict, "no_hosts_eligible",
 			"no online hosts are out of date")
 		return
 	}
 	fuID, err := s.deps.FleetWorker.Start(r.Context(), user.ID, target, hostIDs)
 	if err != nil {
 		if errors.Is(err, store.ErrFleetUpdateRunning) {
 			writeJSONError(w, stdhttp.StatusConflict, "fleet_update_in_progress", err.Error())
 			return
 		}
 		writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
 		return
 	}
 	auditPayload, _ := json.Marshal(map[string]any{
 		"fleet_update_id": fuID,
 		"target_version":  target,
 		"host_count":      len(hostIDs),
 	})
 	_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
 		ID: ulid.Make().String(), UserID: &user.ID, Actor: "user",
 		Action:     "fleet.update_started",
 		TargetKind: ptr("fleet_update"), TargetID: &fuID,
 		TS:      time.Now().UTC(),
 		Payload: auditPayload,
 	})
 	writeJSON(w, stdhttp.StatusAccepted, map[string]string{"fleet_update_id": fuID})
 }
 // handleAPIFleetUpdateCancel is POST /api/fleet-updates/{id}/cancel.
 func (s *Server) handleAPIFleetUpdateCancel(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	user, ok := s.requireUser(r)
 	if !ok {
 		writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
 		return
 	}
 	if s.deps.FleetWorker == nil {
 		writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "")
 		return
 	}
 	fuID := chi.URLParam(r, "id")
 	if fuID == "" {
 		writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "")
 		return
 	}
 	fu, _, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID)
 	if err != nil {
 		if errors.Is(err, store.ErrNotFound) {
 			writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "")
 			return
 		}
 		writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
 		return
 	}
 	if fu.Status != "running" {
 		writeJSONError(w, stdhttp.StatusConflict, "fleet_update_not_running",
 			"fleet update is not in the running state")
 		return
 	}
 	if err := s.deps.FleetWorker.Cancel(r.Context(), fuID); err != nil {
 		writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
 		return
 	}
 	_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
 		ID: ulid.Make().String(), UserID: &user.ID, Actor: "user",
 		Action:     "fleet.update_cancelled",
 		TargetKind: ptr("fleet_update"), TargetID: &fuID,
 		TS: time.Now().UTC(),
 	})
 	w.WriteHeader(stdhttp.StatusNoContent)
 }
 // handleAPIFleetUpdateGet is GET /api/fleet-updates/{id}.
 func (s *Server) handleAPIFleetUpdateGet(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	if _, ok := s.requireUser(r); !ok {
 		writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
 		return
 	}
 	fuID := chi.URLParam(r, "id")
 	fu, hosts, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID)
 	if err != nil {
 		if errors.Is(err, store.ErrNotFound) {
 			writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "")
 			return
 		}
 		writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
 		return
 	}
 	names := s.hostNameMap(r)
 	view := fleetUpdateView{
 		ID:              fu.ID,
 		StartedAt:       fu.StartedAt.UTC().Format(time.RFC3339Nano),
 		StartedByUserID: fu.StartedByUserID,
 		TargetVersion:   fu.TargetVersion,
 		Status:          fu.Status,
 		CurrentHostID:   fu.CurrentHostID,
 		HaltedReason:    fu.HaltedReason,
 		Hosts:           make([]fleetUpdateHostView, 0, len(hosts)),
 	}
 	if fu.CompletedAt != nil {
 		s := fu.CompletedAt.UTC().Format(time.RFC3339Nano)
 		view.CompletedAt = &s
 	}
 	for _, h := range hosts {
 		view.Hosts = append(view.Hosts, fleetUpdateHostView{
 			HostID:       h.HostID,
 			HostName:     names[h.HostID],
 			Position:     h.Position,
 			Status:       h.Status,
 			JobID:        h.JobID,
 			FailedReason: h.FailedReason,
 		})
 	}
 	writeJSON(w, stdhttp.StatusOK, view)
 }
 // handleUIFleetUpdate renders /settings/fleet-update.
 func (s *Server) handleUIFleetUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	u := s.requireUIUser(w, r)
 	if u == nil {
 		return
 	}
 	page, err := s.buildFleetUpdatePage(r)
 	if err != nil {
 		slog.Error("ui fleet update: build page", "err", err)
 		stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
 		return
 	}
 	view := s.baseView(r, u)
 	view.Title = "Fleet update · restic-manager"
 	view.Active = "settings"
 	view.Page = page
 	if err := s.deps.UI.Render(w, "fleet_update", view); err != nil {
 		slog.Error("ui fleet update: render", "err", err)
 	}
 }
 // handleUIFleetUpdatePartial renders just the inner panel for htmx
 // auto-refresh polling — same data, no chrome.
 func (s *Server) handleUIFleetUpdatePartial(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	u := s.requireUIUser(w, r)
 	if u == nil {
 		return
 	}
 	page, err := s.buildFleetUpdatePage(r)
 	if err != nil {
 		slog.Error("ui fleet update partial: build page", "err", err)
 		stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
 		return
 	}
 	view := s.baseView(r, u)
 	view.Page = page
 	if err := s.deps.UI.RenderPartial(w, "fleet_update_inner", view); err != nil {
 		slog.Error("ui fleet update partial: render", "err", err)
 	}
 }
 // buildFleetUpdatePage assembles the data both /settings/fleet-update
 // and its partial render against. Resolves the most-recent fleet
 // update (active OR completed/cancelled/halted) so the page can show
 // the last roll's result instead of disappearing into "idle" the
 // instant a roll finishes.
 func (s *Server) buildFleetUpdatePage(r *stdhttp.Request) (fleetUpdatePage, error) {
 	page := fleetUpdatePage{
 		TargetVersion: version.Version,
 		HostNames:     map[string]string{},
 		PollURL:       "/settings/fleet-update/partial",
 	}
 	hosts, err := s.deps.Store.ListHosts(r.Context())
 	if err != nil {
 		return page, err
 	}
 	for _, h := range hosts {
 		page.HostNames[h.ID] = h.Name
 	}
 	active, err := s.deps.Store.ActiveFleetUpdate(r.Context())
 	if err != nil {
 		return page, err
 	}
 	mostRecent := active
 	if mostRecent == nil {
 		// Fall back to the most recent terminal row so the page can
 		// show "completed" / "halted" / "cancelled" once the worker
 		// finishes. One small bespoke query — keeps the page from
 		// flashing back to "idle" the instant a roll wraps up.
 		var id string
 		err := s.deps.Store.DB().QueryRowContext(r.Context(),
 			`SELECT id FROM fleet_updates ORDER BY started_at DESC LIMIT 1`).
 			Scan(&id)
 		if err == nil {
 			fu, _, gerr := s.deps.Store.GetFleetUpdate(r.Context(), id)
 			if gerr == nil {
 				mostRecent = fu
 			}
 		}
 	}
 	if mostRecent != nil {
 		_, rows, gerr := s.deps.Store.GetFleetUpdate(r.Context(), mostRecent.ID)
 		if gerr == nil {
 			page.Active = mostRecent
 			page.ActiveRows = make([]fleetUpdateHostView, 0, len(rows))
 			for _, hr := range rows {
 				page.ActiveRows = append(page.ActiveRows, fleetUpdateHostView{
 					HostID:       hr.HostID,
 					HostName:     page.HostNames[hr.HostID],
 					Position:     hr.Position,
 					Status:       hr.Status,
 					JobID:        hr.JobID,
 					FailedReason: hr.FailedReason,
 				})
 			}
 		}
 	}
 	// Idle list (or "still out of date" reference even when an active
 	// roll is running — cheap to compute, harmless to attach).
 	for _, h := range hosts {
 		if h.Status != "online" {
 			continue
 		}
 		if h.AgentVersion == "" || h.AgentVersion == page.TargetVersion {
 			continue
 		}
 		page.OutOfDateHosts = append(page.OutOfDateHosts, h)
 	}
 	return page, nil
 }
 // deriveOutOfDateOnlineHostIDs returns the list of host IDs that
 // (a) are online (Hub.Connected) and (b) have an agent_version that's
 // non-empty AND != target. Used by the start endpoint when the caller
 // omits host_ids.
 func (s *Server) deriveOutOfDateOnlineHostIDs(ctx context.Context, target string) ([]string, error) {
 	hosts, err := s.deps.Store.ListHosts(ctx)
 	if err != nil {
 		return nil, err
 	}
 	out := []string{}
 	for _, h := range hosts {
 		if h.AgentVersion == "" || h.AgentVersion == target {
 			continue
 		}
 		if !s.deps.Hub.Connected(h.ID) {
 			continue
 		}
 		out = append(out, h.ID)
 	}
 	return out, nil
 }
 // hostNameMap returns hostID → name; used to hydrate fleet-update
 // JSON responses.
 func (s *Server) hostNameMap(r *stdhttp.Request) map[string]string {
 	out := map[string]string{}
 	hosts, err := s.deps.Store.ListHosts(r.Context())
 	if err != nil {
 		return out
 	}
 	for _, h := range hosts {
 		out[h.ID] = h.Name
 	}
 	return out
 }
@@ -1,334 +0,0 @@
 // fleet_update_test.go — coverage for the P6-15 fleet-update HTTP
 // surface: start/cancel/get JSON endpoints + RBAC.
 package http
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	stdhttp "net/http"
 	"sync"
 	"testing"
 	"time"
 	"github.com/oklog/ulid/v2"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )
 // fakeFleetWorker stands in for *fleetupdate.Worker in HTTP tests.
 // It records what was passed to Start/Cancel and lets tests inject
 // canned errors. Satisfies the FleetWorker interface in
 // host_update.go.
 type fakeFleetWorker struct {
 	mu sync.Mutex
 	startCalls []fakeStartCall
 	startID    string
 	startErr   error
 	cancelCalls []string
 	cancelErr   error
 }
 type fakeStartCall struct {
 	UserID  string
 	Target  string
 	HostIDs []string
 }
 func (f *fakeFleetWorker) Start(_ context.Context, userID, target string, hostIDs []string) (string, error) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 	f.startCalls = append(f.startCalls, fakeStartCall{userID, target, append([]string(nil), hostIDs...)})
 	if f.startErr != nil {
 		return "", f.startErr
 	}
 	return f.startID, nil
 }
 func (f *fakeFleetWorker) Cancel(_ context.Context, id string) error {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 	f.cancelCalls = append(f.cancelCalls, id)
 	return f.cancelErr
 }
 // helloOnlineHost is the smallest setup that lets the dispatch /
 // derivation logic see a host as "online + version mismatch".
 // Returns the host id.
 func helloOnlineHost(t *testing.T, srv *Server, st *store.Store, name, agentVer string) string {
 	t.Helper()
 	id := makeHost(t, st, name)
 	if err := st.MarkHostHello(context.Background(), id, agentVer, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
 		t.Fatalf("mark hello: %v", err)
 	}
 	// Mark connected on the hub so deriveOutOfDateOnlineHostIDs
 	// considers it online without needing a real WS handshake. The
 	// Conn has a nil websocket pointer — tests never call Send on it.
 	srv.deps.Hub.Register(id, ws.NewConn(id, nil))
 	return id
 }
 func TestFleetUpdateStartHappyPath(t *testing.T) {
 	t.Parallel()
 	srv, ts, st := rawTestServer(t)
 	worker := &fakeFleetWorker{startID: ulid.Make().String()}
 	srv.deps.FleetWorker = worker
 	cookie, uid := loginAsAdminWithID(t, st)
 	hostID := helloOnlineHost(t, srv, st, "fu-host", "v0")
 	body := map[string]any{"host_ids": []string{hostID}}
 	raw, _ := json.Marshal(body)
 	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader(raw))
 	req.AddCookie(cookie)
 	req.Header.Set("Content-Type", "application/json")
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusAccepted {
 		t.Fatalf("status: got %d, want 202", res.StatusCode)
 	}
 	var out struct {
 		FleetUpdateID string `json:"fleet_update_id"`
 	}
 	if err := json.NewDecoder(res.Body).Decode(&out); err != nil {
 		t.Fatalf("decode: %v", err)
 	}
 	if out.FleetUpdateID != worker.startID {
 		t.Fatalf("fleet_update_id: got %q, want %q", out.FleetUpdateID, worker.startID)
 	}
 	worker.mu.Lock()
 	if len(worker.startCalls) != 1 || worker.startCalls[0].UserID != uid {
 		t.Fatalf("start calls: %+v", worker.startCalls)
 	}
 	if got := worker.startCalls[0].HostIDs; len(got) != 1 || got[0] != hostID {
 		t.Fatalf("host_ids: %v", got)
 	}
 	worker.mu.Unlock()
 	// Audit row.
 	var n int
 	if err := st.DB().QueryRow(
 		`SELECT COUNT(*) FROM audit_log WHERE action = 'fleet.update_started' AND target_id = ?`,
 		out.FleetUpdateID).Scan(&n); err != nil {
 		t.Fatalf("audit count: %v", err)
 	}
 	if n != 1 {
 		t.Fatalf("audit rows: got %d, want 1", n)
 	}
 }
 func TestFleetUpdateStartConflictWhenAlreadyRunning(t *testing.T) {
 	t.Parallel()
 	srv, ts, st := rawTestServer(t)
 	worker := &fakeFleetWorker{startErr: store.ErrFleetUpdateRunning}
 	srv.deps.FleetWorker = worker
 	cookie := loginAsAdmin(t, st)
 	_ = helloOnlineHost(t, srv, st, "fu-host", "v0")
 	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
 	req.AddCookie(cookie)
 	req.Header.Set("Content-Type", "application/json")
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusConflict {
 		t.Fatalf("status: got %d, want 409", res.StatusCode)
 	}
 	body := readJSONError(t, res.Body)
 	if body.Code != "fleet_update_in_progress" {
 		t.Fatalf("code: %q", body.Code)
 	}
 }
 func TestFleetUpdateStartDerivesHostIDsWhenEmpty(t *testing.T) {
 	t.Parallel()
 	srv, ts, st := rawTestServer(t)
 	worker := &fakeFleetWorker{startID: ulid.Make().String()}
 	srv.deps.FleetWorker = worker
 	cookie := loginAsAdmin(t, st)
 	// Two online + out-of-date, one online + at-target, one offline.
 	a := helloOnlineHost(t, srv, st, "behind-a", "v0")
 	b := helloOnlineHost(t, srv, st, "behind-b", "v0")
 	_ = helloOnlineHost(t, srv, st, "uptodate", version.Version)
 	offlineID := makeHost(t, st, "offline-host")
 	if err := st.MarkHostHello(context.Background(), offlineID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
 		t.Fatalf("mark hello: %v", err)
 	}
 	// Don't MarkOnline → derivation should skip.
 	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
 	req.AddCookie(cookie)
 	req.Header.Set("Content-Type", "application/json")
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusAccepted {
 		t.Fatalf("status: got %d, want 202", res.StatusCode)
 	}
 	worker.mu.Lock()
 	defer worker.mu.Unlock()
 	if len(worker.startCalls) != 1 {
 		t.Fatalf("start calls: %d", len(worker.startCalls))
 	}
 	got := worker.startCalls[0].HostIDs
 	want := map[string]bool{a: true, b: true}
 	if len(got) != 2 || !want[got[0]] || !want[got[1]] {
 		t.Fatalf("derived host_ids: got %v, want both of %v", got, []string{a, b})
 	}
 }
 func TestFleetUpdateCancelHappyPath(t *testing.T) {
 	t.Parallel()
 	srv, ts, st := rawTestServer(t)
 	worker := &fakeFleetWorker{}
 	srv.deps.FleetWorker = worker
 	cookie := loginAsAdmin(t, st)
 	// Seed a running fleet update directly.
 	fuID := ulid.Make().String()
 	uid := ulid.Make().String()
 	if err := st.CreateUser(context.Background(), store.User{
 		ID: uid, Username: "starter", PasswordHash: "x",
 		Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
 	}); err != nil {
 		t.Fatalf("seed user: %v", err)
 	}
 	hostID := makeHost(t, st, "fu-cancel-host")
 	if err := st.CreateFleetUpdate(context.Background(),
 		store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1"},
 		[]string{hostID}); err != nil {
 		t.Fatalf("seed fleet update: %v", err)
 	}
 	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet-updates/"+fuID+"/cancel", nil)
 	req.AddCookie(cookie)
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusNoContent {
 		t.Fatalf("status: got %d, want 204", res.StatusCode)
 	}
 	worker.mu.Lock()
 	if len(worker.cancelCalls) != 1 || worker.cancelCalls[0] != fuID {
 		t.Fatalf("cancel calls: %v", worker.cancelCalls)
 	}
 	worker.mu.Unlock()
 }
 func TestFleetUpdateCancelNotRunning(t *testing.T) {
 	t.Parallel()
 	srv, ts, st := rawTestServer(t)
 	srv.deps.FleetWorker = &fakeFleetWorker{}
 	cookie := loginAsAdmin(t, st)
 	// Seed + complete one so it's no longer running.
 	fuID := ulid.Make().String()
 	uid := ulid.Make().String()
 	_ = st.CreateUser(context.Background(), store.User{
 		ID: uid, Username: "starter2", PasswordHash: "x",
 		Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
 	})
 	hostID := makeHost(t, st, "fu-done-host")
 	_ = st.CreateFleetUpdate(context.Background(),
 		store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1"},
 		[]string{hostID})
 	if err := st.CompleteFleetUpdate(context.Background(), fuID, time.Now().UTC()); err != nil {
 		t.Fatalf("complete: %v", err)
 	}
 	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet-updates/"+fuID+"/cancel", nil)
 	req.AddCookie(cookie)
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusConflict {
 		t.Fatalf("status: got %d, want 409", res.StatusCode)
 	}
 	body := readJSONError(t, res.Body)
 	if body.Code != "fleet_update_not_running" {
 		t.Fatalf("code: %q", body.Code)
 	}
 }
 func TestFleetUpdateGetHydrates(t *testing.T) {
 	t.Parallel()
 	_, ts, st := rawTestServer(t)
 	cookie := loginAsAdmin(t, st)
 	uid := ulid.Make().String()
 	_ = st.CreateUser(context.Background(), store.User{
 		ID: uid, Username: "starter3", PasswordHash: "x",
 		Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
 	})
 	hostID := makeHost(t, st, "fu-get-host")
 	fuID := ulid.Make().String()
 	if err := st.CreateFleetUpdate(context.Background(),
 		store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1.2.3"},
 		[]string{hostID}); err != nil {
 		t.Fatalf("seed: %v", err)
 	}
 	req, _ := stdhttp.NewRequest("GET", ts.URL+"/api/fleet-updates/"+fuID, nil)
 	req.AddCookie(cookie)
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusOK {
 		t.Fatalf("status: got %d, want 200", res.StatusCode)
 	}
 	var got fleetUpdateView
 	if err := json.NewDecoder(res.Body).Decode(&got); err != nil {
 		t.Fatalf("decode: %v", err)
 	}
 	if got.ID != fuID || got.TargetVersion != "v1.2.3" || got.Status != "running" {
 		t.Fatalf("parent: %+v", got)
 	}
 	if len(got.Hosts) != 1 || got.Hosts[0].HostID != hostID || got.Hosts[0].HostName != "fu-get-host" {
 		t.Fatalf("hosts: %+v", got.Hosts)
 	}
 }
 func TestFleetUpdateRBAC(t *testing.T) {
 	t.Parallel()
 	_, ts, st := rawTestServer(t)
 	for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} {
 		role := role
 		t.Run(string(role), func(t *testing.T) {
 			cookie := loginAsRole(t, st, role)
 			req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
 			req.AddCookie(cookie)
 			req.Header.Set("Content-Type", "application/json")
 			res, err := stdhttp.DefaultClient.Do(req)
 			if err != nil {
 				t.Fatalf("do: %v", err)
 			}
 			defer res.Body.Close()
 			if res.StatusCode != stdhttp.StatusForbidden {
 				t.Fatalf("status: got %d, want 403", res.StatusCode)
 			}
 		})
 	}
 }
 // Sanity check that fakeFleetWorker satisfies the FleetWorker iface.
 var _ FleetWorker = (*fakeFleetWorker)(nil)
@@ -1,217 +0,0 @@
 package http
 import (
 	"context"
 	"encoding/json"
 	stdhttp "net/http"
 	"time"
 	"github.com/go-chi/chi/v5"
 	"github.com/oklog/ulid/v2"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )
 // UpdateWatcher is the slim view of the ws.updateWatcher this package
 // uses for tracking in-flight update dispatches. Defined as an
 // interface so a test can inject a stub.
 type UpdateWatcher interface {
 	Track(jobID, hostID string)
 }
 // FleetWorker is the slim view of the fleetupdate.Worker this package
 // uses. Kept here for forward compatibility with P6-15 — the host
 // update endpoint itself does not use it.
 type FleetWorker interface {
 	Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error)
 	Cancel(ctx context.Context, fleetUpdateID string) error
 }
 // dispatchHostUpdateResult communicates structured outcomes from the
 // shared dispatch path so both the HTTP handler and the fleet worker
 // can format errors in their own idiom.
 type dispatchHostUpdateResult struct {
 	JobID  string
 	Code   string // "" on success
 	Status int    // HTTP status the JSON handler should use on error
 	Msg    string // human-readable detail (optional)
 }
 // dispatchHostUpdate is the shared "send command.update to one host"
 // path. It performs every pre-check (host exists, online, version
 // mismatch, no in-flight update) and on success creates the jobs row,
 // audits, dispatches the WS envelope, and tracks the watcher entry.
 //
 // Pre-checks are returned as structured codes rather than HTTP errors
 // so the fleet worker can map them onto its own per-host status enum
 // without parsing strings.
 func (s *Server) dispatchHostUpdate(ctx context.Context, hostID string, actorKind string, actorID *string) dispatchHostUpdateResult {
 	host, err := s.deps.Store.GetHost(ctx, hostID)
 	if err != nil || host == nil {
 		return dispatchHostUpdateResult{Code: "host_not_found", Status: stdhttp.StatusNotFound}
 	}
 	if !s.deps.Hub.Connected(host.ID) {
 		return dispatchHostUpdateResult{
 			Code: "host_offline", Status: stdhttp.StatusConflict,
 			Msg: "agent is not currently connected",
 		}
 	}
 	if host.AgentVersion != "" && host.AgentVersion == version.Version {
 		return dispatchHostUpdateResult{
 			Code: "already_up_to_date", Status: stdhttp.StatusConflict,
 			Msg: "agent already running version " + version.Version,
 		}
 	}
 	existing, err := s.deps.Store.RunningUpdateJobForHost(ctx, hostID)
 	if err != nil {
 		return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
 	}
 	if existing != "" {
 		return dispatchHostUpdateResult{
 			Code: "update_in_progress", Status: stdhttp.StatusConflict,
 			Msg:   "an update job is already in flight for this host",
 			JobID: existing,
 		}
 	}
 	jobID := ulid.Make().String()
 	now := time.Now().UTC()
 	if err := s.deps.Store.CreateJob(ctx, store.Job{
 		ID: jobID, HostID: hostID, Kind: "update",
 		ActorKind: actorKind, ActorID: actorID,
 		CreatedAt: now,
 	}); err != nil {
 		return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
 	}
 	env, err := api.Marshal(api.MsgCommandUpdate, ulid.Make().String(), api.CommandUpdatePayload{
 		JobID: jobID,
 	})
 	if err != nil {
 		return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
 	}
 	if err := s.deps.Hub.Send(ctx, hostID, env); err != nil {
 		// Roll the job to failed so we don't leak a queued row.
 		_ = s.deps.Store.MarkJobFinished(ctx, jobID, "failed", -1, nil, err.Error(), time.Now().UTC())
 		return dispatchHostUpdateResult{
 			Code: "host_offline", Status: stdhttp.StatusConflict, Msg: err.Error(),
 		}
 	}
 	if s.deps.UpdateWatcher != nil {
 		s.deps.UpdateWatcher.Track(jobID, hostID)
 	}
 	auditPayload, _ := json.Marshal(map[string]string{
 		"job_id":         jobID,
 		"target_version": version.Version,
 	})
 	_ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{
 		ID:         ulid.Make().String(),
 		UserID:     actorID,
 		Actor:      actorKind,
 		Action:     "host.update_dispatched",
 		TargetKind: ptr("host"),
 		TargetID:   &hostID,
 		TS:         now,
 		Payload:    auditPayload,
 	})
 	return dispatchHostUpdateResult{JobID: jobID}
 }
 // handleHostUpdate is POST /api/hosts/{id}/update — JSON, admin-only.
 func (s *Server) handleHostUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	user, ok := s.requireUser(r)
 	if !ok {
 		writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
 		return
 	}
 	hostID := chi.URLParam(r, "id")
 	if hostID == "" {
 		writeJSONError(w, stdhttp.StatusBadRequest, "missing_host_id", "")
 		return
 	}
 	actor := "user"
 	var actorID *string
 	if user != nil {
 		actorID = &user.ID
 	}
 	res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID)
 	if res.Code != "" {
 		writeJSONError(w, res.Status, res.Code, res.Msg)
 		return
 	}
 	writeJSON(w, stdhttp.StatusAccepted, map[string]string{"job_id": res.JobID})
 }
 // handleHostUpdateForm is the HTMX-friendly POST /hosts/{id}/update
 // variant. On success it sets HX-Redirect to the job detail page; on
 // pre-check failures it renders an inline error banner.
 func (s *Server) handleHostUpdateForm(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	user, ok := s.requireUser(r)
 	if !ok {
 		stdhttp.Error(w, "unauthorised", stdhttp.StatusUnauthorized)
 		return
 	}
 	hostID := chi.URLParam(r, "id")
 	if hostID == "" {
 		stdhttp.Error(w, "missing host_id", stdhttp.StatusBadRequest)
 		return
 	}
 	actor := "user"
 	var actorID *string
 	if user != nil {
 		actorID = &user.ID
 	}
 	res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID)
 	if res.Code != "" {
 		// Inline banner for HTMX swaps. Mirrors what host_credentials
 		// returns on validation errors — small text/html fragment.
 		w.Header().Set("Content-Type", "text/html; charset=utf-8")
 		w.WriteHeader(res.Status)
 		msg := hostUpdateErrorMessage(res.Code, res.Msg)
 		_, _ = w.Write([]byte(`<div class="banner banner-error" role="alert">` + htmlEscape(msg) + `</div>`))
 		return
 	}
 	w.Header().Set("HX-Redirect", "/jobs/"+res.JobID)
 	w.WriteHeader(stdhttp.StatusOK)
 }
 func hostUpdateErrorMessage(code, msg string) string {
 	switch code {
 	case "host_not_found":
 		return "Host not found."
 	case "host_offline":
 		return "Agent is offline; can't deliver the update command."
 	case "already_up_to_date":
 		return "Agent is already running the current version."
 	case "update_in_progress":
 		return "An update is already in progress for this host."
 	}
 	if msg != "" {
 		return msg
 	}
 	return "Update dispatch failed."
 }
 // htmlEscape is a minimal HTML-attr-safe escaper. Avoids pulling html/template
 // for a one-shot inline banner.
 func htmlEscape(s string) string {
 	out := make([]byte, 0, len(s))
 	for i := 0; i < len(s); i++ {
 		switch s[i] {
 		case '&':
 			out = append(out, []byte("&amp;")...)
 		case '<':
 			out = append(out, []byte("&lt;")...)
 		case '>':
 			out = append(out, []byte("&gt;")...)
 		case '"':
 			out = append(out, []byte("&quot;")...)
 		default:
 			out = append(out, s[i])
 		}
 	}
 	return string(out)
 }
@@ -1,270 +0,0 @@
 // host_update_test.go — covers POST /api/hosts/{id}/update.
 package http
 import (
 	"context"
 	"encoding/json"
 	"io"
 	stdhttp "net/http"
 	"strings"
 	"sync"
 	"testing"
 	"time"
 	"github.com/coder/websocket"
 	"github.com/oklog/ulid/v2"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )
 // stubWatcher records Track calls so tests can assert the watcher was
 // notified.
 type stubWatcher struct {
 	mu      sync.Mutex
 	tracked []string // hostIDs
 }
 func (s *stubWatcher) Track(_, hostID string) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	s.tracked = append(s.tracked, hostID)
 }
 func TestHostUpdateHappyPath(t *testing.T) {
 	t.Parallel()
 	srv, ts, st := rawTestServer(t)
 	watcher := &stubWatcher{}
 	srv.deps.UpdateWatcher = watcher
 	hostID, token := enrolHostForWS(t, srv, st, "upd-host")
 	c := agentDial(t, srv, ts, hostID, token)
 	sendHello(t, c, "upd-host")
 	_ = drainUntil(t, c, api.MsgScheduleSet)
 	// Force a version mismatch so the dispatch isn't short-circuited.
 	if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
 		t.Fatalf("mark hello: %v", err)
 	}
 	cookie := loginAsAdmin(t, st)
 	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
 	req.AddCookie(cookie)
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusAccepted {
 		t.Fatalf("status: got %d, want 202", res.StatusCode)
 	}
 	var out struct {
 		JobID string `json:"job_id"`
 	}
 	if err := json.NewDecoder(res.Body).Decode(&out); err != nil {
 		t.Fatalf("decode: %v", err)
 	}
 	if out.JobID == "" {
 		t.Fatal("missing job_id in response")
 	}
 	// command.update envelope arrives.
 	deadline := time.Now().Add(2 * time.Second)
 	var got api.Envelope
 	for time.Now().Before(deadline) {
 		ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
 		mt, raw, rerr := c.Read(ctx)
 		cancel()
 		if rerr != nil {
 			break
 		}
 		if mt != websocket.MessageText {
 			continue
 		}
 		if !strings.Contains(string(raw), `"command.update"`) {
 			continue
 		}
 		_ = json.Unmarshal(raw, &got)
 		break
 	}
 	if got.Type != api.MsgCommandUpdate {
 		t.Fatal("never received command.update envelope")
 	}
 	var cp api.CommandUpdatePayload
 	if err := got.UnmarshalPayload(&cp); err != nil {
 		t.Fatalf("payload: %v", err)
 	}
 	if cp.JobID != out.JobID {
 		t.Fatalf("payload job_id: got %q want %q", cp.JobID, out.JobID)
 	}
 	// Watcher tracked.
 	watcher.mu.Lock()
 	defer watcher.mu.Unlock()
 	if len(watcher.tracked) != 1 || watcher.tracked[0] != hostID {
 		t.Fatalf("watcher tracked: %v", watcher.tracked)
 	}
 	// Audit row exists.
 	var n int
 	if err := st.DB().QueryRow(
 		`SELECT COUNT(*) FROM audit_log WHERE action = 'host.update_dispatched' AND target_id = ?`,
 		hostID).Scan(&n); err != nil {
 		t.Fatalf("audit count: %v", err)
 	}
 	if n != 1 {
 		t.Fatalf("audit rows: got %d, want 1", n)
 	}
 }
 func TestHostUpdateNotFound(t *testing.T) {
 	t.Parallel()
 	_, ts, st := rawTestServer(t)
 	cookie := loginAsAdmin(t, st)
 	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/no-such/update", nil)
 	req.AddCookie(cookie)
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusNotFound {
 		t.Fatalf("status: got %d want 404", res.StatusCode)
 	}
 }
 func TestHostUpdateOffline(t *testing.T) {
 	t.Parallel()
 	_, ts, st := rawTestServer(t)
 	hostID := ulid.Make().String()
 	if err := st.CreateHost(context.Background(), store.Host{
 		ID: hostID, Name: "off", OS: "linux", Arch: "amd64",
 		EnrolledAt: time.Now().UTC(),
 	}, "deadbeef", ""); err != nil {
 		t.Fatalf("create: %v", err)
 	}
 	cookie := loginAsAdmin(t, st)
 	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
 	req.AddCookie(cookie)
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusConflict {
 		t.Fatalf("status: got %d want 409", res.StatusCode)
 	}
 	body := readJSONError(t, res.Body)
 	if body.Code != "host_offline" {
 		t.Fatalf("code: %q", body.Code)
 	}
 }
 func TestHostUpdateAlreadyUpToDate(t *testing.T) {
 	t.Parallel()
 	srv, ts, st := rawTestServer(t)
 	hostID, token := enrolHostForWS(t, srv, st, "uptodate-host")
 	c := agentDial(t, srv, ts, hostID, token)
 	sendHello(t, c, "uptodate-host")
 	_ = drainUntil(t, c, api.MsgScheduleSet)
 	// Force agent_version == version.Version.
 	if err := st.MarkHostHello(context.Background(), hostID, version.Version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
 		t.Fatalf("mark hello: %v", err)
 	}
 	cookie := loginAsAdmin(t, st)
 	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
 	req.AddCookie(cookie)
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusConflict {
 		t.Fatalf("status: got %d want 409", res.StatusCode)
 	}
 	body := readJSONError(t, res.Body)
 	if body.Code != "already_up_to_date" {
 		t.Fatalf("code: %q", body.Code)
 	}
 }
 func TestHostUpdateInProgress(t *testing.T) {
 	t.Parallel()
 	srv, ts, st := rawTestServer(t)
 	hostID, token := enrolHostForWS(t, srv, st, "inprog-host")
 	c := agentDial(t, srv, ts, hostID, token)
 	sendHello(t, c, "inprog-host")
 	_ = drainUntil(t, c, api.MsgScheduleSet)
 	if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
 		t.Fatalf("mark hello: %v", err)
 	}
 	// Pre-seed an in-flight update job.
 	jobID := ulid.Make().String()
 	if err := st.CreateJob(context.Background(), store.Job{
 		ID: jobID, HostID: hostID, Kind: "update",
 		ActorKind: "user", CreatedAt: time.Now().UTC(),
 	}); err != nil {
 		t.Fatalf("seed job: %v", err)
 	}
 	cookie := loginAsAdmin(t, st)
 	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
 	req.AddCookie(cookie)
 	res, err := stdhttp.DefaultClient.Do(req)
 	if err != nil {
 		t.Fatalf("do: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusConflict {
 		t.Fatalf("status: got %d want 409", res.StatusCode)
 	}
 	body := readJSONError(t, res.Body)
 	if body.Code != "update_in_progress" {
 		t.Fatalf("code: %q", body.Code)
 	}
 }
 func TestHostUpdateRBAC(t *testing.T) {
 	t.Parallel()
 	_, ts, st := rawTestServer(t)
 	hostID := ulid.Make().String()
 	if err := st.CreateHost(context.Background(), store.Host{
 		ID: hostID, Name: "rbac-host", OS: "linux", Arch: "amd64",
 		EnrolledAt: time.Now().UTC(),
 	}, "deadbeef", ""); err != nil {
 		t.Fatalf("create: %v", err)
 	}
 	for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} {
 		role := role
 		t.Run(string(role), func(t *testing.T) {
 			cookie := loginAsRole(t, st, role)
 			req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
 			req.AddCookie(cookie)
 			res, err := stdhttp.DefaultClient.Do(req)
 			if err != nil {
 				t.Fatalf("do: %v", err)
 			}
 			defer res.Body.Close()
 			if res.StatusCode != stdhttp.StatusForbidden {
 				t.Fatalf("status for %s: got %d want 403", role, res.StatusCode)
 			}
 		})
 	}
 }
 type jsonErrBody struct {
 	Code    string `json:"code"`
 	Message string `json:"message,omitempty"`
 }
 func readJSONError(t *testing.T, body io.Reader) jsonErrBody {
 	t.Helper()
 	var out jsonErrBody
 	if err := json.NewDecoder(body).Decode(&out); err != nil {
 		t.Fatalf("decode error body: %v", err)
 	}
 	return out
 }
@@ -4,7 +4,6 @@ import (
 	stdhttp "net/http"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )
 // hostView is the JSON projection of a Host row. Same shape as the
@@ -28,8 +27,6 @@ type hostView struct {
 	RepoSizeBytes    int64    `json:"repo_size_bytes"`
 	SnapshotCount    int      `json:"snapshot_count"`
 	OpenAlertCount   int      `json:"open_alert_count"`
 	UpdateAvailable  bool     `json:"update_available"`
 	TargetVersion    string   `json:"target_version,omitempty"`
 }
 // handleListHosts returns the full fleet as JSON. Authenticated; the
@@ -88,8 +85,6 @@ func hostToView(h store.Host) hostView {
 		RepoSizeBytes:    h.RepoSizeBytes,
 		SnapshotCount:    h.SnapshotCount,
 		OpenAlertCount:   h.OpenAlertCount,
 		TargetVersion:    version.Version,
 		UpdateAvailable:  h.AgentVersion != "" && h.AgentVersion != version.Version,
 	}
 	if v.Tags == nil {
 		v.Tags = []string{}
@@ -39,13 +39,6 @@ type Deps struct {
 	// NotificationHub (optional, wired in G1) is used by the test-fire
 	// endpoint to dispatch a single synthetic payload through a channel.
 	NotificationHub *notification.Hub
 	// UpdateWatcher tracks in-flight agent self-update dispatches and
 	// reconciles them against incoming hello envelopes. Optional;
 	// nil = no-op (handlers degrade by skipping the Track call).
 	UpdateWatcher UpdateWatcher
 	// FleetWorker drives the rolling fleet-update worker. Optional;
 	// nil = fleet update endpoints (P6-15) report unavailable.
 	FleetWorker FleetWorker
 	// Version is the binary's build version, surfaced in the chrome.
 	// Empty falls back to "dev".
 	Version string
@@ -130,9 +123,8 @@ func (s *Server) routes(r chi.Router) {
 	r.Post("/api/agents/announce", s.handleAnnounce)
 	r.Get("/agent/binary", s.handleAgentBinary)
 	r.Get("/install/*", s.handleInstallAsset)
 	r.Get("/api/version", s.handleVersion)
 	if s.deps.Hub != nil {
-		hd := ws.HandlerDeps{
+		r.Mount("/ws/agent", ws.AgentHandler(ws.HandlerDeps{
 			Hub:            s.deps.Hub,
 			Store:          s.deps.Store,
 			JobHub:         s.deps.JobHub,
@@ -140,11 +132,7 @@ func (s *Server) routes(r chi.Router) {
 			OnHello:        s.onAgentHello,
 			OnScheduleAck:  s.applyScheduleAck,
 			OnScheduleFire: s.dispatchScheduledJob,
-		}
+		}))
 		if w, ok := s.deps.UpdateWatcher.(*ws.UpdateWatcher); ok && w != nil {
 			hd.UpdateWatcher = w
 		}
 		r.Mount("/ws/agent", ws.AgentHandler(hd))
 	}
 	r.Get("/ws/agent/pending", s.handlePendingWS)
 	r.Mount("/static/", staticHandler())
@@ -195,9 +183,7 @@ func (s *Server) routes(r chi.Router) {
 			r.Get("/hosts/{id}/sources", s.handleUIHostSources)
 			r.Get("/hosts/{id}/sources/new", s.handleUISourceGroupNewGet)
 			r.Get("/hosts/{id}/sources/{gid}/edit", s.handleUISourceGroupEditGet)
 			r.Get("/hosts/{id}/jobs", s.handleUIHostJobs)
 			r.Get("/hosts/{id}/repo", s.handleUIHostRepo)
 			r.Get("/hosts/{id}/repo/trend", s.handleUIRepoTrend)
 			r.Get("/hosts/{id}/schedules", s.handleUISchedulesList)
 			r.Get("/hosts/{id}/schedules/new", s.handleUIScheduleNewGet)
 			r.Get("/hosts/{id}/schedules/{sid}/edit", s.handleUIScheduleEditGet)
@@ -284,14 +270,6 @@ func (s *Server) routes(r chi.Router) {
 	r.Group(func(r chi.Router) {
 		r.Use(s.requireRole(store.RoleAdmin))
 		r.Post("/api/hosts/{id}/update", s.handleHostUpdate)
 		r.Post("/hosts/{id}/update", s.handleHostUpdateForm)
 		// Fleet update (P6-15): rolling update across many hosts.
 		r.Post("/api/fleet/update", s.handleAPIFleetUpdateStart)
 		r.Post("/api/fleet-updates/{id}/cancel", s.handleAPIFleetUpdateCancel)
 		r.Get("/api/fleet-updates/{id}", s.handleAPIFleetUpdateGet)
 		r.Get("/api/users", s.handleAPIUsersList)
 		r.Post("/api/users", s.handleAPIUserCreate)
 		r.Get("/api/users/{id}", s.handleAPIUserGet)
@@ -305,8 +283,6 @@ func (s *Server) routes(r chi.Router) {
 		if s.deps.UI != nil {
 			r.Post("/hosts/{id}/delete", s.handleUIHostDelete)
 			r.Get("/settings", s.handleUISettings)
 			r.Get("/settings/fleet-update", s.handleUIFleetUpdate)
 			r.Get("/settings/fleet-update/partial", s.handleUIFleetUpdatePartial)
 			r.Get("/settings/users", s.handleUIUsersList)
 			r.Get("/settings/users/new", s.handleUIUserNewGet)
 			r.Post("/settings/users/new", s.handleUIUserNewPost)
@@ -345,27 +321,6 @@ func (s *Server) Shutdown(ctx context.Context) error {
 	return s.srv.Shutdown(ctx)
 }
 // SetFleetWorker installs the fleet-update worker post-construction.
 // Used to break the wiring loop in cmd/server (the worker depends on a
 // dispatcher that delegates back into the server's host-update path).
 func (s *Server) SetFleetWorker(fw FleetWorker) { s.deps.FleetWorker = fw }
 // DispatchHostUpdate is the public entry point for callers (the fleet
 // worker) that need to drive the same dispatch path the HTTP handler
 // uses, without going through HTTP. Returns the structured result so
 // the caller can map error codes to its own status enum.
 func (s *Server) DispatchHostUpdate(ctx context.Context, hostID, actorUserID string) (jobID string, code string, err error) {
 	var actorID *string
 	if actorUserID != "" {
 		actorID = &actorUserID
 	}
 	res := s.dispatchHostUpdate(ctx, hostID, "user", actorID)
 	if res.Code != "" {
 		return res.JobID, res.Code, nil
 	}
 	return res.JobID, "", nil
 }
 // Addr returns the configured listen address. Useful in tests when
 // the caller passes :0 to get a random port.
 func (s *Server) Addr() string { return s.srv.Addr }
@@ -1,83 +0,0 @@
 package http
 import (
 	"context"
 	stdhttp "net/http"
 	"strings"
 	"testing"
 	"time"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 )
 func getDashboard(t *testing.T, baseURL string, cookie *stdhttp.Cookie) string {
 	t.Helper()
 	client := &stdhttp.Client{
 		CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
 			return stdhttp.ErrUseLastResponse
 		},
 	}
 	req, err := stdhttp.NewRequest("GET", baseURL+"/", nil)
 	if err != nil {
 		t.Fatalf("new request: %v", err)
 	}
 	req.AddCookie(cookie)
 	res, err := client.Do(req)
 	if err != nil {
 		t.Fatalf("GET /: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusOK {
 		t.Fatalf("GET /: want 200, got %d", res.StatusCode)
 	}
 	body := make([]byte, 0, 1<<20)
 	buf := make([]byte, 4096)
 	for {
 		n, rerr := res.Body.Read(buf)
 		body = append(body, buf[:n]...)
 		if rerr != nil {
 			break
 		}
 	}
 	return string(body)
 }
 func TestDashboard_HostRowSparklineRendersWithHistory(t *testing.T) {
 	t.Parallel()
 	_, baseURL, st := newTestServerWithUI(t)
 	cookie := loginAsAdmin(t, st)
 	hostID := makeHost(t, st, "h-spark")
 	ctx := context.Background()
 	// Two history points → polyline must render.
 	for i, day := range []string{"2026-05-05", "2026-05-06"} {
 		v := int64(100 + i*50)
 		if err := st.UpsertHostRepoStatsHistory(ctx, hostID, day,
 			store.HostRepoStats{TotalSizeBytes: &v}, time.Now().UTC()); err != nil {
 			t.Fatalf("upsert %s: %v", day, err)
 		}
 	}
 	body := getDashboard(t, baseURL, cookie)
 	if !strings.Contains(body, `class="repo-sparkline"`) {
 		t.Errorf("expected sparkline SVG in dashboard body (class=repo-sparkline missing)")
 	}
 	if !strings.Contains(body, `<polyline`) {
 		t.Errorf("expected <polyline> in dashboard body")
 	}
 }
 func TestDashboard_HostRowSparklineEmptyState(t *testing.T) {
 	t.Parallel()
 	_, baseURL, st := newTestServerWithUI(t)
 	cookie := loginAsAdmin(t, st)
 	makeHost(t, st, "h-empty")
 	body := getDashboard(t, baseURL, cookie)
 	if !strings.Contains(body, `class="repo-sparkline"`) {
 		t.Errorf("expected sparkline SVG element on dashboard")
 	}
 	if !strings.Contains(body, `>—<`) {
 		t.Errorf("expected em-dash placeholder in empty sparkline cell")
 	}
 }
@@ -5,10 +5,8 @@ import (
 	"encoding/base64"
 	"encoding/json"
 	"errors"
 	"html/template"
 	"io/fs"
 	"log/slog"
 	"math"
 	stdhttp "net/http"
 	"net/url"
 	"sort"
@@ -25,8 +23,6 @@ import (
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/web/sparkline"
 	"gitea.dcglab.co.uk/steve/restic-manager/web"
 )
@@ -159,10 +155,6 @@ type dashboardPage struct {
 	// when it's already active). Pre-computed so the template stays
 	// dumb.
 	SortURL map[string]string
 	// UpdatesBehind is the count of online hosts whose agent_version
 	// trails the server. Surfaces as the dashboard "N hosts behind"
 	// hero tile and links to ?updates=behind.
 	UpdatesBehind int
 }
 // dashboardFilter holds the parsed query-string filter state.
@@ -173,10 +165,6 @@ type dashboardFilter struct {
 	Tag        string // mirrors ActiveTag for round-trip on links
 	Sort       string // column key (see sortDashboard)
 	Dir        string // "asc" | "desc"
 	// Updates narrows to hosts whose agent is behind the server's
 	// version. Only valid value today is "behind"; empty means no
 	// filter.
 	Updates string
 }
 // dashboardHostRow carries a host plus the per-row Run-now decision
@@ -192,17 +180,6 @@ type dashboardHostRow struct {
 	// NextRun is the next-fire time of RunAllScheduleID (when set),
 	// computed server-side from its cron. nil otherwise.
 	NextRun *time.Time
 	// UpdateAvailable is true when the host's agent has connected at
 	// least once AND its agent_version differs from the server's. Used
 	// by the host_row partial to render the update-available chip.
 	UpdateAvailable bool
 	// TargetVersion is the server's build version, surfaced in the
 	// chip's tooltip and label.
 	TargetVersion string
 	// RepoSparklineSVG is a server-rendered inline SVG showing the
 	// 30-day repo-size trend. Empty-state SVG (em-dash) is returned
 	// when no history rows exist for the host.
 	RepoSparklineSVG template.HTML
 }
 // pickRunAllSchedule returns the ID of the single schedule whose
@@ -278,11 +255,7 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
 	// calls per host — fine at fleet sizes we care about.
 	rows := make([]dashboardHostRow, 0, len(hosts))
 	for _, h := range hosts {
-		row := dashboardHostRow{
+		row := dashboardHostRow{Host: h}
 			Host:            h,
 			TargetVersion:   version.Version,
 			UpdateAvailable: h.AgentVersion != "" && h.AgentVersion != version.Version,
 		}
 		groups, gerr := s.deps.Store.ListSourceGroupsByHost(r.Context(), h.ID)
 		if gerr != nil {
 			slog.Warn("ui dashboard: list source groups", "host_id", h.ID, "err", gerr)
@@ -303,20 +276,6 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
 				}
 			}
 		}
 		since := time.Now().UTC().AddDate(0, 0, -30)
 		pts, herr := s.deps.Store.ListHostRepoStatsHistory(r.Context(), h.ID, since)
 		if herr != nil {
 			slog.Warn("ui dashboard: list repo history", "host_id", h.ID, "err", herr)
 		}
 		sparkPoints := make([]float64, len(pts))
 		for i, p := range pts {
 			if p.TotalSizeBytes == nil {
 				sparkPoints[i] = math.NaN()
 			} else {
 				sparkPoints[i] = float64(*p.TotalSizeBytes)
 			}
 		}
 		row.RepoSparklineSVG = sparkline.RenderSparkline(sparkPoints, 88, 20)
 		rows = append(rows, row)
 	}
@@ -330,13 +289,6 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
 		critOpenCount = len(crit)
 	}
 	updatesBehind := 0
 	for _, h := range allHosts {
 		if h.Status == "online" && h.AgentVersion != "" && h.AgentVersion != version.Version {
 			updatesBehind++
 		}
 	}
 	view := s.baseView(r, u)
 	view.Page = dashboardPage{
 		Hosts:         rows,
@@ -350,7 +302,6 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
 		Filter:        filter,
 		RefreshURL:    "/?" + filter.encode(),
 		SortURL:       buildDashboardSortURLs(filter),
 		UpdatesBehind: updatesBehind,
 	}
 	if err := s.deps.UI.Render(w, "dashboard", view); err != nil {
 		slog.Error("ui: render dashboard", "err", err)
@@ -369,7 +320,6 @@ func parseDashboardFilter(q url.Values) dashboardFilter {
 		Tag:        q.Get("tag"),
 		Sort:       q.Get("sort"),
 		Dir:        q.Get("dir"),
 		Updates:    q.Get("updates"),
 	}
 	if f.Sort == "" {
 		f.Sort = "name"
@@ -402,9 +352,6 @@ func (f dashboardFilter) encode() string {
 	if f.Dir != "" && f.Dir != "asc" {
 		v.Set("dir", f.Dir)
 	}
 	if f.Updates != "" {
 		v.Set("updates", f.Updates)
 	}
 	return v.Encode()
 }
@@ -455,11 +402,6 @@ func filterAndSortDashboardHosts(hosts []store.Host, f dashboardFilter) []store.
 				continue
 			}
 		}
 		if f.Updates == "behind" {
 			if h.AgentVersion == "" || h.AgentVersion == version.Version {
 				continue
 			}
 		}
 		out = append(out, h)
 	}
 	sortDashboardHosts(out, f.Sort, f.Dir)
@@ -867,20 +809,6 @@ type hostChromeData struct {
 	SourceGroupCount int
 	ScheduleCount    int
 	ScheduleVersion  int64 // host_schedule_version (latest desired)
 	// UpdateAvailable + TargetVersion drive the agent-out-of-date chip
 	// in the host detail header. UpdateAvailable is true iff the host
 	// has connected at least once AND its agent_version != server's.
 	UpdateAvailable bool
 	TargetVersion   string
 	// Online + UpdateInProgress drive the per-host "Update agent"
 	// button on host_detail. Online mirrors hub.Connected; pulled here
 	// so the button can disable when the host is unreachable.
 	Online           bool
 	UpdateInProgress bool
 	// CanAdmin is true when the viewing user has admin role; used to
 	// gate the "Update agent" button. Kept on the chrome struct so any
 	// page reusing host_chrome already has it for free.
 	CanAdmin bool
 	// KnownTags is the union of tags already in use across the fleet,
 	// used for autocomplete on the host-tags edit form. Cheap query.
 	KnownTags []string
@@ -906,14 +834,6 @@ type hostChromeData struct {
 // render the page with stale counts than 500 the whole tab.
 func (s *Server) loadHostChrome(r *stdhttp.Request, host store.Host, subtab, crumb string) hostChromeData {
 	d := hostChromeData{Host: host, SubTab: subtab, Crumb: crumb}
 	d.TargetVersion = version.Version
 	d.UpdateAvailable = host.AgentVersion != "" && host.AgentVersion != version.Version
 	if s.deps.Hub != nil {
 		d.Online = s.deps.Hub.Connected(host.ID)
 	}
 	if existing, _ := s.deps.Store.RunningUpdateJobForHost(r.Context(), host.ID); existing != "" {
 		d.UpdateInProgress = true
 	}
 	if groups, err := s.deps.Store.ListSourceGroupsByHost(r.Context(), host.ID); err == nil {
 		d.SourceGroupCount = len(groups)
 	} else {
@@ -1052,10 +972,8 @@ func (s *Server) handleUIHostDetail(w stdhttp.ResponseWriter, r *stdhttp.Request
 	view := s.baseView(r, u)
 	view.Title = host.Name + " · restic-manager"
 	chrome := s.loadHostChrome(r, *host, "snapshots", "snapshots")
 	chrome.CanAdmin = u.Role == string(store.RoleAdmin)
 	view.Page = hostDetailPage{
-		hostChromeData: chrome,
+		hostChromeData: s.loadHostChrome(r, *host, "snapshots", "snapshots"),
 		Snapshots:      shown,
 		SnapshotsShown: len(shown),
 		LegacyRestic:   !restic.Env{Version: host.ResticVersion}.AtLeastVersion(0, 17),
@@ -1,47 +0,0 @@
 package http
 import (
 	"log/slog"
 	stdhttp "net/http"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 )
 // hostJobsPage is the page-data struct for /hosts/{id}/jobs.
 type hostJobsPage struct {
 	hostChromeData
 	Jobs []store.Job
 }
 // handleUIHostJobs renders the per-host jobs list. Read-only — no
 // actions, just a click-through to the existing /jobs/{id} detail
 // page for any row.
 func (s *Server) handleUIHostJobs(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	u := s.requireUIUser(w, r)
 	if u == nil {
 		return
 	}
 	host, ok := s.loadHostForUI(w, r)
 	if !ok {
 		return
 	}
 	jobs, err := s.deps.Store.ListJobsByHost(r.Context(), host.ID, 100)
 	if err != nil {
 		slog.Error("ui host jobs: list", "host_id", host.ID, "err", err)
 		stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
 		return
 	}
 	page := hostJobsPage{
 		hostChromeData: s.loadHostChrome(r, *host, "jobs", "jobs"),
 		Jobs:           jobs,
 	}
 	view := s.baseView(r, u)
 	view.Title = host.Name + " jobs · restic-manager"
 	view.Page = page
 	if err := s.deps.UI.Render(w, "host_jobs", view); err != nil {
 		slog.Error("ui: render host_jobs", "err", err)
 		stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
 	}
 }
@@ -1,85 +0,0 @@
 package http
 import (
 	"context"
 	"io"
 	stdhttp "net/http"
 	"strings"
 	"testing"
 	"time"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 )
 func TestUIHostJobs_RendersList(t *testing.T) {
 	t.Parallel()
 	_, baseURL, st := newTestServerWithUI(t)
 	cookie := loginAsAdmin(t, st)
 	hostID := makeHost(t, st, "h-jobs-render")
 	// Two jobs with distinct kinds + statuses.
 	now := time.Now().UTC()
 	ctx := context.Background()
 	if err := st.CreateJob(ctx, store.Job{
 		ID: "01HZZZZZZZZZZZZZZZZZZZZZ10", HostID: hostID, Kind: "backup",
 		ActorKind: "user", CreatedAt: now.Add(-time.Hour),
 	}); err != nil {
 		t.Fatalf("create job: %v", err)
 	}
 	if err := st.MarkJobFinished(ctx, "01HZZZZZZZZZZZZZZZZZZZZZ10", "succeeded", 0, nil, "", now.Add(-time.Hour+time.Minute)); err != nil {
 		t.Fatalf("finish job: %v", err)
 	}
 	if err := st.CreateJob(ctx, store.Job{
 		ID: "01HZZZZZZZZZZZZZZZZZZZZZ11", HostID: hostID, Kind: "prune",
 		ActorKind: "schedule", CreatedAt: now,
 	}); err != nil {
 		t.Fatalf("create job: %v", err)
 	}
 	if err := st.MarkJobFinished(ctx, "01HZZZZZZZZZZZZZZZZZZZZZ11", "failed", 1, nil, "boom", now.Add(time.Minute)); err != nil {
 		t.Fatalf("finish job: %v", err)
 	}
 	body := getHostJobsPage(t, baseURL, hostID, cookie)
 	for _, want := range []string{"backup", "prune", "succeeded", "failed", "schedule", "user", `class="jobs-row`} {
 		if !strings.Contains(body, want) {
 			t.Errorf("expected %q in body, missing", want)
 		}
 	}
 }
 func TestUIHostJobs_EmptyState(t *testing.T) {
 	t.Parallel()
 	_, baseURL, st := newTestServerWithUI(t)
 	cookie := loginAsAdmin(t, st)
 	hostID := makeHost(t, st, "h-jobs-empty")
 	body := getHostJobsPage(t, baseURL, hostID, cookie)
 	if !strings.Contains(body, "No jobs yet.") {
 		t.Error("expected empty-state heading")
 	}
 }
 // getHostJobsPage fetches /hosts/{id}/jobs and returns the body string.
 func getHostJobsPage(t *testing.T, baseURL, hostID string, cookie *stdhttp.Cookie) string {
 	t.Helper()
 	client := &stdhttp.Client{
 		CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
 			return stdhttp.ErrUseLastResponse
 		},
 	}
 	req, err := stdhttp.NewRequest("GET", baseURL+"/hosts/"+hostID+"/jobs", nil)
 	if err != nil {
 		t.Fatalf("new request: %v", err)
 	}
 	req.AddCookie(cookie)
 	res, err := client.Do(req)
 	if err != nil {
 		t.Fatalf("GET /hosts/%s/jobs: %v", hostID, err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusOK {
 		t.Fatalf("GET /hosts/%s/jobs: want 200, got %d", hostID, res.StatusCode)
 	}
 	raw, _ := io.ReadAll(res.Body)
 	return string(raw)
 }
@@ -1,12 +1,9 @@
 package http
 import (
 	"context"
 	"encoding/json"
 	"errors"
 	"html/template"
 	"log/slog"
 	"math"
 	stdhttp "net/http"
 	"strconv"
 	"strings"
@@ -16,7 +13,6 @@ import (
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/web/sparkline"
 )
 // ui_repo.go — HTML form-driven repo-tab handlers (connection,
@@ -31,15 +27,6 @@ import (
 //   POST /hosts/{id}/admin-credentials               — admin (prune) creds
 //   POST /hosts/{id}/admin-credentials/delete        — clear admin creds
 // repoTrendView is the data the repo_size_chart partial needs.
 // HostID + Range round-trip through the htmx range pills; ChartSVG
 // is pre-rendered server-side so the partial is just a wrapper.
 type repoTrendView struct {
 	HostID   string
 	Range    string
 	ChartSVG template.HTML
 }
 // repoStatsView is a flat, pre-dereferenced projection of
 // store.HostRepoStats for use in templates. Nil pointer fields are
 // collapsed to zero/false and accompanied by a Has* sentinel so the
@@ -87,10 +74,6 @@ type hostRepoPage struct {
 	// Nil when no row exists yet (fresh hosts).
 	StatsView *repoStatsView
 	// Trend holds the pre-rendered chart fragment data for the
 	// 30/90/365-day repo-size + snapshot-count overlay chart.
 	Trend repoTrendView
 	// Snapshots-by-tag — map[group_name]count, plus an "untagged" row.
 	SnapshotsByTag    map[string]int
 	UntaggedSnapshots int
@@ -242,52 +225,9 @@ func (s *Server) loadHostRepoPage(r *stdhttp.Request, host store.Host) (*hostRep
 			}
 		}
 	}
 	p.Trend = s.buildRepoTrendView(r.Context(), host.ID, "30d")
 	return p, nil
 }
 // buildRepoTrendView builds the chart-partial data for a host. Used
 // both by the page-load (initial 30d render) and the htmx fragment
 // endpoint (range switching). An invalid rangeKey falls back to "30d".
 func (s *Server) buildRepoTrendView(ctx context.Context, hostID, rangeKey string) repoTrendView {
 	days := 30
 	switch rangeKey {
 	case "90d":
 		days = 90
 	case "1y":
 		days = 365
 	default:
 		rangeKey = "30d"
 	}
 	since := time.Now().UTC().AddDate(0, 0, -days)
 	pts, err := s.deps.Store.ListHostRepoStatsHistory(ctx, hostID, since)
 	if err != nil {
 		slog.Warn("ui repo trend: list history", "host_id", hostID, "err", err)
 	}
 	sizes := make([]float64, len(pts))
 	counts := make([]float64, len(pts))
 	dayList := make([]time.Time, len(pts))
 	for i, p := range pts {
 		dayList[i] = p.Day
 		if p.TotalSizeBytes == nil {
 			sizes[i] = math.NaN()
 		} else {
 			sizes[i] = float64(*p.TotalSizeBytes)
 		}
 		if p.SnapshotCount == nil {
 			counts[i] = math.NaN()
 		} else {
 			counts[i] = float64(*p.SnapshotCount)
 		}
 	}
 	chartSVG := sparkline.RenderChart([]sparkline.Series{
 		{Name: "size", Stroke: "#3b82f6", Axis: sparkline.AxisLeft, Format: sparkline.FormatBytes, Points: sizes},
 		{Name: "snapshots", Stroke: "#f59e0b", Axis: sparkline.AxisRight, Format: sparkline.FormatCount, Points: counts},
 	}, dayList, sparkline.ChartOpts{Width: 640, Height: 220})
 	return repoTrendView{HostID: hostID, Range: rangeKey, ChartSVG: chartSVG}
 }
 func (s *Server) handleUIHostRepo(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	u := s.requireUIUser(w, r)
 	if u == nil {
@@ -1,25 +0,0 @@
 // ui_repo_trend.go — htmx fragment endpoint for the repo-page
 // trend chart. Returns just the chart partial wrapped in
 // <div id="repo-trend-chart"> so htmx can outerHTML-swap it.
 //
 //	GET /hosts/{id}/repo/trend?range=30d|90d|1y
 package http
 import (
 	stdhttp "net/http"
 	"github.com/go-chi/chi/v5"
 )
 func (s *Server) handleUIRepoTrend(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	u := s.requireUIUser(w, r)
 	if u == nil {
 		return
 	}
 	hostID := chi.URLParam(r, "id")
 	view := s.baseView(r, u)
 	view.Page = s.buildRepoTrendView(r.Context(), hostID, r.URL.Query().Get("range"))
 	if err := s.deps.UI.RenderPartial(w, "repo_size_chart", view); err != nil {
 		stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
 	}
 }
@@ -1,123 +0,0 @@
 package http
 import (
 	"context"
 	stdhttp "net/http"
 	"strings"
 	"testing"
 	"time"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 )
 func getTrend(t *testing.T, baseURL, hostID, rangeKey string, cookie *stdhttp.Cookie) string {
 	t.Helper()
 	client := &stdhttp.Client{
 		CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
 			return stdhttp.ErrUseLastResponse
 		},
 	}
 	url := baseURL + "/hosts/" + hostID + "/repo/trend"
 	if rangeKey != "" {
 		url += "?range=" + rangeKey
 	}
 	req, err := stdhttp.NewRequest("GET", url, nil)
 	if err != nil {
 		t.Fatalf("new request: %v", err)
 	}
 	req.AddCookie(cookie)
 	res, err := client.Do(req)
 	if err != nil {
 		t.Fatalf("GET %s: %v", url, err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusOK {
 		t.Fatalf("GET %s: want 200, got %d", url, res.StatusCode)
 	}
 	body := make([]byte, 0, 1<<20)
 	buf := make([]byte, 4096)
 	for {
 		n, rerr := res.Body.Read(buf)
 		body = append(body, buf[:n]...)
 		if rerr != nil {
 			break
 		}
 	}
 	return string(body)
 }
 func TestUIRepoTrend_30dRange(t *testing.T) {
 	t.Parallel()
 	_, baseURL, st := newTestServerWithUI(t)
 	cookie := loginAsAdmin(t, st)
 	hostID := makeHost(t, st, "h-trend")
 	ctx := context.Background()
 	now := time.Now().UTC()
 	for i := 0; i < 5; i++ {
 		day := now.AddDate(0, 0, -i).Format("2006-01-02")
 		v := int64(1000 + i*100)
 		c := int64(10 + i)
 		if err := st.UpsertHostRepoStatsHistory(ctx, hostID, day,
 			store.HostRepoStats{TotalSizeBytes: &v, SnapshotCount: &c}, now); err != nil {
 			t.Fatalf("seed %s: %v", day, err)
 		}
 	}
 	body := getTrend(t, baseURL, hostID, "30d", cookie)
 	if !strings.Contains(body, `class="repo-trend-chart"`) {
 		t.Errorf("expected repo-trend-chart SVG in fragment")
 	}
 	if !strings.Contains(body, `id="repo-trend-chart"`) {
 		t.Errorf("expected outer wrapper id=repo-trend-chart")
 	}
 	if !strings.Contains(body, `data-range="30d"`) {
 		t.Errorf("expected data-range=30d")
 	}
 }
 func TestUIRepoTrend_InvalidRangeFallsBackTo30d(t *testing.T) {
 	t.Parallel()
 	_, baseURL, st := newTestServerWithUI(t)
 	cookie := loginAsAdmin(t, st)
 	hostID := makeHost(t, st, "h-trend2")
 	body := getTrend(t, baseURL, hostID, "banana", cookie)
 	if !strings.Contains(body, `data-range="30d"`) {
 		t.Errorf("expected data-range=30d on invalid range fallback")
 	}
 }
 // TestUIRepoPageRendersTrendPanel — full-page render path: seed 3
 // history rows, fetch /hosts/{id}/repo, assert the Trend panel with
 // SVG chart ID, class, and heading text appear embedded in the page.
 func TestUIRepoPageRendersTrendPanel(t *testing.T) {
 	t.Parallel()
 	_, baseURL, st := newTestServerWithUI(t)
 	cookie := loginAsAdmin(t, st)
 	hostID := makeHost(t, st, "h-trend-page")
 	ctx := context.Background()
 	now := time.Now().UTC()
 	for i := 0; i < 3; i++ {
 		day := now.AddDate(0, 0, -i).Format("2006-01-02")
 		v := int64(2000 + i*200)
 		c := int64(20 + i)
 		if err := st.UpsertHostRepoStatsHistory(ctx, hostID, day,
 			store.HostRepoStats{TotalSizeBytes: &v, SnapshotCount: &c}, now); err != nil {
 			t.Fatalf("seed %s: %v", day, err)
 		}
 	}
 	body := getRepoPage(t, baseURL, hostID, cookie)
 	if !strings.Contains(body, `id="repo-trend-chart"`) {
 		t.Errorf("expected id=\"repo-trend-chart\" in full-page render")
 	}
 	if !strings.Contains(body, `class="repo-trend-chart"`) {
 		t.Errorf("expected class=\"repo-trend-chart\" in full-page render")
 	}
 	if !strings.Contains(body, ">Trend<") {
 		t.Errorf("expected panel heading '>Trend<' in full-page render")
 	}
 }
@@ -1,20 +0,0 @@
 package http
 import (
 	"encoding/json"
 	stdhttp "net/http"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )
 // handleVersion exposes the server's build-time identifying constants
 // (set via -ldflags). Public-band — no secrets surface here, the agent
 // updater compares its own agent_version byte-for-byte against the
 // Version field to drive the "out of date" signal.
 func (s *Server) handleVersion(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	w.Header().Set("Content-Type", "application/json")
 	_ = json.NewEncoder(w).Encode(map[string]string{
 		"version": version.Version,
 		"commit":  version.Commit,
 	})
 }
@@ -1,42 +0,0 @@
 package http
 import (
 	"encoding/json"
 	stdhttp "net/http"
 	"testing"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )
 func TestVersionEndpoint(t *testing.T) {
 	t.Parallel()
 	prevV, prevC := version.Version, version.Commit
 	version.Version = "v9.9.9-test"
 	version.Commit = "abc1234"
 	t.Cleanup(func() {
 		version.Version = prevV
 		version.Commit = prevC
 	})
 	_, url, _ := newTestServerWithHub(t)
 	res, err := stdhttp.Get(url + "/api/version")
 	if err != nil {
 		t.Fatalf("get: %v", err)
 	}
 	defer res.Body.Close()
 	if res.StatusCode != stdhttp.StatusOK {
 		t.Fatalf("status: got %d want 200", res.StatusCode)
 	}
 	var body map[string]string
 	if err := json.NewDecoder(res.Body).Decode(&body); err != nil {
 		t.Fatalf("decode: %v", err)
 	}
 	if body["version"] != "v9.9.9-test" {
 		t.Fatalf("version: got %q", body["version"])
 	}
 	if body["commit"] != "abc1234" {
 		t.Fatalf("commit: got %q", body["commit"])
 	}
 }
@@ -75,28 +75,6 @@ func funcMap() template.FuncMap {
 			return *p
 		},
 		"sub": func(a, b int) int { return a - b },
 		// durationHuman formats the elapsed time between two *time.Time
 		// values as a short human string: "350ms", "4.2s", "2m 15s",
 		// "1h 4m". Returns "—" when either pointer is nil.
 		"durationHuman": func(start, end *time.Time) string {
 			if start == nil || end == nil {
 				return "—"
 			}
 			d := end.Sub(*start)
 			if d < 0 {
 				d = -d
 			}
 			if d < time.Second {
 				return fmt.Sprintf("%dms", d.Milliseconds())
 			}
 			if d < time.Minute {
 				return fmt.Sprintf("%.1fs", d.Seconds())
 			}
 			if d < time.Hour {
 				return fmt.Sprintf("%dm %ds", int(d.Minutes()), int(d.Seconds())%60)
 			}
 			return fmt.Sprintf("%dh %dm", int(d.Hours()), int(d.Minutes())%60)
 		},
 		// joinComma joins a slice with ", ". Used by the schedule list
 		// to render retention summaries.
 		"joinComma": func(parts []string) string { return strings.Join(parts, ", ") },
@@ -108,9 +108,6 @@ func New() (*Renderer, error) {
 		"templates/partials/tree_node.html",
 		"templates/partials/alert_row.html",
 		"templates/partials/crit_banner.html",
 		"templates/partials/fleet_update_inner.html",
 		"templates/partials/host_update_chip.html",
 		"templates/partials/repo_size_chart.html",
 	}
 	pageEntries, err := fs.Glob(web.FS, "templates/pages/*.html")
@@ -16,7 +16,6 @@ import (
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )
 // HandlerDeps is the set of collaborators the agent WS handler needs.
@@ -27,9 +26,6 @@ type HandlerDeps struct {
 	// AlertEngine receives job-finished and host-online events so the
 	// alert engine can evaluate its rules. Optional; nil = no-op.
 	AlertEngine *alert.Engine
 	// UpdateWatcher reconciles in-flight agent-update dispatches against
 	// hello envelopes. Optional; nil = no-op.
 	UpdateWatcher *UpdateWatcher
 	// OnHello is called once per successful hello, after the host row
 	// has been touched and the conn registered. Used by the HTTP
 	// layer to push host_credentials down as a config.update before
@@ -151,9 +147,6 @@ func runAgentLoop(ctx context.Context, c *Conn, hostID string, deps HandlerDeps)
 	if deps.AlertEngine != nil {
 		deps.AlertEngine.NotifyHostOnline(hostID)
 	}
 	if deps.UpdateWatcher != nil {
 		deps.UpdateWatcher.OnHello(ctx, hostID, helloPayload.AgentVersion, version.Version)
 	}
 	deps.Hub.Register(hostID, c)
 	defer deps.Hub.Unregister(hostID, c)
@@ -227,17 +220,11 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
 		// a *success* — restic's idempotent init returns that when the
 		// repo is already initialised, which is the happy path for
 		// onboarding against an existing repo.
-		if job, err := deps.Store.GetJob(ctx, p.JobID); err == nil && job != nil {
+		if job, err := deps.Store.GetJob(ctx, p.JobID); err == nil && job != nil &&
-			switch job.Kind {
+			job.Kind == string(api.JobInit) {
-			case string(api.JobInit):
+			status, errOut := repoStatusFromInit(string(p.Status), errMsg)
-				status, errOut := repoStatusFromInit(string(p.Status), errMsg)
+			if err := deps.Store.SetHostRepoStatus(ctx, hostID, status, errOut); err != nil {
-				if err := deps.Store.SetHostRepoStatus(ctx, hostID, status, errOut); err != nil {
+				slog.Warn("ws: set host repo status", "host_id", hostID, "err", err)
 					slog.Warn("ws: set host repo status", "host_id", hostID, "err", err)
 				}
 			case string(api.JobBackup):
 				if err := deps.Store.SetHostLastBackup(ctx, hostID, string(p.Status), p.FinishedAt); err != nil {
 					slog.Warn("ws: set host last backup", "host_id", hostID, "err", err)
 				}
 			}
 		}
 		if deps.JobHub != nil {
@@ -339,10 +326,6 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
 		} else {
 			slog.Info("ws: repo stats refreshed", "host_id", hostID)
 		}
 		day := time.Now().UTC().Format("2006-01-02")
 		if err := deps.Store.UpsertHostRepoStatsHistory(ctx, hostID, day, patch, time.Now().UTC()); err != nil {
 			slog.Warn("ws: upsert host repo stats history", "host_id", hostID, "err", err)
 		}
 	case api.MsgCommandResult:
 		// TODO(P2): persist command.result acks for "did the agent
@@ -133,42 +133,3 @@ func TestRepoStatsReportPartialUpdate(t *testing.T) {
 		t.Errorf("LastCheckStatus: got %q want ok", got.LastCheckStatus)
 	}
 }
 func TestRepoStatsReportWritesHistoryRow(t *testing.T) {
 	t.Parallel()
 	s := openWSTestStore(t)
 	ctx := context.Background()
 	const hostID = "h-stats-history"
 	seedHostWS(t, s, hostID)
 	payload := api.RepoStatsPayload{
 		TotalSizeBytes: int64ptrWS(12345),
 		SnapshotCount:  int64ptrWS(7),
 	}
 	env, err := api.Marshal(api.MsgRepoStats, "", payload)
 	if err != nil {
 		t.Fatalf("marshal: %v", err)
 	}
 	deps := HandlerDeps{Store: s}
 	dispatchAgentMessage(ctx, nil, hostID, env, deps)
 	pts, err := s.ListHostRepoStatsHistory(ctx, hostID, time.Time{})
 	if err != nil {
 		t.Fatalf("list history: %v", err)
 	}
 	if len(pts) != 1 {
 		t.Fatalf("want 1 history row, got %d", len(pts))
 	}
 	wantDay := time.Now().UTC().Format("2006-01-02")
 	if got := pts[0].Day.Format("2006-01-02"); got != wantDay {
 		t.Errorf("day: want %s, got %s", wantDay, got)
 	}
 	if pts[0].TotalSizeBytes == nil || *pts[0].TotalSizeBytes != 12345 {
 		t.Errorf("TotalSizeBytes: want 12345, got %v", pts[0].TotalSizeBytes)
 	}
 	if pts[0].SnapshotCount == nil || *pts[0].SnapshotCount != 7 {
 		t.Errorf("SnapshotCount: want 7, got %v", pts[0].SnapshotCount)
 	}
 }
@@ -1,184 +0,0 @@
 package ws
 import (
 	"context"
 	"fmt"
 	"log/slog"
 	"sync"
 	"time"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 )
 // updateTimeout bounds how long the watcher waits for an agent to come
 // back with its new version after a command.update dispatch. var (not
 // const) so tests can shrink it.
 var updateTimeout = 90 * time.Second
 // AlertRaiser is the slim subset of *alert.Engine the update watcher
 // touches. Defined here (not in the alert package) so the dependency
 // arrow points the right way.
 type AlertRaiser interface {
 	RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time)
 	ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time)
 }
 // UpdateWatcher tracks in-flight agent-update dispatches and reconciles
 // them against incoming hello envelopes. Entries land on Track and
 // resolve via OnHello (success path) or the periodic sweep (timeout).
 type UpdateWatcher struct {
 	store  *store.Store
 	alerts AlertRaiser
 	jobHub *JobHub // optional — if nil, no fan-out to browser streams
 	mu      sync.Mutex
 	entries map[string]*updateEntry // hostID → entry
 	tickPeriod time.Duration
 }
 type updateEntry struct {
 	jobID     string
 	startedAt time.Time
 	// terminated is set once the entry has reached a terminal state so
 	// late OnHellos don't resurrect it.
 	terminated bool
 }
 // NewUpdateWatcher builds an unstarted watcher. Call Run in a goroutine
 // to start the periodic sweep.
 func NewUpdateWatcher(st *store.Store, alerts AlertRaiser, jobHub *JobHub) *UpdateWatcher {
 	return &UpdateWatcher{
 		store:      st,
 		alerts:     alerts,
 		jobHub:     jobHub,
 		entries:    make(map[string]*updateEntry),
 		tickPeriod: 5 * time.Second,
 	}
 }
 // Track registers a freshly-dispatched update job. A subsequent Track
 // for the same host replaces the prior entry (last-write-wins).
 func (w *UpdateWatcher) Track(jobID, hostID string) {
 	if w == nil {
 		return
 	}
 	w.mu.Lock()
 	w.entries[hostID] = &updateEntry{jobID: jobID, startedAt: time.Now()}
 	w.mu.Unlock()
 }
 // OnHello is called by the WS handler after a successful hello has been
 // persisted. If a tracked update for the host matches the targetVersion,
 // the job is marked succeeded and any open update_failed alert is
 // auto-resolved. A non-matching version is a no-op (the watcher keeps
 // waiting until the timeout).
 func (w *UpdateWatcher) OnHello(ctx context.Context, hostID, agentVersion, targetVersion string) {
 	if w == nil {
 		return
 	}
 	w.mu.Lock()
 	e, ok := w.entries[hostID]
 	if !ok || e.terminated {
 		w.mu.Unlock()
 		return
 	}
 	if agentVersion != targetVersion {
 		// Not the version we asked for — keep waiting.
 		w.mu.Unlock()
 		return
 	}
 	e.terminated = true
 	jobID := e.jobID
 	delete(w.entries, hostID)
 	w.mu.Unlock()
 	now := time.Now().UTC()
 	if err := w.store.MarkJobFinished(ctx, jobID, "succeeded", 0, nil, "", now); err != nil {
 		slog.Warn("ws update watcher: mark succeeded", "job_id", jobID, "host_id", hostID, "err", err)
 	}
 	w.publishJobFinished(jobID, api.JobSucceeded, 0, "", now)
 	if w.alerts != nil {
 		w.alerts.ResolveUpdateFailed(ctx, hostID, now)
 	}
 }
 // Run drives the periodic sweep. Returns when ctx is done.
 func (w *UpdateWatcher) Run(ctx context.Context) {
 	if w == nil {
 		return
 	}
 	t := time.NewTicker(w.tickPeriod)
 	defer t.Stop()
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case now := <-t.C:
 			w.sweep(ctx, now)
 		}
 	}
 }
 func (w *UpdateWatcher) sweep(ctx context.Context, now time.Time) {
 	type expired struct {
 		hostID string
 		jobID  string
 		age    time.Duration
 	}
 	var toFail []expired
 	w.mu.Lock()
 	for hostID, e := range w.entries {
 		if e.terminated {
 			continue
 		}
 		if now.Sub(e.startedAt) >= updateTimeout {
 			toFail = append(toFail, expired{hostID: hostID, jobID: e.jobID, age: now.Sub(e.startedAt)})
 			e.terminated = true
 			delete(w.entries, hostID)
 		}
 	}
 	w.mu.Unlock()
 	for _, x := range toFail {
 		reason := fmt.Sprintf("timeout: agent did not reconnect within %s", updateTimeout)
 		stamp := now.UTC()
 		errMsg := reason
 		if err := w.store.MarkJobFinished(ctx, x.jobID, "failed", -1, nil, errMsg, stamp); err != nil {
 			slog.Warn("ws update watcher: mark failed", "job_id", x.jobID, "host_id", x.hostID, "err", err)
 		}
 		w.publishJobFinished(x.jobID, api.JobFailed, -1, errMsg, stamp)
 		if w.alerts != nil {
 			w.alerts.RaiseUpdateFailed(ctx, x.hostID, x.jobID, reason, stamp)
 		}
 	}
 }
 // publishJobFinished pushes a synthetic job.finished envelope into the
 // JobHub so any browser still streaming this job sees it terminate.
 // The agent itself exits before it can send job.finished (it has to —
 // it's about to relaunch into the new binary), so without this fan-out
 // the /jobs/{id} page hangs until reload.
 //
 // Best-effort: if the hub is nil or the envelope can't be marshalled
 // we log and move on — the DB-side state is already correct, this is
 // purely a UI wake-up.
 func (w *UpdateWatcher) publishJobFinished(jobID string, status api.JobStatus, exitCode int, errMsg string, finishedAt time.Time) {
 	if w.jobHub == nil {
 		return
 	}
 	payload := api.JobFinishedPayload{
 		JobID:      jobID,
 		Status:     status,
 		ExitCode:   exitCode,
 		FinishedAt: finishedAt,
 		Error:      errMsg,
 	}
 	env, err := api.Marshal(api.MsgJobFinished, "", payload)
 	if err != nil {
 		slog.Warn("ws update watcher: marshal synthetic job.finished", "job_id", jobID, "err", err)
 		return
 	}
 	w.jobHub.Broadcast(jobID, env)
 }
@@ -1,230 +0,0 @@
 package ws
 import (
 	"context"
 	"sync"
 	"testing"
 	"time"
 	"github.com/oklog/ulid/v2"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
 )
 type fakeAlerts struct {
 	mu       sync.Mutex
 	raised   []string // hostIDs
 	resolved []string
 	reasons  []string
 }
 func (f *fakeAlerts) RaiseUpdateFailed(_ context.Context, hostID, _ /*jobID*/, reason string, _ time.Time) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 	f.raised = append(f.raised, hostID)
 	f.reasons = append(f.reasons, reason)
 }
 func (f *fakeAlerts) ResolveUpdateFailed(_ context.Context, hostID string, _ time.Time) {
 	f.mu.Lock()
 	defer f.mu.Unlock()
 	f.resolved = append(f.resolved, hostID)
 }
 func seedJob(t *testing.T, st *store.Store, hostID string) string {
 	t.Helper()
 	jobID := ulid.Make().String()
 	if err := st.CreateJob(context.Background(), store.Job{
 		ID: jobID, HostID: hostID, Kind: "update",
 		ActorKind: "user", CreatedAt: time.Now().UTC(),
 	}); err != nil {
 		t.Fatalf("create job: %v", err)
 	}
 	return jobID
 }
 func TestUpdateWatcherOnHelloSuccess(t *testing.T) {
 	st := openWSTestStore(t)
 	hostID := ulid.Make().String()
 	seedHostWS(t, st, hostID)
 	jobID := seedJob(t, st, hostID)
 	a := &fakeAlerts{}
 	w := NewUpdateWatcher(st, a, nil)
 	w.Track(jobID, hostID)
 	w.OnHello(context.Background(), hostID, "v2", "v2")
 	job, err := st.GetJob(context.Background(), jobID)
 	if err != nil {
 		t.Fatalf("get job: %v", err)
 	}
 	if job.Status != "succeeded" {
 		t.Fatalf("status: got %q want succeeded", job.Status)
 	}
 	a.mu.Lock()
 	defer a.mu.Unlock()
 	if len(a.resolved) != 1 || a.resolved[0] != hostID {
 		t.Fatalf("resolve calls: %v", a.resolved)
 	}
 	if len(a.raised) != 0 {
 		t.Fatalf("unexpected raises: %v", a.raised)
 	}
 }
 func TestUpdateWatcherTimeout(t *testing.T) {
 	prev := updateTimeout
 	updateTimeout = 50 * time.Millisecond
 	t.Cleanup(func() { updateTimeout = prev })
 	st := openWSTestStore(t)
 	hostID := ulid.Make().String()
 	seedHostWS(t, st, hostID)
 	jobID := seedJob(t, st, hostID)
 	a := &fakeAlerts{}
 	w := NewUpdateWatcher(st, a, nil)
 	w.Track(jobID, hostID)
 	time.Sleep(80 * time.Millisecond)
 	w.sweep(context.Background(), time.Now())
 	job, err := st.GetJob(context.Background(), jobID)
 	if err != nil {
 		t.Fatalf("get job: %v", err)
 	}
 	if job.Status != "failed" {
 		t.Fatalf("status: got %q want failed", job.Status)
 	}
 	a.mu.Lock()
 	defer a.mu.Unlock()
 	if len(a.raised) != 1 || a.raised[0] != hostID {
 		t.Fatalf("raise calls: %v", a.raised)
 	}
 	if len(a.reasons) == 0 || a.reasons[0] == "" {
 		t.Fatalf("missing reason")
 	}
 }
 func TestUpdateWatcherMismatchedVersionNoOp(t *testing.T) {
 	st := openWSTestStore(t)
 	hostID := ulid.Make().String()
 	seedHostWS(t, st, hostID)
 	jobID := seedJob(t, st, hostID)
 	a := &fakeAlerts{}
 	w := NewUpdateWatcher(st, a, nil)
 	w.Track(jobID, hostID)
 	w.OnHello(context.Background(), hostID, "v1", "v2")
 	job, _ := st.GetJob(context.Background(), jobID)
 	if job.Status == "succeeded" || job.Status == "failed" {
 		t.Fatalf("status flipped on mismatched hello: %q", job.Status)
 	}
 	a.mu.Lock()
 	defer a.mu.Unlock()
 	if len(a.raised) != 0 || len(a.resolved) != 0 {
 		t.Fatalf("unexpected alert calls raised=%v resolved=%v", a.raised, a.resolved)
 	}
 }
 func TestUpdateWatcherHelloAfterTimeoutIsNoOp(t *testing.T) {
 	prev := updateTimeout
 	updateTimeout = 50 * time.Millisecond
 	t.Cleanup(func() { updateTimeout = prev })
 	st := openWSTestStore(t)
 	hostID := ulid.Make().String()
 	seedHostWS(t, st, hostID)
 	jobID := seedJob(t, st, hostID)
 	a := &fakeAlerts{}
 	w := NewUpdateWatcher(st, a, nil)
 	w.Track(jobID, hostID)
 	time.Sleep(80 * time.Millisecond)
 	w.sweep(context.Background(), time.Now())
 	// Hello arrives after sweep — entry already gone, must be no-op.
 	w.OnHello(context.Background(), hostID, "v2", "v2")
 	job, _ := st.GetJob(context.Background(), jobID)
 	if job.Status != "failed" {
 		t.Fatalf("status flipped from failed → %q", job.Status)
 	}
 	a.mu.Lock()
 	defer a.mu.Unlock()
 	if len(a.resolved) != 0 {
 		t.Fatalf("late hello triggered ResolveUpdateFailed: %v", a.resolved)
 	}
 }
 func TestUpdateWatcherOnHelloBroadcastsJobFinished(t *testing.T) {
 	st := openWSTestStore(t)
 	hostID := ulid.Make().String()
 	seedHostWS(t, st, hostID)
 	jobID := seedJob(t, st, hostID)
 	hub := NewJobHub()
 	sub := hub.Register(jobID)
 	defer sub.unregister()
 	w := NewUpdateWatcher(st, &fakeAlerts{}, hub)
 	w.Track(jobID, hostID)
 	w.OnHello(context.Background(), hostID, "v2", "v2")
 	select {
 	case env := <-sub.ch:
 		if env.Type != api.MsgJobFinished {
 			t.Fatalf("envelope type: got %q want %q", env.Type, api.MsgJobFinished)
 		}
 		var p api.JobFinishedPayload
 		if err := env.UnmarshalPayload(&p); err != nil {
 			t.Fatalf("unmarshal payload: %v", err)
 		}
 		if p.JobID != jobID || p.Status != api.JobSucceeded {
 			t.Fatalf("payload: got %+v", p)
 		}
 	case <-time.After(time.Second):
 		t.Fatal("expected synthetic job.finished broadcast, got nothing")
 	}
 }
 func TestUpdateWatcherTimeoutBroadcastsJobFinished(t *testing.T) {
 	prev := updateTimeout
 	updateTimeout = 50 * time.Millisecond
 	t.Cleanup(func() { updateTimeout = prev })
 	st := openWSTestStore(t)
 	hostID := ulid.Make().String()
 	seedHostWS(t, st, hostID)
 	jobID := seedJob(t, st, hostID)
 	hub := NewJobHub()
 	sub := hub.Register(jobID)
 	defer sub.unregister()
 	w := NewUpdateWatcher(st, &fakeAlerts{}, hub)
 	w.Track(jobID, hostID)
 	time.Sleep(80 * time.Millisecond)
 	w.sweep(context.Background(), time.Now())
 	select {
 	case env := <-sub.ch:
 		if env.Type != api.MsgJobFinished {
 			t.Fatalf("envelope type: got %q want %q", env.Type, api.MsgJobFinished)
 		}
 		var p api.JobFinishedPayload
 		if err := env.UnmarshalPayload(&p); err != nil {
 			t.Fatalf("unmarshal payload: %v", err)
 		}
 		if p.JobID != jobID || p.Status != api.JobFailed {
 			t.Fatalf("payload: got %+v", p)
 		}
 	case <-time.After(time.Second):
 		t.Fatal("expected synthetic job.finished broadcast, got nothing")
 	}
 }
@@ -77,56 +77,6 @@ func (s *Store) RaiseOrTouch(ctx context.Context, hostID, kind, dedupKey, severi
 	return id, true, nil
 }
 // RaiseOrTouchSystem is the host-less variant of RaiseOrTouch — the
 // alert row's host_id is stored as NULL, so the FK to hosts is bypassed.
 // Used by fleet-wide alerts (e.g. fleet_update_halted) where the
 // failure surface isn't pinned to a single host.
 func (s *Store) RaiseOrTouchSystem(ctx context.Context, kind, dedupKey, severity, message string, when time.Time) (id string, didRaise bool, err error) {
 	tx, err := s.db.BeginTx(ctx, nil)
 	if err != nil {
 		return "", false, fmt.Errorf("store: begin: %w", err)
 	}
 	defer func() { _ = tx.Rollback() }()
 	row := tx.QueryRowContext(ctx,
 		`SELECT id FROM alerts
 		   WHERE host_id IS NULL AND kind = ? AND dedup_key = ? AND resolved_at IS NULL
 		   LIMIT 1`,
 		kind, dedupKey)
 	var existing string
 	switch err := row.Scan(&existing); {
 	case err == nil:
 		_, uerr := tx.ExecContext(ctx,
 			`UPDATE alerts SET last_seen_at = ?, message = ? WHERE id = ?`,
 			when.UTC().Format(time.RFC3339Nano), message, existing)
 		if uerr != nil {
 			return "", false, fmt.Errorf("store: touch alert: %w", uerr)
 		}
 		if err := tx.Commit(); err != nil {
 			return "", false, err
 		}
 		return existing, false, nil
 	case errors.Is(err, sql.ErrNoRows):
 		// fall through to insert
 	default:
 		return "", false, fmt.Errorf("store: lookup alert: %w", err)
 	}
 	id = ulid.Make().String()
 	whenStr := when.UTC().Format(time.RFC3339Nano)
 	_, err = tx.ExecContext(ctx,
 		`INSERT INTO alerts (id, host_id, kind, dedup_key, severity, message, created_at, last_seen_at)
 		 VALUES (?, NULL, ?, ?, ?, ?, ?, ?)`,
 		id, kind, dedupKey, severity, message, whenStr, whenStr)
 	if err != nil {
 		return "", false, fmt.Errorf("store: insert alert: %w", err)
 	}
 	if err := tx.Commit(); err != nil {
 		return "", false, err
 	}
 	return id, true, nil
 }
 // refreshHostOpenAlertCount recomputes hosts.open_alert_count from the
 // alerts table for one host. Self-healing: idempotent and survives
 // out-of-order edits. Best-effort — errors are returned but callers
--- a/Show More
+++ b/Show More