ci(release): use DEV_TOKEN for registry login

The auto-issued GITHUB_TOKEN lacks write:package scope on this Gitea instance, so the v0.9.0 tag build failed at docker login. Switch to the user-level DEV_TOKEN secret which has the correct scope.
Merge pull request 'testing: bootstrap UI, agent reliability, NS-01..04 + alert username' (#18 ) from ns-batch-host-ops into main
2026-05-06 19:05:54 +01:00 · 2026-05-05 21:09:17 +00:00 · 2026-05-05 22:03:15 +01:00 · 2026-05-05 16:36:08 +00:00 · 2026-05-05 17:15:00 +01:00 · 2026-05-05 15:18:48 +01:00
146 changed files with 157 additions and 27684 deletions
@@ -1,32 +0,0 @@
-<!--
-Thanks for the PR! A few quick checks before submitting:
-
-* Did you open an issue first for non-trivial changes?
-* `make lint test` is green locally?
-* Commits are focused (one logical change per commit)?
-* No `Co-Authored-By` trailers (repo policy)?
-* No new dependencies without a one-line justification below?
-->
-
-## Summary
-
-<!-- One paragraph: what changed and why. -->
-
-## Test plan
-
-<!-- Bullet list of what you actually ran. Be specific.
-     - `make test` → green
-     - Manually exercised the new flow at /hosts/{id}/foo
-     - Smoke env: enrolled a fresh host, ran a backup end-to-end
-->
-
-## Notes for the reviewer
-
-<!-- Anything the reviewer needs to know that isn't obvious from the
-     diff: related issue, follow-up work that's intentionally not
-     in this PR, deferred concerns, design alternatives considered
-     and rejected. -->
-
-## Linked issues
-
-<!-- "Closes #123" / "Refs #456" / "Part of P5-06" -->
@@ -1,52 +0,0 @@
---
-name: Bug report
-about: Something isn't behaving the way the docs / code suggest it should
-title: "[bug] "
-labels: bug
---
-
-## What happened
-
-<!-- A clear description of the actual behaviour. Include the exact
-     UI surface, API endpoint, or CLI invocation involved. -->
-
-## What you expected
-
-<!-- What you thought would happen, and where that expectation came from
-     (docs page, command output, prior behaviour). -->
-
-## Steps to reproduce
-
-1.
-2.
-3.
-
-## Environment
-
- restic-manager server version: <!-- `restic-manager-server --version` or footer of the UI -->
- Agent version (if relevant): <!-- `restic-manager-agent --version` -->
- restic version on affected host: <!-- `restic version` -->
- Host OS: <!-- e.g. "Ubuntu 22.04 amd64" or "Windows Server 2022" -->
- How was the server installed: <!-- docker compose / source build / other -->
-
-## Logs / output
-
-<details><summary>Server log (sanitised)</summary>
-
-```
-<!-- paste relevant lines; redact tokens, passwords, repo URLs -->
-```
-
-</details>
-
-<details><summary>Agent log (sanitised)</summary>
-
-```
-```
-
-</details>
-
-## Anything else
-
-<!-- Screenshots, related issues, recent changes you made before the
-     bug appeared, anything that might help. -->
@@ -1,34 +0,0 @@
---
-name: Feature request
-about: Suggest a new capability or change to existing behaviour
-title: "[feature] "
-labels: enhancement
---
-
-## What you're trying to do
-
-<!-- Describe the use case, not the proposed solution. Who is the
-     operator, what are they trying to accomplish, and what's
-     blocking them today? -->
-
-## Why the current behaviour falls short
-
-<!-- What does the system do today, and where does it stop short of
-     the use case above? -->
-
-## Proposed direction (optional)
-
-<!-- If you have a specific design in mind, describe it. Skip this
-     section if you'd rather leave it to the maintainer. -->
-
-## Scope check
-
- [ ] I've read [`spec.md`](../spec.md) §2 (Goals & Non-Goals).
- [ ] This isn't already on the roadmap in [`tasks.md`](../tasks.md).
- [ ] This fits the project's "small fleet, one person operating"
-      target rather than enterprise / multi-tenant / SaaS use cases.
-
-## Anything else
-
-<!-- Related restic features, prior art in similar tools, links to
-     discussions you've had elsewhere. -->
@@ -2,34 +2,28 @@
 #
 # Notes for anyone editing this file:
 #
-# Custom runner image
-#   Every job runs inside `gitea.dcglab.co.uk/steve/ci-runner-go`
-#   (recipe: https://gitea.dcglab.co.uk/steve/ci/src/branch/main/images/ci-runner-go).
-#   That image already ships:
-#     * Go on PATH at /usr/local/go/bin (so `actions/setup-go` is
-#       redundant and intentionally NOT used here — the action would
-#       otherwise re-download Go on every job)
-#     * Node.js + npm (used by docs / e2e workflows)
-#     * Docker CLI, Buildx, Compose v2 (used by docker-build steps)
-#   When bumping the Go floor, push a new ci-runner-go image with
-#   the matching Go version and bump the date pin in IMAGE below.
-#
 # Self-hosted runner expectations
-#   Each runner host bind-mounts persistent volumes for
-#   /root/go/pkg/mod (GOMODCACHE), /root/.cache/go-build (GOCACHE),
-#   and /root/.cache/act (action clones) into every job container —
-#   regardless of which image the container is built from. As a
+#   The Gitea runners are provisioned out-of-band (the infra team owns
+#   the script). Each runner host bind-mounts persistent volumes for
+#   /root/go/pkg/mod (GOMODCACHE), /root/.cache/go-build (GOCACHE), and
+#   /root/.cache/act (action clones) into every job container. As a
 #   result:
-#     * Common GitHub actions (actions/checkout, actions/upload-artifact,
-#       golangci/golangci-lint-action) are pre-cloned into
-#       /root/.cache/act on the runner, so the per-job
-#       "git clone https://github.com/actions/..." step is a fetch,
-#       not a full clone.
+#     * `cache: true` on actions/setup-go is intentionally OMITTED — the
+#       action would otherwise tar/untar GOMODCACHE+GOCACHE through the
+#       Gitea cache backend on every job, undoing the host-volume cache
+#       and adding ~10s of redundant zstd round-trip per job.
+#     * Common GitHub actions (actions/checkout, actions/setup-go,
+#       actions/upload-artifact, golangci/golangci-lint-action) are
+#       pre-cloned into /root/.cache/act on the runner, so the per-job
+#       "git clone https://github.com/actions/..." step is a fetch, not
+#       a full clone.
 #     * golangci-lint is pre-installed at /usr/local/bin/golangci-lint
-#       on the runner host BUT that's outside the job's filesystem
-#       view; the golangci-lint-action below pins a specific version
-#       and re-downloads — that's fine (deterministic CI > marginal
-#       speed).
+#       on the runner (latest v2.x). The golangci-lint-action below
+#       still pins a specific version and re-downloads — that's fine
+#       (deterministic CI > marginal speed) but means the host-installed
+#       binary is currently unused. Drop the `version:` arg below to
+#       use the host-installed one if you want to trade determinism
+#       for speed.
 #
 # Build matrix
 #   Linux amd64 + arm64 + Windows amd64. CGO_ENABLED=0 throughout —
@@ -38,10 +32,10 @@
 #   binaries.
 #
 # Go version
-#   Anchored by the ci-runner-go image (currently Go 1.25.7). Floor
-#   is set by the heaviest dep (modernc.org/sqlite v1.50+ requires
-#   Go 1.23+; we run 1.25 so golangci-lint's Go-version compatibility
-#   check is happy — see the version pin in the lint job).
+#   The GO_VERSION env var anchors all three jobs. Floor is set by the
+#   heaviest dep (modernc.org/sqlite v1.50+ requires Go 1.23+ today;
+#   we run 1.25 so golangci-lint's Go-version compatibility check is
+#   happy — see the version pin in the lint job).
 #
 # upload-artifact
 #   Pinned at v3 historically; v3 was deprecated upstream. v4 should
@@ -54,12 +48,8 @@ on:
  pull_request:
    branches: [main]

-# Force bash as the default shell. With `container:` set on every
-# job, Gitea Actions otherwise picks `sh -e` and our `set -euo
-# pipefail` fails on dash with "Illegal option -o pipefail".
-defaults:
-  run:
-    shell: bash
+env:
+  GO_VERSION: "1.25"

 jobs:
  test:
@@ -70,7 +60,6 @@ jobs:
    # one runner. The third shard ("rest") covers everything else.
    name: Test (${{ matrix.name }})
    runs-on: ubuntu-latest
-    container: gitea.dcglab.co.uk/steve/ci-runner-go:2026-05-08
    strategy:
      fail-fast: false
      matrix:
@@ -84,6 +73,10 @@ jobs:
            packages: ""
    steps:
      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: ${{ env.GO_VERSION }}
+          # cache: true intentionally omitted — see header notes.
      - name: go vet
        run: go vet ./...
      - name: go test
@@ -105,9 +98,12 @@ jobs:
  lint:
    name: Lint
    runs-on: ubuntu-latest
-    container: gitea.dcglab.co.uk/steve/ci-runner-go:2026-05-08
    steps:
      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: ${{ env.GO_VERSION }}
+          # cache: true intentionally omitted — see header notes.
      - uses: golangci/golangci-lint-action@v7
        with:
          # Must be built against the same Go release as go.mod targets,
@@ -121,7 +117,6 @@ jobs:
  build:
    name: Build (${{ matrix.goos }}/${{ matrix.goarch }})
    runs-on: ubuntu-latest
-    container: gitea.dcglab.co.uk/steve/ci-runner-go:2026-05-08
    strategy:
      fail-fast: false
      matrix:
@@ -135,6 +130,10 @@ jobs:
            ext: ".exe"
    steps:
      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: ${{ env.GO_VERSION }}
+          # cache: true intentionally omitted — see header notes.
      - name: build server + agent
        env:
          GOOS: ${{ matrix.goos }}
@@ -1,109 +0,0 @@
-# P5-06 — End-to-end test suite.
-#
-# Spec : docs/superpowers/specs/2026-05-07-p5-oss-readiness-design.md
-# Stack: e2e/compose.e2e.yml (server + agent + rest-server + playwright)
-# Tests: e2e/playwright/tests/*.spec.ts
-#
-# Triggered on every PR into main and on workflow_dispatch. Runs
-# longer than the unit-test workflow (~3-4 minutes for a clean run);
-# kept separate so a slow e2e doesn't block the fast lint/test loop.
-#
-# Networking note: every interaction with the server (health probe,
-# Playwright) happens from a container on the compose `rmnet`
-# network, addressing the server as `http://server:8080`. We can't
-# rely on `127.0.0.1:8080` because Gitea's runner executes steps
-# inside its own container, where compose's host port-publish is
-# not visible.
-
-name: e2e
-
-on:
-  pull_request:
-    branches: [main]
-  workflow_dispatch:
-
-# Force bash as the default shell — see ci.yml header.
-defaults:
-  run:
-    shell: bash
-
-jobs:
-  e2e:
-    name: Playwright vs docker-compose
-    runs-on: ubuntu-latest
-    container: gitea.dcglab.co.uk/steve/ci-runner-go:2026-05-08
-    timeout-minutes: 15
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Build the e2e stack
-        # --profile test pulls in the playwright service which is
-        # otherwise gated. --pull refreshes base images so a bump
-        # to the Dockerfile's FROM tag (e.g. mcr.microsoft.com/
-        # playwright:vX.Y.Z-jammy) isn't masked by a stale runner
-        # cache that still has the old tag's layers.
-        run: docker compose --profile test -f e2e/compose.e2e.yml build --pull
-
-      - name: Bring up the stack
-        run: docker compose -f e2e/compose.e2e.yml up -d server rest-server source-fixture
-
-      - name: Wait for server health
-        run: |
-          set -eu
-          for i in $(seq 1 30); do
-            if docker run --rm --network e2e_rmnet curlimages/curl:8.10.1 \
-                  -fsS http://server:8080/api/version >/dev/null 2>&1; then
-              echo "server up"; exit 0
-            fi
-            sleep 2
-          done
-          echo "server didn't come up"; docker compose -f e2e/compose.e2e.yml logs server; exit 1
-
-      - name: Capture bootstrap token from server logs
-        id: bootstrap
-        run: |
-          set -eu
-          for i in $(seq 1 15); do
-            line=$(docker compose -f e2e/compose.e2e.yml logs server 2>&1 | grep -E 'bootstrap token' -A2 | grep -Eo '[a-zA-Z0-9_-]{40,}' | head -1 || true)
-            if [ -n "$line" ]; then
-              echo "RM_BOOTSTRAP_TOKEN=$line" >> "$GITHUB_ENV"
-              echo "got bootstrap token (${#line} chars)"
-              exit 0
-            fi
-            sleep 1
-          done
-          echo "bootstrap token not found in logs"
-          docker compose -f e2e/compose.e2e.yml logs server
-          exit 1
-
-      - name: Start the agent
-        run: docker compose -f e2e/compose.e2e.yml up -d agent
-
-      - name: Prepare report mounts
-        run: |
-          mkdir -p e2e/playwright/playwright-report e2e/playwright/test-results
-          chmod -R a+rwX e2e/playwright/playwright-report e2e/playwright/test-results
-
-      - name: Run Playwright tests
-        env:
-          RM_BOOTSTRAP_TOKEN: ${{ env.RM_BOOTSTRAP_TOKEN }}
-        run: docker compose -f e2e/compose.e2e.yml run --rm playwright
-
-      - name: Compose logs (on failure)
-        if: failure()
-        run: |
-          docker compose -f e2e/compose.e2e.yml logs --tail=200 server
-          docker compose -f e2e/compose.e2e.yml logs --tail=200 agent
-          docker compose -f e2e/compose.e2e.yml logs --tail=200 rest-server
-
-      - name: Upload Playwright report (on failure)
-        if: failure()
-        uses: actions/upload-artifact@v3
-        with:
-          name: playwright-report
-          path: e2e/playwright/playwright-report
-          retention-days: 7
-
-      - name: Tear down
-        if: always()
-        run: docker compose -f e2e/compose.e2e.yml down -v
@@ -37,16 +37,10 @@ env:
  REGISTRY: gitea.dcglab.co.uk
  IMAGE_NAME: ${{ gitea.repository }}

-# Force bash as the default shell — see ci.yml header.
-defaults:
-  run:
-    shell: bash
-
 jobs:
  image:
    name: Build + push image
    runs-on: ubuntu-latest
-    container: gitea.dcglab.co.uk/steve/ci-runner-go:2026-05-08
    steps:
      - uses: actions/checkout@v4

@@ -2,10 +2,6 @@
 /bin/
 /dist/

-# Generated mdBook output (source under docs/book/src is committed,
-# the rendered book/ directory is not).
-/docs/book/book/
-
 # Local data / runtime state
 /data/
 /certs/
@@ -38,7 +38,7 @@ but the **agent** is fetched by the install script from the server's
 **install script** are fetched from `<DataDir>/install/`. Plain
 `make build` doesn't touch any of those — the source-of-truth files
 in the working tree (`deploy/install/*`, `bin/restic-manager-agent`)
-must be copied into `$HOME/smoke/data/...` *and* the running agent
+must be copied into `/tmp/rm-smoke/data/...` *and* the running agent
 on this dev host needs replacing if the change touches agent code or
 the unit file.

@@ -53,13 +53,13 @@ asking the operator to test.**
 ```sh
 # 1. Restage what the install script serves (binary + unit + script).
 cp bin/restic-manager-agent \
-   $HOME/smoke/data/agent-binaries/restic-manager-agent-linux-amd64
+   /tmp/rm-smoke/data/agent-binaries/restic-manager-agent-linux-amd64
 cp deploy/install/install.sh \
-   $HOME/smoke/data/install/install.sh
+   /tmp/rm-smoke/data/install/install.sh
 cp deploy/install/install.ps1 \
-   $HOME/smoke/data/install/install.ps1
+   /tmp/rm-smoke/data/install/install.ps1
 cp deploy/install/restic-manager-agent.service \
-   $HOME/smoke/data/install/restic-manager-agent.service
+   /tmp/rm-smoke/data/install/restic-manager-agent.service

 # 2. Replace the running agent on this dev box and restart the
 #    service. Skip only when the change is server-side only AND
@@ -74,36 +74,15 @@ sudo -n systemctl restart restic-manager-agent
 # 3. The server runs from the working tree; restart it manually
 #    after a build that touches server code:
 pkill -f restic-manager-server
-RM_LISTEN=:8080 RM_DATA_DIR=$HOME/smoke/data \
+RM_LISTEN=:8080 RM_DATA_DIR=/tmp/rm-smoke/data \
 RM_BASE_URL=http://127.0.0.1:8080 \
-RM_SECRET_KEY_FILE=$HOME/smoke/data/secret.key \
+RM_SECRET_KEY_FILE=/tmp/rm-smoke/data/secret.key \
 RM_COOKIE_SECURE=false \
-./bin/restic-manager-server >> $HOME/smoke/server.log 2>&1 &
+./bin/restic-manager-server >> /tmp/rm-smoke/server.log 2>&1 &
 ```

-## Smoke server: use the Make targets, not raw `nohup`
-
-The smoke server runs as a transient `systemd --user` unit named
-`restic-manager-smoke.service` so it survives any sandbox or
-process-group boundary that would otherwise SIGTERM a backgrounded
-process. Use the Make targets:
-
-```
-make smoke-restart   # rebuild server + (re)launch as systemd --user unit
-make smoke-status    # systemctl --user status
-make smoke-logs      # tail $HOME/smoke/server.log
-make smoke-stop      # stop the unit
-make smoke-deploy    # full rebuild + restage agent assets + restart
-```
-
-`./bin/restic-manager-server &` from inside a Bash tool call gets
-reaped when the tool exits — don't do that. If the unit fails to
-start: `systemctl --user status restic-manager-smoke` and
-`$HOME/smoke/server.log` have the diagnosis.
-
-`smoke-deploy` does NOT touch `/usr/local/bin/restic-manager-agent`
-on this dev box; if your change requires the live agent here to
-update, run the agent restage block above by hand.
+A `make smoke-deploy` target that bundles all of this would be a
+good follow-up.

 ## Migrations: prefer column-level ALTERs over table rebuilds

@@ -1,69 +0,0 @@
-# Code of Conduct
-
-restic-manager is a small project run by one person. This Code of
-Conduct sets out the basic expectations for participating in the
-project's issue tracker, pull requests, and any other community
-spaces (chat, mailing lists) we may run in future.
-
-## Expected behaviour
-
- **Be civil.** Disagreement is fine; rudeness is not. The same
-  comment can usually be made without making it personal.
- **Assume good faith.** People asking what feels like a basic
-  question may be new to the project. People proposing what feels
-  like a duplicate idea may not have seen the prior discussion.
-  Point them to the right place politely.
- **Stay on topic.** Issue threads are for the issue. Tangential
-  conversations belong in their own thread.
- **Acknowledge the project's scope.** restic-manager is
-  intentionally small in scope (see `spec.md` §2). Reasonable
-  feature suggestions may still be declined for fit reasons.
-
-## Unacceptable behaviour
-
- Harassment, threats, or insults — public or private.
- Discriminatory comments based on age, body size, disability,
-  ethnicity, gender identity or expression, level of experience,
-  nationality, personal appearance, race, religion, sexual identity
-  or orientation.
- Sustained disruption — derailing threads, ignoring repeated
-  requests to take a discussion elsewhere, brigading.
- Publishing other people's private information without permission.
-
-## Reporting
-
-If someone in the project's spaces is behaving in a way that
-breaches this Code of Conduct, contact the maintainer directly
-through the contact details on their Gitea profile, or via the
-private security disclosure path documented in
-[SECURITY.md](./SECURITY.md). Reports stay confidential.
-
-The maintainer will review the report, gather context if needed,
-and respond. Possible outcomes include a private warning, a public
-clarification of expectations, a temporary or permanent ban from
-project spaces, or no action if the report doesn't hold up.
-
-There is no formal appeals process — this is a one-person project,
-not a foundation. If you think a decision was wrong you can say
-so, in writing, to the maintainer; that's it.
-
-## Scope
-
-This Code of Conduct applies to interactions in any space the
-project owns or operates: the Gitea repository (issues, pull
-requests, discussions, wiki), any chat channels we publish, and
-any conferences or events the project is officially represented at.
-
-It does not apply to:
-
- Forks of the project that aren't being submitted back upstream.
- Conversations between contributors that don't reference the
-  project.
- Public criticism of the project itself.
-
-## Acknowledgement
-
-This document borrows shape and language from the
-[Contributor Covenant](https://www.contributor-covenant.org/) v2.1
-but is intentionally shorter and adapted to the project's
-single-maintainer reality.
@@ -1,168 +1,30 @@
-# Contributing to restic-manager
+# Contributing

-Thanks for your interest in restic-manager. This document covers how
-to set up a development environment, the conventions the project
-follows, and how patches make it from your machine into `main`.
+Thanks for your interest in contributing to restic-manager.

-## Project status and scope
+> This is a placeholder. The project is in pre-alpha (Phase 1 / MVP). A
+> full contributor guide will land alongside the Phase 5 OSS-readiness
+> work — see [`tasks.md`](./tasks.md) P5-02. Until then the notes below
+> apply.

-restic-manager is in pre-1.0. Core functionality (Phases 0–4) is
-landed; OSS-readiness polish is in progress. The top of
-[`tasks.md`](./tasks.md) tracks what's next; [`spec.md`](./spec.md)
-is the canonical design doc and the source of truth for any
-"why is it built this way" question.
+## Before opening a PR

-The project is **single-maintainer, hobbyist-scale, and licensed
-under [PolyForm Noncommercial 1.0.0](./LICENSE)**. That has two
-practical implications:
+1. Open an issue first for non-trivial changes — the design is still
+   moving (see [`spec.md`](./spec.md)) and unsolicited large PRs may
+   conflict with in-flight work.
+2. `make lint test` should pass.
+3. Match the existing code style — `gofumpt`, `goimports`, no comments
+   that just restate what the code does.
+4. Keep commits focused; one logical change per commit.

-1. Big PRs without prior discussion may be declined for fit
-   reasons even when they're correct — opening an issue first lets
-   us check alignment cheaply.
-2. Commercial use is not permitted by the license. Bug reports and
-   patches from operators of personal/community deployments are
-   very welcome.
+## Reporting security issues

-## Getting started
-
-### Prerequisites
-
- Go 1.25 or newer (`go.mod` is the source of truth)
- `make`
- For the front-end CSS bundle: nothing extra — `make build`
-  downloads a pinned `tailwindcss` standalone binary into `bin/`.
- For the docs site: nothing extra — `make docs` does the same trick
-  with `mdbook`.
- For end-to-end tests: Docker + Docker Compose, plus `npx` for
-  Playwright.
-
-### One-time setup
-
-```sh
-git clone https://gitea.dcglab.co.uk/steve/restic-manager.git
-cd restic-manager
-make build          # compiles bin/restic-manager-{server,agent}
-make test           # full unit + integration test sweep
-make lint           # gofumpt + goimports + golangci-lint
-```
-
-### Running locally
-
-For most development, the [smoke environment](./docs/e2e-smoke.md)
-is the path of least resistance:
-
-```sh
-make smoke-restart  # rebuilds, launches as a systemd --user unit
-make smoke-logs     # tail of the server log
-```
-
-Then point a browser at `http://127.0.0.1:8080`. The first run
-prints a one-time bootstrap token to the log; use it to create the
-admin user.
-
-## Code conventions
-
-### Style
-
- `gofumpt` for formatting; `goimports` for import grouping.
-  Both run via the pre-commit hook in this repo.
- `golangci-lint` with `.golangci.yml` defaults; CI rejects on lint
-  errors.
- UK English in identifiers, comments, log messages, and UI strings
-  (the misspell linter is configured for the UK locale — see
-  P3-X5 for the original sweep).
- Comments explain **why**, not what; avoid restating the code.
-  A surprising invariant or an external constraint is worth
-  writing down. "Adds 1 to x" is not.
- `slog` for structured logs. Never log secrets — and especially
-  never the merged-creds rest-server URL (see [`CLAUDE.md`](./CLAUDE.md)).
-
-### File and package layout
-
- `cmd/server` and `cmd/agent` are the two binary entry points.
- `internal/` holds everything that's not part of the public Go
-  API (which is none of it — restic-manager isn't a library).
- Per-feature packages live under `internal/server/...` for the
-  control plane and `internal/agent/...` for the agent.
- `web/templates/` are HTML templates rendered with the standard
-  library; embedded via `web.FS`.
-
-### Tests
-
- Unit tests live alongside the code as `*_test.go`. Use the
-  in-process sqlite store (`store.Open(":memory:")`) when you need
-  state — there is no test mock layer to maintain.
- HTTP handlers test through `httptest.NewServer` against the real
-  router; see `internal/server/http/auth_test.go` for the canonical
-  fixture pattern.
- End-to-end tests live in `e2e/` and run against a Docker Compose
-  stack. See [`docs/e2e.md`](./docs/e2e.md).
-
-### Database migrations
-
- Migrations are hand-rolled SQL in `internal/store/migrations/`
-  and embedded via `embed.FS`.
- Prefer column-level `ALTER TABLE` over rebuilds — see
-  [`CLAUDE.md`](./CLAUDE.md) "Migrations" section for the FK-cascade
-  trap that bit migration 0007's first draft.
-
-## Workflow
-
-### Before opening a PR
-
-1. **Open an issue first** for non-trivial changes. The design is
-   still moving; an issue lets us agree on direction cheaply.
-2. Run `make lint test` locally — both must pass.
-3. Match existing code style (see above).
-4. Keep commits focused: one logical change per commit. Imperative
-   subject lines, body explaining why if it isn't obvious.
-5. Don't add `Co-Authored-By` trailers — repo policy. If you used
-   AI assistance in writing the patch, that's fine; we just don't
-   pollute every commit message with attribution boilerplate.
-
-### Pull requests
-
-PRs target `main`. CI runs lint + tests on Linux amd64/arm64 and
-Windows amd64; all three must be green to merge. Squash-merge is
-the default; the PR title becomes the merge-commit subject, so
-keep it short and informative.
-
-The PR template asks for:
-
- A short description of what changed and why.
- A test plan (commands run, scenarios verified).
- Anything reviewers need to know to assess the change (related
-  issue, follow-up work, deferred concerns).
-
-### Reporting bugs
-
-Open an issue with:
-
- restic-manager version (`server --version`) and agent version.
- restic version on the affected host.
- Steps to reproduce.
- Server and agent logs (sanitise any tokens before pasting).
-
-Security-sensitive bugs go through the [SECURITY.md](./SECURITY.md)
-disclosure path instead — please don't open a public issue for
-them.
-
-### Suggesting features
-
-Open an issue describing the use case (not just the proposed
-solution). The roadmap in `tasks.md` shows where the project is
-heading; if the suggestion fits a future phase we'll wire it in
-there. If it falls outside the project's scope (multi-tenancy, SaaS,
-non-restic backends — see `spec.md` §2 non-goals) we'll say so
-early to save your time.
-
-## Code of conduct
-
-Project participation is governed by [CODE_OF_CONDUCT.md](./CODE_OF_CONDUCT.md).
-The short version: be civil; assume good faith; harassment is not
-tolerated.
+Please do **not** open a public issue for security problems. A
+`SECURITY.md` with a private disclosure path will be added in Phase 5
+(P5-05). Until then, contact the repository owner directly via the
+contact details on their gitea profile.

 ## License

-By contributing you agree that your contributions are licensed
-under the [PolyForm Noncommercial 1.0.0](./LICENSE) license.
+By contributing you agree that your contributions are licensed under
+the [PolyForm Noncommercial 1.0.0](./LICENSE) license.
@@ -7,9 +7,7 @@ AGENT_BIN      := $(BIN_DIR)/restic-manager-agent
 VERSION        ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev)
 COMMIT         ?= $(shell git rev-parse HEAD 2>/dev/null || echo none)
 DATE           ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
-VERSION_PKG    := gitea.dcglab.co.uk/steve/restic-manager/internal/version
-LDFLAGS        := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE) \
-                  -X $(VERSION_PKG).Version=$(VERSION) -X $(VERSION_PKG).Commit=$(COMMIT)
+LDFLAGS        := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE)
 GOFLAGS        := -trimpath
 DOCKER_IMAGE   ?= gitea.dcglab.co.uk/steve/restic-manager
 DOCKER_TAG     ?= dev
@@ -24,29 +22,7 @@ TAILWIND_URL      := https://github.com/tailwindlabs/tailwindcss/releases/downlo
 TAILWIND_INPUT    := web/styles/input.css
 TAILWIND_OUTPUT   := web/static/css/styles.css

-# mdBook for the docs site (P5-01). Single static binary, no
-# Rust toolchain — same pattern as Tailwind.
-MDBOOK_VERSION    ?= v0.4.51
-MDBOOK_OS         := $(shell uname -s | tr A-Z a-z)
-MDBOOK_TRIPLE     := $(shell uname -m)-unknown-$(if $(filter darwin,$(MDBOOK_OS)),apple-darwin,linux-gnu)
-MDBOOK_BIN        := $(BIN_DIR)/mdbook
-MDBOOK_TARBALL    := mdbook-$(MDBOOK_VERSION)-$(MDBOOK_TRIPLE).tar.gz
-MDBOOK_URL        := https://github.com/rust-lang/mdBook/releases/download/$(MDBOOK_VERSION)/$(MDBOOK_TARBALL)
-DOCS_BOOK_DIR     := docs/book
-DOCS_BOOK_OUT     := $(DOCS_BOOK_DIR)/book
-
-.PHONY: help build server agent test test-race lint fmt tidy clean run-server run-agent docker release tailwind tailwind-watch docs docs-watch setup hooks smoke-restart smoke-stop smoke-status smoke-logs smoke-deploy
-
-# ---- smoke-env tooling -------------------------------------------------
-# The smoke server runs as a transient user-systemd unit so it survives
-# bash-tool boundaries and reboots-of-the-shell. Use `make smoke-restart`
-# any time you've rebuilt the server. `make smoke-deploy` is the full
-# rebuild + restage + restart workflow described in CLAUDE.md.
-SMOKE_UNIT       := restic-manager-smoke
-SMOKE_DATA_DIR   := $(HOME)/smoke/data
-SMOKE_LOG_FILE   := $(HOME)/smoke/server.log
-SMOKE_BASE_URL   := http://127.0.0.1:8080
-SMOKE_LISTEN     := :8080
+.PHONY: help build server agent test test-race lint fmt tidy clean run-server run-agent docker release tailwind tailwind-watch setup hooks

 help:
 	@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN{FS=":.*?## "};{printf "  \033[36m%-14s\033[0m %s\n",$$1,$$2}'
@@ -71,18 +47,6 @@ tailwind-watch: $(TAILWIND_BIN) ## Watch and rebuild on every save
 	@mkdir -p $$(dirname $(TAILWIND_OUTPUT))
 	$(TAILWIND_BIN) -c tailwind.config.js -i $(TAILWIND_INPUT) -o $(TAILWIND_OUTPUT) --watch

-$(MDBOOK_BIN):
-	@mkdir -p $(BIN_DIR)
-	@echo "==> downloading mdbook $(MDBOOK_VERSION) ($(MDBOOK_TRIPLE))"
-	curl -fsSL "$(MDBOOK_URL)" | tar -xz -C $(BIN_DIR) mdbook
-	@chmod +x $@
-
-docs: $(MDBOOK_BIN) ## Build the docs/book/ mdBook site into docs/book/book/
-	$(MDBOOK_BIN) build $(DOCS_BOOK_DIR)
-
-docs-watch: $(MDBOOK_BIN) ## Serve the docs site at http://127.0.0.1:3000 with live reload
-	$(MDBOOK_BIN) serve $(DOCS_BOOK_DIR) -n 127.0.0.1 -p 3000
-
 agent: ## Build the agent binary
 	@mkdir -p $(BIN_DIR)
 	CGO_ENABLED=0 go build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(AGENT_BIN) ./cmd/agent
@@ -113,7 +77,7 @@ tidy: ## go mod tidy
 	go mod tidy

 clean: ## Remove build artifacts
-	rm -rf $(BIN_DIR) coverage.out coverage.html $(TAILWIND_OUTPUT) $(DOCS_BOOK_OUT)
+	rm -rf $(BIN_DIR) coverage.out coverage.html $(TAILWIND_OUTPUT)

 run-server: server ## Build and run the server
 	$(SERVER_BIN)
@@ -128,48 +92,6 @@ docker: ## Build the server Docker image
 	  --build-arg DATE=$(DATE) \
 	  -t $(DOCKER_IMAGE):$(DOCKER_TAG) .

-smoke-restart: server ## (Re)start the smoke server as a transient user-systemd unit
-	@systemctl --user reset-failed $(SMOKE_UNIT) >/dev/null 2>&1 || true
-	@systemctl --user stop $(SMOKE_UNIT) >/dev/null 2>&1 || true
-	@echo "==> launching $(SMOKE_UNIT)"
-	systemd-run --user --unit=$(SMOKE_UNIT) \
-	  --setenv=RM_LISTEN=$(SMOKE_LISTEN) \
-	  --setenv=RM_DATA_DIR=$(SMOKE_DATA_DIR) \
-	  --setenv=RM_BASE_URL=$(SMOKE_BASE_URL) \
-	  --setenv=RM_SECRET_KEY_FILE=$(SMOKE_DATA_DIR)/secret.key \
-	  --setenv=RM_COOKIE_SECURE=false \
-	  --property=StandardOutput=append:$(SMOKE_LOG_FILE) \
-	  --property=StandardError=append:$(SMOKE_LOG_FILE) \
-	  --property=Restart=on-failure \
-	  $(PWD)/$(SERVER_BIN)
-	@for i in 1 2 3 4 5; do \
-	  curl -fsS -o /dev/null $(SMOKE_BASE_URL)/api/version 2>/dev/null && \
-	    { echo "==> smoke server up: $$(curl -s $(SMOKE_BASE_URL)/api/version)"; exit 0; }; \
-	  sleep 1; \
-	done; \
-	echo "!! smoke server did not respond on $(SMOKE_BASE_URL) — check $(SMOKE_LOG_FILE)" >&2; \
-	systemctl --user status --no-pager $(SMOKE_UNIT) || true; \
-	exit 1
-
-smoke-stop: ## Stop the smoke server
-	systemctl --user stop $(SMOKE_UNIT) || true
-	@systemctl --user reset-failed $(SMOKE_UNIT) >/dev/null 2>&1 || true
-
-smoke-status: ## Show status of the smoke server
-	@systemctl --user status --no-pager $(SMOKE_UNIT) 2>&1 | head -20 || true
-
-smoke-logs: ## Tail the smoke server log
-	tail -50 $(SMOKE_LOG_FILE)
-
-smoke-deploy: build smoke-restart ## Rebuild + restage agent into smoke + restart server (full per-CLAUDE.md cycle)
-	@echo "==> restaging agent + install assets into $(SMOKE_DATA_DIR)"
-	cp $(AGENT_BIN) $(SMOKE_DATA_DIR)/agent-binaries/restic-manager-agent-linux-amd64
-	cp deploy/install/install.sh $(SMOKE_DATA_DIR)/install/install.sh
-	cp deploy/install/install.ps1 $(SMOKE_DATA_DIR)/install/install.ps1
-	cp deploy/install/restic-manager-agent.service $(SMOKE_DATA_DIR)/install/restic-manager-agent.service
-	@echo "==> NOTE: this dev box's installed agent at /usr/local/bin/restic-manager-agent is NOT updated by this target."
-	@echo "    Run the agent restage block in CLAUDE.md if your change touches agent code or the unit file."
-
 release: ## Cross-compile for all supported platforms
 	@mkdir -p $(BIN_DIR)
 	@for target in linux/amd64 linux/arm64 windows/amd64; do                          \
@@ -1,62 +1,36 @@
 # restic-manager

 Self-hosted, browser-based, single-pane-of-glass for managing
-[restic](https://restic.net) backups across a fleet of Linux and
-Windows endpoints.
+[restic](https://restic.net) backups across a fleet of Linux and Windows
+endpoints.

-> **Status:** pre-1.0, feature-complete for the original use
-> case. Phases 0–4 + 6 are landed (MVP, scheduling, restore,
-> RBAC + OIDC, observability); Phase 5 (OSS readiness — docs site,
-> contributor onboarding, end-to-end CI) is in flight. See
-> [`spec.md`](./spec.md) for the design and [`tasks.md`](./tasks.md)
-> for the live roadmap.
+> Status: pre-alpha. Phase 0 (project bootstrap) complete; Phase 1 (MVP) in
+> progress. See [`spec.md`](./spec.md) for the design and
+> [`tasks.md`](./tasks.md) for the roadmap.

-## What it does
+## What it does (target)

- Central visibility into backup state for every endpoint.
- Trigger any restic operation remotely (`backup`, `forget`,
-  `prune`, `check`, `unlock`, `snapshots`, `stats`, `diff`,
-  `restore`).
- Per-host schedules with named source groups + retention.
- Live job log streamed to the browser; downloadable as
-  text/NDJSON afterwards.
- Restore wizard: browse a snapshot's tree, pick paths, restore
-  in-place or to a new directory.
- Repo health surfacing (size, raw size, last check, lock state),
-  plus a 30/90-day repo-size trend.
- Alerting over webhook, ntfy, or SMTP.
- Cross-platform agent (Linux systemd + Windows SCM).
- Append-only-friendly: separate admin credential for prune.
- Optional Prometheus `/metrics` endpoint + sample Grafana
-  dashboard.
- Optional OIDC SSO (Authelia, Authentik, etc.).
+- Central visibility into backup state for every endpoint
+- Trigger any restic operation remotely (`backup`, `forget`, `prune`,
+  `check`, `unlock`, `snapshots`, `stats`, `diff`, `restore`)
+- Manage per-host backup schedules from the UI
+- Live job progress streamed back to the UI
+- Restore wizard (browse snapshots, pick paths, restore to original or
+  alternate host)
+- Repo health surfacing (size, dedup ratio, last check, lock state)
+- Alerting on failure or staleness
+- Cross-platform agent (Linux + Windows)
+- Ransomware-resistant repo access via append-only credentials

-## Screenshots
+## Architecture (one-line summary)

-| Sign in | Empty dashboard | Add host |
-|:-------:|:---------------:|:--------:|
-| ![Sign in](docs/screenshots/01-login.png) | ![Dashboard, fresh](docs/screenshots/02-dashboard-empty.png) | ![Add host](docs/screenshots/03-add-host.png) |
-
-| Alerts | Settings | Audit log |
-|:------:|:--------:|:---------:|
-| ![Alerts](docs/screenshots/04-alerts.png) | ![Settings](docs/screenshots/05-settings.png) | ![Audit log](docs/screenshots/06-audit.png) |
-
-(Screenshots from a fresh smoke install with no hosts. A populated
-fleet view and the live-log + restore wizard surfaces are part of
-the docs site under [`docs/book/`](./docs/book) — `make docs` to
-render locally.)
-
-## Architecture (one-line)
-
-A small Go control-plane in Docker, lightweight Go agents on each
-endpoint holding an outbound WebSocket to the control-plane, and
-a restic repository (rest-server, S3, B2, SFTP — anything restic
-speaks) that holds the actual backup data. **The control-plane
-never touches backup bytes.**
+A small Go control-plane on the Proxmox host, lightweight Go agents on each
+endpoint that hold an outbound WebSocket to the control-plane, and a
+`restic/rest-server` on Unraid that holds the actual backup data. The
+control-plane never touches backup bytes.

 Full architecture diagram and component breakdown:
-[`spec.md` §3](./spec.md), or the rendered version in the
-[docs site](./docs/book/src/concepts/architecture.md).
+[`spec.md` §3](./spec.md).

 ## Repository layout

@@ -64,63 +38,31 @@ Full architecture diagram and component breakdown:
 cmd/server/        control-plane binary
 cmd/agent/         endpoint agent binary
 internal/api       shared API types (REST + WS envelopes)
-internal/server/   HTTP, WS, UI handlers, alert engine
+internal/server/   HTTP, WS, UI handlers
 internal/agent/    service integration, restic runner, local scheduler
 internal/restic    restic CLI wrapper
 internal/store     SQLite persistence
-internal/crypto    secret encryption (AEAD)
+internal/crypto    secret encryption
 internal/auth      passwords, sessions, agent tokens
 web/               server-rendered templates + static assets
-deploy/            Dockerfile, docker-compose.yml, install scripts, Grafana dashboard
-docs/              prose docs + the mdBook site under docs/book
-e2e/               compose stack + Playwright tests for end-to-end CI
+deploy/            Dockerfile, docker-compose.yml, install scripts
+design/            UI wireframes (Phase 0 design pass)
 ```

-## Quickstart
-
-The reference deployment is a single Docker container fronted by
-your existing reverse proxy. See the [installation guide](docs/book/src/getting-started/install.md)
-for the full path; the very short version:
-
-```sh
-export RM_VERSION=v0.9.0    # pin a real tag
-export RM_BASE_URL=https://restic.example.com
-export RM_TRUSTED_PROXY=10.0.0.0/8
-docker compose -f deploy/docker-compose.yml up -d
-```
-
-The server prints a one-time bootstrap token to the log on first
-start. POST it to `/api/bootstrap` (or open `/bootstrap` in a
-browser) to create the admin user.
-
 ## Local development

-Requires Go 1.25+. The floor is set by `modernc.org/sqlite` v1.50.
+Requires Go 1.25+ (built and tested on 1.26). The floor is set by
+`modernc.org/sqlite` v1.50.

 ```sh
 make build           # builds cmd/server and cmd/agent into ./bin
 make test            # runs go test ./...
 make lint            # runs golangci-lint
-make smoke-restart   # systemd --user smoke server (see CLAUDE.md)
-make docs            # renders the mdBook site to docs/book/book/
+make run-server      # runs the server (dev defaults)
 ```

-End-to-end test harness against a Docker Compose stack with a
-sibling Linux agent: see [`docs/e2e.md`](docs/e2e.md). Runs in CI
-on every PR.
-
-## Documentation
-
- **Concepts and operator guides**: [docs site](docs/book/src/intro.md),
-  rendered with `make docs`.
- **Reverse-proxy setup**: [docs/reverse-proxy.md](docs/reverse-proxy.md).
- **Prometheus + Grafana**: [docs/prometheus.md](docs/prometheus.md).
- **End-to-end test harness**: [docs/e2e.md](docs/e2e.md).
- **Security policy**: [SECURITY.md](SECURITY.md).
- **Contributing**: [CONTRIBUTING.md](CONTRIBUTING.md).
-
 ## License

-[PolyForm Noncommercial 1.0.0](./LICENSE). Free for personal,
-hobby, research, educational, governmental, and other noncommercial
-use. Commercial use requires a separate license.
+PolyForm Noncommercial 1.0.0 — see [`LICENSE`](./LICENSE). Free for personal,
+hobby, research, educational, governmental, and other noncommercial use.
+Commercial use requires a separate license.
@@ -1,137 +0,0 @@
-# Security policy
-
-restic-manager handles credentials that grant access to backup
-repositories — losing them means an attacker can read or destroy a
-fleet's backups. We take security reports seriously even at this
-project's small scale.
-
-## Supported versions
-
-Pre-1.0, only the latest tagged release on `main` is supported.
-Backporting fixes to older tags is not currently offered.
-
-| Version            | Supported      |
-|--------------------|----------------|
-| `main` HEAD        | Yes            |
-| Latest released tag| Yes            |
-| Anything older     | No             |
-
-## Reporting a vulnerability
-
-**Please don't open a public issue for security problems.**
-
-Instead, use one of these private channels:
-
-1. **Gitea private message** to the repository owner. The
-   instance is at <https://gitea.dcglab.co.uk> and the owner's
-   profile (`steve`) has direct-message contact set up.
-2. **Email** to the address on the maintainer's Gitea profile.
-   Use a subject like `[SECURITY] restic-manager: <one-line summary>`
-   so it doesn't get lost. PGP optional — if you want to encrypt,
-   ask for a key first.
-
-If you don't get an acknowledgement within **3 working days**,
-please escalate through the other channel — solo maintainers do
-miss things, and the goal here is to fix the problem, not to
-preserve protocol.
-
-### What to include
-
- A description of the issue and the impact (what does an attacker
-  gain? confidentiality, integrity, availability?).
- Affected component (server, agent, install script, docs).
- Affected version (`restic-manager-server --version`).
- Reproduction steps if you have them. A working PoC is welcome
-  but not required — a credible threat model is enough.
- Whether you intend to publish a writeup, and any timing
-  preferences.
-
-### What we'll do
-
-1. Acknowledge receipt within 3 working days.
-2. Confirm or refute the issue, and agree a rough severity (CVSS
-   or just "this is bad / this isn't"). Asking clarifying
-   questions is normal at this stage — please don't read it as
-   foot-dragging.
-3. Develop a fix on a private branch, test it, and prepare a
-   release.
-4. Coordinate disclosure timing with you. The default is **30
-   days from confirmed report to public disclosure**, with a
-   patched release published before the disclosure date. Faster
-   if a workable PoC is already circulating; slower only by
-   mutual agreement.
-5. Credit the reporter in the release notes (or omit the credit
-   if you'd rather stay anonymous — your choice).
-
-## Scope
-
-In scope:
-
- The server binary (`cmd/server`) and any HTTP, WebSocket, or CLI
-  surface it exposes.
- The agent binary (`cmd/agent`) and the way it consumes commands
-  from the server.
- The install scripts (`deploy/install/install.sh`, `install.ps1`)
-  and the systemd unit shipped with them.
- The docker-compose reference deployment and the docker image we
-  publish.
- Any cryptographic primitive choice or implementation detail
-  (AEAD, token hashing, session handling, OIDC handshake).
- Documentation that, if followed, leads operators into an
-  insecure configuration.
-
-Out of scope (not because they aren't real problems, just not ones
-this report channel can act on):
-
- Vulnerabilities in restic itself — report those upstream at
-  <https://github.com/restic/restic>.
- Vulnerabilities in third-party dependencies that haven't yet been
-  patched upstream — report upstream first.
- Issues that require pre-authenticated admin access on the control
-  plane (admins can already do everything; that's not a privilege
-  escalation, that's the design).
- DoS via resource exhaustion on a deployment without the
-  recommended reverse proxy / rate limiting in front (see
-  `docs/reverse-proxy.md`).
- Social-engineering scenarios that don't have a technical hook
-  into the project's own surfaces.
-
-## Threat model summary
-
-For context (longer version in [`spec.md`](./spec.md) §11):
-
- The server is **HTTP-only**; TLS termination, ACME, HSTS, and
-  edge rate-limiting are the reverse proxy's job.
- Credentials are encrypted at rest with an AEAD key loaded from
-  `RM_SECRET_KEY_FILE`. The same key encrypts agent secrets that
-  travel to the agent over the WS channel.
- Agents authenticate with bearer tokens issued at enrolment and
-  hashed at rest. Compromise of the server DB does **not** leak
-  bearer tokens in plaintext, but does leak the hashes (which is
-  enough to log in *as* the agent until the operator revokes —
-  see [NS-01 / NS-02](./tasks.md) for the revoke + regenerate
-  flows).
- The control plane intentionally **never touches backup bytes** —
-  the agent runs `restic` directly against the repo. A
-  compromised control plane can dispatch new jobs but cannot
-  exfiltrate snapshot contents in-band.
- Append-only credentials are first-class. Forget/prune jobs use a
-  separate, admin-marked credential that the server only pushes
-  for the duration of a maintenance dispatch.
-
-## Hardening checklist for operators
-
- Run behind a TLS-terminating reverse proxy (Caddy/nginx/Traefik).
- Set `RM_TRUSTED_PROXY` to the proxy's CIDR so request IPs aren't
-  spoofable.
- Back up `RM_SECRET_KEY_FILE` separately from the database.
-  Without it the encrypted creds are unrecoverable.
- Use append-only credentials for the everyday backup path; only
-  the optional admin credential should have write/forget/prune
-  power.
- Disable users (don't delete) when staff change roles — bearer
-  tokens stay valid until rotated.
- Watch the alert and audit-log views during enrolment of new
-  hosts.
-
-Thanks for helping keep restic-manager users safe.
@@ -1,8 +0,0 @@
-# The ask!
-
-I have numerous servers deployed out in a lab, mainly Linux but some Windows
-All have restic installed on them
-I need to build a browser based management service that allows me to have a central single-plane-of-glass to monitor and manage all teh endpoints
-All endpoints will be enabled for SSH (unless other methods are better?)
-
-Plan out how we would go about this please?
@@ -148,7 +148,6 @@ func run() error {
 		resticBin:                 resticBin,
 		resticVer:                 snap.ResticVersion,
 		resticSupportsNoOwnership: resticSupportsNoOwnership,
-		serverURL:                 cfg.ServerURL,
 		secrets:                   sec,
 		scheduler:                 scheduler.New(),
 	}
@@ -215,7 +214,6 @@ type dispatcher struct {
 	resticBin                 string
 	resticVer                 string // e.g. "0.17.1"; empty if restic isn't installed yet
 	resticSupportsNoOwnership bool   // captured at startup from `restic restore --help`
-	serverURL                 string // base URL of the server (used by the self-update fetch)
 	secrets                   *secrets.Store
 	scheduler                 *scheduler.Scheduler

@@ -397,12 +395,10 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S
 				"up_kbps", up, "down_kbps", down)
 		}

-	case api.MsgCommandUpdate:
-		var p api.CommandUpdatePayload
-		if err := env.UnmarshalPayload(&p); err != nil {
-			return fmt.Errorf("command.update: %w", err)
-		}
-		go d.runUpdate(ctx, p, tx)
+	case api.MsgAgentUpdateAvail:
+		var p api.AgentUpdateAvailablePayload
+		_ = env.UnmarshalPayload(&p)
+		slog.Info("ws agent: update available", "version", p.LatestVersion, "url", p.PackageURL)

 	default:
 		slog.Debug("ws agent: ignored message", "type", env.Type)
@@ -1,65 +0,0 @@
-package main
-
-import (
-	"context"
-	"fmt"
-	"log/slog"
-	"time"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/updater"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/wsclient"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
-)
-
-// runUpdate handles a server-dispatched command.update. It logs progress
-// via log.stream so the live job page captures pre-restart state, then
-// calls the platform updater. On Linux the updater calls os.Exit; on
-// Windows it spawns a detached helper and returns, with the agent then
-// exiting.
-//
-// The terminal job state is set by the server, not the agent: success
-// is "agent re-hellos with matching version" rather than anything the
-// agent itself can assert. The only `job.finished` we send from here is
-// on the failure path, before any restart attempt.
-func (d *dispatcher) runUpdate(ctx context.Context, p api.CommandUpdatePayload, tx wsclient.Sender) {
-	logf := func(format string, args ...any) {
-		line := fmt.Sprintf(format, args...)
-		slog.Info("ws agent: update: " + line)
-		env, err := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
-			JobID:   p.JobID,
-			TS:      time.Now().UTC(),
-			Stream:  api.LogStdout,
-			Payload: line,
-		})
-		if err == nil {
-			_ = tx.Send(env)
-		}
-	}
-
-	startedEnv, err := api.Marshal(api.MsgJobStarted, "", api.JobStartedPayload{
-		JobID:     p.JobID,
-		Kind:      api.JobUpdate,
-		StartedAt: time.Now().UTC(),
-	})
-	if err == nil {
-		_ = tx.Send(startedEnv)
-	}
-
-	logf("fetching new binary from %s", d.serverURL)
-	if err := updater.Update(ctx, d.serverURL); err != nil {
-		logf("update failed: %v", err)
-		finishedEnv, mErr := api.Marshal(api.MsgJobFinished, "", api.JobFinishedPayload{
-			JobID:      p.JobID,
-			Status:     api.JobFailed,
-			FinishedAt: time.Now().UTC(),
-			Error:      err.Error(),
-		})
-		if mErr == nil {
-			_ = tx.Send(finishedEnv)
-		}
-		return
-	}
-	// Unreachable on Linux (Update calls os.Exit). On Windows control
-	// returns here while the detached helper does the swap-and-restart;
-	// the agent then exits cleanly so SCM hands off.
-}
@@ -17,10 +17,8 @@ import (
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/fleetupdate"
 	rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
@@ -90,11 +88,9 @@ func run() error {

 	hub := ws.NewHub()
 	jobHub := ws.NewJobHub()
-	metricsRegistry := metrics.NewRegistry()

 	notifHub := notification.NewHub(st, aead, cfg.BaseURL)
 	alertEngine := alert.NewEngine(st, notifHub)
-	updateWatcher := ws.NewUpdateWatcher(st, alertEngine, jobHub)

 	renderer, err := ui.New()
 	if err != nil {
@@ -120,11 +116,9 @@ func run() error {
 		JobHub:          jobHub,
 		AlertEngine:     alertEngine,
 		NotificationHub: notifHub,
-		UpdateWatcher:   updateWatcher,
 		UI:              renderer,
 		Version:         version,
 		OIDC:            oidcClient,
-		Metrics:         metricsRegistry,
 	}

 	// First-run bootstrap: if the users table is empty, mint a one-time
@@ -153,17 +147,10 @@ func run() error {

 	srv := rmhttp.New(deps)

-	// Fleet-update worker — built after the HTTP server because the
-	// dispatcher delegates back into srv.DispatchHostUpdate.
-	fleetWorker := fleetupdate.NewWorker(st, hub,
-		&serverDispatcher{srv: srv}, alertEngine)
-	srv.SetFleetWorker(fleetWorker)
-
 	ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
 	defer stop()

 	go alertEngine.Run(ctx)
-	go updateWatcher.Run(ctx)

 	errCh := make(chan error, 1)
 	go func() {
@@ -256,12 +243,3 @@ func run() error {
 	}
 	return nil
 }
-
-// serverDispatcher adapts the http.Server's DispatchHostUpdate method
-// to the fleetupdate.Dispatcher interface. Lives in main so the
-// http and fleetupdate packages don't need to know about each other.
-type serverDispatcher struct{ srv *rmhttp.Server }
-
-func (d *serverDispatcher) DispatchUpdate(ctx context.Context, hostID, actorUserID string) (string, string, error) {
-	return d.srv.DispatchHostUpdate(ctx, hostID, actorUserID)
-}
@@ -1,325 +0,0 @@
-{
-  "annotations": {
-    "list": [
-      {
-        "builtIn": 1,
-        "datasource": { "type": "grafana", "uid": "-- Grafana --" },
-        "enable": true,
-        "hide": true,
-        "iconColor": "rgba(0, 211, 255, 1)",
-        "name": "Annotations & Alerts",
-        "type": "dashboard"
-      }
-    ]
-  },
-  "description": "restic-manager fleet overview. Imports against any Prometheus data source.",
-  "editable": true,
-  "fiscalYearStartMonth": 0,
-  "graphTooltip": 0,
-  "id": null,
-  "links": [],
-  "liveNow": false,
-  "panels": [
-    {
-      "id": 1,
-      "title": "Fleet status",
-      "type": "stat",
-      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-      "gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "color": "red", "value": null },
-              { "color": "green", "value": 1 }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "justifyMode": "auto",
-        "orientation": "auto",
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-        "textMode": "auto"
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "rm_hosts_online",
-          "legendFormat": "online",
-          "refId": "A"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "rm_hosts_total",
-          "legendFormat": "total",
-          "refId": "B"
-        }
-      ]
-    },
-    {
-      "id": 2,
-      "title": "Open alerts",
-      "type": "stat",
-      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-      "gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "color": "green", "value": null },
-              { "color": "yellow", "value": 1 },
-              { "color": "red", "value": 5 }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "none",
-        "orientation": "horizontal",
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-        "textMode": "auto"
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "sum by (severity) (rm_active_alerts)",
-          "legendFormat": "{{severity}}",
-          "refId": "A"
-        }
-      ]
-    },
-    {
-      "id": 3,
-      "title": "Backups failing (last reported run)",
-      "type": "stat",
-      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-      "gridPos": { "h": 6, "w": 6, "x": 12, "y": 0 },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "thresholds" },
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              { "color": "green", "value": null },
-              { "color": "red", "value": 1 }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "options": {
-        "colorMode": "value",
-        "graphMode": "area",
-        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
-        "textMode": "auto"
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "count(rm_host_last_backup_success == 0)",
-          "legendFormat": "failing",
-          "refId": "A"
-        }
-      ]
-    },
-    {
-      "id": 4,
-      "title": "Hosts",
-      "type": "table",
-      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 6 },
-      "fieldConfig": {
-        "defaults": {
-          "custom": { "align": "auto", "displayMode": "auto" }
-        },
-        "overrides": [
-          {
-            "matcher": { "id": "byName", "options": "Value #B" },
-            "properties": [
-              { "id": "displayName", "value": "Last backup (s ago)" },
-              { "id": "unit", "value": "s" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #C" },
-            "properties": [
-              { "id": "displayName", "value": "Repo size" },
-              { "id": "unit", "value": "bytes" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #D" },
-            "properties": [
-              { "id": "displayName", "value": "Snapshots" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #A" },
-            "properties": [
-              { "id": "displayName", "value": "Online" }
-            ]
-          },
-          {
-            "matcher": { "id": "byName", "options": "Value #E" },
-            "properties": [
-              { "id": "displayName", "value": "Open alerts" }
-            ]
-          }
-        ]
-      },
-      "options": { "showHeader": true },
-      "transformations": [
-        {
-          "id": "merge",
-          "options": {}
-        }
-      ],
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "rm_host_agent_online",
-          "format": "table",
-          "instant": true,
-          "refId": "A"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "time() - rm_host_last_backup_timestamp_seconds",
-          "format": "table",
-          "instant": true,
-          "refId": "B"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "rm_host_repo_size_bytes",
-          "format": "table",
-          "instant": true,
-          "refId": "C"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "rm_host_snapshot_count",
-          "format": "table",
-          "instant": true,
-          "refId": "D"
-        },
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "rm_host_open_alerts",
-          "format": "table",
-          "instant": true,
-          "refId": "E"
-        }
-      ]
-    },
-    {
-      "id": 5,
-      "title": "Repo size over time",
-      "type": "timeseries",
-      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": {
-            "axisLabel": "",
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "lineWidth": 1,
-            "pointSize": 5,
-            "showPoints": "never"
-          },
-          "unit": "bytes"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": { "calcs": ["last"], "displayMode": "list", "placement": "bottom", "showLegend": true },
-        "tooltip": { "mode": "multi", "sort": "desc" }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "rm_host_repo_size_bytes",
-          "legendFormat": "{{host}}",
-          "refId": "A"
-        }
-      ]
-    },
-    {
-      "id": 6,
-      "title": "Job duration p95 (last 1h, by kind)",
-      "type": "timeseries",
-      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
-      "fieldConfig": {
-        "defaults": {
-          "color": { "mode": "palette-classic" },
-          "custom": {
-            "drawStyle": "line",
-            "fillOpacity": 5,
-            "lineWidth": 1,
-            "pointSize": 4,
-            "showPoints": "never"
-          },
-          "unit": "s"
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": { "calcs": ["last"], "displayMode": "list", "placement": "bottom", "showLegend": true },
-        "tooltip": { "mode": "multi", "sort": "desc" }
-      },
-      "targets": [
-        {
-          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
-          "expr": "histogram_quantile(0.95, sum by (kind, le) (rate(rm_job_duration_seconds_bucket[1h])))",
-          "legendFormat": "{{kind}}",
-          "refId": "A"
-        }
-      ]
-    }
-  ],
-  "refresh": "30s",
-  "schemaVersion": 39,
-  "style": "dark",
-  "tags": ["restic-manager", "backups"],
-  "templating": {
-    "list": [
-      {
-        "current": {},
-        "hide": 0,
-        "includeAll": false,
-        "label": "Prometheus",
-        "multi": false,
-        "name": "DS_PROMETHEUS",
-        "options": [],
-        "query": "prometheus",
-        "refresh": 1,
-        "regex": "",
-        "skipUrlSync": false,
-        "type": "datasource"
-      }
-    ]
-  },
-  "time": { "from": "now-6h", "to": "now" },
-  "timepicker": {},
-  "timezone": "",
-  "title": "restic-manager — fleet",
-  "uid": "rm-fleet-overview",
-  "version": 1,
-  "weekStart": ""
-}
@@ -52,12 +52,7 @@ ProtectSystem=full
 # whenever a new SecretsKey is minted, so we need a targeted
 # write-exemption for that dir. No exemption for the rest of /etc:
 # the agent has no business editing /etc/passwd, /etc/sudoers, etc.
-#
-# /usr/local/bin is writable so the self-update flow (P6-01) can
-# atomic-rename a fresh binary over the running one. Permitting the
-# whole directory (rather than just the binary path) is required
-# because os.Rename takes a write lock on the parent dir.
-ReadWritePaths=/etc/restic-manager /usr/local/bin
+ReadWritePaths=/etc/restic-manager
 ProtectHostname=true
 ProtectKernelTunables=true
 ProtectKernelModules=true
@@ -1,19 +0,0 @@
-[book]
-title = "restic-manager"
-description = "Self-hosted control plane for restic backups across a fleet of Linux and Windows endpoints."
-authors = ["Steve Cliff"]
-language = "en-GB"
-multilingual = false
-src = "src"
-
-[output.html]
-default-theme = "ayu"
-preferred-dark-theme = "ayu"
-git-repository-url = "https://gitea.dcglab.co.uk/steve/restic-manager"
-git-repository-icon = "fa-code-fork"
-edit-url-template = "https://gitea.dcglab.co.uk/steve/restic-manager/_edit/main/docs/book/{path}"
-no-section-label = false
-
-[output.html.fold]
-enable = true
-level = 2
@@ -1,40 +0,0 @@
-# Summary
-
-[Introduction](./intro.md)
-
-# Getting started
-
- [Installing the server](./getting-started/install.md)
- [Enrolling your first host](./getting-started/enrolling-hosts.md)
- [Running behind a reverse proxy](./getting-started/reverse-proxy.md)
-
-# Concepts
-
- [Architecture](./concepts/architecture.md)
- [Credentials and how they flow](./concepts/credentials.md)
- [Schedules and source groups](./concepts/schedules-and-source-groups.md)
- [Repo maintenance](./concepts/repo-maintenance.md)
-
-# Operations
-
- [Backups and restores](./operations/backups-and-restores.md)
- [Alerts and notifications](./operations/alerts.md)
- [Observability with Prometheus](./operations/observability.md)
- [Updating agents](./operations/updates.md)
-
-# Security
-
- [Threat model](./security/threat-model.md)
- [Hardening checklist](./security/hardening.md)
- [Reporting vulnerabilities](./security/disclosure.md)
-
-# Reference
-
- [Environment variables](./reference/env-vars.md)
- [HTTP endpoints](./reference/http-endpoints.md)
-
---
-
-[Contributing](./contributing.md)
-[Roadmap](./roadmap.md)
-[License](./license.md)
@@ -1,121 +0,0 @@
-# Architecture
-
-## Components
-
-```
-┌────────────────────────────────────────────────────────────┐
-│  Server (control plane, single process)                    │
-│   * chi-based HTTP API + HTMX server-rendered UI           │
-│   * WebSocket hub for agent fan-out + browser fan-out      │
-│   * SQLite store (modernc.org/sqlite, pure Go)             │
-│   * AEAD encryption helpers                                │
-│   * Alert engine + notification hub                        │
-└────────────┬───────────────────────────────────┬───────────┘
-             │ outbound WS only                   │ HTTP(S)
-             │                                    │
-┌────────────▼─────────────┐         ┌────────────▼─────────────┐
-│  Agent (per host)        │         │  Browser (operator)      │
-│   * coder/websocket      │         │   * htmx + a tiny bit    │
-│   * cron for schedules   │         │     of vanilla JS for    │
-│   * restic wrapper       │         │     live job updates     │
-│   * sysinfo collector    │         └──────────────────────────┘
-└────────────┬─────────────┘
-             │ subprocess: restic ...
-             │
-┌────────────▼─────────────────────────────────────────────────┐
-│  restic repository (rest-server, S3, B2, SFTP, local …)      │
-│  Backup data flows directly here. Server never touches it.   │
-└──────────────────────────────────────────────────────────────┘
-```
-
-## Why outbound-only WebSockets?
-
-The agent dials the server on `/ws/agent` with a bearer token. The
-server doesn't initiate connections to the agent. Three reasons:
-
-1. **Firewall friendliness.** Nothing on the endpoint needs an
-   inbound port; this works behind the typical "branch office NAT"
-   without router config.
-2. **Single auth point.** The bearer token is the only credential
-   that crosses the boundary; the agent never accepts an
-   incoming socket.
-3. **Reconnect semantics are simpler.** When the connection drops
-   (NAT timeout, server restart, transient network glitch) the
-   agent backs off and re-dials; the server marks the host
-   offline after 90s and lets the alert engine raise a stale-host
-   alert.
-
-## Why SQLite?
-
-SQLite covers the project's HA non-goal: there isn't one. A small
-control plane managing twelve endpoints does not need replication
-or a separate database tier. SQLite gives us:
-
- A single file to back up (plus the secret key).
- Hand-rolled migrations under `internal/store/migrations/` —
-  no migration framework lock-in.
- `WAL` mode plus per-connection foreign-key enforcement.
-
-The migrations file the entire schema; there's no ORM or
-query-builder layer between Go code and SQL.
-
-## Why the agent runs `restic` itself, not via the server
-
-The control plane never holds backup bytes in flight. That's
-deliberate:
-
- A compromised control plane cannot exfiltrate snapshot
-  contents in-band — at worst it can dispatch new backup or
-  forget jobs (audit-logged) but the data path is between the
-  agent and the repository.
- The same agent process can target whichever transport restic
-  natively supports (rest-server, S3, B2, SFTP, local), no
-  separate mux on the server side.
-
-## Job lifecycle
-
-```
-            ┌──────────────────────┐
-operator →  │ POST /hosts/{id}/    │
-            │       run-backup     │
-            └──────────┬───────────┘
-                       │   1. INSERT INTO jobs (status='queued')
-                       │   2. dispatch command.run over WS
-                       ▼
-            ┌──────────────────────┐
-            │ Agent dispatches     │
-            │ restic subprocess    │
-            └──────────┬───────────┘
-                       │
-                       │   3. job.started   ───▶ store.MarkJobStarted
-                       │   4. job.progress  ───▶ JobHub broadcast (live UI)
-                       │   5. log.stream    ───▶ append to job_logs
-                       │   6. job.finished  ───▶ store.MarkJobFinished
-                       │                          + alert engine eval
-                       │                          + (P6) metrics histogram
-                       ▼
-                  terminal: succeeded | failed | cancelled
-```
-
-Operators see live updates because the browser subscribes to
-`/api/jobs/{id}/stream`, and the WS handler broadcasts each
-agent-emitted envelope to all live subscribers in addition to
-persisting it.
-
-## What scheduling looks like
-
- The agent runs a local `robfig/cron/v3` instance.
- The server pushes the desired schedule set to the agent on
-  hello + after every CRUD change.
- When the agent's cron fires, it sends `schedule.fire` to the
-  server. The server creates a job row, sends `command.run` back,
-  and the agent dispatches a normal backup.
- If the WS drops between fire and run, the server queues the
-  schedule firing into `pending_runs` and drains on agent
-  reconnect — no missed scheduled backups due to network blips.
-
-For everything that isn't a backup (forget, prune, check), the
-server runs a 60-second maintenance ticker against
-`host_repo_maintenance` rows and dispatches the relevant command
-when a cadence is due. The agent's local cron only handles
-backups.
@@ -1,98 +0,0 @@
-# Credentials and how they flow
-
-restic-manager handles three credential surfaces:
-
-1. **Operator credentials** — the username + password (or OIDC
-   identity) that logs into the UI.
-2. **Agent bearer tokens** — issued at enrolment, used by the
-   agent to authenticate its WebSocket to the server.
-3. **Repo credentials** — the rest-server / S3 / B2 / SFTP
-   credentials the agent passes to `restic` itself.
-
-Each has a different threat model and storage strategy.
-
-## Operator credentials
-
- Local users are stored in `users` with a bcrypt password hash.
- Sessions are random tokens minted at login, stored hashed in
-  the `sessions` table, expired after 24h. Cookie is HttpOnly,
-  SameSite=Lax, and Secure (when `RM_COOKIE_SECURE=true`,
-  default).
- OIDC users carry `auth_source='oidc'` and an `oidc_subject`
-  pinning their IdP identity. Local password login is rejected
-  for OIDC users.
- Disabling a user soft-deletes them via `disabled_at` —
-  pre-existing sessions are invalidated on the next request.
-
-## Agent bearer tokens
-
- Minted at enrolment, hashed at rest with `auth.HashToken`.
- The plaintext token only exists in memory at enrolment time
-  and on the agent's filesystem (`/etc/restic-manager/agent.yaml`,
-  mode `0600`, owned by the service user).
- Compromise of the server DB leaks the hashes, which is enough
-  to *log in as that agent* until you revoke. Compromise of the
-  agent host leaks the plaintext (via the config file) — same
-  end result.
- Rotation: re-enrol the host. Today there's no in-place rotate;
-  the operator deletes the host (which cascades, including
-  revoking the bearer hash) and re-runs the install command.
-
-## Repo credentials
-
-This is the credential that ultimately matters for backup
-integrity. restic-manager keeps two slots per host:
-
- **The everyday credential** (`host_credentials.kind = ''`).
-  Append-only-friendly: this is the one your backup schedule
-  uses. It can write but not delete or forget.
- **The admin credential** (`host_credentials.kind = 'admin'`).
-  Has full delete rights. Only pushed to the agent transiently
-  while a `prune` or `forget` job is dispatching, and discarded
-  by the agent after the job ends.
-
-### Encryption flow
-
-1. Operator types the credential into the UI or the install form.
-2. Server AEAD-encrypts the cred (`crypto.AEAD.Encrypt`) using the
-   key in `RM_SECRET_KEY_FILE`. The plaintext is dropped from
-   memory.
-3. Encrypted blob is stored in `host_credentials.cred_blob`.
-4. When the agent connects, the server decrypts the blob and
-   sends the **plaintext** down the WebSocket inside a
-   `config.update` envelope.
-5. The agent stores the plaintext in its in-memory secrets store
-   for the lifetime of the process; it's reloaded fresh on every
-   server-side push.
-6. When a job runs, the agent merges the credential into the
-   restic environment (`restic.Env.RepoURL` stays bare; the
-   `user:pass@…` form is built only inside `envSlice()` at the
-   moment of `exec.Command`).
-
-The merged form is **never logged**. The slog package's structured
-output gets `restic.RedactURL()` for any URL it has cause to
-mention.
-
-### Why push plaintext over the wire?
-
-The transport itself is the trust boundary: the WebSocket runs
-inside the same TLS-terminated reverse-proxy connection your
-browser uses, and the agent has already authenticated with its
-bearer token. Re-encrypting the payload on top of that would just
-move the key-management problem somewhere else.
-
-If your reverse proxy isn't TLS-terminated, the deployment is
-already broken — see [Hardening](../security/hardening.md).
-
-## Setup tokens (admin-driven)
-
-When an admin creates a new user, the server mints a one-time
-setup link valid for 1 hour. The hash is stored; the raw token
-is shown to the admin once. The user opens the link, sets a
-password, and is dropped into a session. Expired tokens are
-swept on the alert engine's 60s tick.
-
-Same pattern for enrolment tokens: the raw token only exists in
-memory at mint time, and the install snippet is the operator's
-only chance to capture it. If you lose it, regenerate via the
-**Add host** page (NS-02).
@@ -1,85 +0,0 @@
-# Repo maintenance
-
-Backups go in; without maintenance, repos grow forever and
-eventually fall over. restic-manager runs three maintenance
-operations on a per-host cadence:
-
-| Command  | What it does                                                | Default cadence |
-|----------|-------------------------------------------------------------|-----------------|
-| `forget` | Marks snapshots eligible for removal per the retention policy attached to each source group. Cheap; runs append-only. | Daily after the last backup of the day |
-| `prune`  | Reclaims space from the repo. Requires the **admin** credential (write+delete). | Weekly, off-peak |
-| `check`  | Verifies repo integrity. Sub-options surface lock state. | Weekly, with `--read-data-subset N%` to sample pack files |
-
-A new field on each host row, `host_repo_maintenance`, holds the
-cron expressions and last-fire anchors. The maintenance ticker on
-the server runs every 60s, finds hosts whose next-fire is due,
-and dispatches the right command. The agent's local cron is
-**only** for backups.
-
-## Why server-side and not agent-side?
-
-The agent's cron knows about backups because backups are
-per-source-group. Maintenance is per-repo, not per-source-group,
-so doing it server-side keeps the per-host wiring simple:
-
- One ticker, not N agent crons to keep in sync.
- Cancelling a maintenance dispatch is just "don't dispatch the
-  next one" — no agent-side state to clean up.
- Skipping offline hosts is trivial (no queue; only scheduled
-  *backups* queue into `pending_runs`).
-
-## Forget and the multi-group payload
-
-A single `forget` job can target several source groups at once.
-The wire envelope (`ForgetGroups`) carries one entry per group,
-each with its retention policy. The agent runs N
-`restic forget --tag <name> --keep-...` invocations in sequence,
-streams their output, and reports a single terminal status.
-
-## Prune and the admin credential
-
-Prune mutates the repo. The everyday append-only credential
-**cannot** prune — that's the whole point of append-only.
-restic-manager keeps a second slot per host (`kind = 'admin'`)
-for the credential that can.
-
-When a prune is dispatched (cadence-driven or operator-driven):
-
-1. Server pushes the admin credential to the agent in a fresh
-   `config.update`.
-2. Agent runs `restic prune` with the merged credential.
-3. Job finishes; agent discards the admin credential from its
-   in-memory secrets store.
-
-The server never logs the merged URL (see
-[Credentials](./credentials.md)).
-
-## Check and lock state
-
-`restic check` warns about stale locks when it finds them. The
-agent ships every check's output back as a `repo.stats` envelope
-and a stream of log lines; if a stale lock is detected, the
-**Repo** page surfaces a banner with an **Unlock** button. The
-operator-only `unlock` command runs `restic unlock` and clears
-the banner.
-
-`unlock` has no cadence — it's a manual action, never automatic.
-Auto-unlocking would mask the cause (probably a previously
-crashed long-running operation) and risk corrupting an
-operation the operator has merely lost track of.
-
-## Repo stats
-
-After every backup, check, prune, and unlock, the agent runs
-`restic stats --json --mode raw-data` and ships the result as a
-`repo.stats` envelope. The server stores this in
-`host_repo_stats` (latest only) and `host_repo_stats_history`
-(one row per host per day, last-write-wins per column — a
-prune-only patch never nulls a backup-time size).
-
-The host detail page surfaces:
-
- Total size + raw size in the vitals strip.
- Last-check timestamp + colour-coded status.
- Last-prune timestamp.
- 30/90-day repo size trend chart.
@@ -1,105 +0,0 @@
-# Schedules and source groups
-
-Two related but separable ideas:
-
- A **source group** is a named bundle of "what to back up":
-  include paths, exclude patterns, retention policy, retry
-  configuration, optional pre/post hooks. The group's name is
-  used as the restic snapshot tag, so retention can target it
-  with `restic forget --tag <name>`.
- A **schedule** is a cron expression that, when it fires,
-  triggers a backup of one or more source groups on a host.
-
-Decoupling them means you can have one schedule covering several
-groups (e.g. `0 1 * * *` running both `system` and `data`), and
-each group has its own retention without duplicating policy
-across schedules.
-
-## Source group anatomy
-
-```yaml
-name: data
-includes:
-  - /var/lib/postgresql
-  - /home
-excludes:
-  - /home/*/.cache
-  - /home/*/Downloads
-retention:
-  keep_last: 7
-  keep_daily: 14
-  keep_weekly: 4
-  keep_monthly: 6
-retry_max: 3
-retry_backoff_seconds: 600
-pre_hook: |
-  pg_dump -U postgres -F c -f /var/lib/postgresql/dumps/all.dump
-post_hook: |
-  rm -f /var/lib/postgresql/dumps/all.dump
-```
-
-### Conflict detection
-
-If your retention policy says `keep_hourly: 24` but no schedule
-points at this group sub-daily, the UI surfaces a
-**conflict-dimension banner** ("`hourly` won't be honoured —
-no schedule fires more often than once a day"). The flag is
-stored on the source group (`conflict_dimension`) and refreshed
-whenever a schedule or group changes.
-
-### Hooks
-
-`pre_hook` and `post_hook` run on the agent host inside
-`/bin/sh -c` (`cmd.exe /C` on Windows). Output is streamed back
-to the live job log as `hook(<phase>): …` lines.
-
- A non-zero `pre_hook` exit aborts the backup.
- `post_hook` always runs, with `RM_JOB_STATUS=succeeded|failed`
-  in the environment. Use this for cleanup that must happen
-  whether the backup worked or not.
- Hooks only run for `kind=backup` jobs. They do not run for
-  `forget`, `prune`, `check`, etc.
- AEAD-encrypted at rest at the HTTP layer; the agent receives
-  plaintext over the WS channel.
-
-A "host default" pair of hooks lives on the host itself; a
-source group's own hooks override them when set.
-
-## Schedule anatomy
-
-```yaml
-cron: "0 2 * * *"
-enabled: true
-source_group_ids:
-  - <gid for "data">
-  - <gid for "system">
-```
-
-Slim by design: a schedule says **when** and **which groups**.
-Everything else (paths, retention, hooks) lives on the groups.
-
-The agent's local cron fires the schedule. If the WebSocket is
-down at fire time, the server queues the firing into
-`pending_runs` and drains it on the next agent reconnect — a
-short network blip won't lose the backup.
-
-### Last / next run
-
-The schedules tab shows "next" (computed by parsing the cron
-expression with `robfig/cron/v3`) and "last" (the latest
-`actor_kind=schedule` job in the `jobs` table) for every
-schedule. The dashboard host row also surfaces `next 12h ago/from
-now` when a single covering schedule is the run-now candidate.
-
-## Bandwidth limits
-
-Two places set restic's `--limit-upload` / `--limit-download`:
-
-1. **Host-wide caps** on the host row (`bandwidth_up_kbps`,
-   `bandwidth_down_kbps`). Pushed to the agent on hello and
-   after `PUT /api/hosts/{id}/bandwidth`. Apply to every restic
-   invocation on the host.
-2. **Per-job overrides** on the per-source-group Run-now form.
-   Win over host caps for the lifetime of that one job.
-
-If neither is set, restic runs unthrottled.
@@ -1,17 +0,0 @@
-# Contributing
-
-Full contributor guide:
-[`CONTRIBUTING.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/CONTRIBUTING.md)
-in the repository root.
-
-The short version:
-
- Open an issue first for non-trivial changes; the design is
-  still moving and unsolicited large PRs may conflict with
-  in-flight work.
- `make lint test` must pass.
- One logical change per commit, no `Co-Authored-By` trailers.
- UK English in identifiers and comments; comments explain the
-  **why** not the **what**.
-
-Code of conduct: [`CODE_OF_CONDUCT.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/CODE_OF_CONDUCT.md).
@@ -1,113 +0,0 @@
-# Enrolling your first host
-
-The control plane only knows about hosts you've explicitly
-enrolled. Two paths exist:
-
-1. **Token-based enrolment** — admin generates a token, pastes it
-   into an install command on the host. The host appears immediately,
-   already mapped to the desired repo.
-2. **Announce-and-approve** — the agent runs without a token,
-   "announces" itself to the server, and a human in the UI accepts
-   the announcement.
-
-Token-based is the default and what most operators want; the
-announce flow exists for the case where you can't easily paste a
-secret onto the host (auto-imaged endpoints, scripted bring-ups
-from a config repo).
-
-## Token-based enrolment
-
-### From the UI
-
-1. Click **+ Add host** on the dashboard.
-2. Fill in the hostname, the restic repo URL, and the repo
-   credentials. The credentials are AEAD-encrypted at the server
-   immediately; what you paste is what the agent receives.
-3. Optionally pick the initial source paths — these become the
-   first source group on the host.
-4. Submit. The server mints a one-time token and shows you a copy-
-   pasteable install snippet.
-
-### On the host (Linux)
-
-```sh
-curl -fsSL https://restic.example.com/install/install.sh | \
-    sudo RM_SERVER=https://restic.example.com \
-         RM_ENROL_TOKEN=<token> \
-         bash
-```
-
-The script:
-
-1. Detects architecture (`amd64` or `arm64`).
-2. Downloads the agent binary from `/agent/binary?os=…&arch=…`.
-3. Drops the systemd unit at
-   `/etc/systemd/system/restic-manager-agent.service`.
-4. Runs the agent in `-enrol` mode, which posts the token and
-   stores the persistent bearer it gets back.
-5. Enables and starts the unit.
-
-Within seconds the host should appear on the dashboard as
-**online**.
-
-### On the host (Windows)
-
-```pwsh
-$env:RM_SERVER  = "https://restic.example.com"
-$env:RM_ENROL_TOKEN = "<token>"
-iwr -useb $env:RM_SERVER/install/install.ps1 | iex
-```
-
-Equivalent shape: registers a Windows service via the SCM
-(see P2-16 for details), runs `-enrol`, starts the service.
-
-## Recovering a lost token
-
-Tokens are single-use and short-lived (1h). If you closed the tab
-before pasting the install command, head to the **Add host** page —
-outstanding tokens are listed there with a **Regenerate** button.
-Regenerating revokes the old token's hash and mints a fresh raw
-token while preserving the original repo credentials and initial
-paths. (NS-02 in `tasks.md` if you want the design rationale.)
-
-## Announce-and-approve
-
-If the host can reach the server but you don't want to paste a
-secret on it, run the agent in `-announce` mode:
-
-```sh
-restic-manager-agent -announce \
-                     -server https://restic.example.com \
-                     -hostname myhost
-```
-
-The host appears in the **Pending hosts** panel on the dashboard
-with its hostname, OS, arch, and the source IP that announced it.
-Click **Accept**, fill in the repo URL + credentials, and the
-server pushes the bearer over the still-open WebSocket. No
-back-and-forth round trip.
-
-If you don't accept within an hour the announcement is swept.
-
-## What happens on the agent
-
-After enrolment, the agent:
-
-1. Connects via WebSocket to `/ws/agent` with its bearer token.
-2. Sends a `hello` envelope with its OS, arch, agent version,
-   restic version, and protocol version.
-3. Receives a `config.update` carrying its encrypted repo
-   credentials and any source-group paths.
-4. Sits idle, sending a heartbeat every 30s. Operator-driven
-   "Run now" actions arrive as `command.run` envelopes; scheduled
-   jobs are driven by the agent's local cron.
-
-## Auto-init of the repository
-
-The first time a backup runs, the agent invokes `restic init`
-against the repo you configured at enrolment. If the repo already
-exists (`config file already exists`) the agent treats it as a
-success and proceeds. The host's repo status (`unknown` →
-`ready` / `init_failed`) is surfaced under the vitals strip on
-the host detail page; if init fails, save fresh credentials in
-the **Repo** tab to retry.
@@ -1,92 +0,0 @@
-# Installing the server
-
-The reference deployment is a single Docker container fronted by
-your existing reverse proxy. The image bundles the server binary,
-the cross-compiled agent binaries, and the install scripts.
-
-## Prerequisites
-
- A Linux host with Docker and Docker Compose.
- A reverse proxy in front (Caddy, nginx, Traefik) terminating
-  TLS on a public hostname. The server itself is HTTP-only by
-  design — see [Reverse proxy](./reverse-proxy.md) for why.
- A persistent volume for the server's data directory.
-
-## Quick start
-
-The reference compose file lives at
-[`deploy/docker-compose.yml`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/deploy/docker-compose.yml):
-
-```yaml
-services:
-  restic-manager:
-    image: gitea.dcglab.co.uk/steve/restic-manager:${RM_VERSION:-latest}
-    restart: unless-stopped
-    environment:
-      RM_LISTEN: ":8080"
-      RM_DATA_DIR: "/data"
-      RM_BASE_URL: "https://restic.example.com"
-      # Trust your reverse proxy's CIDR so X-Forwarded-* are honoured.
-      RM_TRUSTED_PROXY: "10.0.0.0/8"
-    volumes:
-      - rm-data:/data
-    ports:
-      # Bind localhost only — your reverse proxy is the public face.
-      - "127.0.0.1:8080:8080"
-
-volumes:
-  rm-data:
-```
-
-Bring it up:
-
-```sh
-docker compose up -d
-docker compose logs -f restic-manager
-```
-
-The first run prints a one-time **bootstrap token** to the log. Use
-it within an hour or it expires; if you miss the window the
-container print it again on next start as long as no admin user
-exists.
-
-## First-run admin setup
-
-Open `https://restic.example.com/bootstrap` (or whatever your
-public URL is). Paste the bootstrap token, pick a username and a
-password (≥ 12 characters), and submit. You'll land in the
-dashboard logged in as the new admin.
-
-If you'd rather curl it, the equivalent is:
-
-```sh
-curl -X POST https://restic.example.com/api/bootstrap \
-     -H 'Content-Type: application/json' \
-     -d '{"token":"<token-from-log>","username":"admin","password":"<≥12 chars>"}'
-```
-
-## Backing up the secret key
-
-Inside the data volume, `secret.key` holds the AEAD key used to
-encrypt every credential at rest. **Back it up separately from
-the database.** Without it, encrypted credentials in the database
-are unrecoverable; you'd have to re-enrol every host.
-
-A simple working approach: copy `secret.key` to your password
-manager or to a separately-backed-up secrets vault the day you
-install. It doesn't change.
-
-## Updating the server
-
-```sh
-# Pin a new version in your compose file (.env or docker-compose.yml),
-# then:
-docker compose pull
-docker compose up -d
-```
-
-Migrations run automatically on startup; the server will refuse to
-start if a migration fails (better to bail than to half-migrate).
-
-For the agent self-update story, see
-[Updating agents](../operations/updates.md).
@@ -1,95 +0,0 @@
-# Running behind a reverse proxy
-
-The restic-manager server is HTTP-only by design. TLS termination,
-public hostname, ACME, HSTS, and edge-level rate limiting all
-belong to a reverse proxy you already operate outside this project.
-
-## What the proxy must forward
-
-The server reads four headers when (and only when) the immediate
-peer matches `RM_TRUSTED_PROXY`:
-
-| Header                 | Value                                              | Why |
-|------------------------|----------------------------------------------------|-----|
-| `X-Forwarded-For`      | The original client IP                             | Rate-limit keys, audit log entries, OIDC redirect-URI checks. |
-| `X-Forwarded-Proto`    | `https`                                            | Used for absolute URLs (e.g. OIDC redirect URIs). |
-| `Host`                 | The public hostname clients use                    | Cookies are scoped to this; `RM_BASE_URL` must match. |
-| `Connection` / `Upgrade` | Pass through unchanged                           | `/ws/agent` and `/api/jobs/{id}/stream` are WebSockets; without `Upgrade: websocket` they fail. |
-
-Set `RM_TRUSTED_PROXY` to the CIDR (or comma-separated list of
-CIDRs) the proxy connects from. Anything outside that range has
-its `X-Forwarded-*` headers ignored, so a stray request that
-bypasses the proxy can't spoof the client IP.
-
-## Caddy
-
-```caddyfile
-restic.example.com {
-    encode zstd gzip
-    reverse_proxy 127.0.0.1:8080 {
-        header_up X-Real-IP {remote_host}
-    }
-}
-```
-
-Caddy adds `X-Forwarded-For` / `X-Forwarded-Proto` automatically
-and passes WebSocket headers through by default, so this is the
-whole config.
-
-## nginx
-
-```nginx
-server {
-    listen 443 ssl http2;
-    server_name restic.example.com;
-
-    ssl_certificate     /etc/letsencrypt/live/restic.example.com/fullchain.pem;
-    ssl_certificate_key /etc/letsencrypt/live/restic.example.com/privkey.pem;
-
-    location / {
-        proxy_pass         http://127.0.0.1:8080;
-        proxy_http_version 1.1;
-        proxy_set_header   Host              $host;
-        proxy_set_header   X-Forwarded-For   $proxy_add_x_forwarded_for;
-        proxy_set_header   X-Forwarded-Proto https;
-
-        # WebSocket upgrade
-        proxy_set_header   Upgrade           $http_upgrade;
-        proxy_set_header   Connection        "upgrade";
-
-        # Long-lived agent WS — disable read timeout for this surface.
-        proxy_read_timeout 86400s;
-    }
-}
-```
-
-## Traefik
-
-```yaml
-http:
-  routers:
-    restic-manager:
-      rule: "Host(`restic.example.com`)"
-      entryPoints: [websecure]
-      tls:
-        certResolver: letsencrypt
-      service: restic-manager
-
-  services:
-    restic-manager:
-      loadBalancer:
-        servers:
-          - url: "http://restic-manager:8080"
-        passHostHeader: true
-```
-
-Traefik forwards WebSocket upgrades and the standard
-`X-Forwarded-*` set out of the box.
-
-## Verification
-
-After bringing the proxy up, the audit log should show your real
-client IP for an interactive login (not the proxy's local
-address). If you see `127.0.0.1` or the proxy's container IP, your
-`RM_TRUSTED_PROXY` is wrong or `X-Forwarded-For` isn't being
-forwarded.
@@ -1,86 +0,0 @@
-# restic-manager
-
-restic-manager is a self-hosted, browser-based, single-pane-of-glass
-for managing [restic](https://restic.net) backups across a fleet of
-Linux and Windows endpoints. It's designed for **small fleets** —
-the original target was twelve endpoints — and **one operator**.
-
-## What it does
-
- Centralised view of every endpoint's last backup, repo size,
-  snapshot count, and recent jobs.
- Trigger any restic operation remotely (`backup`, `forget`, `prune`,
-  `check`, `unlock`, `snapshots`, `stats`, `diff`, `restore`).
- Per-host backup schedules with source groups (named bundles of
-  paths + retention policy).
- Live job log streamed to the browser; downloadable as text or NDJSON.
- Restore wizard with snapshot tree browse + path selection.
- Repo-level health surfacing (size, raw size, last-check, lock
-  state) plus a 30/90-day size trend.
- Alerting over webhook, ntfy, or SMTP.
- Cross-platform agent (Linux + Windows).
- Append-only-credential-friendly with a separate admin credential
-  for forget/prune.
-
-## What it isn't
-
- **Not a SaaS.** Single-instance, single-tenant, by design.
- **Not a replacement for restic** — it's a control plane. The agent
-  shells out to a real `restic` binary.
- **Not highly available.** SQLite, single process; if you need
-  HA backups, you're shopping in the wrong aisle.
- **Not a multi-protocol backup tool.** restic only.
-
-## How it fits together
-
-```
-┌──────────────────────────────────────────────┐
-│  Server (control plane, Docker)              │
-│   - REST + WebSocket API                     │
-│   - SQLite store                             │
-│   - Embedded HTMX UI                         │
-└──────────┬─────────────────────────┬─────────┘
-           │ outbound WS              │ HTTP(S)
-           │                          │
-┌──────────▼──────────┐    ┌──────────▼─────────┐
-│  Agent (per host)   │    │  Browser (operator) │
-│   - restic wrapper  │    └─────────────────────┘
-│   - cron for sched. │
-└──────────┬──────────┘
-           │ restic
-┌──────────▼──────────────────────────────────┐
-│  rest-server / S3 / SFTP / local repo       │
-│  (the actual backup data — server never     │
-│   touches it)                               │
-└─────────────────────────────────────────────┘
-```
-
-The control plane is a Go binary that runs in Docker. Each endpoint
-runs a small Go agent that holds an outbound WebSocket to the
-control plane. Backup data flows directly between the agent and the
-restic repository — the control plane never sees a snapshot byte.
-
-## Where to start
-
- [Installing the server](./getting-started/install.md) walks
-  through the Docker-based reference deployment.
- [Enrolling your first host](./getting-started/enrolling-hosts.md)
-  covers the install scripts and the announce-and-approve flow.
- [Architecture](./concepts/architecture.md) is the right read if
-  you want to know why something is the way it is before running
-  the install.
-
-## Project status
-
-Pre-1.0 but feature-complete for the original use case. Phases
-0–4 are landed (MVP, scheduling, restore, RBAC + OIDC); Phase 5
-(this docs site, contributor onboarding, end-to-end CI) is in
-flight. See [`tasks.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/tasks.md)
-for the live roadmap and [`spec.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md)
-for the canonical design doc.
-
-## License
-
-[PolyForm Noncommercial 1.0.0](https://polyformproject.org/licenses/noncommercial/1.0.0/).
-Personal and community deployments welcome; commercial use
-requires a separate license.
@@ -1,39 +0,0 @@
-# License
-
-restic-manager is licensed under
-[**PolyForm Noncommercial 1.0.0**](https://polyformproject.org/licenses/noncommercial/1.0.0/).
-The full text lives at
-[`LICENSE`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/LICENSE)
-in the repository root.
-
-## What this means
-
- **Personal, hobbyist, educational, charitable, and similar
-  noncommercial use** is fully permitted, including modification
-  and redistribution.
- **Commercial use is not permitted** without a separate
-  license. The maintainer is not currently offering one — if
-  you need commercial rights, open an issue to start the
-  conversation.
- The license is permissive about everything except commercial
-  use: you can fork, modify, deploy in your home/lab, and
-  contribute back.
-
-## Why this license
-
-The PolyForm Noncommercial license was chosen because:
-
- It's a real, legal, plainly-worded license (not a custom
-  half-written variant).
- It permits the realistic uses for a hobby project (the
-  maintainer's homelab, a friend's fleet, a charity's IT
-  closet) without inviting commercial vendors to repackage
-  the work.
- It's compatible with the project staying small and
-  maintainable — the maintainer doesn't want to be on the hook
-  for SLA-grade commercial support.
-
-## Contributions
-
-By contributing, you agree your contributions are licensed
-under the same PolyForm Noncommercial 1.0.0 license.
@@ -1,73 +0,0 @@
-# Alerts and notifications
-
-restic-manager raises alerts on conditions that need human
-attention. The alert engine evaluates rules on a 60s tick and
-on every job-finished / host-online event.
-
-## Built-in alert kinds
-
-| Kind                | Trigger | Severity |
-|---------------------|---------|----------|
-| `backup_failed`     | A backup job ends in `failed` or `cancelled` | warning |
-| `forget_failed`     | A forget job ends in `failed` | warning |
-| `prune_failed`      | A prune job ends in `failed` | critical |
-| `check_failed`      | A check job ends in `failed` | critical |
-| `agent_offline`     | A host has been offline more than 90s past its heartbeat cadence | warning |
-| `stale_schedule`    | A schedule's "last run" is more than 1.5 × its interval ago | warning |
-| `update_failed`     | An agent self-update returned a fail or didn't reconnect within 90s | warning |
-| `fleet_update_halted`| The rolling fleet-update worker stopped on a failure | critical |
-
-Each alert has a `dedup_key` so re-firing the same condition
-just bumps `last_seen_at` — the operator gets one row per
-condition, not a thousand.
-
-## Lifecycle
-
-```
-raised  ──acknowledge──▶  acknowledged  ──resolve──▶  resolved
-   │                          │
-   └────────auto-resolve──────┘
-   (e.g. agent_offline auto-resolves on agent_online)
-```
-
- **Acknowledge** says "I've seen this, stop notifying about it".
- **Resolve** says "the underlying condition is gone".
- Some alerts auto-resolve when the condition clears
-  (`agent_offline` is the canonical example).
-
-## Notification channels
-
-Configure under **Settings → Notifications**. Each channel can
-subscribe to all alerts or filter by severity.
-
-### Webhook
-
-Posts a JSON envelope to a URL of your choice. Useful for
-piping into Slack via an Incoming Webhook URL or into your own
-alerting tooling.
-
-### ntfy
-
-Pushes a plain-text alert to an [ntfy.sh](https://ntfy.sh/)
-topic. Configure the topic URL; optional bearer token if you
-self-host with auth.
-
-### SMTP
-
-Plain SMTP (with optional TLS). Configure host, port,
-username, password, and the recipient list.
-
-## Test fire
-
-Each channel exposes a **Test fire** button that dispatches a
-single synthetic alert through the channel without touching the
-alert engine. Use this when you've added a channel and want to
-verify connectivity before the next real failure happens.
-
-## What gets logged
-
-Every alert raise / acknowledge / resolve writes an audit log
-entry. The audit log UI at **Settings → Audit log** filters by
-user, action, target, and time range — useful for the
-post-incident "who clicked acknowledge on the prune-failure
-alert" question.
@@ -1,73 +0,0 @@
-# Backups and restores
-
-## Running a backup
-
-Three ways to trigger one:
-
-1. **Scheduled** — the agent's local cron fires at the time set
-   on the schedule.
-2. **Run-now** — operator clicks **Run now** on the host detail
-   right rail. Posts to `/hosts/{id}/run-backup` (defaults to all
-   source groups) or to a per-group form for finer control.
-3. **API** — `POST /api/hosts/{id}/jobs` with the appropriate
-   payload. Same audit + dispatch path.
-
-In every case the server creates a `jobs` row, broadcasts a
-`command.run` to the host, and lands the operator on the live
-job log page (HTMX `HX-Redirect`).
-
-## Cancelling a job
-
-Any running job — backup, forget, prune, restore, anything —
-exposes a **Cancel** button on its detail page. The server
-broadcasts `command.cancel`, and the agent kills the running
-restic subprocess via context cancel: SIGTERM first, SIGKILL
-after a 5s grace (`cmd.Cancel` + `cmd.WaitDelay`). On Windows the
-SIGTERM step is replaced with `os.Kill` because Windows can't
-deliver SIGTERM. Result: a cancelled job lands as `cancelled`
-within a couple of hundred milliseconds.
-
-## Restore wizard
-
-Restoring a file or path goes through a four-step wizard at
-`/hosts/{id}/restore`:
-
-1. **Pick a snapshot.** Search by id or by date; the page is
-   pre-populated when you launched the wizard from a snapshot row.
-2. **Browse the snapshot tree.** Lazy-loaded children via the
-   `MsgTreeList` synchronous WS RPC; results are cached
-   per-wizard-session for 30 minutes. Pick the absolute paths
-   you want.
-3. **Choose a target.** Either **In place** (overwrites the
-   live filesystem; requires you to type the hostname to
-   confirm) or **New directory** (default
-   `$HOME/rm-restore/<job-id>/`; agent expands `$HOME` /
-   `${HOME}` / `~/` and creates the directory chain).
-4. **Review and submit.** Server mints a job, dispatches
-   `command.run` with a `RestorePayload`, and `HX-Redirect`s to
-   the live job log.
-
-`--no-ownership` is gated on restic ≥ 0.17 (the flag was added
-in that release). Hosts running 0.16 don't get the flag and
-restore as the running user instead.
-
-## Snapshot diff
-
-Two snapshot ids in the **Diff** form on the host detail page →
-a `JobDiff` job that runs `restic diff <a> <b>`. Output streams
-to the standard live job log. Useful when investigating a
-suspiciously-sized backup.
-
-## Job log artefacts
-
-Every job's log is persisted in `job_logs` (one row per line),
-not just streamed in-memory. That gives you:
-
- A live view at `/jobs/{id}` while the job runs.
- Two download formats from the same page header dropdown:
-  - **txt** — one line per row, `HH:MM:SS.mmm  TAG  payload`.
-  - **ndjson** — one self-contained JSON object per line
-    (`{seq, ts, stream, payload}`), perfect for `jq`.
-
-Downloads work whether the job is running or finished —
-the source is the DB, not the live socket.
@@ -1,61 +0,0 @@
-# Observability with Prometheus
-
-restic-manager can expose a Prometheus scrape endpoint at
-`GET /metrics`. The endpoint is **opt-in** — without an explicit
-auth gate it isn't even mounted, so a forgotten config can't
-accidentally publish fleet state.
-
-The full reference lives at
-[`docs/prometheus.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/docs/prometheus.md);
-the short version follows.
-
-## Enable the endpoint
-
-Set at least one of:
-
- `RM_METRICS_TOKEN` — `Authorization: Bearer <token>` required.
- `RM_METRICS_TRUSTED_CIDR` — restricts source IPs (comma-CIDR).
-
-Both ANDed when both set. Constant-time token compare; CIDR
-honours `X-Forwarded-For` only when the immediate hop matches
-`RM_TRUSTED_PROXY`.
-
-## Metrics emitted
-
- **Server gauges**: `rm_hosts_total`, `rm_hosts_online`,
-  `rm_active_alerts{severity}`, `rm_build_info{...}`.
- **Per-host gauges**: `rm_host_agent_online`,
-  `rm_host_last_backup_timestamp_seconds`,
-  `rm_host_last_backup_success`, `rm_host_repo_size_bytes`,
-  `rm_host_snapshot_count`, `rm_host_open_alerts`,
-  `rm_host_repo_status`.
- **Histogram**:
-  `rm_job_duration_seconds{kind,status,le=…}` (buckets
-  `1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf`).
-
-In-memory histogram only. Prometheus persists the scrapes; if
-you need durable history at hourly resolution that's
-Prometheus's job.
-
-## Sample Grafana dashboard
-
-[`deploy/grafana/restic-manager-dashboard.json`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/deploy/grafana/restic-manager-dashboard.json)
-imports through Grafana's **+ → Import → Upload JSON file**.
-Six panels:
-
-1. Fleet status (online / total).
-2. Open alerts by severity.
-3. Backups failing on most-recent run.
-4. Hosts table — last backup, repo size, snapshots, open alerts.
-5. Repo size over time, one line per host.
-6. Job-duration p95 over a 1h window per kind.
-
-## Alerting
-
-restic-manager already has a built-in alert engine
-([Alerts](./alerts.md)). The dashboard intentionally doesn't
-duplicate it as Prometheus alert rules. If you want
-Prometheus-side alerts on top, write your own based on the
-metrics above — `rm_host_last_backup_success == 0`,
-`time() - rm_host_last_backup_timestamp_seconds > <max age>`,
-or whatever suits your environment.
@@ -1,50 +0,0 @@
-# Updating agents
-
-Server updates are a `docker compose pull && up -d` away.
-Agents update via the control plane.
-
-## Single-host update
-
-Each host's detail page shows an **Update agent** button when
-the agent's reported version is older than the server's. The
-button:
-
-1. Dispatches a `command.update` to that host.
-2. The agent fetches the appropriate binary from
-   `$RM_SERVER/agent/binary?os=…&arch=…` to
-   `<binary-path>.new`.
-3. Copies the running binary to `<binary-path>.old` (one
-   revision back, in case rollback is needed).
-4. Atomic-renames `.new` over the running binary.
-5. Exits cleanly. systemd's `Restart=always` (or Windows SCM)
-   brings the process back on the new binary.
-
-A 90-second timer on the server side waits for a hello at the
-target version and marks the update succeeded — or, if the
-agent doesn't reconnect at the expected version in time, marks
-the update **failed** and raises an `update_failed` alert.
-
-## Fleet update
-
-The admin-only **Settings → Fleet update** page drives a rolling
-update across every host in the fleet:
-
- One host at a time.
- Wait for hello-with-target-version (max 95s).
- On any host failing, **halt** the rollout, raise a
-  `fleet_update_halted` alert, leave the rest of the fleet on
-  the old version. No surprise mass-failures.
-
-You can cancel an in-progress fleet update; the worker stops
-after the current host finishes.
-
-## TLS and corruption
-
-Updates rely on the reverse proxy's TLS to detect corruption in
-transit. There's no separate sha256 verification step — we
-chose the simpler model on the basis that the same TLS already
-gates every other byte the server hands to the agent.
-
-If you'd like a separate signature step before applying updates,
-that's a future-phase enhancement (see `tasks.md` Phase 6
-candidates).
@@ -1,58 +0,0 @@
-# Environment variables
-
-The server reads its configuration from environment variables
-(canonical) with an optional YAML overlay. Env wins over YAML so
-operators can tweak a single setting without rewriting the file.
-
-## Server
-
-| Variable                  | Default                          | Meaning |
-|---------------------------|----------------------------------|---------|
-| `RM_LISTEN`               | `:8080`                          | TCP listener for the HTTP server. |
-| `RM_DATA_DIR`             | `/data`                          | Persistent state directory (SQLite, secret key, agent assets). |
-| `RM_BASE_URL`             | (none)                           | Public URL clients use; required for OIDC redirects + cookie scope. |
-| `RM_SECRET_KEY_FILE`      | `${RM_DATA_DIR}/secret.key`      | Path to the AEAD key file. Auto-generated on first run. |
-| `RM_COOKIE_SECURE`        | `true`                           | Set `false` only for local HTTP testing. Controls `Secure` on session cookies. |
-| `RM_TRUSTED_PROXY`        | (none)                           | Comma-separated CIDRs trusted for `X-Forwarded-*`. |
-| `RM_BUNDLED_ASSETS_DIR`   | `/opt/restic-manager/dist`       | Read-only path with bundled agent binaries + install scripts (the docker image bakes them here). |
-| `RM_METRICS_TOKEN`        | (off)                            | When set, `GET /metrics` requires `Authorization: Bearer <token>`. |
-| `RM_METRICS_TRUSTED_CIDR` | (off)                            | When set, `GET /metrics` restricts source IPs (comma-CIDR). |
-
-OIDC variables (all optional; empty issuer disables OIDC):
-
-| Variable                       | Meaning |
-|--------------------------------|---------|
-| `RM_OIDC_ISSUER`               | OIDC discovery URL (e.g. `https://auth.example.com`). |
-| `RM_OIDC_CLIENT_ID`            | Client ID registered with the IdP. |
-| `RM_OIDC_CLIENT_SECRET`        | Client secret (or use `RM_OIDC_CLIENT_SECRET_FILE`). |
-| `RM_OIDC_CLIENT_SECRET_FILE`   | Path to a file holding the client secret. |
-| `RM_OIDC_DISPLAY_NAME`         | Button label on the login page (e.g. "Authelia"). |
-| `RM_OIDC_ROLE_CLAIM`           | Token claim that carries roles (default `groups`). |
-| `RM_OIDC_ROLE_MAPPING`         | `idp-group=role` entries, comma-separated (e.g. `rm-admin=admin,rm-ops=operator`). |
-| `RM_OIDC_REDIRECT_URL`         | Override for the redirect URL; defaults to `${RM_BASE_URL}/auth/oidc/callback`. |
-
-## Agent
-
-| Variable             | Default | Meaning |
-|----------------------|---------|---------|
-| `RM_AGENT_CONFIG`    | `/etc/restic-manager/agent.yaml` (Linux) | Config file path. |
-
-The agent's other settings live in the YAML file (server URL,
-bearer token, optional cert pin). The install script writes that
-file for you at enrolment.
-
-## Build-time
-
-The Makefile threads `-ldflags` from `git describe` into the
-`internal/version` package so `--version` and the dashboard
-footer show the right values:
-
-```
-X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=$(VERSION)
-X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Commit=$(COMMIT)
-```
-
-If you build with `go build` directly (no Makefile), `Version`
-falls back to `dev` and the agent-update comparison falls back
-to "always equal". Source-build deployments can still run; they
-just don't participate in the self-update flow.
@@ -1,82 +0,0 @@
-# HTTP endpoints
-
-A non-exhaustive map of the surfaces the control plane exposes.
-All `/api/*` routes return JSON; all other paths render HTML
-(server-rendered with HTMX in the loop).
-
-The canonical wiring lives at
-[`internal/server/http/server.go`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/internal/server/http/server.go);
-when in doubt, read the routes block there.
-
-## Public (no auth)
-
-| Method | Path                       | Purpose |
-|--------|----------------------------|---------|
-| GET    | `/healthz`                 | Liveness probe. Returns 204. |
-| POST   | `/api/auth/login`          | Local-user login. JSON body: `{username, password}`. |
-| POST   | `/api/auth/logout`         | Invalidate the session cookie. |
-| POST   | `/api/bootstrap`           | First-run admin creation. Accepts the token printed at first start. |
-| POST   | `/api/agents/enroll`       | Token-based agent enrolment. |
-| POST   | `/api/agents/announce`     | Announce-and-approve agent enrolment. |
-| GET    | `/agent/binary?os=&arch=`  | Serves the agent binary for the install scripts. |
-| GET    | `/install/*`               | Serves the Linux + Windows install scripts and the systemd unit. |
-| GET    | `/api/version`             | Build version + commit JSON. |
-| GET    | `/metrics`                 | Prometheus exposition (only when opted-in via `RM_METRICS_TOKEN` / `RM_METRICS_TRUSTED_CIDR`). |
-| GET    | `/login`, `/setup`, `/bootstrap` | UI pages. |
-
-## Authenticated (any role)
-
-| Method | Path                                     | Purpose |
-|--------|------------------------------------------|---------|
-| GET    | `/`                                      | Dashboard. |
-| GET    | `/hosts/{id}`                            | Host detail. |
-| GET    | `/hosts/{id}/repo`                       | Repo tab. |
-| GET    | `/hosts/{id}/jobs`                       | Jobs tab. |
-| GET    | `/hosts/{id}/sources`                    | Source groups list. |
-| GET    | `/hosts/{id}/schedules`                  | Schedules list. |
-| GET    | `/jobs/{id}`                             | Live job log. |
-| GET    | `/api/hosts`, `/api/fleet/summary`       | JSON list + summary. |
-| GET    | `/api/jobs/{id}/stream`                  | WebSocket subscription to a job's live log. |
-| GET    | `/api/jobs/{id}/log.{txt,ndjson}`        | Persisted log download. |
-
-## Operator role and above
-
-| Method | Path                                  | Purpose |
-|--------|---------------------------------------|---------|
-| POST   | `/hosts/{id}/run-backup`              | Run-now (HTMX form-post). |
-| POST   | `/hosts/{id}/sources/{gid}/run-now`   | Per-source-group run-now. |
-| POST   | `/hosts/{id}/repo/{prune,check,unlock,reinit,probe}` | Maintenance actions. |
-| POST   | `/api/hosts/{id}/snapshots/diff`      | Snapshot-diff job. |
-| POST   | `/hosts/{id}/restore`                 | Restore wizard submit. |
-| POST   | `/api/jobs/{id}/cancel`               | Cancel a running job. |
-| POST   | `/hosts/{id}/tags`                    | Update host tags. |
-| POST   | `/hosts/{id}/sources` and friends     | Source-group CRUD. |
-| POST   | `/hosts/{id}/schedules` and friends   | Schedule CRUD. |
-| POST   | `/hosts/{id}/repo/credentials`, `/admin-credentials` | Credential update. |
-
-## Admin role only
-
-| Method | Path                                  | Purpose |
-|--------|---------------------------------------|---------|
-| POST   | `/hosts/new`                          | Mint enrolment token (Add host). |
-| POST   | `/hosts/{id}/delete`                  | Delete + cascade. |
-| POST   | `/hosts/{id}/update`                  | Dispatch a single agent update. |
-| GET/POST | `/settings/users/...`                | User management. |
-| POST   | `/settings/notifications/...`         | Notification channel CRUD + test fire. |
-| POST   | `/settings/fleet-update/...`          | Fleet-update worker. |
-
-## WebSocket
-
-| Path                           | Who connects | Auth |
-|--------------------------------|--------------|------|
-| `/ws/agent`                    | Agent        | Bearer token issued at enrolment. |
-| `/ws/agent/pending`            | Agent (announce flow) | Pending-id query param. |
-| `/api/jobs/{id}/stream`        | Browser      | Session cookie. |
-
-## RBAC enforcement
-
-Routes are grouped into chi route-groups by required role
-(`viewer < operator < admin`); the `requireRole` middleware in
-`internal/server/http/middleware.go` is the bouncer. Sessions
-re-validate `disabled_at` on every request, so a disabled user's
-cookie stops working immediately.
@@ -1,32 +0,0 @@
-# Roadmap
-
-The live roadmap is in
-[`tasks.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/tasks.md).
-Phases ship in order; items inside a phase ship as the
-opportunity arises.
-
-## Status snapshot
-
-| Phase | Theme                                            | Status |
-|-------|--------------------------------------------------|--------|
-| 0     | Project bootstrap                                | ✅ done |
-| 1     | MVP: enrolment, visibility, on-demand backup     | ✅ done |
-| 2     | Scheduling, retention, repo operations           | ✅ done |
-| 3     | Restore, alerts, audit                           | ✅ done |
-| 4     | RBAC, OIDC, host tags                            | ✅ done |
-| 5     | OSS readiness                                    | 🚧 in flight (this docs site is part of it) |
-| 6     | Update delivery + observability polish           | ✅ done |
-
-## What's not on the roadmap
-
-The non-goals list in [`spec.md` §2](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md):
-
- Replacing restic itself or providing custom repo formats
- Managing non-restic backup tools
- Multi-tenancy / SaaS deployment
- High availability of the control plane (SQLite, single-instance)
- Mobile-native apps (responsive web only)
-
-If something there is critical to your use case, restic-manager
-isn't the right tool. That's not a closed door — it's a
-deliberate scope decision so the project stays maintainable.
@@ -1,35 +0,0 @@
-# Reporting vulnerabilities
-
-The full disclosure policy lives in
-[`SECURITY.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/SECURITY.md)
-at the repo root. The short version:
-
- **Don't open a public issue.**
- Send a Gitea private message to `steve` on
-  <https://gitea.dcglab.co.uk>, or email the address on the
-  maintainer's profile, with a subject like
-  `[SECURITY] restic-manager: <one-line summary>`.
- Expect an acknowledgement within 3 working days; escalate
-  through the other channel if you don't get one.
- Default disclosure window is **30 days from confirmed report
-  to public disclosure**, faster if a PoC is already
-  circulating, slower only by mutual agreement.
-
-## What to include
-
-A description of the issue and the impact, the affected
-component (server / agent / install script / docs), the version,
-and reproduction steps. A working PoC is welcome but not
-required — a credible threat model is enough.
-
-## In scope vs. out of scope
-
-See the full policy. Quick highlights:
-
- **In scope:** server, agent, install scripts, docker image,
-  docker-compose reference, crypto choices, docs that lead to
-  insecure configs.
- **Out of scope:** restic itself (report upstream), unpatched
-  third-party deps (report upstream first), pre-authenticated
-  admin abuse (admins are designed to have full power), DoS on
-  deployments without the recommended reverse proxy.
@@ -1,72 +0,0 @@
-# Hardening checklist
-
-A baseline for new deployments. Most of these are defaults; the
-list is here to make audit easy.
-
-## Server
-
- [ ] Reverse proxy in front, TLS terminating at the proxy
-      (Caddy/nginx/Traefik).
- [ ] `RM_TRUSTED_PROXY` set to the proxy's CIDR.
- [ ] `RM_BASE_URL` matches the public hostname and the cookie
-      scope you want.
- [ ] `RM_COOKIE_SECURE=true` (the default; only set `false`
-      for local HTTP testing).
- [ ] HTTP listener bound to **localhost** in the compose file,
-      not `0.0.0.0`. The reverse proxy is the only thing that
-      should reach it.
- [ ] `secret.key` backed up separately from the database.
- [ ] Bootstrap token consumed and the printed log line scrubbed
-      from any log archive.
-
-## Authentication
-
- [ ] Admin user has a password ≥ 12 characters (the floor).
- [ ] OIDC enabled if you have an IdP — local password auth
-      stays as a break-glass.
- [ ] Disabled (not deleted) any users who change roles or leave
-      so their session is invalidated immediately.
- [ ] The last-admin guard isn't tripped — there's always at
-      least one enabled admin user.
-
-## Repo credentials
-
- [ ] Append-only credential set as the everyday cred for every
-      host.
- [ ] Admin credential set only where prune cadence is enabled.
- [ ] No credentials reused across hosts. Each host should have
-      its own credential pair so a single host compromise has a
-      single blast radius.
- [ ] If using rest-server, `--append-only` flag is on for the
-      everyday user; the prune user is a separate identity.
-
-## Agent
-
- [ ] Agent runs as `root` (Linux) or `LocalSystem` (Windows)
-      **only when** the source paths require it. Otherwise pin
-      a service user that has read access to what's backed up
-      and nothing else.
- [ ] systemd unit's sandboxing flags are intact
-      (`NoNewPrivileges`, `Protect*`, `MemoryDenyWriteExecute`).
- [ ] Agent's config file `/etc/restic-manager/agent.yaml` is
-      mode `0600` and owned by the service user. The bearer
-      token lives in there.
-
-## Operations
-
- [ ] Alerts wired to a real channel (webhook into Slack,
-      ntfy topic, SMTP) — not just sitting in the UI.
- [ ] Test-fire each notification channel after configuring.
- [ ] Audit-log retention is long enough to cover the operator's
-      incident-response window.
- [ ] Prometheus endpoint, if enabled, gated by token AND CIDR
-      where practical (default is opt-in / off).
-
-## Recovery
-
- [ ] A documented procedure for rotating a leaked agent bearer
-      (delete + re-enrol the host).
- [ ] A test-restore done at least once, end-to-end, before
-      relying on the system in anger.
- [ ] `secret.key` and the SQLite database covered by separate
-      backup paths so neither alone reconstitutes the other.
@@ -1,110 +0,0 @@
-# Threat model
-
-This page documents what restic-manager defends against, what it
-doesn't, and the trust assumptions a deployment is making. The
-canonical version lives in [`spec.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md)
-§11; the summary here is shaped for operators rather than
-implementers.
-
-## Trust boundaries
-
-```
-┌──────────────────────────────────────────┐
-│  TRUSTED zone                            │
-│  ┌─────────────┐    ┌──────────────┐     │
-│  │  Operator's │    │   Reverse    │     │
-│  │   browser   │◄──►│    proxy     │     │  TLS terminates here
-│  └─────────────┘    └──────┬───────┘     │
-└────────────────────────────┼─────────────┘
-                             │ HTTP, plaintext
-                             │ (loopback or trusted LAN)
-┌────────────────────────────▼─────────────┐
-│  Server (control plane)                  │
-└────────────┬─────────────────────────────┘
-             │ outbound WebSocket (TLS to clients via proxy)
-             │ — bearer-authenticated
-┌────────────▼──────────────┐
-│  Agent (per host)         │  ◄── attacker model: assume one
-└────────────┬──────────────┘       endpoint can be compromised
-             │ subprocess
-             ▼
-   restic ──▶ repository (rest-server / S3 / SFTP / …)
-```
-
-## What we defend against
-
-### Network attacker between operator and server
-
- HTTPS via the reverse proxy is the only operator-facing surface
-  on a sane deployment.
- `RM_COOKIE_SECURE=true` (default) means the session cookie
-  refuses to ride a non-HTTPS connection.
- `RM_TRUSTED_PROXY` gates whether `X-Forwarded-*` is honoured;
-  a bypassing request can't spoof the client IP.
-
-### Compromised agent host
-
- The agent's bearer token can dispatch commands **only on its
-  own host**. It can't read other hosts' state, dispatch jobs
-  on other hosts, or escalate within the control plane.
- If you suspect a host compromise:
-  1. Disable the agent's host row from **Hosts → Delete**
-     (cascades the bearer hash).
-  2. Rotate the repo credential at the rest-server / object
-     store side.
-  3. Audit-log lists every action that bearer ever drove.
-
-### DB compromise without the secret key
-
- Repo credentials are AEAD-encrypted at rest. A DB dump alone
-  doesn't expose them.
- Agent bearer **hashes** are leaked; that's enough to
-  authenticate as any agent until you revoke. A rotation
-  procedure is just "delete + re-enrol" today.
- Operator passwords are bcrypt-hashed; OIDC users have no
-  password to leak.
- Session tokens are hashed; an attacker can't replay a
-  session from a DB dump.
-
-### DB compromise WITH the secret key
-
-The attacker can decrypt every credential. Treat
-`secret.key` with the same care as a password manager database.
-Back it up to a separate vault, not to the same Docker volume
-as the database.
-
-### Forget/prune as a DoS vector
-
- The everyday backup credential cannot prune (append-only).
- The admin credential is only pushed to the agent at the
-  moment of dispatch and discarded after the job ends.
- Compromise of a single agent host does **not** grant prune
-  rights — at worst the attacker gets fresh write access until
-  the credential is rotated.
-
-### Operator-side typo or bad copy-paste
-
- Repo credentials are stored encrypted; mis-typed creds fail
-  fast on the next `restic` invocation rather than silently
-  corrupting state.
- NS-03 added auto-init: the first dispatched job after creds
-  change runs `restic init`, surfaces the error eagerly under
-  the host's vitals strip if the creds are bad, and resets the
-  host's `repo_status` so the operator can retry without
-  hunting through job logs.
-
-## What we don't defend against
-
- **Insider threat at the maintainer level.** A malicious
-  maintainer can publish a backdoored container; SBOM /
-  signing infrastructure (Phase 6 candidate) would help here
-  but isn't shipped today.
- **Supply chain.** We pin module versions (`go.sum`) and
-  pin the Tailwind binary's release tag, but a compromise in
-  one of those upstreams would land here.
- **Side-channel via restic itself.** A bug in restic that
-  enables snapshot-content disclosure is restic's problem; the
-  control plane doesn't see snapshot bytes either way.
- **DoS via resource exhaustion** without the recommended
-  reverse-proxy / rate-limit in front. Don't expose the
-  server's HTTP port to the public internet directly.
@@ -1,120 +0,0 @@
-# End-to-end test harness
-
-The e2e harness stands up the full production-shaped stack
-(server + agent + rest-server) in Docker Compose and drives it
-through Playwright. CI runs it on every PR; operators can run it
-locally too.
-
-## Files
-
-```
-e2e/
-├── compose.e2e.yml         compose stack: server + rest-server + agent
-├── Dockerfile.agent        Linux container for the agent (alpine + restic)
-├── agent-entrypoint.sh     decides between announce / token-enrol / run
-└── playwright/
-    ├── package.json
-    ├── playwright.config.ts
-    └── tests/
-        ├── lib/server.ts   bootstrap, login, accept, poll helpers
-        └── smoke.spec.ts   happy-path: enrol → backup → succeeded
-```
-
-## Local run
-
-Prerequisites: Docker + Docker Compose, and `npx` for Playwright.
-
-```sh
-# 1. Build + bring up the stack (server, rest-server, source data).
-docker compose -f e2e/compose.e2e.yml up --build -d server rest-server source-fixture
-
-# 2. Wait for the server, then scrape the bootstrap token from the log.
-until curl -fsS http://127.0.0.1:8080/api/version >/dev/null; do sleep 1; done
-RM_BOOTSTRAP_TOKEN=$(docker compose -f e2e/compose.e2e.yml logs server \
-    | grep -Eo '[a-zA-Z0-9_-]{40,}' | head -1)
-export RM_BOOTSTRAP_TOKEN
-
-# 3. Start the agent (it announces against the running server).
-docker compose -f e2e/compose.e2e.yml up -d agent
-
-# 4. Install + run Playwright.
-cd e2e/playwright
-npm install
-npx playwright install --with-deps chromium
-npx playwright test
-```
-
-When the test passes you'll see:
-
-```
-Running 2 tests using 1 worker
-  ✓  smoke: enrol-via-announce → backup › happy path completes in under a minute (47s)
-  ✓  smoke: scrape /metrics › metrics endpoint exposes the host gauge (180ms)
-
-  2 passed (47.5s)
-```
-
-Tear-down:
-
-```sh
-docker compose -f e2e/compose.e2e.yml down -v
-```
-
-`-v` removes the named volumes too — important between runs because
-the rest-server volume holds an initialised repo and the
-agent-config volume holds a stale bearer.
-
-## What the test exercises
-
-1. **Bootstrap.** Posts the admin-creation request to
-   `/api/bootstrap` with the token scraped from the server log.
-2. **Login (UI).** Drives the login form via Playwright; verifies
-   the dashboard loads with a session cookie set.
-3. **Pending host appears.** Polls the dashboard for the inline
-   accept form generated by the announcing agent; reads the
-   pending-id out of its action URL.
-4. **Accept.** POSTs `/api/pending-hosts/{id}/accept` with the
-   rest-server URL + repo password. The server mints a Host row
-   + bearer + AEAD-encrypted creds and pushes the bearer down
-   the still-open pending WebSocket.
-5. **Online + auto-init.** Polls `/api/hosts` until the new host
-   is `status=online`. Auto-init runs as part of this — the
-   first dispatched job after creds save is `restic init`.
-6. **Run backup.** Submits the host detail page's `Run now`
-   form; expects `HX-Redirect` to the live job page.
-7. **Verify.** Polls `/api/hosts` until the host's
-   `last_backup_status` flips to `succeeded`.
-8. **Metrics.** Scrapes `/metrics` and asserts the
-   server-gauge + build-info lines are present (the compose
-   stack opens the endpoint via `RM_METRICS_TRUSTED_CIDR=0.0.0.0/0`).
-
-## CI workflow
-
-[`.gitea/workflows/e2e.yml`](../.gitea/workflows/e2e.yml) runs the
-suite on every PR into `main`. On failure it dumps the last 200
-lines of each container log as a workflow annotation and uploads
-the Playwright HTML report as an artefact.
-
-## When tests fail
-
- **Pending host never appears.** Agent container probably
-  couldn't reach the server. Check `docker compose logs agent`
-  for connection errors and `docker compose logs server` for
-  any 4xx on `/api/agents/announce`.
- **Backup hangs in `running`.** The agent shells out to
-  `restic`; check the live job log at
-  `http://127.0.0.1:8080/jobs/<id>` (still up after a
-  failed test as long as you didn't `down -v`).
- **`RM_BOOTSTRAP_TOKEN not set`.** The server log scrape
-  matched the wrong line or the token regex is too tight. The
-  server prints the token on a line starting with `    ` (four
-  spaces) inside a banner; widen the regex if your server log
-  format changes.
-
-## Adding new tests
-
-The harness is intentionally flat — one `*.spec.ts` per
-scenario. Reuse the helpers in `lib/server.ts` and avoid
-duplicating bootstrap / login boilerplate. Heavy fixtures
-(custom users, OIDC IdP) belong in their own compose override
-file rather than complicating `compose.e2e.yml`.
@@ -1,139 +0,0 @@
-# Prometheus + Grafana
-
-restic-manager exposes a Prometheus scrape endpoint at `GET /metrics`.
-The endpoint is **opt-in** — it is not mounted at all unless you set
-at least one of the auth gates below. Once enabled, it serves the
-standard `text/plain` exposition format that every Prometheus
-release since 2.x parses without configuration.
-
-A sample Grafana dashboard lives at
-`deploy/grafana/restic-manager-dashboard.json`.
-
-## Enable the endpoint
-
-Two switches, both off by default. If both are set, both must pass
-(token AND source-IP); if only one is set, that gate alone
-authorises a scrape.
-
-| Env var                    | YAML key               | Effect |
-|----------------------------|------------------------|--------|
-| `RM_METRICS_TOKEN`         | `metrics_token`        | Requires `Authorization: Bearer <token>`. Compared in constant time. |
-| `RM_METRICS_TRUSTED_CIDR`  | `metrics_trusted_cidrs` (list) | Restricts the source IP to one of the listed CIDRs. Comma-separated in env, list in YAML. Honours `X-Forwarded-For` only when the immediate hop matches `RM_TRUSTED_PROXY`. |
-
-When neither is set, `GET /metrics` returns 404 — the route is not
-registered with the chi router so a forgotten config can't
-accidentally publish fleet state.
-
-### Example: Docker
-
-```yaml
-services:
-  restic-manager:
-    image: gitea.dcglab.co.uk/steve/restic-manager:latest
-    environment:
-      RM_METRICS_TOKEN_FILE: /run/secrets/rm_metrics_token
-      RM_METRICS_TRUSTED_CIDR: "10.0.0.0/8"
-    secrets:
-      - rm_metrics_token
-```
-
-(`RM_METRICS_TOKEN_FILE` is not currently supported — set
-`RM_METRICS_TOKEN` directly. The `_FILE` convention is on the
-roadmap.)
-
-## Prometheus scrape config
-
-Drop into your `prometheus.yml`:
-
-```yaml
-scrape_configs:
-  - job_name: restic-manager
-    metrics_path: /metrics
-    scheme: https            # via your reverse proxy
-    static_configs:
-      - targets: ['restic.example.com']
-    authorization:
-      type: Bearer
-      credentials_file: /etc/prometheus/secrets/rm_metrics_token
-```
-
-If you don't run a TLS-terminating proxy in front, drop `scheme:
-https` (the server is HTTP-only — see `docs/reverse-proxy.md`).
-
-## Metric reference
-
-All names are `rm_`-prefixed. Per-host metrics carry a `host_id`
-label (the stable ULID, immune to renames) and a `host` label
-(the human-readable name).
-
-### Server gauges
-
-| Name                  | Labels                             | Description |
-|-----------------------|------------------------------------|-------------|
-| `rm_hosts_total`      | —                                  | Total number of enrolled hosts (excludes pending announces). |
-| `rm_hosts_online`     | —                                  | Number of hosts with `status='online'`. |
-| `rm_active_alerts`    | `severity` ∈ {info, warning, critical} | Open alerts by severity. |
-| `rm_build_info`       | `version, commit, go_version`      | Always 1; pure label-bag for joining. |
-
-### Per-host gauges
-
-| Name                                       | Description |
-|--------------------------------------------|-------------|
-| `rm_host_agent_online`                     | 1 if the agent is currently online, 0 otherwise. |
-| `rm_host_last_backup_timestamp_seconds`    | Unix timestamp of the host's most recent backup. **Omitted** for hosts with no backup yet. |
-| `rm_host_last_backup_success`              | 1 if the most recent backup succeeded, 0 otherwise. **Omitted** for hosts with no backup yet. |
-| `rm_host_repo_size_bytes`                  | Latest reported repo size from `restic stats --mode raw-data`. **Omitted** when unknown. |
-| `rm_host_snapshot_count`                   | Number of restic snapshots known on the host's repo. |
-| `rm_host_open_alerts`                      | Number of currently open alerts attached to this host. |
-| `rm_host_repo_status`                      | Always 1; the `status` label carries `unknown` / `ready` / `init_failed`. |
-
-### Job duration histogram
-
-```
-rm_job_duration_seconds_bucket{kind, status, le}
-rm_job_duration_seconds_sum{kind, status}
-rm_job_duration_seconds_count{kind, status}
-```
-
-`kind` ∈ {backup, forget, prune, check, unlock, restore, diff, init, update}.
-`status` ∈ {succeeded, failed, cancelled}.
-
-Buckets (seconds):
-
-```
-1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf
-1s   5s  30s  1m  5m   30m   1h    6h    24h
-```
-
-The histogram is in-memory only — values reset on process restart.
-Operators who want durable history should let Prometheus persist
-the scrapes; restic-manager itself is a control plane, not a
-metrics database.
-
-## Grafana dashboard
-
-Import `deploy/grafana/restic-manager-dashboard.json`:
-
-1. In Grafana, **+ → Import → Upload JSON file**.
-2. Pick the Prometheus data source you scrape with.
-3. The dashboard's six panels populate from the metrics above:
-   * **Fleet status** — online/total stat panel.
-   * **Open alerts** — by severity.
-   * **Hosts** — per-host table (last backup, repo size, snapshots, alerts).
-   * **Repo size over time** — one line per host.
-   * **Backups failing** — count of hosts whose last backup didn't succeed.
-   * **Job duration p95** — `histogram_quantile(0.95, …)` over a 1h window per kind.
-
-Alerting is intentionally not configured in the dashboard — the
-control plane already has alerts (P3-05) with native channels for
-webhook, ntfy, and SMTP. Re-implementing them in Prometheus would
-just duplicate state. If you do want Prom-side alerts, copy the
-recording rules into your usual location.
-
-## Cardinality
-
-Per scrape: O(hosts) gauge rows + O(kinds × statuses × buckets)
-histogram rows. A 100-host fleet emits roughly 700 host rows + 270
-histogram rows — well below any practical limit. There are no
-`job_id` labels (cardinality bomb avoidance) and no per-source-group
-labels.
@@ -1,259 +0,0 @@
-# P2 Completion Implementation Plan
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
-
-**Goal:** Close every remaining P2 task in `tasks.md`: P2R-09 (auto-init UX), P2R-10/11/12 (hooks), P2R-13 (bandwidth wiring + per-job override), P2R-14 (schedule next/last run), P2-16 (Windows svc), P2-17 (`install.ps1`), P2-18 (announce-and-approve).
-
-**Architecture:** Server stays HTTP+WS; agent stays a single binary that auto-restages via `make build`. Hooks live on `source_groups` (and host-level defaults). Announce-and-approve adds a separate WS path (`/ws/agent/pending`) and a Pending hosts panel; token-flow stays default. Windows service support uses `golang.org/x/sys/windows/svc` behind a `//go:build windows` tag — Linux builds untouched. **Operator is away — make best guesses on small UX choices, but commit each item separately so the choices are reviewable.**
-
-**Tech Stack:** Go 1.23+, chi router, modernc/sqlite, `coder/websocket`, `robfig/cron/v3`, HTMX + Tailwind, `golang.org/x/sys/windows/svc`, Ed25519 (stdlib).
-
---
-
-## Pre-flight
-
- [ ] **Run baseline:** `go vet ./... && go build ./... && go test ./...` — must be green before starting. Restage agent + restart server (per CLAUDE.md restage block) so smoke env is warm.
-
-## Order of execution
-
-Smallest blast-radius first. UI polish → bandwidth → next/last → hooks → announce → Windows. Commit and restage at each task boundary. Run `go vet ./... && go test ./...` before every commit.
-
---
-
-## Task 1 — P2R-13a: Wire bandwidth caps into restic invocations
-
-**Files:**
- Modify: `internal/restic/runner.go` (add `LimitUploadKBps`, `LimitDownloadKBps` to `Env` or to a per-call options struct already present; emit `--limit-upload N`/`--limit-download N` on `restic backup|forget|prune|check|restore`)
- Modify: `internal/agent/runner/*.go` — pass host-wide caps into the runner. Caps come from `agent.config.Config` or are pushed via `config.update`. Decision: ship caps in the existing `config.update` envelope as new fields `bandwidth_up_kbps`, `bandwidth_down_kbps`. Server pushes on hello + on `PUT /api/hosts/{id}/bandwidth`.
- Modify: `internal/api/messages.go` — extend `ConfigUpdatePayload` with the two int pointers.
- Modify: `internal/server/ws/handler.go` (or wherever hello/config push lives) — include caps in the pushed config.
- Modify: `internal/server/http/host_bandwidth.go` — after `SetHostBandwidth`, fan out a `config.update` to the connected agent (mirror the credentials-edit path).
- Test: `internal/restic/runner_test.go` — assert flag injection.
- Test: `internal/server/ws/*_test.go` — assert config.update carries caps on hello and on edit.
-
- [ ] **Step 1.1** Add `LimitUploadKBps *int`, `LimitDownloadKBps *int` to whatever per-host config the runner already consults. Existing pattern is `restic.Env{}`; extend it.
- [ ] **Step 1.2** Failing test in `internal/restic/runner_test.go`: build a backup command with `LimitUploadKBps=1024`, assert the resulting argv contains `--limit-upload 1024`.
- [ ] **Step 1.3** Implement: prepend the flags in argv builders for `backup`, `forget`, `prune`, `check`, `restore`. Skip when nil/<=0.
- [ ] **Step 1.4** Wire `config.update` payload — server reads `Host.BandwidthUpKBps`/`DownKBps`, includes them in the existing `ConfigUpdatePayload` push on hello and on bandwidth edit (mirror cred-edit fan-out in `internal/server/http/host_credentials.go`).
- [ ] **Step 1.5** Agent applies caps: store in the in-memory dispatcher state on `config.update`, attach to every restic call.
- [ ] **Step 1.6** `go vet ./... && go test ./... && make build && <restage block>`. Commit:
-```
-agent+server: apply host bandwidth caps to restic invocations
-```
-
-## Task 2 — P2R-13b: Per-job override on Run-now confirm dialog
-
-**Decision:** A small numeric input on the per-source-group Run-now button (and dashboard Run-all). Operator is away — keep it minimal: two optional inputs (up/down KB/s) on the dispatch endpoint; UI shows a `<details>` "Limit bandwidth for this run" disclosure with two number inputs.
-
-**Files:**
- Modify: `internal/server/http/sources.go` (or wherever the per-group Run-now POST lives) — accept optional `bandwidth_up_kbps`/`bandwidth_down_kbps` form fields, pass through.
- Modify: dispatch path (`internal/server/dispatch_*.go` or `ws/handler.go` job-dispatch core) — accept overrides, include in the `command.run` payload.
- Modify: `internal/api/messages.go` — `CommandRunPayload` gains optional caps that take precedence over host-wide caps when present.
- Modify: agent dispatcher — use payload override if present else falls back to config caps.
- Modify: `web/templates/pages/host_sources.html` (and the schedules Run-now form) — `<details>` block.
- Test: HTTP test for the new form fields; agent runner test for override precedence.
-
- [ ] **Step 2.1** Failing test: POST to per-group Run-now with `bandwidth_up_kbps=512` → assert dispatched payload carries 512.
- [ ] **Step 2.2** Implement endpoint changes + payload extension.
- [ ] **Step 2.3** Agent override precedence test (payload wins over config).
- [ ] **Step 2.4** UI `<details>` blocks (one per Run-now form).
- [ ] **Step 2.5** Playwright spot-check via `:8080` smoke env: open Sources tab, expand the Run-now disclosure, fire with limit=128, then open the live job log and confirm the agent's restic argv (read `/tmp/rm-smoke/server.log` for the dispatched command — it logs argv) shows `--limit-upload 128`.
- [ ] **Step 2.6** Commit.
-
-## Task 3 — P2R-14: Schedule "next run" / "last run"
-
-**Files:**
- Modify: `internal/store/schedules.go` — add `NextRunAt(time.Time)` derivation helper and `LatestScheduledJobAt(host_id, schedule_id) (time.Time, error)` (or a single batched fetch for all schedules of a host).
- Modify: dashboard host row (`web/templates/partials/host_row.html`) — show "Next: …" and "Last: …" when there's a single covering schedule (already detected in slice 5).
- Modify: `web/templates/pages/host_schedules.html` — add Next/Last columns to the schedules table.
- Modify: relevant page handlers (`internal/server/http/ui_schedules.go`, dashboard handler) — populate the data.
- Test: `schedules_test.go` for next-run derivation (parse cron, compute next from a fixed `now`).
-
- [ ] **Step 3.1** Add `NextRun(cronExpr string, from time.Time) (time.Time, error)` helper using `robfig/cron/v3`'s `Parse(...).Next(from)`. Test with three crons.
- [ ] **Step 3.2** Add `LatestJobByActorKindForSchedule(host_id, schedule_id) (time.Time, status, error)` query against `jobs` (filter `actor_kind='schedule'` AND `schedule_id=?`, ORDER BY `started_at` DESC LIMIT 1).
- [ ] **Step 3.3** Wire schedules-page handler to populate Next/Last per row; render relative time + ISO tooltip (mirror existing `formatRelTime` template helper if it exists; otherwise use a simple "5m ago" helper).
- [ ] **Step 3.4** Wire dashboard row: when single covering schedule, surface "Next: 03:00" / "Last: 8h ago — succeeded".
- [ ] **Step 3.5** Playwright spot-check: a host with a schedule shows Next/Last; pause it → Next becomes "—" / "(paused)".
- [ ] **Step 3.6** Commit.
-
-## Task 4 — P2R-09: Auto-init UX polish
-
-**Files:**
- Modify: `web/templates/pages/host_repo.html` — danger-zone re-init button + two-step confirm (type the host name).
- Modify: `internal/server/http/ui_repo.go` (or new `repo_reinit.go`) — `POST /hosts/{id}/repo/reinit` admin-only, audit-logged. Server runs `restic init --force` (or wipes-then-inits — pick the safer of the two; restic doesn't truly wipe a repo, the operator must clear the bucket. **Best guess:** dispatch a normal `init` job with a flag that re-runs even if the repo claims to exist; if restic refuses, surface "the repo on the remote already has data — clear it manually before re-init" via the job log).
- Modify: host detail page header / vitals strip — surface init result line. Use the existing latest-`init`-job query to render "repo ready · initialised <relative time> ago" or "init failed · job N · retry".
- Test: HTTP test for re-init endpoint (auth, audit, host-name confirm); template test that the result line renders for both states.
-
- [ ] **Step 4.1** Add helper: `LatestJobByKind(host_id, "init")` — already exists from P2R-06 (`store.LatestJobByKind`). Reuse.
- [ ] **Step 4.2** Render init line into vitals strip; show "init failed" amber when latest init failed.
- [ ] **Step 4.3** Implement `POST /hosts/{id}/repo/reinit` handler — admin role check, requires a `confirm_hostname` form field that must equal `host.Name`, returns 400 otherwise. Dispatches a fresh `init` job.
- [ ] **Step 4.4** Add danger-zone re-init form to `host_repo.html` (currently disabled per slice 4). Two-step confirm with the typed hostname.
- [ ] **Step 4.5** Playwright: visit `/hosts/{id}/repo`, click re-init, type wrong hostname → blocked; type right hostname → dispatches init job → returns to live log.
- [ ] **Step 4.6** Commit.
-
-## Task 5 — P2R-10: Hook schema (migration 0010)
-
-**Files:**
- Create: `internal/store/migrations/0010_hooks.sql`
-  - `ALTER TABLE source_groups ADD COLUMN pre_hook BLOB;`  (AEAD ciphertext, NULLable)
-  - `ALTER TABLE source_groups ADD COLUMN post_hook BLOB;`
-  - `ALTER TABLE hosts ADD COLUMN pre_hook_default BLOB;`
-  - `ALTER TABLE hosts ADD COLUMN post_hook_default BLOB;`
-  - All four are AEAD ciphertext (existing `crypto.AEAD`); BLOB column type.
- Modify: `internal/store/types.go` — add `PreHook *string` (decrypted), `PostHook *string` to `SourceGroup`; same to `Host`.
- Modify: `internal/store/sources.go` + `internal/store/hosts.go` — getters/setters encrypt on write, decrypt on read. Pass `crypto.AEAD` through (pattern mirrors `host_credentials.go`).
- Test: encrypt/decrypt round-trip; setting `nil` clears the column.
-
- [ ] **Step 5.1** Write migration SQL. Column-level ALTERs only (per CLAUDE.md).
- [ ] **Step 5.2** Update store types + getters/setters with AEAD encrypt/decrypt. Mirror `internal/store/host_credentials.go` patterns exactly.
- [ ] **Step 5.3** Round-trip test: set hook on a source group; reload; assert plaintext returned. Set nil; assert nil after reload.
- [ ] **Step 5.4** `go vet && go test`. Commit.
-
-## Task 6 — P2R-11: Agent execution of hooks
-
-**Files:**
- Modify: `internal/api/messages.go` — `ConfigUpdatePayload` (or the per-source-group bundle inside `ScheduleSetPayload`) carries `PreHook`, `PostHook` plaintext (server has decrypted by then; wire is authenticated WS, same trust boundary as repo creds).
- Modify: agent dispatcher — for `kind=backup` only:
-  - Run `pre_hook` (if present) via `os/exec` with the host shell (`/bin/sh -c` on Linux, `cmd.exe /C` on Windows). Capture stdout+stderr → JobLog with `hook:` prefix. Non-zero exit aborts the backup, marks the job failed with `pre_hook` error.
-  - Run `post_hook` (if present) **always** after the backup, with `RM_JOB_STATUS=succeeded|failed` env var. Capture into JobLog, prefix `hook:`. Non-zero exit on post_hook does NOT change job status (warning logged).
- Skip both for `kind` ∈ {forget, prune, check, unlock, init} per spec.md §14.3.
- Test: dispatcher test with a `pre_hook` that exits 1 → backup not started; `post_hook` always runs and sees `RM_JOB_STATUS`.
-
- [ ] **Step 6.1** Plumb hooks through `ScheduleSetPayload` source-group bundle + per-group Run-now `command.run` payload (override host-default with group hook if both present). Server-side resolution: host default if group hook is empty.
- [ ] **Step 6.2** Agent dispatcher: factor hook execution into `internal/agent/runner/hooks.go`. Use `exec.CommandContext`, set env, plumb output to existing JobLog stream with `Source: "hook"` (or prefix the log lines `hook: …`).
- [ ] **Step 6.3** Failing test in `internal/agent/runner/runner_test.go` (create file if absent): `pre_hook=/bin/false` → job fails with `pre_hook failed (exit 1)` and the actual restic backup never runs (assert via mock-restic shim).
- [ ] **Step 6.4** Test: `post_hook` runs even when backup fails; receives `RM_JOB_STATUS=failed`.
- [ ] **Step 6.5** Test: hooks skipped on `forget`/`prune`/`check`/`unlock` jobs.
- [ ] **Step 6.6** `go vet && go test && make build && <restage block>`. Commit.
-
-## Task 7 — P2R-12: Hook editor UI
-
-**Files:**
- Modify: `web/templates/pages/source_group_edit.html` (new or extend existing source-group form) — `<textarea>` for pre_hook, `<textarea>` for post_hook, with the warning banner: "this hook runs as the agent service user (root on Linux; LocalSystem on Windows)".
- Modify: source-group HTTP handler (`internal/server/http/sources.go`) — accept hook fields on POST/PUT, encrypt-and-persist via store.
- Create: a new "Settings" tab section on host detail (currently inert per P1-25) — wait, just add a new sub-tab or extend Repo page. **Decision:** add `pre_hook_default` / `post_hook_default` to the Repo page under a new "Hooks" section since Settings is still inert.
- Modify: source-group form admin-only check; post-only edit allowed by operators? **Decision:** admin-only edit per spec; render but disable for operators.
- Modify: audit-log writer — emit `source_group.hook_updated` and `host.default_hook_updated` events (without the hook body).
- Test: HTTP test for create + update; admin-only enforcement; audit row written without secret.
-
- [ ] **Step 7.1** Source-group form extension + handler wiring.
- [ ] **Step 7.2** Repo page Hooks section (host defaults).
- [ ] **Step 7.3** Audit entries.
- [ ] **Step 7.4** Playwright: as admin, set a `pre_hook` of `echo hello`, fire Run-now, open live log, confirm `hook: hello` line appears.
- [ ] **Step 7.5** Commit.
-
-## Task 8 — P2-18a: Announce schema + endpoint
-
-**Files:**
- Create: `internal/store/migrations/0011_pending_hosts.sql`
-  ```sql
-  CREATE TABLE pending_hosts (
-    id                 TEXT PRIMARY KEY,
-    hostname           TEXT NOT NULL,
-    os                 TEXT NOT NULL,
-    arch               TEXT NOT NULL,
-    agent_version      TEXT NOT NULL,
-    restic_version     TEXT NOT NULL,
-    public_key         BLOB NOT NULL,             -- 32-byte Ed25519
-    fingerprint        TEXT NOT NULL,             -- "SHA256:hex"
-    announced_from_ip  TEXT NOT NULL,
-    first_seen_at      TEXT NOT NULL,
-    last_seen_at       TEXT NOT NULL,
-    expires_at         TEXT NOT NULL
-  );
-  CREATE INDEX pending_hosts_expires ON pending_hosts(expires_at);
-  CREATE INDEX pending_hosts_fingerprint ON pending_hosts(fingerprint);
-  ```
- Create: `internal/store/pending_hosts.go` — `CreatePendingHost`, `GetPendingHostByFingerprint`, `ListPendingHosts`, `DeletePendingHost`, `TouchPendingHost`, `DeleteExpiredPendingHosts`.
- Create: `internal/server/http/announce.go` — `POST /api/agents/announce` accepts `{hostname, os, arch, agent_version, restic_version, public_key (base64)}`. Validates protocol_version implicitly via `agent_version` check. Token-bucket rate limit per source IP (10/min). Global cap 100 pending rows. Returns `{fingerprint, pending_id, hostname_collision: bool}`.
- Test: `announce_test.go` — happy path; rate limit; cap; collision flag.
-
- [ ] **Step 8.1** Migration + store layer + tests.
- [ ] **Step 8.2** Endpoint + tests (use a fake clock + in-process token bucket).
- [ ] **Step 8.3** Commit.
-
-## Task 9 — P2-18b: Pending WS + accept/reject
-
-**Files:**
- Create: `internal/server/ws/pending.go` — `GET /ws/agent/pending` upgrade. Server issues a 32-byte nonce; agent signs it with its Ed25519 private key; server verifies against the `public_key` stored on the pending row keyed by the supplied `pending_id`. If valid, hold the connection open; on accept, push a single `enrolled` message containing `{bearer_token, repo_credentials_aead_blob}` and close cleanly. On reject, close with code 4001 + reason "rejected".
- Create: `internal/server/http/pending.go` — admin-only `POST /api/pending-hosts/{id}/accept` (atomically: mint bearer, decrypt admin-supplied repo creds (passed in form), promote pending row → real `hosts` row, push `enrolled` to the open WS, audit-log) and `POST /api/pending-hosts/{id}/reject` (delete row + close socket).
- Modify: server `main.go` route registration.
- Test: integration test — fake agent opens pending WS, admin POST /accept, agent receives bearer.
-
- [ ] **Step 9.1** Pending WS handler with nonce-sign verify.
- [ ] **Step 9.2** Accept/reject endpoints. Accept reuses the existing token-consume path internally (mints persistent bearer from `crypto.RandomToken`-style helper, inserts host row + `host_credentials`).
- [ ] **Step 9.3** Tests.
- [ ] **Step 9.4** Commit.
-
-## Task 10 — P2-18c: Agent announce path
-
-**Files:**
- Modify: `cmd/agent/main.go` — when `RM_TOKEN` is unset, switch to announce mode instead of erroring out. `RM_SERVER` still required.
- Create: `internal/agent/announce/announce.go` — generate-or-load Ed25519 keypair (persisted as a file alongside `secrets.enc`, mode 0600). POST `/api/agents/announce`. Open `/ws/agent/pending`. Wait. On `enrolled` message, persist bearer to `agent.yaml`, persist repo creds via existing secrets store, exit announce mode and reconnect via the normal WS path.
- Modify: `deploy/install/install.sh` — when `RM_TOKEN` is missing, run agent in announce mode and `journalctl --follow` until the agent prints the fingerprint, print it to the operator's terminal in big copy-friendly format, then keep following until enrolled.
- Test: end-to-end test in `internal/server/...` using a fake agent.
-
- [ ] **Step 10.1** Keypair generation + persistence.
- [ ] **Step 10.2** Announce client + pending WS client; print `SHA256:…` fingerprint to stdout in a banner.
- [ ] **Step 10.3** Install script branch.
- [ ] **Step 10.4** Playwright: register a host via announce mode (run agent locally with no RM_TOKEN), log into UI, see Pending hosts panel with the fingerprint, click Accept, confirm host appears.
- [ ] **Step 10.5** Commit.
-
-## Task 11 — P2-18d: Pending hosts UI panel
-
-**Files:**
- Modify: `web/templates/pages/dashboard.html` — add Pending hosts panel above the host list when any pending rows exist.
- Modify: dashboard handler — `Store.ListPendingHosts(now)` (auto-skips expired).
- Add buttons → POST `/api/pending-hosts/{id}/accept` and `/reject` via HTMX.
- Background sweeper for `DeleteExpiredPendingHosts` every 60s (mirror the existing offline-sweeper goroutine pattern).
-
- [ ] **Step 11.1** Sweeper goroutine.
- [ ] **Step 11.2** Dashboard handler + template.
- [ ] **Step 11.3** Accept form must include the same repo URL/user/pw fields as the token-mint form (admin still supplies repo creds at accept time).
- [ ] **Step 11.4** Playwright sweep.
- [ ] **Step 11.5** Commit.
-
-## Task 12 — P2-16: Windows service integration
-
-**Decision:** Cannot test on Windows from WSL. Goal is a clean compile under `GOOS=windows GOARCH=amd64` and code that follows the canonical `golang.org/x/sys/windows/svc/example` pattern. Untestable beyond compile + manual review; mark in commit message.
-
-**Files:**
- Create: `internal/agent/service/service_windows.go` (build tag `//go:build windows`) — implements `svc.Handler`. `Execute` starts the agent's main loop in a goroutine, listens for `svc.Stop`/`svc.Shutdown`, cancels ctx, waits.
- Create: `internal/agent/service/service_other.go` (build tag `//go:build !windows`) — stub `RunService` that just runs the agent loop in the foreground.
- Create: `internal/agent/service/install_windows.go` — `Install`, `Uninstall`, `Start`, `Stop` thin wrappers around `mgr` package.
- Modify: `cmd/agent/main.go` — sub-commands: `install`, `uninstall`, `start`, `stop`, `run` (default). `run` delegates to `service.Run()` which on Windows checks `svc.IsWindowsService()` and dispatches accordingly.
- Test: `internal/agent/service/service_windows_test.go` (build-tagged) for argv parsing only — actual SCM interaction can't be tested in CI.
-
- [ ] **Step 12.1** Implement the svc.Handler shell.
- [ ] **Step 12.2** Install/uninstall wrappers (use `mgr.ConnectLocal()`, `m.CreateService(name, exepath, mgr.Config{...}, "run")`).
- [ ] **Step 12.3** Cross-compile check: `GOOS=windows GOARCH=amd64 go build ./cmd/agent` must succeed.
- [ ] **Step 12.4** Commit with note "untested on Windows; compile-verified only".
-
-## Task 13 — P2-17: install.ps1
-
-**Files:**
- Create: `deploy/install/install.ps1` — PowerShell 5.1+ compatible. Checks admin elevation. Downloads agent binary from `$RM_SERVER/agent/binary?os=windows&arch=amd64`. Drops it at `C:\Program Files\restic-manager\restic-manager-agent.exe`. Runs `restic-manager-agent.exe install` (registers service). Starts it. Detects existing tasks named `*restic*` via `Get-ScheduledTask` and prints them — does not auto-disable. Writes `C:\ProgramData\restic-manager\agent.yaml` with `RM_SERVER` + `RM_TOKEN` (or no token if announce-mode).
- Modify: `internal/server/http/install.go` (or wherever install scripts are served) to also serve `/install/install.ps1`.
- Modify: CLAUDE.md restage block to also stage `install.ps1`.
-
- [ ] **Step 13.1** Write the script.
- [ ] **Step 13.2** Wire serving + restage.
- [ ] **Step 13.3** Smoke parse: `pwsh -NoProfile -Command "Get-Command -Syntax (Get-ChildItem deploy/install/install.ps1)"` if pwsh is on PATH, else `Set-StrictMode` parse via `pwsh -c "$null = [scriptblock]::Create((Get-Content deploy/install/install.ps1 -Raw))"`. Skip if no pwsh available — note in commit.
- [ ] **Step 13.4** Commit.
-
-## Task 14 — Final integration sweep
-
- [ ] **Step 14.1** `go vet ./... && go test ./... -race`. Full build. Restage. Restart server.
- [ ] **Step 14.2** Playwright walkthrough on `:8080`: login → dashboard shows pending-hosts empty state → create source group → set a `pre_hook` → Run-now with bandwidth override → confirm hook fires + bandwidth applied → schedules tab shows next/last → repo page shows init-OK line → re-init flow gated by typed hostname.
- [ ] **Step 14.3** Update `tasks.md`: tick P2R-09, P2R-10, P2R-11, P2R-12, P2R-13, P2R-14, P2-16, P2-17, P2-18 done. Update Phase 2 acceptance line items as satisfied.
- [ ] **Step 14.4** Open PR `p2-completion → main` with a summary of every item closed.
-
---
-
-## Decisions made on the operator's behalf (away)
-
-1. **Bandwidth UI for per-job override:** small `<details>` disclosure under each Run-now button. Simpler than a modal; matches the rest of the app's progressive-disclosure style.
-2. **Re-init UX:** server dispatches a fresh `init` job; if restic refuses because the repo already exists, surfaces the error in the job log and instructs the operator to clear the remote bucket. We don't try to forcibly wipe — too dangerous, and the agent doesn't have credentials to wipe S3/B2/etc generically.
-3. **Hooks editor lives on the Repo page (host defaults) + on the source-group edit form (per-group override).** Skips inventing a new "Settings" tab since that surface is still inert.
-4. **Announce flow:** admin still supplies repo creds at accept time (same form as the token-mint flow). The pending row only carries identity-of-the-endpoint material, never repo creds.
-5. **Windows service:** compile-verified only; untested. Commit message will say so.
@@ -1,131 +0,0 @@
-# P5-03 implementation plan — Docker-only release
-
-Spec: `docs/superpowers/specs/2026-05-05-p5-03-docker-only-release.md`.
-
-Branch: `p5-03-docker-release`. Do not auto-open a PR (see CLAUDE.md
-memory: CI runs are expensive on the self-hosted cluster).
-
---
-
-## Slice 1 — Server config + handler fallback
-
-**Goal:** server can serve agent binaries / install scripts from a
-read-only "bundled assets" path when `<DataDir>` doesn't have them.
-
-1. `internal/server/config/config.go` (or wherever `Cfg` lives) gains
-   a `BundledAssetsDir string` field, defaulting to
-   `/opt/restic-manager/dist`. Wire from `RM_BUNDLED_ASSETS_DIR` env
-   var, mirroring the existing env-var conventions.
-2. `internal/server/http/agent_assets.go`:
-   - `handleAgentBinary`: try `<DataDir>/agent-binaries/<name>`
-     first; on `os.Stat` ENOENT, try
-     `<BundledAssetsDir>/agent-binaries/<name>`; on second ENOENT,
-     existing 404.
-   - `handleInstallAsset`: same dual-path, with `install/` subpath.
-3. Tests in `internal/server/http/agent_assets_test.go` (new file):
-   - DataDir hit serves DataDir bytes.
-   - DataDir miss + bundled hit serves bundled bytes.
-   - DataDir hit shadows bundled.
-   - Both miss → 404 + existing error envelope.
-   - Path-traversal still rejected for `install/*` (regression check).
-
-**Verify:** `go vet ./...` + `go test ./internal/server/http/...`.
-
---
-
-## Slice 2 — Version ldflags on both binaries
-
-1. `cmd/server/main.go`: keep `var version`, add
-   `var commit = "none"` and `var date = "unknown"`. Surface via
-   existing version-log line.
-2. `cmd/agent/main.go`: same three vars. Agent already reports
-   `agent_version` in the WS hello — extend to include commit if
-   it's already plumbed through `internal/api`; otherwise leave the
-   commit out of the wire and just log it on startup.
-3. `Makefile`: extend the `make build` `-ldflags` to set all three
-   from `git describe --tags --always` + `git rev-parse HEAD` +
-   UTC timestamp. Source-build users get real values, not "dev".
-4. `deploy/Dockerfile.server`: add `ARG COMMIT=none` and
-   `ARG DATE=unknown`; pass through `-ldflags`.
-
-**Verify:** `make build && ./bin/restic-manager-server -version`
-(or whatever the existing flag is) prints non-`dev` values.
-
---
-
-## Slice 3 — Dockerfile bakes agents + install assets
-
-1. Build stage cross-compiles three agents:
-
-   ```dockerfile
-   RUN go build -trimpath -ldflags="-s -w \
-         -X main.version=${VERSION} -X main.commit=${COMMIT} -X main.date=${DATE}" \
-       -o /out/agent/restic-manager-agent-linux-amd64 ./cmd/agent
-   ENV GOARCH=arm64
-   RUN go build ... -o /out/agent/restic-manager-agent-linux-arm64 ./cmd/agent
-   ENV GOOS=windows GOARCH=amd64
-   RUN go build ... -o /out/agent/restic-manager-agent-windows-amd64.exe ./cmd/agent
-   ```
-
-   (Reset `GOOS`/`GOARCH` between layers via `ENV`. Server build
-   stays at `GOOS=linux GOARCH=$TARGETARCH`.)
-
-2. Final stage `COPY --from=build`:
-   - `/out/restic-manager-server` → `/usr/local/bin/`
-   - `/out/agent/*` → `/opt/restic-manager/dist/agent-binaries/`
-   - `deploy/install/install.sh` →
-     `/opt/restic-manager/dist/install/install.sh`
-   - `deploy/install/install.ps1` →
-     `/opt/restic-manager/dist/install/install.ps1`
-   - `deploy/install/restic-manager-agent.service` →
-     `/opt/restic-manager/dist/install/restic-manager-agent.service`
-
-3. Set `--chmod=0755` on the agent binaries and `install.sh`,
-   `--chmod=0644` on the unit file and `install.ps1`. Distroless
-   final stage runs as `nonroot`; bundled assets are readable by
-   anyone (mode `o+r`), so the user switch doesn't break reads.
-
-**Verify:**
-```sh
-docker build -f deploy/Dockerfile.server -t rm:dev .
-docker run --rm -d -p 18080:8080 \
-    -e RM_LISTEN=:8080 -e RM_DATA_DIR=/data \
-    -e RM_BASE_URL=http://127.0.0.1:18080 \
-    -v rm-test:/data rm:dev
-curl -fsSL "http://127.0.0.1:18080/agent/binary?os=linux&arch=amd64" | wc -c
-curl -fsSL "http://127.0.0.1:18080/install/install.sh" | head -1
-```
-
-Both should succeed against a fresh volume (no operator staging).
-
---
-
-## Slice 4 — Release workflow
-
-`.gitea/workflows/release.yml` per the spec. Two jobs:
-
-1. **`image`**: checkout → setup-qemu → setup-buildx → login → compute
-   tags → buildx build+push.
-2. (Future) `release-notes`: stub left as a TODO comment for now.
-   Operator can hand-write release notes via the Gitea UI on first
-   cut.
-
-The `compute tags` shell step is the only non-trivial bit; tested
-inline by running the script with mocked `GITHUB_REF_TYPE` /
-`GITHUB_REF_NAME` env vars before committing.
-
-**Verify on first dispatch:** trigger `workflow_dispatch` from the
-Gitea UI, check the runner produces `:snapshot-<sha>` and pushes
-multi-arch.
-
---
-
-## Slice 5 — Tasks.md + commit + push
-
-1. `tasks.md`: tick P5-03; add a one-line note that goreleaser was
-   dropped in favour of Docker-only after a 2026-05-05 design pass
-   (link the spec).
-2. `git add -A && git commit -m "p5-03: docker-only release path"`
-   (no Co-Authored-By trailer — CLAUDE.md rule).
-3. `git push -u origin p5-03-docker-release`.
-4. **Stop.** Do not open a PR. Wait for operator review.
@@ -1,61 +0,0 @@
-# Plan — P6-04 + P6-05 Prometheus metrics + Grafana dashboard
-
-Spec: `docs/superpowers/specs/2026-05-07-p6-04-05-prometheus-metrics-design.md`
-
-## Step 1 — Config wiring
-
- Add fields to `internal/server/config/config.go`:
-  - `MetricsToken string` (yaml `metrics_token`)
-  - `MetricsTrustedCIDRs []string` (yaml `metrics_trusted_cidrs`)
-  - method `(c Config) MetricsAuthEnabled() bool` returning true iff at least one of the two is configured.
- Env loading: `RM_METRICS_TOKEN` and `RM_METRICS_TRUSTED_CIDR` (comma-CIDR).
- `validate()` extension: ensure each CIDR parses (reuse the same `netip.ParsePrefix` pattern that already validates `TrustedProxies`).
- Tests: extend `config_test.go` covering both env vars + happy/sad CIDR.
-
-## Step 2 — `internal/server/metrics` package
-
- `Registry` struct: `sync.Mutex`, `map[jobKey]*histogramState` where `jobKey = struct{kind,status string}`.
- `ObserveJob(kind, status string, dur time.Duration)` — clamps negative durations to 0; locks; bumps the right bucket + sum + count.
- `Snapshot() Snapshot` — copies state under lock; returns plain value type.
- `Snapshot` carries `Histogram` rows (kind, status, buckets, sum, count) and accepts the rest from the caller (host rows, alert counts, build info).
- `Render(w io.Writer, s Snapshot) error` — emits standard text exposition with stable line ordering. No external dep; manual escape of `\` `"` `\n` in label values per the Prom format spec.
- Unit tests: golden render, concurrent observe, bucket boundaries.
-
-## Step 3 — HTTP handler
-
- New `internal/server/http/metrics.go`:
-  - `(s *Server) handleMetrics(w, r)` — calls `authoriseMetricsScrape`, then `gatherSnapshot(ctx)` then `metrics.Render`.
-  - `authoriseMetricsScrape(r, cfg) (ok bool, status int)` — pure helper; bearer token compared with `subtle.ConstantTimeCompare`; CIDR check on `r.RemoteAddr` first, then `X-Forwarded-For` if a trusted proxy fronted us (mirror `realIP`'s logic; simplest path is to call `chi/middleware.RealIP`-aware lookup the existing handlers use).
-  - `gatherSnapshot(ctx)` — assembles the snapshot from `Store.ListHosts`, `Store.ListAlerts({Status:"open"})`, the metrics registry, and `version.Version`/`version.Commit`/`runtime.Version()`.
- Route mounted in `server.go` only if `s.deps.Cfg.MetricsAuthEnabled()`.
- `Deps` grows a `Metrics *metrics.Registry` field; nil-tolerant in handlers.
-
-## Step 4 — Hook job-finished
-
- `internal/server/ws/handler.go`:
-  - `HandlerDeps` grows `Metrics *metrics.Registry`.
-  - In the `MsgJobFinished` branch, after the `GetJob` lookup we already do, observe `(job.Kind, p.Status.String(), p.FinishedAt.Sub(deref(job.StartedAt)))`. Skip if `job.StartedAt` is nil (rare race).
- `cmd/server` wires the registry into both `Deps` and `HandlerDeps` from a single instance.
-
-## Step 5 — Tests
-
- `internal/server/metrics/registry_test.go` — observe + snapshot determinism.
- `internal/server/metrics/render_test.go` — golden output for a fixed snapshot.
- `internal/server/http/metrics_test.go` — auth matrix (six cases per the spec) using the existing `newTestServer` fixture pattern. Render snapshot includes ≥1 host so we exercise the gather path end-to-end.
-
-## Step 6 — Docs + dashboard (P6-05)
-
- `docs/prometheus.md` — enable + scrape config + metric reference + dashboard import.
- `deploy/grafana/restic-manager-dashboard.json` — six-panel dashboard. Hand-authored against Grafana 11 dashboard schema (uid, schemaVersion, panels with `targets[].expr`, datasource as variable). Validated by importing into Grafana — but since we can't run Grafana in CI, the structural sniff test is just that the JSON parses and contains the expected panel titles + datasource variable.
-
-## Step 7 — Tasks.md + verification
-
- Strike P6-04, P6-05 in `tasks.md`; add an "as shipped" note mirroring the prior P6 entries.
- Run `go vet ./...`, `go test ./...`, `make build`.
- Push branch (no PR per standing instruction).
-
-## Risk register
-
- **CIDR check for proxied scrapes** — easy to mis-implement, easy to mis-document. The handler test must exercise both "direct hit" and "X-Forwarded-For" paths.
- **Histogram lock contention** — every job finish takes the mutex. Job throughput is low (a few/min/host max), and `ObserveJob` is a couple of map lookups; no risk in practice.
- **Dashboard JSON drift** — Grafana versions evolve. Pinning `schemaVersion` and using only well-supported panel types (timeseries, stat, table) keeps the import working across recent versions.
@@ -1,473 +0,0 @@
-# P3 — Alerts (design)
-
-> Phase 3 sub-spec covering the alerts engine, notification channels, and UI
-> (P3-05 / P3-06 / P3-07).
->
-> Wireframe: `_diag/p3-alerts-wireframe/wireframe.html`. Screenshots in the
-> same directory. Spec brainstorm ran 2026-05-04; user approved all ten
-> design decisions before this spec was written.
-
-## Scope locked
-
-Brainstorm decisions (in order asked):
-
-1. **Rule model.** Hardcoded rule set, no operator-tunable thresholds in v1.
-   The engine knows about each rule type internally; per-rule config can land
-   later if/when an operator asks.
-2. **Rule set.** Six rules: `backup_failed`, `forget_failed`, `prune_failed`,
-   `check_failed`, `stale_schedule`, `agent_offline`.
-3. **Engine cadence.** Hybrid. Event hooks at the existing
-   `MarkJobFinished` and offline-sweeper sites for the immediate triggers;
-   one 60-second ticker handles stale-schedule detection and auto-resolution.
-4. **Resolution.** Auto-resolve when the underlying condition clears + manual
-   Resolve at any time. Acknowledge is a separate "I've seen it" intermediate
-   state that does NOT close the alert.
-5. **v1 channels.** Webhook + native ntfy + SMTP. Apprise deferred (the
-   channel plumbing accepts new kinds without reshaping). SMTP added as
-   a first-class channel post-brainstorm because the use case — overnight
-   alerts the operator wants to read in the morning rather than be pinged
-   on at 03:00 — is poorly served by ntfy's push model and clumsy via
-   webhook → email-gateway.
-6. **Channel scope.** Global only. No per-host or per-severity routing in v1.
-7. **Notification body.** Structured JSON for webhooks, formatted
-   title+body+click-URL for ntfy, plus a per-channel "Send test notification"
-   button with inline result feedback.
-8. **Deduplication.** Open-alert uniqueness on `(host_id, kind)` with a
-   `last_seen_at` bump on every confirming tick. One notification per
-   occurrence; the UI shows "still happening · Ns ago" while a rule keeps
-   matching.
-9. **Alert UI.** Top-level `/alerts` page (the existing nav stub becomes
-   real). Per-host vitals "Open alerts" cell links to `/alerts?host_id=...`.
-   Channel CRUD lives at `/settings/notifications`.
-10. **Delivery semantics.** Best-effort fire-and-forget with a 5s timeout
-    per notification. Failures are logged but not retried. The alert row in
-    the DB is the source of truth.
-
-## Architecture
-
-The subsystem is three loosely-coupled units behind one `AlertEngine`
-goroutine:
-
-```
-                                 ┌───────────────────────────┐
-   event hooks ─────────────────►│                           │
-                                 │   AlertEngine             │ ──► raise/resolve
-   60s ticker ──────────────────►│   (rule evaluation)       │     alert row
-                                 │                           │
-                                 └────────────┬──────────────┘
-                                              │
-                                              ▼
-                                  ┌──────────────────────┐
-                                  │   notification.Hub   │
-                                  │   (fire-and-forget)  │
-                                  └──┬────────┬──────────┘
-                                     │        │
-                              ┌──────▼──┐  ┌──▼──────┐
-                              │ Webhook │  │  Ntfy   │  …future channels
-                              └─────────┘  └─────────┘
-```
-
-### Component boundaries
-
-| Component                                | Purpose                                                                                  | Depends on                             |
-| ---------------------------------------- | ---------------------------------------------------------------------------------------- | -------------------------------------- |
-| `internal/alert.Engine`                  | Owns the rule evaluation. Exposes `OnJobFinished`, `OnHostOffline`, `OnHostOnline` event hooks; runs a 60s ticker for stale-schedule + auto-resolution sweeps. Persists raises/resolves through the store. | store, notification.Hub, slog          |
-| `internal/alert.Rule` + per-rule files   | Each of the six rules is a small struct with `Kind() string`, `Severity() string`, `MessageFor(ctx) string`. The engine iterates over a registered slice. | store models                           |
-| `internal/notification.Hub`              | Receives "alert raised/resolved/test" events; fans out to enabled channels in parallel; logs results to a new `notification_log` table.        | store, channel adapters                |
-| `internal/notification.Channel` (iface)  | Single method `Send(ctx, payload) error` with a 5s context for HTTP channels, 10s for SMTP. Three impls in v1: `webhookChannel`, `ntfyChannel`, `smtpChannel`. | http.Client; net/smtp + crypto/tls for SMTP |
-| `internal/store/alerts.go`               | CRUD on `alerts` table: `RaiseOrTouch(host_id, kind, severity, message)`, `Acknowledge(id, user)`, `Resolve(id, by user)`, `AutoResolve(host_id, kind)`, `ListAlerts(filter)`, plus the `last_seen_at` bump. | sqlite                                 |
-| `internal/store/notification_channels.go` | CRUD on `notification_channels` (new table) + `notification_log` (new table).            | sqlite, crypto.AEAD (for secrets)      |
-| `internal/server/http/ui_alerts.go`      | `/alerts` page handler + filter parsing + ack/resolve form actions.                      | store                                  |
-| `internal/server/http/ui_notifications.go` | `/settings/notifications` page + channel CRUD + "Send test" handler.                   | store, notification.Hub                |
-
-### Engine event shape
-
-The engine runs as one goroutine per server process started in
-`cmd/server/main.go`. It exposes a small set of channels other code writes to:
-
-```go
-type Engine struct {
-    store *store.Store
-    hub   *notification.Hub
-
-    // Event channels (buffered, drop-on-full with a slog warning to keep
-    // hot paths non-blocking). The engine drains them on its own
-    // goroutine, evaluates the rule, and acts.
-    jobFinished chan jobFinishedEvent  // from store.MarkJobFinished hook
-    hostOffline chan string            // host_id; from offline sweeper
-    hostOnline  chan string            // host_id; from ws handler hello
-
-    // 60s ticker drives stale-schedule + auto-resolution sweeps.
-    tick *time.Ticker
-}
-```
-
-The hot-path call sites (`store.MarkJobFinished`, `ws.handler` offline
-sweep, `ws.handler` hello) push to these channels via a tiny
-`Engine.Notify*` method that does a non-blocking send. The engine's own
-goroutine handles every match — keeps mutation off the hot path.
-
-### Rule catalogue
-
-| Kind                | Severity | Trigger                                                                 | Auto-resolve when                                  |
-| ------------------- | -------- | ----------------------------------------------------------------------- | -------------------------------------------------- |
-| `backup_failed`     | warning  | `MarkJobFinished` with kind=backup, status=failed                       | next backup for the same host succeeds             |
-| `forget_failed`     | warning  | `MarkJobFinished` with kind=forget, status=failed                       | next forget for the same host succeeds             |
-| `prune_failed`      | warning  | `MarkJobFinished` with kind=prune, status=failed                        | next prune for the same host succeeds              |
-| `check_failed`      | critical | `MarkJobFinished` with kind=check, status=failed OR errors_found        | next check for the same host succeeds without errors |
-| `stale_schedule`    | warning  | 60s ticker: a schedule's next-fire time is more than 5 minutes in the past with no matching job since | next job for that schedule succeeds OR schedule deleted |
-| `agent_offline`     | warning  | offline-sweeper marks the host offline AND the host has been offline > 15 min (engine checks `last_seen_at`) | hostOnline event for that host                     |
-
-The 15-minute floor on `agent_offline` exists so a 30-second blip during
-agent restart doesn't generate a notification storm. The store's existing
-offline sweeper (`hosts.last_seen_at` with 90s threshold) already marks the
-host offline; the engine sees the event but waits for the threshold before
-raising.
-
-### Dedup + last_seen_at
-
-`store.RaiseOrTouch(host_id, kind, severity, message)`:
-
-```sql
-SELECT id, last_seen_at FROM alerts
- WHERE host_id = ? AND kind = ? AND resolved_at IS NULL
- LIMIT 1;
-```
-
- Found: `UPDATE alerts SET last_seen_at = ?, message = ? WHERE id = ?`,
-  return `(id, didRaise=false)`.
- Not found: `INSERT INTO alerts (id, host_id, kind, severity, message,
-  created_at, last_seen_at) VALUES (?, ?, ?, ?, ?, ?, ?)`, return
-  `(id, didRaise=true)`.
-
-The engine fires a notification through the Hub only when `didRaise=true`.
-Touch-only events keep the row's `last_seen_at` fresh so the UI can render
-"still happening · Ns ago" without spamming the operator's phone.
-
-### Notification payload shapes
-
-**Webhook** — a single JSON envelope per event:
-
-```json
-{
-  "event":     "alert.raised",
-  "alert_id":  "01KQT...",
-  "severity":  "warning",
-  "kind":      "backup_failed",
-  "host_id":   "01KQ...",
-  "host_name": "alfa-01",
-  "message":   "Backup 'system-config' failed: rest-server returned 401",
-  "raised_at": "2026-05-04T15:42:01Z",
-  "link":      "https://restic-manager.example/alerts/01KQT..."
-}
-```
-
-`event` is one of `alert.raised | alert.acknowledged | alert.resolved |
-alert.test`. The same envelope shape is reused across events — operators
-build one bridge, switch on `event` and `severity`.
-
-**SMTP** — single-recipient plain-text email per channel. The channel
-config carries the SMTP server credentials and a `to` address; one
-channel = one recipient (or one distribution-list address). Operators
-who want multiple recipients add multiple channels — keeps the config
-flat and the failure modes per-recipient.
-
-Subject pattern is hardcoded (no per-channel template in v1):
-
-```
-Subject: [restic-manager] [<severity>] <host_name>: <kind>
-From: <configured-from-address>
-To: <configured-to-address>
-Date: <RFC 5322>
-Message-ID: <alert_id@<server-host>>
-
-<message line — same string the webhook/ntfy gets>
-
-—
-Raised at: 2026-05-04T15:42:01Z
-Severity:  warning
-Host:      alfa-01
-Kind:      backup_failed
-
-Open in restic-manager:
-https://restic-manager.example/alerts/01KQT...
-
-(This message was sent by restic-manager. Acknowledge or resolve in the UI.)
-```
-
-The body is plain text only in v1 — no HTML alternative — both because
-the data is already structured well enough as text and because HTML
-email opens a long tail of rendering / sanitisation concerns. The
-`Message-ID` includes the alert id so a thread-aware client can group
-related events (raised → acknowledged → resolved) together.
-
-Encryption:
- **STARTTLS** (default, port 587). Opportunistic upgrade. Most
-  operator-facing relays.
- **Implicit TLS** (port 465). Connect-then-TLS-handshake.
- **None** (port 25). Plain. Hidden behind a "Yes I understand" warning
-  on the form because the password goes over the wire.
-
-Auth:
- **PLAIN** (RFC 4616) over TLS. Default and almost always what's wanted.
- **CRAM-MD5** (RFC 2195). Offered if the server advertises it, no UI
-  toggle — automatic.
- No OAuth2 / XOAUTH2 in v1; that's a real next step if Gmail-without-
-  app-passwords becomes a recurring ask.
-
-Per-message timeout is 10s (vs 5s for HTTP channels) — STARTTLS
-handshake + DATA over a slow link can legitimately take that long.
-
-**Ntfy** — uses the standard publish format:
-
-```
-POST /<topic> HTTP/1.1
-Host: <server>
-Authorization: Bearer <access-token>   (if configured)
-Title: [warning] alfa-01 backup failed
-Priority: 4
-Tags: warning,backup_failed
-Click: https://restic-manager.example/alerts/01KQT...
-
-Backup 'system-config' failed: rest-server returned 401
-```
-
-Severity → priority mapping:
-
-| Severity  | Priority |
-| --------- | -------- |
-| info      | 3 (default) |
-| warning   | 4 (high)    |
-| critical  | 5 (urgent)  |
-
-Per-channel `default_priority` setting overrides for non-critical alerts;
-critical always goes urgent regardless.
-
-### Test notification
-
-`POST /api/notifications/{channel_id}/test` builds a synthetic event
-(severity=info, kind=test_notification, message="Test from
-restic-manager", link to the channel's edit page) and runs it through the
-real send path. Returns `{ok: bool, latency_ms: int, status_code?: int,
-error?: string}`. UI renders the green ✓ / red ✗ feedback inline.
-
-## Routes added
-
-| Method  | Path                                                  | Purpose                                                       |
-| ------- | ----------------------------------------------------- | ------------------------------------------------------------- |
-| GET     | `/alerts`                                             | Fleet alerts list with filters (`?status=open&severity=warning&host_id=...&q=...`) |
-| POST    | `/alerts/{id}/acknowledge`                            | Mark alert acknowledged (HTMX form)                           |
-| POST    | `/alerts/{id}/resolve`                                | Manual resolve (HTMX form)                                    |
-| GET     | `/settings/notifications`                             | Channel list page                                             |
-| GET     | `/settings/notifications/new`                         | Channel kind picker + empty form                              |
-| POST    | `/settings/notifications/new`                         | Validate + create + redirect                                  |
-| GET     | `/settings/notifications/{id}/edit`                   | Channel edit form                                             |
-| POST    | `/settings/notifications/{id}/edit`                   | Validate + update                                             |
-| POST    | `/settings/notifications/{id}/delete`                 | Delete channel (typed-confirm name in the form)               |
-| POST    | `/api/notifications/{id}/test`                        | Fire test notification, return JSON result                    |
-| GET     | `/api/alerts`                                         | JSON list (mirrors the UI filters) for future REST callers    |
-
-## Data model
-
-### Migration 0013 — alerts.last_seen_at
-
-```sql
-ALTER TABLE alerts ADD COLUMN last_seen_at TEXT;
-UPDATE alerts SET last_seen_at = created_at WHERE last_seen_at IS NULL;
-```
-
-Existing alerts (currently zero in production — nothing writes them yet)
-get `last_seen_at = created_at`. Column is nullable for forwards-compat
-with rows from the alert-engine-pre-bump period.
-
-### Migration 0014 — notification_channels + notification_log
-
-```sql
-CREATE TABLE notification_channels (
-  id              TEXT PRIMARY KEY,
-  kind            TEXT NOT NULL CHECK (kind IN ('webhook', 'ntfy', 'smtp')),
-  name            TEXT NOT NULL,
-  enabled         INTEGER NOT NULL DEFAULT 1 CHECK (enabled IN (0, 1)),
-  config          BLOB NOT NULL,        -- AEAD-encrypted JSON; per-kind shape
-  default_priority TEXT,                -- ntfy only; null for webhook + smtp
-  created_at      TEXT NOT NULL,
-  updated_at      TEXT NOT NULL,
-  last_fired_at   TEXT
-);
-
-CREATE INDEX notification_channels_enabled ON notification_channels(enabled) WHERE enabled = 1;
-
-CREATE TABLE notification_log (
-  id           TEXT PRIMARY KEY,
-  channel_id   TEXT NOT NULL REFERENCES notification_channels(id) ON DELETE CASCADE,
-  alert_id     TEXT REFERENCES alerts(id) ON DELETE SET NULL,
-  event        TEXT NOT NULL,           -- alert.raised | alert.acknowledged | alert.resolved | alert.test
-  ok           INTEGER NOT NULL CHECK (ok IN (0, 1)),
-  status_code  INTEGER,
-  latency_ms   INTEGER,
-  error        TEXT,
-  fired_at     TEXT NOT NULL
-);
-
-CREATE INDEX notification_log_channel ON notification_log(channel_id, fired_at DESC);
-CREATE INDEX notification_log_alert ON notification_log(alert_id);
-```
-
-`config` is an AEAD-encrypted JSON blob — bearer tokens for webhooks and
-access tokens for ntfy live there. Per-kind config shapes:
-
-```go
-type webhookConfig struct {
-    URL          string `json:"url"`
-    BearerToken  string `json:"bearer_token,omitempty"`
-    HeaderName   string `json:"header_name,omitempty"`
-    HeaderValue  string `json:"header_value,omitempty"`
-}
-
-type ntfyConfig struct {
-    ServerURL    string `json:"server_url"`     // default https://ntfy.sh
-    Topic        string `json:"topic"`
-    AccessToken  string `json:"access_token,omitempty"`
-}
-
-type smtpConfig struct {
-    Host       string `json:"host"`         // e.g. smtp.example.com
-    Port       int    `json:"port"`         // default 587 (STARTTLS), 465 (TLS), 25 (none)
-    Encryption string `json:"encryption"`   // "starttls" | "tls" | "none"
-    Username   string `json:"username"`
-    Password   string `json:"password"`     // sensitive — AEAD-encrypted with the rest of config
-    From       string `json:"from"`         // RFC 5322 address; "alerts@example.com" or "Restic-Manager <alerts@…>"
-    To         string `json:"to"`           // single recipient or distribution-list address; v1 = one channel = one to-line
-}
-```
-
-### Engine state
-
-The engine itself is stateless beyond the channels it owns; all
-persisted state is in the existing `alerts` table + the new
-`notification_log` table. A process restart re-evaluates from scratch:
-on next tick the stale-schedule + auto-resolution sweeps catch up with
-whatever happened during the downtime. No outbox to drain.
-
-## UI templates
-
-| Template                                  | Purpose                                                |
-| ----------------------------------------- | ------------------------------------------------------ |
-| `web/templates/pages/alerts.html`         | Fleet alerts page                                      |
-| `web/templates/partials/alert_row.html`   | One alert row (used by both list and detail-fragment swap) |
-| `web/templates/pages/settings.html`       | Settings shell with Notifications / Users / Auth sub-tabs |
-| `web/templates/pages/notifications.html`  | Channel list (Notifications sub-tab body)              |
-| `web/templates/pages/notification_edit.html` | Channel kind picker + per-kind form + test button + payload preview |
-| `web/templates/partials/crit_banner.html` | Dashboard top-of-page banner                           |
-| `web/templates/partials/nav.html`         | Existing — gain a `data-alerts-count` attribute on the Alerts tab so the badge auto-updates |
-
-The Settings shell + Notifications sub-tab is the new chrome the wireframe
-introduced; Users + Authentication tabs are placeholder links that 404 in
-v1 (or render an "Lands later" notice). Same pattern P2R-02 used for
-inert sub-tabs.
-
-## Tests (target coverage)
-
- `internal/alert/engine_test.go` — rule firing per kind: backup_failed
-  raises on `MarkJobFinished(kind=backup, status=failed)`; touch-only on
-  the second failure for the same host (no second notification);
-  auto-resolve on next success.
- `internal/alert/agent_offline_test.go` — `OnHostOffline` emits without
-  raising until the 15-min floor; `OnHostOnline` clears the alert.
- `internal/alert/stale_schedule_test.go` — synthetic schedule whose next
-  fire is in the past triggers; resets when a job lands.
- `internal/notification/webhook_test.go` — payload shape pinned;
-  authorisation header sent when bearer set; custom header echoed; 5s
-  timeout enforced; error in `notification_log`.
- `internal/notification/ntfy_test.go` — title/priority/tags/click headers
-  match the severity mapping; access token sent as `Authorization: Bearer
-  <token>`; default priority overridden by severity for critical.
- `internal/notification/smtp_test.go` — round-trip against a local
-  `net/smtp.NewServer`-style fake (or `mhog`/MailHog if convenient):
-  STARTTLS handshake completes against a self-signed cert; PLAIN auth
-  uses configured creds; subject + from + to + body bytes match the
-  spec'd format; Message-ID contains the alert id; 10s timeout enforced;
-  failure path (auth refused) lands in `notification_log` with the
-  server's error string.
- `internal/server/http/ui_alerts_test.go` — page renders with filters
-  applied; ack/resolve POSTs flip the row + write audit; HX-Redirect
-  bounces back to the filtered list.
- `internal/server/http/ui_notifications_test.go` — CRUD happy paths,
-  validation re-render, secrets-encrypted-at-rest assertion (load row,
-  decrypt, compare), test-button hits the real send path against a
-  test http.Server.
- Migration 0013 + 0014 round-trip tested via `store.Open` on a fresh
-  db.
-
-## Playwright sweep
-
-End-of-phase sweep mirrors the P2R-02 / P3-restore pattern:
-
-1. Login → `/alerts` (initially empty) → see "All clear · last alert
-   never" empty state.
-2. Trigger a fake-failed-backup via `POST /api/hosts/{id}/jobs` against a
-   host with a deliberately-wrong rest-server URL. Wait for the
-   `backup_failed` alert to appear in the list within ~2s of the job
-   finishing.
-3. Acknowledge → row tints + ack actor visible.
-4. Take the agent offline (`systemctl stop`); wait 15 min OR mock
-   `last_seen_at` to 16 min ago via the test harness; confirm
-   `agent_offline` alert raises once.
-5. Restart the agent → `agent_offline` auto-resolves; `backup_failed` is
-   still open.
-6. Configure a webhook channel pointing at a local test sink; click "Send
-   test" → green ✓.
-7. Configure a ntfy channel pointing at a local sink → click "Send test"
-   → green ✓.
-8. Configure an SMTP channel pointing at a local MailHog (Docker, port
-   1025, no TLS for the local-only sweep) → click "Send test" → green ✓
-   → MailHog UI at :8025 shows the test email with the right subject
-   and Message-ID.
-9. Trigger a fresh failed backup → all three channels receive the
-   notification (verified from sink logs + MailHog inbox);
-   `notification_log` has three rows `event=alert.raised, ok=true`.
-10. Manually Resolve the open `backup_failed`; confirm all three channels
-    receive `event=alert.resolved`.
-11. Critical-severity test: trigger `check_failed` (mocked) → dashboard
-    banner appears; clicking it lands on `/alerts?severity=critical&status=open`.
-12. Empty the alerts again → banner disappears.
-
-Screenshots into `_diag/p3-alerts-sweep/`. End-to-end clean, zero console
-errors, before handing back.
-
-## What does NOT change
-
- Existing chrome/templates beyond the small additions noted above.
- Existing `alerts.severity` CHECK (`info`/`warning`/`critical`) — already
-  the right shape; no migration needed for that.
- Audit log writer pattern — engine writes audit rows for ack/resolve
-  the same way every other state-changing handler does.
- The agent. Alerts are entirely a server concern; the agent doesn't
-  know they exist.
-
-## Open questions / explicit non-goals
-
- **Per-rule cooldowns / re-raise on long-running issues.** Out of scope
-  (brainstorm question 8 ruled this out). Operators see "still happening"
-  in the UI; they don't get a reminder ping.
- **SMTP HTML emails.** v1 is plain text only — operators wanting rich
-  rendering can deploy a webhook → mail-merge bridge, or wait for a v2
-  template engine. The Message-ID threading + plain text body should be
-  enough for almost every overnight-digest workflow.
- **SMTP OAuth2 / XOAUTH2.** Out of scope. Gmail / Microsoft 365 with
-  modern OAuth requires an `app password` workaround in v1. Native
-  XOAUTH2 lands when an operator asks (or when Google starts refusing
-  app passwords for non-business accounts in earnest).
- **Multi-recipient SMTP channels.** A channel = one `To`. Operators
-  wanting multiple recipients add multiple channels. Keeps failure
-  attribution per-recipient.
- **Apprise sidecar integration.** Deferred per brainstorm. The
-  `Channel` interface accepts a third impl without reshaping when we get
-  there.
- **Per-host or per-severity channel routing.** Out of scope. Likely
-  next step if operators ask: a `min_severity` field on the channel row.
- **Snooze / mute.** Out of scope. Acknowledge is the closest analogue;
-  full silence-windows would need a new table and is YAGNI for v1.
- **PagerDuty / OpsGenie.** Both have webhook receivers; operators wire
-  them via the webhook channel today.
- **Alert "rules" UI.** No CRUD; the rule set is hardcoded.
@@ -1,342 +0,0 @@
-# P3 — Restore (design)
-
-> Phase 3 sub-spec covering single-host restore (P3-01, P3-02, P3-03, P3-09).
-> P3-04 (cross-host restore) is deferred to a new "Future / unscheduled"
-> section in `tasks.md` — disaster recovery is already covered by re-enrolling
-> a replacement host with the same repo credentials.
->
-> Wireframe: `_diag/p3-restore-wizard/wireframe.html`. Screenshot:
-> `_diag/p3-restore-wizard/01-full-wizard.png`.
-
-## Scope locked
-
-Brainstorm decisions (in order asked):
-
-1. **In-place vs new-directory.** Default is a new directory under
-   `/var/restic-restore/<job-id>/`. An "Restore in place (overwrite original
-   paths)" toggle is gated by typed-confirmation of the host name, mirroring
-   the repo re-init pattern.
-2. **Path-selection granularity.** Tree browser as the path selector, lazy-
-   loaded via `restic ls --json <snapshot> <path>` per directory expansion.
-3. **Cross-host restore (P3-04).** Out of scope this phase. Move to
-   "Future / unscheduled" in `tasks.md`. The disaster-recovery case is covered
-   by the standard enrolment flow: stand up a replacement host, paste the
-   original repo creds at enrolment, snapshots reappear, restore is
-   same-host.
-4. **Snapshot diff (P3-09).** Diff-as-a-job. New `JobDiff` JobKind dispatched
-   like every other agent operation. Output streams as `log.stream` and
-   renders on the live job log page.
-5. **Wizard entry points.** Top-level "Restore" button on host detail
-   (`/hosts/{id}/restore`, opens wizard at step 1) plus a per-snapshot
-   Restore action on snapshot rows (`/hosts/{id}/snapshots/{sid}/restore`,
-   skips step 1).
-6. **Wizard interaction model.** Single-page, sections progressively enable;
-   tree-browser nodes lazy-load via HTMX partials. No `restore_drafts` table.
-7. **Tree-browser data path.** Synchronous WS RPC (`tree.list` ↔
-   `tree.list.result`, correlation-ID) plus a per-wizard-session in-memory
-   cache keyed by `{snapshot_id, path}` with ~30-min TTL.
-8. **Restore progress UI.** Restore-specific job-page variant: files-restored
-   / bytes-restored / throughput / ETA / current-file display, driven by
-   restic restore's JSON status events surfaced through `job.progress`.
-9. **Permissions/ownership.** Policy, not toggle. In-place restore preserves
-   original ownership; new-directory restore drops ownership
-   (`--no-ownership`).
-10. **Concurrency.** Single-flight per host (one job at a time across all
-    kinds). Plus a real cancel-job feature: `command.cancel` envelope, agent
-    kills the `restic` subprocess via context cancel (SIGTERM, SIGKILL after
-    grace), server transitions the job to `cancelled`. The "Cancel" button
-    already in the `job_detail` template becomes real for any running job
-    kind.
-11. **Audit + safety.** Audit row on every restore dispatch (`host.restore`
-    with snapshot ID, paths, target, in-place flag). Recent-restores panel
-    on the host page surfacing the latest restore job alongside last-backup
-    and last-init signals. Role gate deferred to P4-03.
-
-## Architecture
-
-Restore composes from existing primitives plus three new pieces:
-
- **New JobKind values**: `JobRestore`, `JobDiff`. Dispatcher cases mirror
-  the prune/check pattern. Agent-side handlers wrap `restic.RunRestore` and
-  `restic.RunDiff` (new methods on the `restic` package).
- **New WS RPC**: `tree.list` request (`{snapshot_id, path}`) ↔
-  `tree.list.result` reply (`{entries: [{name, type, size}], ...}` or
-  `{error}`). Reuses existing correlation-ID infrastructure from P1-09. No
-  `jobs` row.
- **New cancel surface**: `command.cancel` request (`{job_id}`), agent
-  cancels the running subprocess context, returns `command.ack` + `job.finished`
-  with status `cancelled`. Server endpoint `POST /api/jobs/{id}/cancel`
-  bridges UI button → WS envelope.
-
-Everything else (job lifecycle, log streaming, progress envelope, snapshot
-listing, audit log writer, host_chrome partial, danger-zone typed-confirmation)
-already exists and is reused verbatim.
-
-### Component boundaries
-
-| Component                          | Purpose                                              | Depends on                                |
-| ---------------------------------- | ---------------------------------------------------- | ----------------------------------------- |
-| `internal/restic.RunRestore`        | Run `restic restore` with paths + target + ownership | `restic.Env`                              |
-| `internal/restic.RunDiff`           | Run `restic diff --json a b`                         | `restic.Env`                              |
-| `internal/agent/runner` cases       | Dispatch `JobRestore` / `JobDiff` jobs               | `restic.Run*`, hooks (skipped: backup-only) |
-| `internal/agent/runner` cancel hook | Wire WS `command.cancel` → ctx.CancelFunc per job   | runner job map                            |
-| `internal/agent/runner` tree-list   | Sync RPC handler: `restic ls --json` for one path   | `restic.Env`                              |
-| `internal/server/ws/cancel.go`      | Validate + send `command.cancel` envelope            | hub.Send, store.UpdateJobStatus           |
-| `internal/server/ws/tree.go`        | RPC mediator: `tree.list` request → reply, with cache | hub.SendRPC, in-memory cache              |
-| `internal/server/http/restore.go`   | Wizard routes + dispatch endpoint                    | store, ws, audit                          |
-| `internal/server/http/diff.go`      | Snapshot-diff dispatch endpoint                      | store, ws                                 |
-| `internal/server/http/cancel.go`    | `POST /api/jobs/{id}/cancel`                         | ws                                        |
-| `web/templates/pages/host_restore.html` | Wizard page                                      | host_chrome partial                       |
-| `web/templates/partials/tree_node.html` | Lazy-loaded tree node fragment for HTMX swap     | —                                         |
-| `web/templates/pages/job_detail.html` | Restore-kind progress widget (variant)             | existing job_detail                       |
-
-### Data flow — wizard happy path
-
-```
-operator
-  ├─ GET /hosts/{id}/restore
-  │     server renders wizard shell, snapshot table from store.ListSnapshotsByHost
-  │
-  ├─ click snapshot row (or arrives via /hosts/{id}/snapshots/{sid}/restore)
-  │     wizard advances to step 2, snapshot summary card rendered
-  │
-  ├─ expand a tree node (chevron click)
-  │     HTMX GET /hosts/{id}/restore/tree?snapshot={sid}&path=/etc
-  │       server checks per-session cache (keyed by sid+path)
-  │         hit  → render tree_node fragment from cache
-  │         miss → hub.SendRPC(host_id, "tree.list", {sid, path}) → wait reply
-  │                cache result, render tree_node fragment
-  │
-  ├─ tick file/dir checkboxes (form state, no round-trip)
-  │
-  ├─ pick target radio (and optionally type host name to unlock in-place)
-  │
-  └─ POST /hosts/{id}/restore  (form submit)
-        server validates: ≥1 path, target mode, in-place ⇒ host name match
-        write audit row host.restore
-        store.CreateJob{kind=restore, payload={snapshot_id, paths, target, in_place}}
-        hub.Send(host_id, "command.run", {job_id, kind=restore, payload})
-        HX-Redirect: /jobs/{job_id}
-```
-
-### Data flow — agent restore execution
-
-```
-agent.runner receives command.run kind=restore
-  ├─ check single-flight: if r.activeJobID != "" → reply busy
-  │   (server queues to pending_runs only for kind=backup; restore returns busy)
-  ├─ allocate ctx, ctxCancel — store cancelFunc against job_id in r.cancels
-  ├─ sendStarted(job_id, JobRestore, now)
-  ├─ build target path: if in_place → "/" else "/var/restic-restore/<job_id>/"
-  ├─ build flags: paths from payload, --no-ownership when !in_place
-  ├─ restic.RunRestore(ctx, env, snapshot_id, paths, target, in_place):
-  │   restic restore <sid> --target <path> [--no-ownership] -- <p1> <p2> ...
-  │   parse stdout JSON: forward "status" → job.progress (1Hz throttle), "summary" → final
-  ├─ on success: sendFinished(job_id, succeeded, exit=0)
-  ├─ on ctx.Err() == context.Canceled: sendFinished(job_id, cancelled, exit=130)
-  └─ delete cancel func from r.cancels
-```
-
-### Data flow — cancel
-
-```
-operator clicks Cancel on /jobs/{id} (running)
-  POST /api/jobs/{id}/cancel
-    server: lookup job, ensure status=running, find host
-    hub.Send(host_id, "command.cancel", {job_id})
-  → agent.runner receives command.cancel
-       cancelFunc, ok := r.cancels[job_id]
-       ok && cancelFunc()
-       → restic subprocess context done → exec.Cmd kills via SIGTERM
-       → if still alive after 5s grace → SIGKILL
-       → runner sendFinished(job_id, cancelled, exit=130)
-  → server receives job.finished status=cancelled, persists, broadcasts
-  → browser refresh shows cancelled state
-```
-
-The cancel surface is independently useful for any kind (prune/check/backup) —
-not gated to restore. The button already in `job_detail.html` becomes real.
-
-### Tree-list RPC details
-
-New WS message types (added to `internal/api/messages.go`):
-
-```
-type TreeListRequestPayload struct {
-    SnapshotID string `json:"snapshot_id"`
-    Path       string `json:"path"`
-}
-
-type TreeListEntry struct {
-    Name string `json:"name"`
-    Type string `json:"type"`        // "dir" | "file" | "symlink"
-    Size int64  `json:"size,omitempty"`
-}
-
-type TreeListResultPayload struct {
-    SnapshotID string          `json:"snapshot_id"`
-    Path       string          `json:"path"`
-    Entries    []TreeListEntry `json:"entries,omitempty"`
-    Error      string          `json:"error,omitempty"`
-}
-```
-
-Server-side mediator (`ws.SendRPC`) takes a request envelope, registers the
-correlation ID in a pending map, sends, blocks on a per-call channel until
-the matching reply arrives (or 30s timeout). The pattern is small enough
-to inline in `internal/server/ws/rpc.go` as a generic helper — future
-synchronous RPCs reuse it.
-
-In-memory cache: `map[sessionID]map[cacheKey]TreeListResultPayload` with
-`cacheKey = snapshot_id + "\x00" + path`. Session ID minted per wizard
-load (HTTP-only cookie scoped to `/hosts/{id}/restore/tree`, lifetime 30
-min). On wizard close (browser navigation away) the entry expires
-naturally. No persistence, no migration.
-
-Agent handler runs `restic ls --json <sid> <path>` (non-recursive — restic
-defaults to recursive but `restic ls` accepts `--long` and a path filter;
-parse output line-by-line and emit only direct children of `path`). 60s
-context timeout, mirroring existing `restic snapshots` invocation.
-
-### Restore payload
-
-`api.CommandRunPayload` gains a nested optional `restore` field:
-
-```
-type RestorePayload struct {
-    SnapshotID    string   `json:"snapshot_id"`
-    Paths         []string `json:"paths"`           // absolute paths inside the snapshot
-    InPlace       bool     `json:"in_place"`
-    TargetDir     string   `json:"target_dir"`      // empty when in_place=true
-    PreserveOwner bool     `json:"preserve_owner"`  // mirrors policy: in_place=>true, else=>false
-}
-```
-
-The payload is set by the server when dispatching `JobRestore` and ignored
-on every other kind. Wire-shape test pinned in `wire_test.go`.
-
-### Diff payload
-
-`api.CommandRunPayload` gains:
-
-```
-type DiffPayload struct {
-    SnapshotA string `json:"snapshot_a"`
-    SnapshotB string `json:"snapshot_b"`
-}
-```
-
-Set on `JobDiff`. Output is plain `restic diff --json <a> <b>` forwarded as
-`log.stream` lines. Job page renders unchanged — operator reads the diff
-output directly.
-
-### Recent-restores panel
-
-A small panel rendered on the host detail page below the existing init-status
-line:
-
-```
-last restore: succeeded 2h ago · job f73ab4c1… · 3 files to /var/restic-restore/...
-```
-
-Backed by a new `store.LatestJobByKind(host_id, JobRestore)` query (mirroring
-the existing `store.LatestJobByKind` already used for init/forget/prune/check
-in P2R-06). One template addition in `host_chrome.html` next to the
-`InitStatus` block.
-
-## Routes added
-
-| Method  | Path                                                      | Purpose                                                     |
-| ------- | --------------------------------------------------------- | ----------------------------------------------------------- |
-| GET     | `/hosts/{id}/restore`                                     | Wizard shell (step 1 = snapshot picker)                     |
-| GET     | `/hosts/{id}/snapshots/{sid}/restore`                     | Wizard shell with snapshot pre-selected (skips step 1)      |
-| GET     | `/hosts/{id}/restore/tree`                                | HTMX partial: tree node listing for `?snapshot=&path=`      |
-| POST    | `/hosts/{id}/restore`                                     | Validate + dispatch restore job, redirect to live job page  |
-| POST    | `/api/hosts/{id}/snapshots/diff`                          | Dispatch a diff job for `{snapshot_a, snapshot_b}`          |
-| POST    | `/api/jobs/{id}/cancel`                                   | Send `command.cancel` to host, transition job → cancelled   |
-
-## Migrations
-
-None. Restore + diff piggyback on the existing `jobs` table (their `kind` is
-new but the schema already accepts arbitrary kind strings — there's no
-CHECK constraint on `kind`). The cancel feature uses the existing
-`JobCancelled` terminal status. The tree-list cache lives in process memory.
-
-## Tests (target coverage)
-
- `internal/restic/restore_test.go` — `RunRestore` invocation builds the
-  expected argv (paths, --target, --no-ownership flag presence, in-place
-  variant); JSON status parsing → `BackupStatus`-shaped progress envelopes.
- `internal/restic/diff_test.go` — `RunDiff` argv shape and JSON forwarding.
- `internal/agent/runner/restore_test.go` — happy path, cancel mid-run
-  produces `cancelled` finished, in-place vs new-directory dispatch,
-  single-flight rejects when another job is running.
- `internal/agent/runner/tree_test.go` — `tree.list` handler returns
-  direct children for a synthetic restic ls output, surfaces error on
-  missing snapshot.
- `internal/server/ws/rpc_test.go` — `SendRPC` correlation matching,
-  timeout, concurrent calls.
- `internal/server/http/restore_test.go` — wizard renders with snapshots,
-  POST validates ≥1 path + in-place host-name match, audit row written,
-  job dispatched with correct payload, in-place without typed-confirm
-  re-renders form with input intact and an error.
- `internal/server/http/diff_test.go` — POST dispatches `JobDiff`,
-  snapshot IDs validated against the host's snapshot list.
- `internal/server/http/cancel_test.go` — POST cancel happy path
-  (running → cancelled), 4xx for non-running jobs, 4xx when host offline.
- `internal/server/http/restore_e2e_test.go` — happy path: GET wizard,
-  expand `/etc` (HTMX call returns expected fragment), submit, follow
-  HX-Redirect to job page, see status.
- `web/templates/pages/host_restore_test.go` (template-render test) —
-  wizard renders all four sections; in-place card disabled until typed
-  confirm.
-
-## Playwright iteration / sweep
-
-A Playwright sweep at the end (mirroring P2R-02 Slice 6) runs against the
-local smoke server with a real agent enrolled. Steps:
-
-1. Login → navigate to alfa-01 host → click Restore.
-2. Wizard step 1: pick the most recent snapshot.
-3. Wizard step 2: expand a directory two levels, tick three files,
-   verify tally updates.
-4. Wizard step 3: leave default new-directory.
-5. Wizard step 4: dispatch.
-6. Land on live job page, see progress widget animating, see log lines.
-7. Click Cancel mid-flight, verify status transitions to cancelled and
-   the agent's subprocess actually died (log line `signal: killed` or exit
-   130).
-8. Repeat with in-place mode: type host name, dispatch, verify red
-   primary button, verify files actually overwritten on host.
-9. Snapshot diff: navigate to snapshots, pick two, dispatch diff, see
-   diff output streamed.
-10. Screenshots into `_diag/p3-restore-sweep/`.
-
-End-to-end clean, zero console errors, before handing back.
-
-## What does NOT change
-
- `host_chrome.html` only grows the recent-restores line; sub-tab list
-  unchanged (Restore is a top-level button on the host page, not a sub-tab).
- `enrollment.go`, schedule reconciliation, source-group CRUD, repo
-  maintenance ticker, hook execution — none of these are touched.
- The CLAUDE.md restage block applies as-is when the agent binary changes
-  (it does — runner gains restore/diff/cancel/tree handlers). The unit
-  file does not change.
-
-## Open questions / explicit non-goals
-
- **Restore preview / dry-run.** Restic doesn't have a dry-run for restore.
-  Out of scope.
- **Resumable restore.** Restic restore is idempotent per-file but not
-  resumable mid-stream from where it left off. If a restore is cancelled,
-  the operator re-runs (files already written are overwritten). No state
-  to track.
- **Restore to a glob/pattern (e.g. `*.conf`).** Out of scope; the tree
-  picker requires explicit ticks. Power users can edit the URL or use the
-  CLI.
- **Bandwidth caps for restore.** Honoured automatically — restic's
-  `--limit-download` is part of `restic.Env` already (P2R-13) and applies
-  to restore unchanged.
- **Pre/post hooks for restore.** Hooks today gate only `kind=backup`
-  (P2R-11). Out of scope.
@@ -1,340 +0,0 @@
-# P4-03 / P4-04 — RBAC + User Management Design
-
-> **Date:** 2026-05-05
-> **Status:** brainstorm complete; ready for plan
-> **Closes:** P4-03 (RBAC enforcement at API layer), P4-04 (User management UI)
-
-## Goal
-
-Enforce role-based access control at the HTTP layer (currently every authenticated user has admin powers) and ship the operator-facing screens for managing users, roles, and password lifecycle.
-
-## Architecture
-
-Two coupled subsystems landing in one PR:
-
-1. **RBAC enforcement** — chi route-group middleware that gates each subtree by minimum role. Fail-closed default (admin) so a forgotten declaration doesn't accidentally widen access.
-2. **User management** — `/settings/users` sub-tab with list / add / edit / disable. Setup-link flow for new users (1-hour-expiry single-use token). Self-service password change at `/settings/account`.
-
-The audit log already records actor + user_id on every mutation; new endpoints fold in naturally.
-
-## Role taxonomy
-
-Locked. Three roles, hierarchical (admin ⊇ operator ⊇ viewer):
-
-| Action | admin | operator | viewer |
-|---|:-:|:-:|:-:|
-| View dashboard / alerts / audit / hosts | ✓ | ✓ | ✓ |
-| Trigger Run-now / Restore / Snapshot diff | ✓ | ✓ | ✗ |
-| Acknowledge / resolve alerts | ✓ | ✓ | ✗ |
-| Edit schedules / source groups / retention / hooks | ✓ | ✓ | ✗ |
-| Add / remove hosts (enrolment, accept/reject pending) | ✓ | ✓ | ✗ |
-| Cancel running jobs | ✓ | ✓ | ✗ |
-| Edit repo credentials | ✓ | ✓ | ✗ |
-| Edit notification channels | ✓ | ✗ | ✗ |
-| Manage users | ✓ | ✗ | ✗ |
-| Self password change (`/settings/account`) | ✓ | ✓ | ✓ |
-
-The role enum already exists in the schema (`CHECK (role IN ('admin','operator','viewer'))`) and in `internal/store/types.go`. Bootstrap creates the first user as admin. Zero migration needed for existing installs.
-
-## Schema changes
-
-All column-level ALTERs (CLAUDE.md prefers these over rebuilds; safe under `foreign_keys=ON`).
-
-### Migration 0017 — `users` extensions
-
-```sql
-ALTER TABLE users ADD COLUMN email TEXT;
-ALTER TABLE users ADD COLUMN disabled_at TEXT;
-ALTER TABLE users ADD COLUMN must_change_password INTEGER NOT NULL DEFAULT 0;
-
-- Username case-insensitive lookup. Existing rows are kept as-is;
-- normalisation only applies to new INSERTs (handled in Go).
-CREATE UNIQUE INDEX users_username_lower ON users(LOWER(username));
-```
-
-### Migration 0018 — `user_setup_tokens`
-
-```sql
-CREATE TABLE user_setup_tokens (
-  user_id     TEXT PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
-  token_hash  TEXT NOT NULL,           -- sha256(raw_token), hex
-  expires_at  TEXT NOT NULL,
-  created_at  TEXT NOT NULL,
-  created_by  TEXT NOT NULL REFERENCES users(id) ON DELETE SET NULL
-);
-
-CREATE INDEX user_setup_tokens_expires ON user_setup_tokens(expires_at);
-```
-
-`user_id` is PRIMARY KEY, not just FOREIGN KEY — only one outstanding setup token per user. Regenerating supersedes the old via `INSERT OR REPLACE`.
-
-## RBAC enforcement
-
-### Middleware
-
-```go
-// requireRole returns chi middleware that 403s any request whose
-// session-resolved user doesn't meet the minimum role. Roles are
-// hierarchical: admin > operator > viewer.
-func (s *Server) requireRole(min store.Role) func(http.Handler) http.Handler
-```
-
-Hierarchy implemented as a small helper:
-
-```go
-func roleAtLeast(have, min store.Role) bool {
-    rank := map[store.Role]int{
-        store.RoleViewer:   1,
-        store.RoleOperator: 2,
-        store.RoleAdmin:    3,
-    }
-    return rank[have] >= rank[min]
-}
-```
-
-### Route grouping in `server.go`
-
-The existing `/api` and UI routes get re-grouped into three role bands plus a self-service group:
-
-```
-/api/* viewer-readable    — GET endpoints anyone authenticated can hit
-/api/* operator+          — mutating endpoints up to host/source-group/schedule level
-/api/* admin-only         — /api/users/*, channel CRUD
-/api/account              — self-service password change
-
-/audit, /alerts, /hosts/{id}, etc.   — viewer
-/hosts/{id}/run, /alerts/{id}/ack    — operator
-/settings/users/*, /settings/notifications/* — admin
-/settings/account                    — viewer (any authenticated)
-```
-
-Default at the bottom of `routes()` is admin (fail-closed). Any future endpoint that doesn't get explicitly placed lands in admin-only, surfacing the missing declaration as a permission error rather than a silent bypass.
-
-### Per-handler nuance
-
-One existing case warrants a handler-level check on top of the route gate: `GET /settings/users/{id}/edit` is admin-only, but the `PUT /api/account/password` is viewer-OK. The split-by-route already covers this; no per-handler overrides expected in v1.
-
-### Out of scope of role middleware
-
- `/ws/agent` and `/api/agents/*` — agent bearer-token auth, separate chain
- `/healthz` — unauthenticated
- `/login`, `/logout`, `/bootstrap` — public
-
-### 403 handling
-
- JSON endpoints: `{"error":"forbidden","code":"insufficient_role"}` with HTTP 403
- HTML endpoints: render a small "You don't have permission" panel inside the chrome (so the user keeps their nav and can move away), HTTP 403
- **No audit row on 403** — too noisy with normal users hitting URLs they don't have access to
-
-### Session re-validation
-
-Sessions need to honour `disabled_at` and current role on every request, not just at login. The session-validation middleware reads the user row each request (single PK lookup, fast in SQLite). If `disabled_at IS NOT NULL`, the session is invalidated and the request 401s. This makes "disable user" and "force logout" effectively immediate.
-
-Cost: one SELECT per authenticated request. SQLite handles this comfortably for the fleet sizes this codebase targets.
-
-## Setup-token flow (replacing temp passwords)
-
-### Add user
-
-1. Admin clicks **+ Add user** on `/settings/users`
-2. Form: username (required, lowercase-normalised), email (optional, validated), role (admin/operator/viewer)
-3. Server:
-   - Validates username uniqueness (case-insensitive). On collision with a *disabled* user, return a 409 with `{"existing_user_id": "...", "disabled": true}` so the UI can pivot to a "re-enable existing user" prompt
-   - On collision with an enabled user: 409 with a plain "username taken" error
-   - Creates user row with `password_hash = ""`, `must_change_password = 1`, `disabled_at = NULL`
-   - Generates 32 random bytes, hex-encodes → raw token (64 chars). Stores `sha256(token)` hex in `user_setup_tokens`. `expires_at = now + 1h`
-   - Audit: `user.created`, payload `{"username": "...", "role": "...", "with_setup_token": true}`
-4. Server returns the admin to a one-time setup-link page: `/settings/users/{id}/setup-link`
-   - Shows the URL `http(s)://<base>/setup?token=<raw>` with a Copy button
-   - Countdown timer (live JS) showing time-to-expiry
-   - Warning: "This is the only time you'll see this link. If you lose it, regenerate from the user edit page."
-   - "Done" button → `/settings/users`
-
-The raw token is **never persisted** server-side. Lost tokens require regeneration.
-
-### Setup landing page (public, no auth required)
-
-1. User clicks the link, lands on `/setup?token=<raw>`
-2. Server hashes the token, looks up `user_setup_tokens` row, validates `expires_at > now`
-3. On invalid / expired: render an error page with a "Contact your administrator" message. Audit: `user.setup_token.expired` (no actor).
-4. On valid: render a password-set form: `new password + confirm`. Submit:
-   - Validates password meets policy (min 12 chars, no other constraints in v1 — same as bootstrap path)
-   - Hashes via `auth.HashPassword` (existing helper)
-   - Updates `users.password_hash`, sets `must_change_password = 0`
-   - Deletes the `user_setup_tokens` row (single-use)
-   - Logs the user in via the existing session helper
-   - Audit: `user.setup_completed`, payload `{"user_id": "..."}`
-   - Redirect to `/`
-
-### Regenerate setup link (admin)
-
-`/settings/users/{id}/edit` shows a "Regenerate setup link" button when `must_change_password = 1`. Clicking it:
-
-1. Generates a new token + hash, INSERT OR REPLACE on `user_setup_tokens`
-2. Returns the admin to the same one-time link page as the add-user flow
-3. Audit: `user.setup_token.regenerated`
-
-### Cleanup
-
-Expired tokens linger in the DB until cleaned. Add a cheap sweep on the existing maintenance ticker: `DELETE FROM user_setup_tokens WHERE expires_at < ?`. Runs at the same cadence as the alert engine tick (60s). No new ticker needed.
-
-## Self-service password change
-
-`/settings/account`
-
- Accessible to every authenticated user (any role)
- Form: `current password + new password + confirm`
- Server validates current password (re-uses login bcrypt comparison), updates hash, audits `user.password_changed`
- Special case: if `must_change_password = 1`, the current-password field is hidden / not required (covers the legacy "admin reset password" path if we ever add one — current setup-token path doesn't use this)
-
-The bootstrap user's password change uses this same page (no special case for "first admin").
-
-## User list / management UI
-
-### `/settings/users` (admin-only)
-
-```
-Settings · Users [3]
-─────────────────────────────────────────────────
-[ + Add user ]                       [ ] Show disabled
-
-USERNAME       EMAIL              ROLE      LAST LOGIN     STATUS
-alice          alice@example.com  admin     2 mins ago     enabled
-bob            —                  operator  3 days ago     enabled
-charlie        c@example.com      viewer    never          setup pending  ← if has open setup token
-diane          d@example.com      operator  1 month ago    disabled       ← only when "Show disabled"
-
-Actions per row: Edit · (Re-enable | Disable)
-```
-
- "setup pending" badge for users with `must_change_password=1` — clicking the row goes to edit, which surfaces the regenerate-link button prominently
- "Show disabled" is a checkbox querystring filter (`?show_disabled=1`)
- Sort columns: clickable like the audit log (username, role, last_login). Reuse the same pattern (server-side sort + URL builder + glyph)
-
-### `/settings/users/new` (admin-only)
-
-Single form: `username + email (optional) + role`. On submit → either landed on the setup-link page (success) or returned with an inline "username exists, re-enable existing?" panel (collision with disabled user) / red error (collision with enabled user).
-
-### `/settings/users/{id}/edit` (admin-only)
-
- Display-only block: id, created_at, last_login_at, status
- **Editable**: email, role
- **Buttons**:
-  - "Regenerate setup link" — only when `must_change_password = 1`
-  - "Disable user" — flips `disabled_at`; rejected if last enabled admin (server-side check). Confirmation modal with typed name to confirm.
-  - "Re-enable user" — clears `disabled_at`. No confirmation.
-  - "Force logout" — separate from disable; just kills the session but keeps the user enabled. Useful for "I think Bob's session was hijacked" without locking him out.
- Cancel / Save buttons at the bottom
-
-### `/settings/users/{id}/setup-link` (admin-only)
-
-Renders the one-time link with copy button + countdown. Shown after add-user and after regenerate. Reload of this URL after the token is consumed: 410 Gone with a clear message.
-
-### `/settings/account` (any authenticated)
-
-Self-service password change. Form-only page; no nav under Settings since most users will only see this one Settings page in v1.
-
-## API surface
-
-```
-GET    /api/users                        admin   — list (with ?show_disabled=1 filter)
-POST   /api/users                        admin   — create user, returns user_id + setup_url
-GET    /api/users/{id}                   admin   — read
-PATCH  /api/users/{id}                   admin   — update email, role
-POST   /api/users/{id}/disable           admin   — set disabled_at; rejects last-admin
-POST   /api/users/{id}/enable            admin   — clear disabled_at
-POST   /api/users/{id}/regenerate-setup  admin   — new token, returns setup_url
-POST   /api/users/{id}/force-logout      admin   — kill all sessions for this user
-
-POST   /api/account/password             any auth — self password change
-GET    /setup                            public — landing page (HTML form)
-POST   /setup                            public — submit new password
-```
-
-UI routes mirror the API but at `/settings/users/...`.
-
-## Last-admin self-protection
-
-Two operations that could lock everyone out are guarded:
-
- **Disable user**: rejected if the user is admin AND there are no other enabled admins
- **Demote admin to operator/viewer**: same check
-
-Server-side enforcement (single SELECT on `COUNT(*) FROM users WHERE role='admin' AND disabled_at IS NULL`). UI hint: edit page disables the role dropdown's non-admin options + disable button when the user is the last admin, with a tooltip explaining why.
-
-The bootstrap admin is just a regular admin row; this check covers it.
-
-## Audit actions
-
-New action strings introduced:
-
- `user.created`
- `user.updated` (email / role change)
- `user.disabled`
- `user.enabled`
- `user.password_changed`
- `user.setup_completed`
- `user.setup_token.regenerated`
- `user.setup_token.expired` (system-driven, on cleanup sweep)
- `user.force_logout`
-
-All target_kind = `user`, target_id = the affected user's id. Existing payload conventions apply.
-
-## Ordering / dependencies
-
-Slices in approximate landing order (writing-plans will firm this up):
-
-1. **A. Schema** — migrations 0017 + 0018, `Role` helper updates, store API extensions (email, disabled_at, must_change_password, setup_token CRUD, lowercase username constraints)
-2. **B. RBAC middleware** — `requireRole` + `roleAtLeast`, route re-grouping in server.go, 403 rendering for HTML + JSON
-3. **C. Session re-validation** — extend the existing session middleware to re-read user state per request, kick disabled users
-4. **D. Setup-token flow** — `/setup` GET+POST, the one-time link page after add-user
-5. **E. User CRUD API** — handlers + handlers' tests
-6. **F. UI** — `/settings/users` list, add, edit, setup-link page, account page
-7. **G. Sweep** — Playwright walk through the full lifecycle (add → setup link → user signs in → admin disables → user gets kicked → admin re-enables → user signs back in)
-
-Each slice can land as its own commit on the branch. RBAC middleware (B) goes in *before* user CRUD so we don't ship an open `/api/users/*` even briefly.
-
-## Test strategy
-
- **Store**: `Set/GetSetupToken`, `EnableUser`/`DisableUser`, last-admin guard, lowercase-username uniqueness, expired-token cleanup
- **HTTP middleware**: `roleAtLeast` truth table; viewer hitting an operator route returns 403; disabled user gets 401 mid-session
- **Setup flow integration**: create user → fetch setup URL → land on `/setup?token=...` → POST password → user can log in → token row gone
- **UI**: existing Playwright sweep pattern, screenshots into `_diag/p4-03-04-sweep/`
-
-## Out of scope (deferred)
-
- **OIDC** (P4-05) — adds a parallel auth chain. This PR keeps the surface for it (role taxonomy, session middleware) but doesn't wire it.
- **Email-the-setup-link** — explicitly deferred. Easy follow-up because the SMTP channel client from P3-06 is already there.
- **Hard delete** — disable-only in v1; can add a typed-confirm "purge" later if it turns out to be needed.
- **Password complexity / rotation policy** — current minimum (12 chars) and no rotation; tighten later if/when policy demands.
- **Lockout on failed login** — a brute-force protection layer is its own task and orthogonal to RBAC.
- **Audit on 403** — not in v1; revisit if compliance asks for it.
-
-## Risks / gotchas to watch
-
- **Existing tests** that assume "any logged-in user can hit any endpoint" will break. Audit the test fixtures: most use `loginAsAdmin`, which is fine; any tests currently exercising specific operator/viewer paths need explicit role assignment. (Quick grep suggests there aren't many — bootstrap-only.)
- **Bootstrap user normalisation** — the existing admin row's username is whatever it was set to at first run. The new lowercase-uniqueness index uses `LOWER(username)`, which makes the existing row implicitly lowercase-keyed for lookups. No data migration needed.
- **Session middleware re-read cost** — one SELECT per authenticated request. SQLite WAL handles this fine at expected fleet sizes; if it ever shows up on a profile we add a small in-memory cache keyed by session id with a 30s TTL.
- **403 vs 401 distinction** — make sure unauthenticated requests still get 401 (login redirect) and authenticated-but-insufficient get 403. The middleware should compose: auth-required first, role-required second.
-
-## Acceptance
-
- [ ] An admin can add a user, copy the setup link, the new user can land on `/setup?token=...`, set a password, and reach `/`
- [ ] An expired token (>1h) on `/setup?token=...` shows the "contact your administrator" page
- [ ] Admin regenerates the link, old token is invalid, new token works
- [ ] Operator user can trigger Run-now but cannot reach `/settings/users` (403) and the Users tab in Settings is hidden in their nav
- [ ] Viewer user gets 403 on Run-now, 200 on dashboard / alerts / audit
- [ ] Admin disables a user mid-session — the user's next request is 401 and they're redirected to login
- [ ] Admin cannot disable themselves if they are the last enabled admin (server returns 409, UI button is greyed)
- [ ] Self-service password change at `/settings/account` works for every role
- [ ] All existing tests pass; new test suite covers role middleware, setup-token lifecycle, last-admin guard
-
-## Self-review notes
-
- ✅ All sections concrete, no TBD / TODO
- ✅ Schema migrations are column-level (CLAUDE.md compliance)
- ✅ Audit action vocabulary listed in one place; no string typos to drift
- ✅ Out-of-scope list explicit so reviewers can challenge what we *aren't* doing
- ✅ Last-admin guard handled both server-side and UI-hinted
- ✅ Token storage hashes the secret server-side; raw is shown to admin once and never again
- ✅ Session re-validation cost noted with a fallback if it shows up on a profile
@@ -1,215 +0,0 @@
-# P4-05 — OIDC Login Design
-
-> **Date:** 2026-05-05
-> **Status:** brainstorm complete; ready for plan
-> **Closes:** P4-05 (OIDC login)
-
-## Goal
-
-Wire OpenID Connect authentication as a sign-in path alongside the existing local-user system, so a deployment that already has an IdP (Authelia, Authentik, Keycloak, Okta, Auth0, etc.) can use it for restic-manager logins.
-
-## Architecture
-
-OIDC sits on top of the local-user system rather than replacing it. The first time a user signs in via OIDC the server **just-in-time provisions** a local user row marked `auth_source='oidc'`, with role derived from the IdP's `roles` claim. Subsequent sign-ins look up the same row by stable `oidc_subject` and refresh role + email from the latest claims. Once the row exists it behaves like any other local user — admin can disable it, force-logout, see it in audit logs, etc. — except password-login is rejected because there's no password.
-
-The Authorization Code flow (with PKCE) is implemented against the discovered well-known config of a single configured issuer. Front-channel logout: clicking Sign out drops the local session + redirects the browser to the IdP's `end_session_endpoint` (when advertised). Back-channel logout deferred.
-
-## Locked decisions
-
-| Decision | Pick |
-|---|---|
-| User lifecycle | **B** — JIT-provision local rows on first OIDC login (`auth_source='oidc'`, `oidc_subject`) |
-| Role mapping config | **A** — YAML/env, claim name configurable (default `groups`, matching Authelia / Keycloak / Authentik), default = deny on no-match |
-| Username source | `preferred_username`, fallback to `email` |
-| Username collision with existing local user | **Refuse** with clear remediation message |
-| Provider config | **Single provider** — `providers:` array can come later |
-| Login page layout | SSO button **above** password form; password form labelled "or sign in with a local account" |
-| OIDC users + password login | **Disabled** — `auth_source='oidc'` rows have empty `password_hash`; password form rejects them |
-| Logout shape | **Front-channel only** — drop session + redirect to `end_session_endpoint` when advertised |
-| Role re-evaluation | **At login only** — claims read at the OIDC callback; admin can disable mid-session locally |
-
-## Schema changes
-
-Migration 0019 — `users` extensions for OIDC bookkeeping:
-
-```sql
-ALTER TABLE users ADD COLUMN auth_source TEXT NOT NULL DEFAULT 'local'
-  CHECK (auth_source IN ('local', 'oidc'));
-ALTER TABLE users ADD COLUMN oidc_subject TEXT;
-
-CREATE UNIQUE INDEX users_oidc_subject ON users(oidc_subject)
-  WHERE oidc_subject IS NOT NULL;
-```
-
-Both column-level ALTERs (CLAUDE.md preference). The unique partial index defends the JIT-lookup invariant (one row per IdP subject) without blocking multiple rows with NULL oidc_subject (the local users).
-
-## Configuration
-
-```yaml
-# server config — extend existing config struct
-oidc:
-  issuer:        https://auth.example.com    # well-known config discovered from this
-  client_id:     restic-manager
-  client_secret: ${RM_OIDC_CLIENT_SECRET}    # or via _FILE
-  display_name:  Authelia                    # button label "Sign in with <display_name>"; default "SSO"
-  scopes:        [openid, profile, email, groups]
-  role_claim:    groups                      # default if absent (matches Authelia / Keycloak / Authentik)
-  role_mapping:
-    rm-admins:    admin
-    rm-operators: operator
-    rm-viewers:   viewer
-  # Optional — auto-derived from BaseURL if absent.
-  redirect_url:  https://rm.example.com/auth/oidc/callback
-```
-
-Env-var overrides: `RM_OIDC_ISSUER`, `RM_OIDC_CLIENT_ID`, `RM_OIDC_CLIENT_SECRET`, `RM_OIDC_CLIENT_SECRET_FILE`. Mapping is YAML-only (env doesn't fit a multi-key string→string map cleanly).
-
-When `oidc.issuer` is empty or missing, OIDC is disabled (current behaviour). No restart-toggle UI; this is a deploy-time setting.
-
-## Auth flow
-
-### Login start
-
-`GET /auth/oidc/login` — only mounted when OIDC is configured.
-
-1. Generate `state` (32 random bytes, base64) and `code_verifier` (64 random bytes, base64); compute `code_challenge = base64(sha256(code_verifier))`.
-2. Store `(state, code_verifier, created_at)` in a new ephemeral table (or in memory with a 5-minute TTL — see "trade-off" below).
-3. Redirect to `<authorization_endpoint>?response_type=code&client_id=...&redirect_uri=...&scope=...&state=...&code_challenge=...&code_challenge_method=S256`.
-
-### Callback
-
-`GET /auth/oidc/callback?code=...&state=...` — also OIDC-only mount.
-
-1. Validate `state` against the stored value (one-shot — delete row on read). Reject if missing/expired/already used.
-2. Exchange `code` + `code_verifier` for tokens at `token_endpoint`.
-3. Validate the `id_token` JWT: signature against the JWKS endpoint, `iss`, `aud`, `exp`, `iat`, `nonce` (if used).
-4. Extract `sub`, `preferred_username`, `email`, and the configured `role_claim` (default `roles`).
-5. Pick username: `preferred_username` if non-empty, else `email`. Lowercase / trim per the existing local-user rules.
-6. Pick role: first match in `role_mapping` against the array of role-claim values. **No match → deny with a clear error page**, no row created.
-7. Look up user by `oidc_subject`. Three cases:
-   - **Found** — refresh `email`, `role`, `last_login_at`. Don't touch `username` (changing it would break audit trails; if the IdP changes the username, that's an operator concern). Log `user.oidc_login`.
-   - **Not found, username free** — INSERT row with `auth_source='oidc'`, `oidc_subject=<sub>`, `password_hash=''`, `must_change_password=0`. Log `user.created` with payload `{"auth_source":"oidc"}` + `user.oidc_login`.
-   - **Not found, username taken by a local user** — render an error page: "This OIDC user (`<sub>`) wants to sign in as `alice`, but a local user with that name already exists. Ask your administrator to either rename / remove the local user, or exclude this user from the OIDC mapping." 403, no row created. Log `user.oidc_login_blocked`.
-8. Drop a session cookie + `MarkUserLogin` (the existing helper).
-9. Redirect to `/`.
-
-### Logout
-
-`POST /logout` (existing handler) — augmented:
-
-1. Look up the session before deletion (we need the user row to know if they're an OIDC user).
-2. Delete the session as today.
-3. If the user is `auth_source='oidc'` AND the discovered `end_session_endpoint` is non-empty → 303 to `<end_session_endpoint>?id_token_hint=<id_token>&post_logout_redirect_uri=<base>/login`. Otherwise → existing 303 to `/login`.
-
-We need to keep the latest `id_token` per session to drive `id_token_hint`. Stash it in a new `sessions.id_token TEXT` column (one column-level ALTER on migration 0019 alongside the user columns), populated only for OIDC sessions.
-
-## State table
-
-Two reasonable shapes for the short-lived state used during the OAuth round-trip:
-
- **In-memory map** with a 5-minute TTL sweeper. Simpler, but multi-process deployments lose it (no multi-process today, but Phase 5 OSS readiness might add).
- **`oidc_state` table** — `(state_hash PK, code_verifier, created_at)`, swept on the same 60s alert-engine tick that already handles setup-token cleanup.
-
-I'll go with the **table**. Costs ~3 lines in the existing cleanup tick, behaves correctly under restarts, and survives a future scale-out. Migration 0019 includes:
-
-```sql
-CREATE TABLE oidc_state (
-  state_hash    TEXT PRIMARY KEY,    -- sha256(state) hex; raw state never persisted
-  code_verifier TEXT NOT NULL,
-  created_at    TEXT NOT NULL
-);
-CREATE INDEX oidc_state_created ON oidc_state(created_at);
-```
-
-## Login-page UI
-
-`/login` template branches based on `view.OIDCEnabled`:
-
- **OIDC off** → current layout (just the password form).
- **OIDC on** → an `Sign in with <provider name>` button at the top, then a faint divider line, then the existing password form labelled "Or sign in with a local account". Provider name comes from a new optional config `oidc.display_name` (defaults to "SSO").
-
-Failed-OIDC redirects (no role match, username collision, IdP error) land on `/login?oidc_error=<reason>` with a small banner above the buttons.
-
-## Audit actions
-
-New entries in the action vocabulary:
-
- `user.oidc_login` (target_kind=user, target_id=user_id, payload `{"sub":"…"}`)
- `user.oidc_login_blocked` (target_kind=user, target_id=oidc_subject when no row was created, payload `{"username":"…", "reason":"username_taken|no_role_match|other"}`)
- `user.created` already exists; OIDC's first-time provisioning fires this with payload `{"auth_source":"oidc"}` so the audit log distinguishes admin-created from JIT-provisioned rows.
-
-## User-management UI changes
-
-Small additions, not new screens:
-
- **Users list** — Status column adds a small `oidc` chip when `auth_source='oidc'` so admin can see at a glance which rows came from JIT-provisioning. Sortable by auth_source via the same sortable-headers pattern (lands as a small follow-up if anyone asks; out of scope for v1).
- **Add user form** — disabled when OIDC is the only auth path, with a hint: "User provisioning is handled by your OIDC provider; users appear here on first sign-in." Configurable later via a `oidc.disable_local_users` flag if that becomes a real ask. Out of scope for v1; both paths stay open.
- **Edit user form** — when `auth_source='oidc'`:
-  - Username field disabled (changing it would just be undone on next OIDC login)
-  - Role dropdown disabled, with a hint: "Role is managed by your OIDC provider's `roles` claim mapping. Edit the mapping in server config to change."
-  - Email field disabled (refreshed from IdP on each login)
-  - **Disable / Enable / Force logout** still work — disabling an OIDC user kicks their session and rejects future OIDC logins ("user disabled by administrator")
-  - **Regenerate setup link** hidden — there's no setup token for OIDC users
- **Login UI** — password form rejects users with `auth_source='oidc'` ("This account uses single sign-on. Click the SSO button above.")
-
-## Middleware / handler changes
-
- **Routes**: new public-band entries `GET /auth/oidc/login`, `GET /auth/oidc/callback`. Skipped entirely when OIDC isn't configured (`s.deps.OIDC == nil`).
- **Logout handler** augmented to fetch the user row + decide between local logout (303 → `/login`) and OIDC logout (303 → `end_session_endpoint`).
- **Login handler** rejects `auth_source='oidc'` users with the SSO-prompt error.
- **Last-admin guard** — already covers OIDC users naturally because they live in the `users` table. The role-from-claims path could create a "every admin gets demoted to operator" situation if the IdP's claim mapping is wrong; the guard rejects that demotion at the moment it'd be applied (returns the user to the login page with `oidc_error=role_change_blocked` and audit entry; admin must fix the mapping or promote a local admin first).
-
-## Implementation outline
-
-1. **Schema** — migration 0019 (users.auth_source + oidc_subject, sessions.id_token, oidc_state table)
-2. **Config** — extend `internal/server/config` with the OIDC block + env-var overrides; load JWKS lazily
-3. **Discovery + JWKS** — small helper that fetches `<issuer>/.well-known/openid-configuration` once at startup, caches `authorization_endpoint`, `token_endpoint`, `end_session_endpoint`, `jwks_uri`. JWKS refreshed on first failed verification.
-4. **Login start handler** — `/auth/oidc/login`
-5. **Callback handler** — `/auth/oidc/callback`, with the four claim-resolution branches
-6. **Logout handler augmentation** — branch on `auth_source`
-7. **Login form rejection** — local-user password form rejects OIDC accounts
-8. **State cleanup** — extend the alert engine's existing cleanup tick
-9. **UI** — `oidc` chip on users list, disabled fields on edit-form for OIDC users, login page SSO button + error banner
-10. **Tests** — config parse tests; happy-path callback test using a fake IdP (httptest server with a hand-rolled discovery doc + JWKS); username-collision test; no-role-match test; logout test
-11. **Sweep** — full Playwright walk against an actual IdP (Authelia in a Docker container) — admin gets in via OIDC, role mapping works, logout redirects through IdP, OIDC user can't password-login
-
-## Test strategy
-
-The IdP is the hard part to test cleanly. Two layers:
-
- **Unit / integration tests** use a stub OIDC provider built into the test harness — `httptest.Server` exposing `.well-known/openid-configuration`, a token endpoint that signs minted JWTs with a test ECDSA key, and a JWKS endpoint serving the public key. This covers every code path without a real IdP. Pattern: each test mints its own claims and runs the callback against the stub.
- **Smoke env** runs against a real Authelia container (existing `compose.smoke.yaml`-style file or one-liner `docker run`) for the final sweep — confirms the discovery doc isn't being misread, real JWT verification works, real `end_session_endpoint` redirect works.
-
-## Out of scope (deferred)
-
- **Multi-provider** support (`providers:` array)
- **Back-channel logout** (RFC 8138) — schema isn't blocked from adding it later
- **UI-driven role mapping** (config-only in v1)
- **Refresh tokens / mid-session role re-evaluation** — login-only refresh in v1
- **`oidc.disable_local_users`** flag — both paths stay open in v1
- **OIDC user dashboard chip / badges** beyond the small `oidc` indicator on the users list
- **Per-user "auth source" filter on the users list** — sortable headers cover most of the use case
-
-## Risks / gotchas
-
- **JWKS key rotation** — refresh on first failed verification is the standard fix; document the cache TTL (1h) in the config block.
- **Clock skew** — accept `iat`/`exp` with a 60s leeway; matches what most OIDC libraries do.
- **End-session 404 / not advertised** — degrade gracefully; just drop the session and 303 to `/login`. Don't 500 the logout because the IdP doesn't implement RP-initiated logout.
- **Username changes at the IdP** — silently keep the local username (matches our locked decision: subject is the stable key, username is display-only). Document.
- **Role claim is sometimes a string, sometimes an array, sometimes a comma-separated string** depending on IdP — normalise into `[]string` before mapping. Authelia/Keycloak emit arrays; some custom setups emit strings; handle both.
- **Authelia `sub` is an opaque UUID, not the username** (Authelia 4.39+ default for new clients). Don't assume `sub` is human-readable; it's stable but display value is `preferred_username` or `email`. The locked design already keys lookups on `sub` and uses `preferred_username` for the display username, so this is just a correctness note.
- **`end_session_endpoint` may not be published** (Authelia doesn't advertise it for many configs). The locked logout flow already degrades to "drop session + redirect to /login" when the discovery doc lacks it; no extra config needed.
- **Password-form bypass for OIDC users via /api/auth/login (JSON)** — same rejection rule applies, not just the HTML form.
-
-## Acceptance
-
- [ ] An OIDC user with `roles: ["rm-admins"]` can sign in, becomes an admin, is visible in `/settings/users` with an `oidc` chip
- [ ] Same user signing in again resolves to the same row (no duplicate)
- [ ] Same user with `roles: ["something-else"]` is denied, lands on `/login?oidc_error=no_role_match` with a banner, no row created
- [ ] OIDC user can't password-login through `/login` or `/api/auth/login`
- [ ] Admin disables an OIDC user → next OIDC login is rejected, existing session bounced (existing disable-mid-session)
- [ ] Sign out as an OIDC user → 303 to IdP's end-session URL (when advertised); no end-session URL → 303 to `/login`
- [ ] OIDC config absent → password login works exactly as today (zero behavioural change)
- [ ] Username collision: a local `alice` exists, OIDC user with `preferred_username=alice` and a different `sub` → blocked at sign-in with the clear error page
- [ ] Last-admin guard refuses to demote the only enabled admin even if the IdP's role mapping says otherwise
- [ ] All existing tests pass; new test suite covers the four claim-resolution branches and logout
@@ -1,229 +0,0 @@
-# P5-03 — Docker-only release path
-
-**Status:** approved 2026-05-05. Pivots P5-03 away from `goreleaser` +
-binary archives toward a single Docker image as the only public
-deliverable.
-
-## Goal
-
-One artifact per tag: the `restic-manager` server image, multi-arch
-(linux amd64 + arm64), published to the Gitea container registry of
-this self-hosted instance. The image bakes in cross-compiled agent
-binaries (linux amd64, linux arm64, windows amd64), the install
-scripts, and the systemd unit at a read-only image path. The running
-server distributes those agents and scripts via its existing
-`/agent/binary` and `/install/*` endpoints; operators on N hosts never
-download a release artifact directly.
-
-Source builds via `make build` remain a first-class path for anyone
-who wants binaries.
-
-## Non-goals
-
- Standalone binary archives (`.tar.gz`, `.zip`) on the release page.
- darwin / windows-arm64 agent targets — neither is service-tested.
- `goreleaser`. Not used.
- `cosign`, `SBOM`, `in-toto`, `minisign`. Re-promote when we ship
-  binaries outside an image (Phase 6 candidate).
- GHCR / GitHub mirror. Single source of truth = Gitea.
-
-## Decisions captured (with one-line rationale)
-
-| ID | Decision | Why |
-|----|----------|-----|
-| D1 | One artifact: server Docker image | Architecture already routes agent distribution through the server (`/agent/binary`); release surface should mirror that. |
-| D2 | Trigger: `tag-push` (`v*.*.*`) **plus** `workflow_dispatch` | Tag for real cuts; dispatch for snapshot iteration without polluting tag history. |
-| D3 | Build matrix: linux amd64+arm64 server image; agent cross-compiles for linux amd64+arm64+windows amd64 | Mirrors the existing CI build matrix; nothing ships that hasn't been service-tested. |
-| D4 | Image-baked, separate path (`/opt/restic-manager/dist/`); HTTP handler reads `<DataDir>/...` first, falls back to `/opt/...` | Volume stays purely operator state; image content is immutable per tag; eliminates the smoke-env "stale agent" footgun in production. |
-| D5 | Tag fan-out: `vX.Y.Z`, `X.Y`, `X`, `latest` — but `latest` is held back until `v1.0.0` | Standard rolling-minor pattern; pre-1.0 forces explicit pinning. |
-| D6 | Snapshot tag: `:snapshot-<shortsha>`, never moves `latest` | Operator can never accidentally pull an unblessed build. |
-| D7 | Version embedding via `-ldflags`: `main.version`, `main.commit`, `main.date` on both `cmd/server` and `cmd/agent` | Server already had `version`; add `commit`/`date` to both for parity and traceability. |
-| D8 | Registry: Gitea container registry on this instance, under `<host>/<owner>/restic-manager` | One source of truth, no external creds. |
-| D9 | Integrity: a `SHA256SUMS` file + the manifest digest in the release notes; nothing else | Image is the unit of trust; pull-by-digest is the verification primitive. |
-| D10 | P1-31 (signed binaries) stays deferred | Re-promote the day we ship binaries outside an image. |
-
-## Image layout
-
-Multi-stage Dockerfile (extends today's `deploy/Dockerfile.server`):
-
-```
-build stage (golang:1.25-alpine):
-    cross-compile cmd/server for $TARGETARCH (linux)
-    cross-compile cmd/agent for linux/amd64
-    cross-compile cmd/agent for linux/arm64
-    cross-compile cmd/agent for windows/amd64
-    (CGO_ENABLED=0 throughout — pure-Go SQLite)
-
-final stage (gcr.io/distroless/static-debian12:nonroot):
-    /usr/local/bin/restic-manager-server                   (matches image arch)
-    /opt/restic-manager/dist/agent-binaries/
-        restic-manager-agent-linux-amd64
-        restic-manager-agent-linux-arm64
-        restic-manager-agent-windows-amd64.exe
-    /opt/restic-manager/dist/install/
-        install.sh
-        install.ps1
-        restic-manager-agent.service
-```
-
-`/opt/restic-manager/dist/` is owned by `root:root`, mode `0755` for
-directories, `0755` for `install.sh` (script must be executable when
-the install path uses `curl ... | sh` semantics) and `0644` for the
-unit file and `install.ps1`. The agent binaries are mode `0755`.
-
-`<DataDir>` keeps holding only operator state: `restic-manager.db`,
-`secret.key`, `secrets.enc`, `audit/`, `tls/`. Nothing the image
-owns gets written into the volume.
-
-## Server-side handler change
-
-`internal/server/http/agent_assets.go` today reads from
-`<DataDir>/agent-binaries/<name>` and `<DataDir>/install/<name>`.
-
-Change: if the file isn't present in `<DataDir>`, fall back to
-`/opt/restic-manager/dist/<subpath>/<name>`. The fallback path is a
-new server-config field defaulted to `/opt/restic-manager/dist`,
-overridable via `RM_BUNDLED_ASSETS_DIR` for tests and source-build
-deployments. If neither path resolves, return 404 (existing
-`binary_not_published` / `not_found` body unchanged).
-
-This means:
- A fresh container without any operator-staged overrides serves the
-  baked-in agents. No first-run setup needed.
- An operator can still drop a custom-built agent into
-  `<DataDir>/agent-binaries/` to override the image's copy (handy for
-  pre-release agent testing without rebuilding the server image).
- Source-build dev (`bin/restic-manager-server` running out of the
-  working tree) still works exactly as today — the fallback dir is
-  configurable, and the `<DataDir>` path remains the primary lookup.
-
-Tests cover four cases: (a) DataDir hit, (b) fallback hit, (c) DataDir
-hit shadows fallback, (d) neither — 404.
-
-## Versioning
-
-Both binaries grow `commit` and `date` ldflag-targets next to the
-existing `version`:
-
-```go
-var (
-    version = "dev"
-    commit  = "none"
-    date    = "unknown"
-)
-```
-
-Dockerfile gains `ARG VERSION`, `ARG COMMIT`, `ARG DATE`, all
-`""`-defaulted; the `go build` line passes them via `-ldflags`. The
-release workflow fills them from `${{ gitea.ref_name }}`,
-`${{ gitea.sha }}`, and a UTC ISO-8601 timestamp.
-
-Snapshot builds (workflow_dispatch) compute
-`VERSION=0.0.0-snapshot-${SHORTSHA}` and tag the image as
-`:snapshot-${SHORTSHA}` only. They never touch `latest` or any
-`vX.Y.Z` tag.
-
-## Workflow (`.gitea/workflows/release.yml`)
-
-```yaml
-name: Release
-
-on:
-  push:
-    tags: ['v[0-9]+.[0-9]+.[0-9]+']
-  workflow_dispatch:
-
-env:
-  IMAGE: gitea.dcglab.co.uk/${{ gitea.repository }}
-
-jobs:
-  image:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: docker/setup-qemu-action@v3
-      - uses: docker/setup-buildx-action@v3
-      - uses: docker/login-action@v3
-        with:
-          registry: gitea.dcglab.co.uk
-          username: ${{ gitea.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: compute tags
-        id: meta
-        run: |
-          # tag-push  → :vX.Y.Z, :X.Y, :X (only :latest if X >= 1)
-          # dispatch  → :snapshot-<shortsha>
-          ...
-      - uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: deploy/Dockerfile.server
-          platforms: linux/amd64,linux/arm64
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          build-args: |
-            VERSION=${{ steps.meta.outputs.version }}
-            COMMIT=${{ gitea.sha }}
-            DATE=${{ steps.meta.outputs.date }}
-```
-
-The `compute tags` step:
-
- For `push:tags`: extract `vMAJOR.MINOR.PATCH`. Always emit
-  `:vMAJOR.MINOR.PATCH`, `:MAJOR.MINOR`, `:MAJOR`. Emit `:latest`
-  only when `MAJOR >= 1`.
- For `workflow_dispatch`: emit `:snapshot-<shortsha>`. Nothing else.
-
-No release-asset upload step yet — the GHCR-equivalent registry push
-is the deliverable. A future iteration may attach a `SHA256SUMS` file
-to a Gitea release object once `tea release create` is wired in;
-that's not in scope for the first cut.
-
-## Tests / verification
-
-1. `go vet ./...` (CLAUDE.md rule, runs locally pre-commit).
-2. `go test ./internal/server/http/...` covers the new fallback
-   logic.
-3. Local manual smoke: `docker build -f deploy/Dockerfile.server .`
-   produces an image; `docker run --rm <image>` starts the server;
-   `curl http://127.0.0.1:8080/agent/binary?os=linux&arch=amd64`
-   serves bytes; `curl http://127.0.0.1:8080/install/install.sh`
-   serves the script.
-4. Release workflow itself is exercised on first tag-push; until
-   then, `workflow_dispatch` is the smoke test.
-
-## Operator-facing changes
-
- `README.md` install snippet becomes
-  `docker run -v rm-data:/var/lib/restic-manager ...
-  gitea.dcglab.co.uk/<owner>/restic-manager:vX.Y.Z`. Pre-1.0
-  releases are pinned by exact tag; no `:latest` is published.
- The CLAUDE.md "restage" block is dev-only (smoke env runs the
-  server out of `bin/`). Production users on the image never see
-  it.
- `RM_BUNDLED_ASSETS_DIR` is documented in the server config
-  reference (defaults to `/opt/restic-manager/dist`).
-
-## Risks / footguns
-
- **Image size growth.** Three agent binaries (~15-20 MB each
-  stripped) add ~50 MB. Acceptable; we're already shipping a
-  distroless server. Watch the trajectory once Phase 4 alerting is
-  in.
- **Dockerfile cross-compile multiplies build time** on the runner.
-  Pure-Go means each leg is just a `go build`; total stage time
-  should stay under 60s on the self-hosted runner.
- **`ARG VERSION` leakage.** The current Dockerfile already accepts
-  `ARG VERSION=dev`; we're tightening, not loosening.
- **Operator overriding `<DataDir>/agent-binaries/<name>`** with a
-  stale binary will silently shadow the image's copy. Documented in
-  the server config reference; this is a feature (lets operators
-  hot-patch a pre-release agent) not a bug.
-
-## Out of scope (tracked for follow-up)
-
- Cosign / SBOM / in-toto provenance — defer to Phase 6 with the rest
-  of the supply-chain hardening.
- GHCR mirror — defer until P5-01 docs site goes public.
- `tea release create` integration — pending until we have something
-  worth attaching beyond the image digest.
@@ -1,448 +0,0 @@
-# P6-01 + P6-02 — Agent self-update + fleet update
-
-Status: design approved 2026-05-06.
-Scope: P6-01 (agent self-update mechanism) and P6-02 (dashboard
-version reporting + fleet update UI). One spec, one branch — the
-two tasks are tightly coupled (P6-02 is the operator surface for
-the mechanism P6-01 ships).
-
-## 1. Background
-
-P5-03 pivoted release distribution to a single multi-arch server
-Docker image, with cross-compiled agent binaries baked under
-`/opt/restic-manager/dist/agent-binaries/` and served via
-`GET /agent/binary?os=…&arch=…`. The plumbing already does
-dual-path lookup: `<DataDir>/agent-binaries/<name>` overrides the
-image-baked copy, so an operator can hot-patch a pre-release agent
-without rebuilding the image.
-
-That makes the server the natural distribution point for agent
-upgrades. "Update agent" collapses to "re-fetch from your own
-server" — no apt repo, no Chocolatey, no third-party signing infra,
-and version pinning is automatic because the server only ever
-serves the agent that matches its own release.
-
-This spec wires up the update mechanism end-to-end and the
-operator surface that drives it.
-
-## 2. Decisions
-
-| # | Decision | Rationale |
-|---|----------|-----------|
-| 1 | Operator-driven only — no auto-update | Matches the rest of the app's job-dispatch model; avoids "bad release upgrades every host instantly"; auto-update can be added later as a setting flip if asked |
-| 2 | Linux: just exit, let systemd restart. Windows: detached helper script. | Linux supports rename-while-open; Windows holds an exclusive lock on the running .exe |
-| 3 | M1 (keep `agent.old` on disk) + M2 (rolling fleet update with halt-on-fail). Skip M3 (auto-rollback watchdog). | M1 is ~5 lines, M2 falls naturally out of P6-02's UI, M3 is a lot of plumbing for "shipped a binary that doesn't start" |
-| 4 | Skip sha256 digest verification for v1 | TLS already covers the corruption-in-transit threat; image-tampering is image-build's problem, not the agent's |
-| 5 | Exact string version match for "out of date" | With server-bundled binaries there's exactly one canonical version per server image — anything else is out of date by definition |
-| 6 | WS envelope only, no `restic-manager-agent update` CLI subcommand | YAGNI; no concrete consumer; the underlying logic is reusable when one appears |
-
-## 3. Wire protocol
-
-### 3.1 Server → agent: `command.update`
-
-```
-{
-  "type": "command.update",
-  "id": "<envelope id>",
-  "payload": {
-    "job_id": "<ulid>"
-  }
-}
-```
-
-No `os` / `arch` / `version` in the payload — the agent already
-knows its own build target and fetches from its configured server
-URL via the existing `/agent/binary` handler. Including a target
-version would also tempt the agent into version-comparison logic;
-keep that on the server side.
-
-### 3.2 Job lifecycle (server-driven)
-
-The agent has limited ability to report on its own restart, so the
-job state machine lives on the server:
-
- **queued → running** when the envelope is dispatched.
- **running → succeeded** when the agent re-hellos with
-  `agent_version == server.Version` after dispatch and within
-  the timeout. Audit `host.update_succeeded`.
- **running → failed (timeout)** if 90 seconds pass without a
-  hello carrying the matching version. Audit `host.update_failed`.
-  Raise alert kind `update_failed` (reuses P3-05 alert engine).
-  This single transition covers both the "agent never came back
-  at all" case and the "agent came back at the wrong version"
-  case — see §6.2 for why we don't transition immediately on a
-  mismatched hello.
-
-Migration 0021 widens the `jobs.kind` CHECK constraint to include
-`update`. Same column-level pattern as 0012 (where 0012 added
-`restore` and `diff`).
-
-## 4. Agent-side execution
-
-Lives in `internal/agent/updater`, build-tag split:
-
- `updater_unix.go` — Linux + any future POSIX target.
- `updater_windows.go` — Windows-only, uses the helper-script
-  pattern.
- `updater.go` — shared `Update(ctx, serverURL string) error`
-  interface and the HTTP fetch/streaming code (no platform deps).
-
-### 4.1 Linux flow
-
-1. Receive `command.update` from the WS dispatcher.
-2. Resolve own binary via `os.Executable()` and `filepath.Abs`.
-   Refuse if the resolved path is `/proc/self/exe` or otherwise
-   not a real file (defence in depth — shouldn't happen under
-   systemd, but bail loudly if it does).
-3. `GET <server>/agent/binary?os=linux&arch=<runtime.GOARCH>`,
-   stream to `<binary>.new` in the same directory as the running
-   binary (same filesystem ⇒ atomic rename).
-4. fsync the file, `os.Chmod(0755)`.
-5. Copy current binary to `<binary>.old` (overwrite if it
-   exists). M1 — one-revision rollback target.
-6. `os.Rename(<binary>.new, <binary>)`.
-7. Close the WS connection cleanly (sends close frame so the
-   server transitions the connection to `disconnected` rather
-   than waiting for the heartbeat-miss sweep).
-8. `os.Exit(0)`. Systemd's `Restart=always` (already in the unit)
-   brings up the new binary within seconds.
-
-### 4.2 Windows flow
-
-The .exe is exclusively locked by the OS while running, so steps
-5–6 above can't happen in-process. Use a detached helper:
-
-1. Steps 1–4 the same — fetch into `<binary>.exe.new`, fsync.
-2. Write `update.cmd` to a tmp path with the orchestration:
-   ```
-   timeout /t 3 /nobreak >nul
-   copy /Y "<binary>.exe" "<binary>.exe.old"
-   sc stop restic-manager-agent
-   :wait
-   sc query restic-manager-agent | find "STOPPED" >nul
-   if errorlevel 1 (timeout /t 1 /nobreak >nul & goto wait)
-   move /Y "<binary>.exe.new" "<binary>.exe"
-   sc start restic-manager-agent
-   del "%~f0"
-   ```
-3. `CreateProcess` it detached
-   (`DETACHED_PROCESS | CREATE_NO_WINDOW`, no parent handles).
-4. Close WS, `os.Exit(0)`. SCM sees clean stop and waits — does
-   *not* try to restart, because `sc stop` is the helper's job,
-   not a crash. (`Restart=always` semantics differ between
-   systemd and SCM. SCM treats clean-exit-after-stop as
-   intentional and does not auto-restart; only crashes restart.
-   That's why the helper script needs the explicit `sc start`
-   at the end.)
-
-### 4.3 Service-user assumption
-
-Both Linux (`User=root` per the existing unit) and Windows
-(`LocalSystem` by default) can write the binary path directly. If
-the agent ever moves to a non-root service user, the updater
-breaks — would need either a setuid helper or an out-of-process
-update service. Add a `// NOTE:` comment in the updater package
-flagging this; not a v1 blocker.
-
-## 5. Server build version
-
-New package `internal/version` exposing two constants:
-
-```
-package version
-
-var (
-    Version = "dev"
-    Commit  = ""
-)
-```
-
-Wired via `-ldflags` in the Makefile:
-
-```
-GO_LDFLAGS = -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=$(VERSION) \
-             -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Commit=$(COMMIT)
-
-VERSION := $(shell git describe --tags --always --dirty)
-COMMIT  := $(shell git rev-parse --short HEAD)
-```
-
-Both `cmd/server` and `cmd/agent` link the same package, so an
-agent's `agent_version` (sent in the hello payload, already wired
-since P1-11) is comparable byte-for-byte to the server's
-`version.Version`.
-
-`make build` already does what's needed for source builds. The
-Phase 2 work in this spec is the Docker release path — confirm
-during plan execution that `.gitea/workflows/release.yml` passes
-`VERSION` and `COMMIT` into the Docker `--build-arg` chain so the
-in-image binaries embed the same string the image is tagged with.
-If not, add the wiring.
-
-Dirty/dev builds (`v1.2.3-dirty`) won't match clean server builds,
-so every dev environment will show every host as out-of-date. This
-is acceptable — the chip is a noop in dev, real ops always run
-tagged builds.
-
-A new `GET /api/version` endpoint returns
-`{"version": "...", "commit": "..."}`. Used by the dashboard
-header tile and by `/settings/fleet-update`. Public-band — exposes
-no secrets, lets the install scripts surface it too.
-
-## 6. P6-01 server endpoints
-
-### 6.1 `POST /api/hosts/{id}/update`
-
-Admin-only. Refuses (with structured error code) when:
-
- Host is offline (`host_offline`).
- Host's `agent_version == server.Version` (`already_up_to_date`).
- An update job for this host is already running (`update_in_progress`).
-
-Happy path: creates `jobs` row with `kind=update`, dispatches
-`command.update` envelope, audit-logs `host.update_dispatched`,
-returns `{"job_id": "..."}`.
-
-UI form-post variant on `/hosts/{id}/update` returns
-`HX-Redirect` to the live job log.
-
-### 6.2 Hello handler integration
-
-The existing `onAgentHello` (P1-11) already upserts
-`agent_version`. Extend it: after the upsert, look for any
-`update` job for this host with `status='running'`. If one
-exists:
-
- `agent_version == server.Version` → mark job `succeeded`,
-  audit `host.update_succeeded`.
- `agent_version != server.Version` → leave the job running so
-  the timeout path catches it as a rollback failure (don't fail
-  immediately — gives the agent one chance to come back, restart,
-  hello again with the right version).
-
-Adds a small in-memory map of pending updates so the timeout
-goroutine knows when to give up. Persisted state lives in the
-`jobs` table; the in-memory map is just for the timer.
-
-## 7. P6-02 fleet update
-
-### 7.1 Schema
-
-Migration 0022, column-level adds only:
-
-```
-CREATE TABLE fleet_updates (
-  id              TEXT PRIMARY KEY,
-  started_at      TEXT NOT NULL,
-  started_by_user_id TEXT NOT NULL REFERENCES users(id),
-  target_version  TEXT NOT NULL,
-  status          TEXT NOT NULL CHECK (status IN ('running','completed','halted','cancelled')),
-  current_host_id TEXT REFERENCES hosts(id),
-  halted_reason   TEXT,
-  completed_at    TEXT
-);
-
-CREATE TABLE fleet_update_hosts (
-  fleet_update_id TEXT NOT NULL REFERENCES fleet_updates(id) ON DELETE CASCADE,
-  host_id         TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
-  status          TEXT NOT NULL CHECK (status IN ('pending','running','succeeded','failed','skipped')),
-  job_id          TEXT REFERENCES jobs(id),
-  failed_reason   TEXT,
-  PRIMARY KEY (fleet_update_id, host_id)
-);
-```
-
-### 7.2 Worker loop
-
-A single in-process goroutine — at most one fleet update may run
-at a time (enforced via a `sync.Mutex` + a precondition check on
-`POST /api/fleet/update`).
-
-```
-for each pending fleet_update_hosts row in dispatch order:
-    set fleet_updates.current_host_id = row.host_id
-    set fleet_update_hosts.status = 'running'
-    if host.agent_version == server.Version:
-        # Already updated since we built the list — skip.
-        set status = 'skipped'; continue
-    if !host.online:
-        # Offline since we built the list — halt.
-        halt(reason="host went offline")
-        return
-    dispatch_update_for_host(host)  # reuses 6.1 logic
-    wait_up_to_90s_for_hello_with_matching_version()
-    if matched:
-        set status = 'succeeded'; continue
-    else:
-        set status = 'failed', failed_reason = "..."
-        halt(reason="update failed on host X")
-        return
-set fleet_updates.status = 'completed', completed_at = now
-```
-
-Halt: set `fleet_updates.status = 'halted'`, raise an alert kind
-`fleet_update_halted`, audit `fleet.update_halted` with the host
-id and reason. Subsequent hosts stay `pending` so the operator can
-see what was queued and decide whether to resume (resume = start a
-new fleet update with the still-out-of-date subset).
-
-Cancel: admin-only `POST /api/fleet-updates/{id}/cancel`. Sets
-`status='cancelled'`. The currently-dispatched host's update job
-keeps running (the agent is already mid-restart) — cancel only
-prevents the *next* host from being picked. Audit
-`fleet.update_cancelled`.
-
-### 7.3 UI surfaces
-
-**Per-host chip (host_row partial + host detail chrome):**
-
-`out of date · v1.2.2 → v1.2.3` — amber-accented, mirrors `.tag`
-token shape. Only rendered when:
-
-```
-host.agent_version != "" && host.agent_version != server.Version
-```
-
-Empty `agent_version` (host enrolled but never connected) renders
-nothing rather than "out of date" — we don't know what version
-they have.
-
-**Dashboard summary tile:**
-
-The hero strip already has tiles. Add an "Updates" tile:
-`N hosts behind` linking to `/?updates=behind` (extends NS-04's
-filter machinery — adds an `updates` query param alongside
-`status`/`repo_status`/`tag`). Hidden when N == 0.
-
-**Per-host Update button on `/hosts/{id}`:**
-
-Right-rail, admin-only. Disabled with hover tooltip when host
-offline / already up to date / update in progress. POSTs to
-`/hosts/{id}/update`, `HX-Redirect` to the live job log.
-
-**Fleet update page `/settings/fleet-update`:**
-
-Admin-only. Two states:
-
- **Idle**: lists out-of-date online hosts (table: hostname,
-  current version, target version, last seen). Big "Start rolling
-  update" button behind a typed-confirm dialog (operator types
-  the host count, e.g. `12`, to enable the button — same shape as
-  the host-delete confirm).
- **Running/halted/completed**: shows the currently-active
-  fleet_update row + per-host progress list. Polls every 3s (htmx
-  trigger conditional on `document.visibilityState === 'visible'`,
-  same pattern as the alerts page). Renders:
-  ```
-  Updated 3/12 · currently updating <hostname>
-  Halted on <hostname>: <reason> · job log →
-  ```
-
-Audit actions: `fleet.update_started`, `fleet.update_completed`,
-`fleet.update_halted`, `fleet.update_cancelled`.
-
-### 7.4 Alert engine integration
-
-P3-05's alert engine already supports kind-based registration. Add
-two new kinds:
-
- `update_failed` — per-host, raised on individual update failure.
-  Auto-resolves when the host re-hellos with the matching version.
- `fleet_update_halted` — global, raised on fleet halt. Auto-resolves
-  when a subsequent fleet update completes successfully.
-
-## 8. RBAC
-
-| Endpoint | Role |
-|----------|------|
-| `POST /api/hosts/{id}/update` | admin |
-| `POST /api/fleet/update` | admin |
-| `POST /api/fleet-updates/{id}/cancel` | admin |
-| `GET /api/fleet-updates/{id}` | admin (status polling) |
-| `GET /api/version` | public |
-
-Operator and viewer see the "out of date" chip but no update
-buttons. Mirrors the existing pattern: read affordances are
-visible to all roles, write affordances are gated.
-
-## 9. Testing
-
-### 9.1 Unit
-
- `internal/agent/updater`: fake-`/agent/binary` HTTP server +
-  tmp "running binary" file, assert post-state — binary swapped,
-  `.old` present, no leftover `.new`. Linux path only (Windows
-  helper covered by build-tag compile-only).
- `internal/server/http`: `POST /api/hosts/{id}/update` happy
-  path, refuses-when-offline, refuses-when-up-to-date,
-  refuses-when-update-in-progress, RBAC enforcement, audit row
-  written.
- Hello handler: agent reconnects with matching version after
-  `update` job dispatch → marks job `succeeded`, drops the
-  in-memory pending entry. Mismatched version → no-op (timeout
-  catches it).
- Timeout path: synthetic `update` job + 90s elapsed →
-  marks `failed`, raises alert.
- Fleet worker: table-driven over the loop's state machine —
-  success-then-success, success-then-timeout-halts,
-  cancel-mid-flight, no-online-out-of-date-hosts-completes-immediately,
-  host-disappears-from-list-mid-loop-skips.
-
-### 9.2 Smoke validation (per CLAUDE.md restage block)
-
-1. Build server + agent at version A. Restage. Enrol a host;
-   confirm `agent_version=A`.
-2. Bump version to B (`make build VERSION=B`), rebuild server
-   only, restart server. Dashboard shows host as out-of-date with
-   `A → B` chip. Updates tile reads "1 host behind".
-3. Rebuild agent at B, restage `<DataDir>/agent-binaries/`. Click
-   **Update agent** on host detail. Agent fetches, swaps, exits;
-   systemd restarts it; hello-back at B → job `succeeded`, chip
-   gone, tile clears.
-4. Rollback path: leave `<DataDir>/agent-binaries/` at A, server
-   at B, click Update — agent fetches A, swaps to A, restarts at
-   A; hello says A != B; server marks job `failed` after 90s with
-   reason "agent reconnected at version A, expected B".
-5. Fleet update: spin up two smoke hosts both out-of-date, fire
-   **Start rolling update**, watch progress page tick host 1 →
-   host 2 → completed.
-6. Halt path: replace one of the `<DataDir>/agent-binaries/`
-   files with `/bin/false`. Run fleet update. First host gets
-   broken binary, fails to come back up, fleet update halts at
-   host 1 after 90s, alert raised, host 2 left as `pending`.
-
-Step 6 validates M2 end-to-end — the rolling halt is the actual
-safety guarantee, not a nice-to-have.
-
-## 10. Out of scope
-
- sha256 digest verification (deferred — see decision 4).
- `restic-manager-agent update` CLI subcommand (deferred —
-  decision 6).
- Auto-update (deferred — decision 1).
- Auto-rollback watchdog M3 (deferred — decision 3).
- Migrating the agent off `User=root` (separate hardening track).
- Cross-version protocol-compatibility checks beyond the existing
-  `protocol_version` handshake (P1-11). If the new agent's
-  `protocol_version` is incompatible with the server, the
-  existing handshake rejects it; the update job will then
-  correctly time out and be marked failed.
-
-## 11. Migration plan
-
-1. `internal/version` package + Makefile ldflags wiring.
-2. Migration 0021 (jobs.kind widening) + 0022 (fleet_updates
-   tables).
-3. `internal/agent/updater` package, Linux first.
-4. WS envelope wiring + `command.update` dispatcher.
-5. `POST /api/hosts/{id}/update` + hello-handler integration +
-   timeout goroutine.
-6. UI: chip + per-host update button + dashboard tile + filter.
-7. Fleet update worker + page.
-8. Windows updater path.
-9. Alert engine kinds.
-10. Smoke validation per §9.2.
-
-Each step is independently testable; commits should land at each
-boundary so a failed Windows path (8) doesn't block the rest of
-the work.
@@ -1,223 +0,0 @@
-# P6-03 — Repo size trend graphs
-
-Sparkline on the dashboard host row + full chart on the host repo
-page, both showing repo growth over time. Closes the last
-operator-visibility gap in Phase 6 alongside Prometheus metrics
-(P6-04).
-
-## Goals
-
- Operators can see at a glance whether a host's repo is growing,
-  stable, or shrinking, without leaving the dashboard.
- A second screen on the repo page exposes the same data over a
-  longer window with a snapshot-count overlay so retention
-  behaviour can be eyeballed against size.
- Zero new client-side dependencies; matches the existing
-  HTMX + server-rendered idiom used everywhere else in the UI.
-
-## Non-goals
-
- No backfill of historical data. Trend lights up with whatever
-  the agents report from the day this ships.
- No per-source-group breakdown — repo-level only.
- No alerting on growth rate (dedicated to a future ticket if a
-  user asks).
- No JSON API surface. Prometheus exposure is P6-04, separate.
-
-## Decisions taken in brainstorming
-
- **Metrics:** `total_size_bytes` (sparkline + chart) and
-  `snapshot_count` (chart only). Raw size dropped as redundant.
- **Cadence:** one row per `(host_id, UTC date)`, last-write-wins
-  per column. Bounded at ~365 rows/host/year regardless of job
-  frequency.
- **Backfill:** none. Pure forward-fill from launch day.
- **Rendering:** server-rendered inline SVG, no JS library.
- **Spans:** sparkline fixed at 30 days; chart has `30d | 90d | 1y`
-  range selector, server-rendered swap.
-
-## Schema
-
-New migration `internal/store/migrations/0023_host_repo_stats_history.sql`:
-
-```sql
-CREATE TABLE host_repo_stats_history (
-  host_id           TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
-  day               TEXT NOT NULL,        -- 'YYYY-MM-DD' UTC
-  total_size_bytes  INTEGER,              -- nullable; partial patches don't overwrite
-  snapshot_count    INTEGER,              -- nullable
-  recorded_at       TEXT NOT NULL,        -- RFC3339Nano of last write touching this row
-  PRIMARY KEY (host_id, day)
-);
-CREATE INDEX host_repo_stats_history_host_day
-  ON host_repo_stats_history(host_id, day DESC);
-```
-
-FK cascade matches every other host-scoped table; deleting a host
-through `Store.DeleteHost` (NS-01) wipes its history automatically.
-
-## Write path
-
-Hook the existing `MsgRepoStats` handler in
-`internal/server/ws/handler.go` (around line 319). After the
-existing `UpsertHostRepoStats(ctx, hostID, patch)` call, append:
-
-```go
-day := time.Now().UTC().Format("2006-01-02")
-if err := deps.Store.UpsertHostRepoStatsHistory(ctx, hostID, day, patch); err != nil {
-    slog.Warn("ws: upsert host repo stats history", "host_id", hostID, "err", err)
-}
-```
-
-A history-write failure is logged and dropped — never blocks the
-main upsert. The partial-update contract that
-`UpsertHostRepoStats` already implements is preserved at the
-history layer:
-
-```sql
-INSERT INTO host_repo_stats_history (host_id, day, total_size_bytes, snapshot_count, recorded_at)
-VALUES (?, ?, ?, ?, ?)
-ON CONFLICT(host_id, day) DO UPDATE SET
-  total_size_bytes = COALESCE(excluded.total_size_bytes, host_repo_stats_history.total_size_bytes),
-  snapshot_count   = COALESCE(excluded.snapshot_count,   host_repo_stats_history.snapshot_count),
-  recorded_at      = excluded.recorded_at;
-```
-
-This is critical: the agent's prune handler in
-`internal/agent/runner/runner.go:318` emits a stats patch that
-only carries `LastPruneAt`. Without `COALESCE`, that prune ack
-would null out a `total_size_bytes` we'd already captured from a
-backup earlier the same day.
-
-## Read path
-
-Two new helpers in `internal/store/host_repo_stats_history.go`:
-
-```go
-type RepoStatsHistoryPoint struct {
-    Day            time.Time   // 00:00:00 UTC
-    TotalSizeBytes *int64
-    SnapshotCount  *int64
-}
-
-func (s *Store) ListHostRepoStatsHistory(
-    ctx context.Context, hostID string, since time.Time,
-) ([]RepoStatsHistoryPoint, error)
-```
-
-Returns rows ordered by `day` ascending where at least one metric
-is non-null. The renderer connects available points with a
-straight line — there is no explicit gap representation. A host
-that was offline for a week shows a single segment spanning the
-gap, which is the right visual: the repo state didn't change.
-
-## Rendering
-
-New package `internal/web/sparkline`. Pure Go, no template
-dependency:
-
-```go
-type Series struct {
-    Name   string
-    Points []float64    // nil-points represented as math.NaN
-    Stroke string       // CSS color
-}
-
-func RenderSparkline(points []float64, width, height int) template.HTML
-func RenderChart(series []Series, days []time.Time, opts ChartOpts) template.HTML
-```
-
-`RenderChart` produces a 600×220 SVG with:
-
- Light horizontal gridlines (4 bands).
- Two y-axes: bytes (left, blue) and count (right, amber). Each
-  series is normalised against its own axis.
- X-axis labels at start, midpoint, and end of the window.
- Per-point `<circle>` with a `<title>` for hover tooltips —
-  accessible by default, no JS.
- Empty state: faint dashed baseline + centered "no data yet"
-  text.
-
-Sparkline is 80×20, single blue polyline, single `<title>` on the
-group element showing `"current → 30d ago"`.
-
-Two new partials:
-
- `web/templates/partials/repo_size_sparkline.html`
- `web/templates/partials/repo_size_chart.html`
-
-Both call into the renderer with the appropriate opts. No
-inline `<style>` — colours come from existing Tailwind palette
-classes already used elsewhere (`text-blue-500`, `text-amber-500`).
-
-## UI placement
-
-### Dashboard host row
-
-`web/templates/partials/host_row.html` gains one `<td>` between
-the existing "Repo size" cell and "Snapshots" cell. Width ≈ 88px.
-Cell renders the sparkline partial; if `len(points) < 2` the cell
-shows "—" centred (matches the existing no-data idiom for
-last-backup time in the same partial).
-
-The dashboard's existing 5-second htmx live-refresh
-(`hx-trigger="every 5s ..."` from NS-04) re-renders this cell
-along with the rest of the row. No extra polling.
-
-### Host repo page
-
-`web/templates/pages/host_repo.html` gains a "Trend" panel
-inserted between the existing summary panel and the maintenance
-panel. Panel contains:
-
- Range pills `30d | 90d | 1y` (anchor links with
-  `hx-get="/hosts/{id}/repo/trend?range=…"` and
-  `hx-target="#repo-trend-chart" hx-swap="outerHTML"`).
- The chart partial wrapped in `<div id="repo-trend-chart">`.
- A small legend strip below the chart.
-
-## Endpoints
-
- `GET /hosts/{id}/repo/trend?range=30d|90d|1y` — admin/operator,
-  htmx fragment, returns the chart partial. Auth reuses the
-  existing host-scoped middleware on the `/hosts/{id}` family.
-  Invalid `range` falls back to 30d.
-
-No new admin-only surface — anyone with read access to the host
-can see the trend.
-
-## Testing
-
- `internal/store/host_repo_stats_history_test.go` — upsert
-  merges partial patches without nulling; ordering; since-day
-  filter; cascade on host delete.
- `internal/web/sparkline/sparkline_test.go` — golden SVG files
-  for: empty input, single point, full 30-day series, mixed
-  null points. Goldens live under `testdata/`.
- `internal/server/http/ui_repo_test.go` — trend panel renders
-  with seeded history; range selector swaps server-side; empty
-  state.
- `internal/server/http/ui_dashboard_test.go` — host row sparkline
-  cell present and renders SVG when points exist, "—" when not.
- Smoke after build: dashboard row shows sparkline once two days
-  of data exist; repo page chart toggles cleanly between ranges.
-
-## Migration / rollout
-
- Schema migration is additive — no risk to existing tables.
- Write path is best-effort; on schema issue the main repo-stats
-  upsert is unaffected.
- No agent change required, so no fleet update needed.
-
-## Acceptance
-
- After two days of operation, the dashboard sparkline shows a
-  visible line for any host that has run a backup or
-  maintenance op on both days.
- Host repo page renders the trend panel with the snapshot-count
-  overlay; range selector switches view without a full page
-  reload.
- `go test ./...` and `go vet ./...` clean.
- Smoke env exercise: backup → sparkline updates; range pills
-  swap; FK cascade verified by deleting a host and checking the
-  history table.
@@ -1,175 +0,0 @@
-# P6-04 + P6-05 — Prometheus `/metrics` + Grafana dashboard
-
-Date: 2026-05-07
-Author: Claude (autonomous, sensible-defaults brief from operator)
-Tasks: P6-04 (M), P6-05 (S)
-
-## Problem
-
-The control plane already knows everything a backup operator needs
-to monitor — last-backup timestamp + status, repo size, snapshot
-count, agent online, open alerts, build version — but it surfaces
-those only through the dashboard HTML and a few JSON endpoints. To
-plug into the operator's existing observability stack we need a
-plain Prometheus exposition endpoint and a Grafana dashboard JSON
-that reads from it.
-
-## Goals
-
- `GET /metrics` emits standard Prometheus text-format with the
-  per-host, server, and job-duration metrics enumerated in the
-  task entry (P6-04 in `tasks.md`).
- Endpoint is opt-in and gated by a bearer token and/or an IP
-  allow-list — never publicly readable by default.
- No new third-party dependency (`prometheus/client_golang` is not
-  pulled in). The exposition format is small and stable enough to
-  emit by hand; matches the repo's "no Tailwind/Node" style.
- Sample Grafana dashboard committed to the repo so a stranger can
-  drop it into a Grafana instance and get a working view.
-
-## Non-goals
-
- OpenMetrics (the legacy text format with `# HELP`/`# TYPE` is
-  what every prom server still parses and what every example
-  online demonstrates — pick the boring option).
- Pushgateway or remote-write integration.
- Per-job metric cardinality (no `job_id` labels — that would
-  make the histogram explode).
- Alerting rules. Operators already have alerts inside
-  restic-manager (P3-05); duplicating them in Prometheus is a
-  YAGNI hazard. The dashboard is read-only.
-
-## Auth
-
-Two switches, both off by default. If neither is set the route
-isn't mounted at all (404 from the chi router) — this avoids any
-accidental "wide-open scrape endpoint" deployment.
-
-| env var | type | meaning |
-| --- | --- | --- |
-| `RM_METRICS_TOKEN` | string | If set, callers must send `Authorization: Bearer <token>`. Compared with `crypto/subtle.ConstantTimeCompare`. |
-| `RM_METRICS_TRUSTED_CIDR` | comma-CIDR | If set, callers must hit from a source IP inside one of these CIDRs. Reuses the existing `RM_TRUSTED_PROXY` semantics for honouring `X-Forwarded-For`. |
-
-If both are set, both must pass (AND, not OR — a token leak doesn't grant access from outside the network, and a trusted-network compromise doesn't grant access without the token). If only one is set, that one alone gates access.
-
-YAML overlay mirrors env: `metrics_token`, `metrics_trusted_cidrs`.
-
-## Metrics
-
-All metric names are prefixed `rm_`. Help text is concise.
-
-### Per-host gauges (one row per `host_id`)
-
-```
-rm_host_agent_online{host_id,host}                     1 if status='online' else 0
-rm_host_last_backup_timestamp_seconds{host_id,host}    unix seconds; omitted if no backup yet
-rm_host_last_backup_success{host_id,host}              1 if last_backup_status='succeeded' else 0; omitted if no backup yet
-rm_host_repo_size_bytes{host_id,host}                  total_size from latest repo stats; omitted if unknown
-rm_host_snapshot_count{host_id,host}                   integer
-rm_host_open_alerts{host_id,host}                      count of open + un-resolved alerts attached to this host
-rm_host_repo_status{host_id,host,status}               1, with status ∈ {unknown,ready,init_failed} (info-style, exactly one row per host)
-```
-
-`host` label is `hosts.name` for human readability; `host_id` is
-the stable ULID for joining across renames.
-
-### Server gauges
-
-```
-rm_hosts_total                              count of hosts (excludes pending)
-rm_hosts_online                             count of hosts with status='online'
-rm_active_alerts{severity}                  count of open alerts by severity ∈ {info,warning,critical}
-rm_build_info{version,commit,go_version}    always 1; pure label-bag for joining
-```
-
-### Job duration histogram
-
-```
-rm_job_duration_seconds_bucket{kind,status,le=...}
-rm_job_duration_seconds_sum{kind,status}
-rm_job_duration_seconds_count{kind,status}
-```
-
-`kind` ∈ {backup,forget,prune,check,unlock,restore,diff,init,update}
-(every JobKind we currently dispatch). `status` ∈
-{succeeded,failed,cancelled}. Buckets cover the realistic range —
-short admin commands (unlock, init) finish in seconds; backups can
-be hours:
-
-```
-1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf
-   (1s   5s  30s  1m   5m  30m   1h    6h   24h)
-```
-
-In-memory only. Reset on process restart — operators who want
-durable history scrape into Prom and let it persist.
-
-## Architecture
-
-New package `internal/server/metrics`:
-
- `Registry` — owns the histogram state (sync.Mutex + map keyed by
-  `kind+status`). `ObserveJob(kind, status string, dur time.Duration)`
-  is the only mutator. Lookups via `Snapshot()` are read-only and
-  copy out.
- `Render(w io.Writer, snapshot Snapshot)` — emits the full
-  exposition body. The snapshot is supplied by the HTTP handler
-  pulling from `Store` on each scrape; the package itself has no
-  store dependency, which keeps it trivially unit-testable.
-
-New file `internal/server/http/metrics.go`:
-
- `handleMetrics(w, r)` — auth check (bearer + CIDR), pull current
-  fleet snapshot from `Store`, ask `metrics.Render` to emit.
- Auth helper `authoriseMetricsScrape(r)` — pure function over
-  request + config; tested directly.
-
-Wiring:
-
- `cmd/server` constructs the `metrics.Registry` once and threads
-  it into both `Deps` (for the HTTP layer) and `ws.HandlerDeps`
-  (so the job-finished branch can call `ObserveJob`).
- `ws/handler.go` MsgJobFinished branch grows a single line:
-  `if deps.Metrics != nil { deps.Metrics.ObserveJob(job.Kind, p.Status, p.FinishedAt.Sub(job.StartedAt)) }`.
-  Falls back gracefully if the registry was never wired (tests).
-
-Route registration in `server.go`:
-
-```go
-if s.deps.Cfg.MetricsAuthEnabled() {
-    r.Get("/metrics", s.handleMetrics)
-}
-```
-
-## Cardinality + cost
-
-Per scrape: O(hosts) gauge rows + O(kinds × statuses × buckets) histogram rows. For a 100-host fleet that's ~700 host rows + ~270 histogram rows per scrape — well under any practical limit. The store reads we issue per scrape are: `ListHosts` (already exists, one query), `ListAlerts` filtered by open status (one query), `GetHostRepoStats` already projected onto `Host` via `repo_size_bytes`. No N+1.
-
-A 10s scrape interval against a 100-host fleet is cheap: each scrape is a couple of indexed sqlite reads + a small string render. We're nowhere near a place where caching the snapshot would be worthwhile.
-
-## Documentation (P6-05)
-
- `docs/prometheus.md` — sibling to the existing `docs/reverse-proxy.md`. Sections: enabling the endpoint (env vars), Prometheus scrape config snippet (with bearer + tls), the metric reference table (copy-pasted from this spec), the dashboard import instructions.
- `deploy/grafana/restic-manager-dashboard.json` — Grafana 11+ dashboard JSON. Single Prometheus datasource variable, six panels:
-  1. **Fleet status** — stat panel showing `rm_hosts_online / rm_hosts_total` + a sparkline.
-  2. **Open alerts** — stat panel by severity (`sum by (severity) (rm_active_alerts)`).
-  3. **Hosts** — table of `host`, `online`, `last_backup` (relative time via `time() - rm_host_last_backup_timestamp_seconds`), `repo_size`, `snapshots`.
-  4. **Repo size over time** — time series, one line per host, `rm_host_repo_size_bytes`.
-  5. **Backups failing** — time series counting hosts where `rm_host_last_backup_success == 0`.
-  6. **Job duration p95** — `histogram_quantile(0.95, sum by (kind, le) (rate(rm_job_duration_seconds_bucket[1h])))` over a 1h window.
-
-Dashboard is committed as plain JSON; an operator imports it through the Grafana UI ("+ → Import → upload JSON file") or Grafana provisioning.
-
-## Testing
-
- Unit tests for `metrics.Render` against a fixed snapshot — golden-file style. Pin the exact line ordering (sorted by metric name + label set) so diffs stay tractable.
- Unit tests for `metrics.Registry.ObserveJob` — concurrent writes, bucket boundary correctness, snapshot independence.
- Handler tests for `/metrics` covering: no auth configured → 404; token configured + missing → 401; token configured + correct → 200 + body sniff; CIDR configured + wrong source → 401; CIDR configured + right source → 200; both configured → require both.
- End-to-end smoke verification deferred to manual operator walk-through; full Playwright pass is P5-06's job.
-
-## Out of scope, explicitly
-
- Per-job latency tracking with `job_id` labels (cardinality bomb).
- Restore-specific metrics (P3 surfaces are still settling).
- Histograms keyed on host (kind × status × buckets × hosts is a Prom anti-pattern).
- Auto-discovery / file-SD generators for Prometheus.
@@ -1,42 +0,0 @@
-# Build a Linux container that runs the restic-manager agent against a
-# sibling rest-server in the e2e compose stack. Used only by tests
-# (e2e/compose.e2e.yml + .gitea/workflows/e2e.yml).
-#
-# Two stages:
-#   1. golang:alpine to build the agent binary.
-#   2. alpine:3.20 with the `restic` package + the built binary.
-#
-# Pinning by digest is intentional for CI reproducibility.
-
-FROM golang:1.25-alpine AS build
-WORKDIR /src
-
-ENV CGO_ENABLED=0 \
-    GOFLAGS="-trimpath"
-
-COPY go.mod go.sum* ./
-RUN go mod download
-
-COPY . .
-ARG VERSION=e2e
-RUN go build -ldflags="-s -w -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=${VERSION}" \
-        -o /out/restic-manager-agent ./cmd/agent
-
-FROM alpine:3.20
-RUN apk add --no-cache restic ca-certificates curl
-COPY --from=build /out/restic-manager-agent /usr/local/bin/restic-manager-agent
-
-# Agents normally run as root because backup paths often need it. The
-# e2e fixture only backs up paths under /data which we own, so this
-# container would tolerate a non-root user — but staying root keeps
-# parity with the production install.
-USER root
-
-# The agent needs a writable directory for its config + secrets store.
-RUN mkdir -p /etc/restic-manager /var/lib/restic-manager-agent
-ENV RM_AGENT_CONFIG=/etc/restic-manager/agent.yaml
-
-# The compose entrypoint sets the announce URL via env.
-COPY e2e/agent-entrypoint.sh /usr/local/bin/entrypoint.sh
-RUN chmod +x /usr/local/bin/entrypoint.sh
-ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
@@ -1,21 +0,0 @@
-# Playwright runner for the e2e suite. Built and run by
-# e2e/compose.e2e.yml so the test process sits on the same docker
-# network as the server, agent, and rest-server. The previous setup
-# ran Playwright on the workflow runner host and reached the server
-# via 127.0.0.1:8080; that fails on Gitea's act-style runners
-# because the workflow steps execute inside a runner container,
-# not on the host where compose publishes its ports.
-
-FROM mcr.microsoft.com/playwright:v1.59.1-jammy
-
-WORKDIR /work
-
-# Install npm deps in a separate layer keyed off package.json so
-# changes to specs don't bust the dep cache.
-COPY e2e/playwright/package.json /work/package.json
-RUN npm install --no-audit --no-fund
-
-COPY e2e/playwright/ /work/
-
-ENV CI=1
-ENTRYPOINT ["npx", "playwright", "test"]
@@ -1,27 +0,0 @@
-#!/bin/sh
-# Entrypoint for the e2e agent container.
-#
-# Three states:
-#   1. Already enrolled (agent.yaml has a bearer): run the agent.
-#   2. Token supplied via $RM_ENROL_TOKEN: enrol then run.
-#   3. Otherwise: announce against $RM_SERVER and wait for an admin to
-#      accept us. The announce flow blocks until accepted, then drops
-#      straight into the normal run loop, so this is the test-friendly
-#      path.
-set -eu
-
-CFG="${RM_AGENT_CONFIG:-/etc/restic-manager/agent.yaml}"
-SERVER="${RM_SERVER:?set RM_SERVER}"
-
-if [ -f "$CFG" ] && grep -q '^agent_token:' "$CFG"; then
-    exec restic-manager-agent -config "$CFG"
-fi
-
-if [ -n "${RM_ENROL_TOKEN:-}" ]; then
-    exec restic-manager-agent -config "$CFG" \
-        -enroll-server "$SERVER" \
-        -enroll-token "$RM_ENROL_TOKEN"
-fi
-
-# Announce-and-approve: blocks until an admin accepts, then runs.
-exec restic-manager-agent -config "$CFG" -enroll-server "$SERVER"
@@ -1,108 +0,0 @@
-# End-to-end test stack — used by .gitea/workflows/e2e.yml and by
-# operators who want to run the Playwright suite locally.
-#
-# Three services:
-#   * server      — restic-manager built from the working tree
-#   * agent       — restic-manager agent built from the working tree
-#                   (announces; Playwright accepts it during the test)
-#   * rest-server — the actual restic backend, sibling of the agent
-#
-# Run from the repo root:
-#   docker compose -f e2e/compose.e2e.yml up --build --abort-on-container-exit
-
-services:
-  rest-server:
-    image: restic/rest-server:0.13.0
-    environment:
-      DATA_DIR: /data
-      OPTIONS: "--no-auth"
-    volumes:
-      - rest-data:/data
-    networks: [rmnet]
-
-  server:
-    build:
-      context: ..
-      dockerfile: deploy/Dockerfile.server
-      args:
-        VERSION: e2e
-    environment:
-      RM_LISTEN: ":8080"
-      RM_DATA_DIR: "/data"
-      RM_BASE_URL: "http://server:8080"
-      RM_COOKIE_SECURE: "false"
-      # Bind the metrics endpoint loose for the test, so one of the
-      # Playwright assertions can exercise it.
-      RM_METRICS_TRUSTED_CIDR: "0.0.0.0/0"
-    volumes:
-      - server-data:/data
-    ports:
-      - "127.0.0.1:8080:8080"
-    healthcheck:
-      test: ["CMD", "/usr/local/bin/restic-manager-server", "--version"]
-      interval: 2s
-      timeout: 2s
-      retries: 30
-    networks: [rmnet]
-
-  agent:
-    build:
-      context: ..
-      dockerfile: e2e/Dockerfile.agent
-      args:
-        VERSION: e2e
-    environment:
-      RM_SERVER: "http://server:8080"
-    depends_on:
-      - server
-    volumes:
-      # Source paths the agent backs up. Compose pre-populates this
-      # with a few files so the snapshot list isn't empty.
-      - source-data:/source
-      - agent-config:/etc/restic-manager
-      - agent-state:/var/lib/restic-manager-agent
-    networks: [rmnet]
-
-  # Playwright test runner. Profile-gated so `compose up` doesn't
-  # start it; CI runs it via `compose run --rm playwright`. Lives on
-  # rmnet so it can reach the server via its compose-network DNS
-  # name rather than depending on host port-publish (which doesn't
-  # work on Gitea's container-based runners).
-  playwright:
-    profiles: [test]
-    build:
-      context: ..
-      dockerfile: e2e/Dockerfile.playwright
-    environment:
-      RM_BASE_URL: "http://server:8080"
-      RM_BOOTSTRAP_TOKEN: "${RM_BOOTSTRAP_TOKEN:-}"
-    volumes:
-      - ./playwright/playwright-report:/work/playwright-report
-      - ./playwright/test-results:/work/test-results
-    depends_on:
-      - server
-      - agent
-    networks: [rmnet]
-
-  # One-shot init container that drops a couple of files into the
-  # source volume so backups have something to snapshot.
-  source-fixture:
-    image: alpine:3.20
-    command: >
-      sh -c 'mkdir -p /source && echo "hello world" > /source/hello.txt &&
-             echo "another file" > /source/two.txt && sleep 0.2'
-    volumes:
-      - source-data:/source
-    networks: [rmnet]
-    restart: "no"
-
-volumes:
-  server-data:
-  rest-data:
-  source-data:
-  agent-config:
-  agent-state:
-
-networks:
-  rmnet:
-    driver: bridge
@@ -1,14 +0,0 @@
-{
-  "name": "restic-manager-e2e",
-  "version": "0.0.0",
-  "private": true,
-  "type": "module",
-  "scripts": {
-    "test": "playwright test",
-    "test:headed": "playwright test --headed",
-    "test:debug": "PWDEBUG=1 playwright test"
-  },
-  "devDependencies": {
-    "@playwright/test": "1.59.1"
-  }
-}
@@ -1,31 +0,0 @@
-import { defineConfig, devices } from '@playwright/test';
-
-// Single-target Chromium config: the e2e suite is narrow (smoke
-// the production-shaped flow against the docker-compose stack).
-// Cross-browser matrix doesn't add signal — what we're verifying is
-// the server's HTML and the agent's WebSocket handshake, neither of
-// which depends on browser engine.
-
-const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080';
-
-export default defineConfig({
-    testDir: './tests',
-    timeout: 60_000,
-    expect: { timeout: 10_000 },
-    fullyParallel: false,
-    retries: process.env.CI ? 1 : 0,
-    workers: 1,
-    reporter: [['list'], ['html', { open: 'never' }]],
-    use: {
-        baseURL,
-        trace: 'retain-on-failure',
-        screenshot: 'only-on-failure',
-        video: 'retain-on-failure',
-    },
-    projects: [
-        {
-            name: 'chromium',
-            use: { ...devices['Desktop Chrome'] },
-        },
-    ],
-});
@@ -1,114 +0,0 @@
-// Helpers used by every test. The shape favours the JSON API for
-// reads + accept/dispatch (deterministic, easy to assert) and the
-// browser for human-facing surfaces (login form, dashboard render).
-
-import { APIRequestContext, expect, Page } from '@playwright/test';
-
-export const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080';
-
-export interface HostJSON {
-    id: string;
-    name: string;
-    status: string;
-    last_backup_status?: string;
-}
-
-export async function readBootstrapToken(): Promise<string> {
-    const tok = process.env.RM_BOOTSTRAP_TOKEN;
-    if (!tok) {
-        throw new Error('RM_BOOTSTRAP_TOKEN not set — the harness scrapes it from server logs');
-    }
-    return tok;
-}
-
-export async function bootstrapAdmin(
-    request: APIRequestContext,
-    {
-        username = 'admin',
-        password = 'e2e-test-password-1234',
-    }: { username?: string; password?: string } = {},
-): Promise<{ username: string; password: string }> {
-    const token = await readBootstrapToken();
-    const res = await request.post(`${baseURL}/api/bootstrap`, {
-        data: { token, username, password },
-    });
-    if (!res.ok() && res.status() !== 409 /* already bootstrapped */) {
-        throw new Error(`bootstrap: ${res.status()} ${await res.text()}`);
-    }
-    return { username, password };
-}
-
-export async function loginViaUI(page: Page, username: string, password: string): Promise<void> {
-    await page.goto(`${baseURL}/login`);
-    await page.locator('#login-username').fill(username);
-    await page.locator('#login-password').fill(password);
-    await Promise.all([
-        page.waitForURL(new RegExp(`^${baseURL}/?$`)),
-        page.locator('form[action="/login"] button[type="submit"]').click(),
-    ]);
-}
-
-/**
- * Polls the dashboard until a pending host card is visible, then
- * extracts its pending-id from the inline accept form's action URL.
- */
-export async function waitForPendingHostID(page: Page): Promise<string> {
-    const formLocator = page.locator('form[action^="/api/pending-hosts/"][action$="/accept"]').first();
-    await expect(formLocator).toBeVisible({ timeout: 60_000 });
-    const action = await formLocator.getAttribute('action');
-    if (!action) throw new Error('pending host form has no action attribute');
-    const m = action.match(/\/api\/pending-hosts\/([^/]+)\/accept/);
-    if (!m) throw new Error(`unexpected action URL: ${action}`);
-    return m[1];
-}
-
-export async function acceptPending(
-    request: APIRequestContext,
-    cookie: string,
-    pendingID: string,
-    repo: { url: string; username?: string; password: string },
-): Promise<void> {
-    const res = await request.post(`${baseURL}/api/pending-hosts/${pendingID}/accept`, {
-        headers: { cookie, 'content-type': 'application/json' },
-        data: {
-            repo_url: repo.url,
-            repo_username: repo.username ?? '',
-            repo_password: repo.password,
-        },
-    });
-    if (!res.ok()) {
-        throw new Error(`accept: ${res.status()} ${await res.text()}`);
-    }
-}
-
-export async function listHosts(request: APIRequestContext, cookie: string): Promise<HostJSON[]> {
-    const res = await request.get(`${baseURL}/api/hosts`, { headers: { cookie } });
-    if (!res.ok()) throw new Error(`list hosts: ${res.status()} ${await res.text()}`);
-    const body = (await res.json()) as { items?: HostJSON[]; hosts?: HostJSON[] };
-    return body.items ?? body.hosts ?? [];
-}
-
-export async function waitForHostStatus(
-    request: APIRequestContext,
-    cookie: string,
-    matcher: (h: HostJSON) => boolean,
-    timeoutMs = 60_000,
-): Promise<HostJSON> {
-    const deadline = Date.now() + timeoutMs;
-    let last: HostJSON | undefined;
-    while (Date.now() < deadline) {
-        const hosts = await listHosts(request, cookie);
-        const hit = hosts.find(matcher);
-        if (hit) return hit;
-        last = hosts[0];
-        await new Promise((r) => setTimeout(r, 1_000));
-    }
-    throw new Error(`waitForHostStatus: timeout. Last seen: ${JSON.stringify(last)}`);
-}
-
-export async function getSessionCookie(page: Page): Promise<string> {
-    const cookies = await page.context().cookies();
-    const c = cookies.find((c) => c.name === 'rm_session');
-    if (!c) throw new Error('rm_session cookie not set after login');
-    return `${c.name}=${c.value}`;
-}
@@ -1,80 +0,0 @@
-// End-to-end smoke: bootstrap → accept pending host → run backup → see succeeded.
-//
-// The compose stack stands up a server, a sibling rest-server, and an
-// agent in announce-and-approve mode. This test drives the operator
-// path through the UI (login + dashboard) and the API
-// (accept + run-now + poll for terminal) — UI for the human surfaces,
-// API for the deterministic ones.
-
-import { test, expect } from '@playwright/test';
-import {
-    baseURL,
-    bootstrapAdmin,
-    loginViaUI,
-    waitForPendingHostID,
-    acceptPending,
-    waitForHostStatus,
-    getSessionCookie,
-} from './lib/server';
-
-test.describe('smoke: enrol-via-announce → backup', () => {
-    test('happy path completes in under a minute', async ({ page, request }) => {
-        const { username, password } = await bootstrapAdmin(request);
-        await loginViaUI(page, username, password);
-
-        // Dashboard renders.
-        await expect(page.locator('main')).toContainText(/host|fleet|pending/i, { timeout: 10_000 });
-
-        // Pending host appears (the agent container has been
-        // announcing since startup).
-        const pendingID = await waitForPendingHostID(page);
-        const cookie = await getSessionCookie(page);
-
-        // Accept with the rest-server creds. compose's rest-server runs
-        // --no-auth, so any credentials work; restic still demands a
-        // password to encrypt the repo.
-        await acceptPending(request, cookie, pendingID, {
-            url: 'rest:http://rest-server:8000/',
-            password: 'e2e-repo-password',
-        });
-
-        // Wait for the host to come online + auto-init to land.
-        const onlineHost = await waitForHostStatus(
-            request, cookie,
-            (h) => h.status === 'online',
-            60_000,
-        );
-        expect(onlineHost.id).toBeTruthy();
-
-        // Trigger a backup via the UI form-post (HX-Redirect to /jobs/{id}).
-        await page.goto(`${baseURL}/hosts/${onlineHost.id}`);
-        await Promise.all([
-            page.waitForURL(/\/jobs\//),
-            page.locator('form[action$="/run-backup"] button[type="submit"]').first().click(),
-        ]);
-
-        // Wait for the host's last_backup_status to flip to 'succeeded'.
-        // The job page itself is harder to assert on (it uses
-        // server-pushed updates and a reload-on-finish pattern); the
-        // host record is the source of truth and is what the dashboard
-        // surfaces.
-        const finishedHost = await waitForHostStatus(
-            request, cookie,
-            (h) => h.id === onlineHost.id && h.last_backup_status === 'succeeded',
-            120_000,
-        );
-        expect(finishedHost.last_backup_status).toBe('succeeded');
-    });
-});
-
-test.describe('smoke: scrape /metrics', () => {
-    test('metrics endpoint exposes the host gauge', async ({ request }) => {
-        // Compose sets RM_METRICS_TRUSTED_CIDR=0.0.0.0/0 so the
-        // endpoint is open to the test runner.
-        const res = await request.get(`${baseURL}/metrics`);
-        expect(res.status()).toBe(200);
-        const body = await res.text();
-        expect(body).toContain('rm_hosts_total');
-        expect(body).toContain('rm_build_info{');
-    });
-});
@@ -2,14 +2,10 @@ package runner

 import (
 	"context"
-	"errors"
 	"os"
-	"os/exec"
 	"path/filepath"
 	"sync"
-	"syscall"
 	"testing"
-	"time"

 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/restic"
@@ -47,22 +43,13 @@ func (s *fakeSender) snapshot() []api.Envelope {
 // setupScript writes a shell script (without shebang) to a temp dir,
 // names it "restic", makes it executable, and returns the path.
 //
-// Writes to "<path>.tmp" then renames into place. The rename is the
-// usual guard against ETXTBSY: under -race + many t.Parallel tests,
-// a fork-from-another-goroutine can inherit the writable fd from
+// Writes to "<path>.tmp" then renames into place. The rename is what
+// makes this race-free: under -race + many t.Parallel tests, a
+// fork-from-another-goroutine can inherit the writable fd from
 // os.WriteFile before close completes, and exec'ing the file then
-// returns ETXTBSY ("text file busy"). The renamed dirent points at
-// an inode that has no writable fd open anywhere — exec is safe on
-// a vanilla filesystem.
-//
-// On overlayfs (every job that runs inside a `container:` block on
-// our Gitea runner), the rename can briefly leak ETXTBSY anyway —
-// the upper layer's "writable inode" bookkeeping lags the userspace
-// close. To make the helper deterministic across environments, we
-// probe-exec the file with a benign argument until exec succeeds,
-// then return. Each script body has a `case "$1" in ... esac` shape
-// where unknown args fall through to a clean exit, so the probe is
-// a no-op from the test's point of view.
+// returns ETXTBSY ("text file busy"). Once the rename lands, the
+// final path is a fresh dirent pointing at an inode that has no
+// writable fd open anywhere — exec is safe.
 func setupScript(t *testing.T, body string) string {
 	t.Helper()
 	dir := t.TempDir()
@@ -74,21 +61,7 @@ func setupScript(t *testing.T, body string) string {
 	if err := os.Rename(tmp, final); err != nil {
 		t.Fatalf("setupScript: rename: %v", err)
 	}
-
-	deadline := time.Now().Add(3 * time.Second)
-	for {
-		err := exec.Command(final, "__rm_probe__").Run()
-		if err == nil {
-			return final
-		}
-		if !errors.Is(err, syscall.ETXTBSY) {
-			t.Fatalf("setupScript: probe exec: %v", err)
-		}
-		if time.Now().After(deadline) {
-			t.Fatalf("setupScript: %s still ETXTBSY after 3s", final)
-		}
-		time.Sleep(10 * time.Millisecond)
-	}
+	return final
 }

 // firstEnvOfType returns the first envelope with the given type, or
@@ -1,100 +0,0 @@
-// Package updater carries the agent's self-update logic.
-//
-// The flow is operator-driven: the server dispatches a command.update
-// WS envelope, the agent fetches a fresh binary from the server's
-// /agent/binary endpoint, atomic-renames it over the running binary
-// (Linux) or hands off to a detached helper script (Windows), and
-// exits cleanly so the service manager restarts under the new
-// binary. See docs/superpowers/specs/2026-05-06-p6-01-02-...
-//
-// Platform-specific code is build-tagged into updater_unix.go /
-// updater_windows.go. This file holds the shared HTTP fetch + path
-// helpers + the test seam.
-package updater
-
-import (
-	"context"
-	"fmt"
-	"io"
-	"net/http"
-	"os"
-	"path/filepath"
-	"runtime"
-	"time"
-)
-
-// fetch downloads the new binary into <binaryPath>.new, fsyncs, chmods.
-// Returns the path of the staged file (always binaryPath + ".new").
-func fetch(ctx context.Context, serverURL, binaryPath string) (string, error) {
-	url := fmt.Sprintf("%s/agent/binary?os=%s&arch=%s", serverURL, runtime.GOOS, runtime.GOARCH)
-	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
-	if err != nil {
-		return "", err
-	}
-	c := &http.Client{Timeout: 5 * time.Minute}
-	res, err := c.Do(req)
-	if err != nil {
-		return "", err
-	}
-	defer func() { _ = res.Body.Close() }()
-	if res.StatusCode != http.StatusOK {
-		return "", fmt.Errorf("agent binary fetch: %s", res.Status)
-	}
-
-	stagePath := binaryPath + ".new"
-	f, err := os.OpenFile(stagePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
-	if err != nil {
-		return "", err
-	}
-	if _, copyErr := io.Copy(f, res.Body); copyErr != nil {
-		_ = f.Close()
-		_ = os.Remove(stagePath)
-		return "", copyErr
-	}
-	if syncErr := f.Sync(); syncErr != nil {
-		_ = f.Close()
-		_ = os.Remove(stagePath)
-		return "", syncErr
-	}
-	if closeErr := f.Close(); closeErr != nil {
-		_ = os.Remove(stagePath)
-		return "", closeErr
-	}
-	if err := os.Chmod(stagePath, 0o755); err != nil {
-		_ = os.Remove(stagePath)
-		return "", err
-	}
-	return stagePath, nil
-}
-
-// resolveOwnBinary returns the absolute path of the running binary.
-// Refuses /proc/self/exe — that's what os.Executable returns on some
-// systems but the path can't be renamed across.
-func resolveOwnBinary() (string, error) {
-	p, err := os.Executable()
-	if err != nil {
-		return "", err
-	}
-	abs, err := filepath.Abs(p)
-	if err != nil {
-		return "", err
-	}
-	if abs == "/proc/self/exe" {
-		return "", fmt.Errorf("cannot resolve own binary path (/proc/self/exe)")
-	}
-	return abs, nil
-}
-
-// UpdateForTest is the platform-neutral test seam. In production the
-// platform-specific Update fetches, swaps, then exits the process.
-// UpdateForTest stops short of the exit so unit tests can assert on
-// file state.
-func UpdateForTest(serverURL, binaryPath string) error {
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
-	defer cancel()
-	stage, err := fetch(ctx, serverURL, binaryPath)
-	if err != nil {
-		return err
-	}
-	return swap(stage, binaryPath)
-}
@@ -1,87 +0,0 @@
-//go:build !windows
-
-package updater
-
-import (
-	"bytes"
-	"io"
-	"net/http"
-	"net/http/httptest"
-	"os"
-	"path/filepath"
-	"runtime"
-	"testing"
-)
-
-// TestUpdate_LinuxAtomicSwap stages a fake "running binary" file, runs
-// UpdateForTest against a fake /agent/binary server, and asserts that
-// the binary was swapped, .old preserves the previous bytes, and .new
-// was renamed away.
-func TestUpdate_LinuxAtomicSwap(t *testing.T) {
-	tmp := t.TempDir()
-	binPath := filepath.Join(tmp, "agent")
-	if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil {
-		t.Fatal(err)
-	}
-	newBytes := []byte("NEW BINARY CONTENTS")
-
-	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		if r.URL.Path != "/agent/binary" {
-			http.NotFound(w, r)
-			return
-		}
-		gotOS, gotArch := r.URL.Query().Get("os"), r.URL.Query().Get("arch")
-		if gotOS != runtime.GOOS || gotArch != runtime.GOARCH {
-			t.Errorf("query mismatch: got os=%s arch=%s want %s/%s",
-				gotOS, gotArch, runtime.GOOS, runtime.GOARCH)
-		}
-		_, _ = io.Copy(w, bytes.NewReader(newBytes))
-	}))
-	defer srv.Close()
-
-	if err := UpdateForTest(srv.URL, binPath); err != nil {
-		t.Fatalf("update: %v", err)
-	}
-
-	got, err := os.ReadFile(binPath)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if string(got) != string(newBytes) {
-		t.Fatalf("binary contents: got %q want %q", got, newBytes)
-	}
-	old, err := os.ReadFile(binPath + ".old")
-	if err != nil {
-		t.Fatalf("agent.old missing: %v", err)
-	}
-	if string(old) != "OLD" {
-		t.Fatalf("agent.old contents: got %q want %q", old, "OLD")
-	}
-	if _, err := os.Stat(binPath + ".new"); !os.IsNotExist(err) {
-		t.Fatalf("agent.new should be absent after swap, got err=%v", err)
-	}
-}
-
-// TestUpdate_FetchHTTPError surfaces the server's status when the
-// binary is not published for this os/arch.
-func TestUpdate_FetchHTTPError(t *testing.T) {
-	tmp := t.TempDir()
-	binPath := filepath.Join(tmp, "agent")
-	if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil {
-		t.Fatal(err)
-	}
-
-	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		http.Error(w, `{"error":"binary_not_published"}`, http.StatusNotFound)
-	}))
-	defer srv.Close()
-
-	err := UpdateForTest(srv.URL, binPath)
-	if err == nil {
-		t.Fatal("expected error, got nil")
-	}
-	got, _ := os.ReadFile(binPath)
-	if string(got) != "OLD" {
-		t.Fatalf("binary should not have changed, got %q", got)
-	}
-}
@@ -1,73 +0,0 @@
-//go:build !windows
-
-package updater
-
-import (
-	"context"
-	"fmt"
-	"io"
-	"log/slog"
-	"os"
-	"time"
-)
-
-// Update fetches the new binary, swaps it in, then exits so systemd
-// restarts the process under the new binary. The caller should close
-// the WS connection cleanly (so the server transitions the host to
-// disconnected immediately rather than waiting for the heartbeat
-// sweep) before invoking.
-//
-// Service-user assumption: the agent runs as root under the
-// systemd-shipped unit, which can write the binary path directly.
-// If the agent ever moves to a non-root service user, this breaks —
-// would need a setuid helper or an out-of-process update service.
-func Update(ctx context.Context, serverURL string) error {
-	binPath, err := resolveOwnBinary()
-	if err != nil {
-		return err
-	}
-	stage, err := fetch(ctx, serverURL, binPath)
-	if err != nil {
-		return err
-	}
-	if err := swap(stage, binPath); err != nil {
-		return err
-	}
-	slog.Info("agent self-update: binary swapped, exiting for systemd restart",
-		"binary", binPath)
-	// Give logger / WS close-frame a moment to flush, then exit.
-	time.Sleep(200 * time.Millisecond)
-	os.Exit(0)
-	return nil // unreachable
-}
-
-// swap copies the running binary to <bin>.old (M1 — keep one revision
-// back for hand-rolled rollback), then atomic-renames the staged
-// binary into place. Linux supports rename-while-open so this works
-// even though the running process holds the source open.
-func swap(stagePath, binPath string) error {
-	src, err := os.Open(binPath)
-	if err != nil {
-		return fmt.Errorf("open running binary: %w", err)
-	}
-	defer func() { _ = src.Close() }()
-	dst, err := os.OpenFile(binPath+".old", os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
-	if err != nil {
-		return fmt.Errorf("open .old: %w", err)
-	}
-	if _, err := io.Copy(dst, src); err != nil {
-		_ = dst.Close()
-		return fmt.Errorf("copy to .old: %w", err)
-	}
-	if err := dst.Sync(); err != nil {
-		_ = dst.Close()
-		return err
-	}
-	if err := dst.Close(); err != nil {
-		return err
-	}
-	if err := os.Rename(stagePath, binPath); err != nil {
-		return fmt.Errorf("rename .new over running binary: %w", err)
-	}
-	return nil
-}
@@ -1,73 +0,0 @@
-//go:build windows
-
-package updater
-
-import (
-	"context"
-	"fmt"
-	"log/slog"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"syscall"
-	"time"
-)
-
-// helperScript is rendered with fmt.Sprintf, args order:
-//
-//	%[1]s — running binary path (source for the .old copy)
-//	%[2]s — .old path
-//	%[3]s — staged .new path
-//	%[4]s — running binary path (rename target)
-const helperScript = `@echo off
-timeout /t 3 /nobreak >nul
-copy /Y "%[1]s" "%[2]s"
-sc stop restic-manager-agent
-:wait
-sc query restic-manager-agent | find "STOPPED" >nul
-if errorlevel 1 (timeout /t 1 /nobreak >nul & goto wait)
-move /Y "%[3]s" "%[4]s"
-sc start restic-manager-agent
-del "%%~f0"
-`
-
-// Update on Windows can't overwrite the running .exe in-process
-// (exclusive file lock), so we stage the new binary, write a small
-// detached helper script that waits, stops the service, swaps the
-// binary, and starts the service, then exit cleanly. SCM treats
-// clean exits after sc stop as intentional and does not auto-restart;
-// the helper's final sc start handles that.
-func Update(ctx context.Context, serverURL string) error {
-	binPath, err := resolveOwnBinary()
-	if err != nil {
-		return err
-	}
-	stage, err := fetch(ctx, serverURL, binPath)
-	if err != nil {
-		return err
-	}
-	helperPath := filepath.Join(filepath.Dir(binPath), "agent-update.cmd")
-	body := fmt.Sprintf(helperScript, binPath, binPath+".old", stage, binPath)
-	if err := os.WriteFile(helperPath, []byte(body), 0o755); err != nil {
-		return err
-	}
-	cmd := exec.Command("cmd.exe", "/c", helperPath)
-	cmd.SysProcAttr = &syscall.SysProcAttr{
-		HideWindow:    true,
-		CreationFlags: 0x00000008 | 0x08000000, // DETACHED_PROCESS | CREATE_NO_WINDOW
-	}
-	if err := cmd.Start(); err != nil {
-		return err
-	}
-	slog.Info("agent self-update: helper spawned, exiting cleanly",
-		"binary", binPath, "helper", helperPath)
-	time.Sleep(200 * time.Millisecond)
-	os.Exit(0)
-	return nil // unreachable
-}
-
-// swap is unused on Windows — the helper script does the swap.
-// Defined to satisfy the build (UpdateForTest references it).
-func swap(_, _ string) error {
-	return fmt.Errorf("updater.swap not implemented on Windows; use the helper script via Update")
-}
@@ -1,63 +0,0 @@
-package alert
-
-import (
-	"context"
-	"fmt"
-	"log/slog"
-	"time"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
-)
-
-// Alert-kind constants for P6 self-update flows.
-const (
-	// KindUpdateFailed is raised when an agent fails to come back with
-	// the expected version after a command.update dispatch (timeout or
-	// version-mismatch). Resolved by a subsequent matching hello.
-	KindUpdateFailed = "update_failed"
-
-	// KindFleetUpdateHalted is raised when the fleet-update worker
-	// stops mid-run because a host failed to update or went offline.
-	// Host-less alert (system-scoped). Manually resolved by an admin.
-	KindFleetUpdateHalted = "fleet_update_halted"
-)
-
-// RaiseUpdateFailed records a per-host update failure. dedupKey is the
-// hostID so a re-dispatch on the same host touches the existing alert
-// rather than spawning a duplicate.
-func (e *Engine) RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time) {
-	msg := fmt.Sprintf("Agent update failed (job %s): %s", jobID, reason)
-	e.raiseAndNotify(ctx, hostID, KindUpdateFailed, hostID, "warning", msg, when)
-}
-
-// ResolveUpdateFailed clears any open update_failed alert for hostID.
-// Called from the WS hello path when the agent reconnects with the
-// target version.
-func (e *Engine) ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time) {
-	e.resolveAndNotify(ctx, hostID, KindUpdateFailed, hostID, when)
-}
-
-// RaiseFleetUpdateHalted is host-less — the fleet update is a
-// system-level concept. We persist it via the dedicated host-less
-// alert path so the alerts table's host_id column carries NULL.
-func (e *Engine) RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time) {
-	msg := fmt.Sprintf("Fleet update %s halted: %s", fleetUpdateID, reason)
-	id, didRaise, err := e.store.RaiseOrTouchSystem(ctx, KindFleetUpdateHalted, fleetUpdateID, "warning", msg, when)
-	if err != nil {
-		slog.Warn("alert: raise fleet_update_halted", "fu_id", fleetUpdateID, "err", err)
-		return
-	}
-	if !didRaise {
-		return
-	}
-	go e.hub.Dispatch(ctx, notification.Payload{
-		Event:    notification.EventRaised,
-		AlertID:  id,
-		Severity: "warning",
-		Kind:     KindFleetUpdateHalted,
-		HostID:   "",
-		HostName: "",
-		Message:  msg,
-		RaisedAt: when,
-	})
-}
@@ -63,7 +63,6 @@ const (
 	JobUnlock  JobKind = "unlock"
 	JobRestore JobKind = "restore"
 	JobDiff    JobKind = "diff"
-	JobUpdate  JobKind = "update"
 )

 // JobStatus is the lifecycle state of a job.
@@ -362,14 +361,13 @@ type ConfigUpdatePayload struct {
 	BandwidthDownKBps *int `json:"bandwidth_down_kbps,omitempty"`
 }

-// CommandUpdatePayload carries no operational data — the agent
-// already knows its own os/arch and fetches from its configured
-// server URL via /agent/binary. JobID is the server-issued id of
-// the update job; the agent echoes it on log.stream lines so the
-// live job log captures pre-restart progress, then either exits
-// (Linux) or hands off to a detached helper script (Windows).
-type CommandUpdatePayload struct {
-	JobID string `json:"job_id"`
+// AgentUpdateAvailablePayload — informational only; the agent does
+// NOT self-update. See spec.md §4.2 for the package-manager-based
+// update model.
+type AgentUpdateAvailablePayload struct {
+	LatestVersion string `json:"latest_version"`
+	PackageURL    string `json:"package_url"` // apt repo / choco source
+	Changelog     string `json:"changelog,omitempty"`
 }

 // TreeListRequestPayload is the body of a tree.list RPC. Used by the
@@ -29,12 +29,12 @@ const (

 // Server → agent message types.
 const (
-	MsgCommandRun    MessageType = "command.run"
-	MsgCommandCancel MessageType = "command.cancel"
-	MsgScheduleSet   MessageType = "schedule.set"
-	MsgConfigUpdate  MessageType = "config.update"
-	MsgCommandUpdate MessageType = "command.update"
-	MsgTreeList      MessageType = "tree.list" // sync RPC: list a snapshot's children
+	MsgCommandRun       MessageType = "command.run"
+	MsgCommandCancel    MessageType = "command.cancel"
+	MsgScheduleSet      MessageType = "schedule.set"
+	MsgConfigUpdate     MessageType = "config.update"
+	MsgAgentUpdateAvail MessageType = "agent.update.available"
+	MsgTreeList         MessageType = "tree.list" // sync RPC: list a snapshot's children
 )

 // Envelope is the framing for every WS message in either direction.
@@ -41,24 +41,6 @@ type Config struct {
 	// DataDir. Source-build deployments can override via
 	// RM_BUNDLED_ASSETS_DIR.
 	BundledAssetsDir string `yaml:"bundled_assets_dir"`
-
-	// MetricsToken, if set, gates the /metrics scrape endpoint
-	// behind a `Authorization: Bearer <token>` check (constant-time
-	// compare). When neither this nor MetricsTrustedCIDRs is set,
-	// the route is not mounted at all (the endpoint is opt-in).
-	MetricsToken string `yaml:"metrics_token"`
-
-	// MetricsTrustedCIDRs, if non-empty, gates /metrics so only
-	// callers from these networks may scrape. ANDed with
-	// MetricsToken when both are set.
-	MetricsTrustedCIDRs []string `yaml:"metrics_trusted_cidrs"`
-}
-
-// MetricsAuthEnabled reports whether the operator has opted into
-// exposing the Prometheus scrape endpoint by configuring at least
-// one auth gate.
-func (c Config) MetricsAuthEnabled() bool {
-	return c.MetricsToken != "" || len(c.MetricsTrustedCIDRs) > 0
 }

 // Load resolves config in this order:
@@ -111,19 +93,6 @@ func Load(yamlPath string) (Config, error) {
 	if v, ok := os.LookupEnv("RM_BUNDLED_ASSETS_DIR"); ok {
 		c.BundledAssetsDir = v
 	}
-	if v, ok := os.LookupEnv("RM_METRICS_TOKEN"); ok {
-		c.MetricsToken = v
-	}
-	if v, ok := os.LookupEnv("RM_METRICS_TRUSTED_CIDR"); ok {
-		parts := strings.Split(v, ",")
-		c.MetricsTrustedCIDRs = c.MetricsTrustedCIDRs[:0]
-		for _, p := range parts {
-			p = strings.TrimSpace(p)
-			if p != "" {
-				c.MetricsTrustedCIDRs = append(c.MetricsTrustedCIDRs, p)
-			}
-		}
-	}
 	if v, ok := os.LookupEnv("RM_TRUSTED_PROXY"); ok {
 		// Comma-separated CIDRs; allow whitespace for readability.
 		parts := strings.Split(v, ",")
@@ -168,10 +137,5 @@ func (c *Config) validate() error {
 			return fmt.Errorf("config: RM_TRUSTED_PROXY entry %q is not a valid CIDR: %w", cidr, err)
 		}
 	}
-	for _, cidr := range c.MetricsTrustedCIDRs {
-		if _, err := netip.ParsePrefix(cidr); err != nil {
-			return fmt.Errorf("config: RM_METRICS_TRUSTED_CIDR entry %q is not a valid CIDR: %w", cidr, err)
-		}
-	}
 	return nil
 }
@@ -98,45 +98,6 @@ func TestCookieSecureDefaultAndOverride(t *testing.T) {
 	}
 }

-func TestMetricsAuthGates(t *testing.T) {
-	t.Setenv("RM_LISTEN", ":8080")
-	t.Setenv("RM_DATA_DIR", "/tmp/x")
-
-	c, err := Load("")
-	if err != nil {
-		t.Fatalf("load: %v", err)
-	}
-	if c.MetricsAuthEnabled() {
-		t.Errorf("metrics endpoint should be off by default")
-	}
-
-	t.Setenv("RM_METRICS_TOKEN", "s3cr3t-token-with-enough-bytes")
-	t.Setenv("RM_METRICS_TRUSTED_CIDR", "10.0.0.0/8, 192.168.1.0/24")
-	c, err = Load("")
-	if err != nil {
-		t.Fatalf("load: %v", err)
-	}
-	if c.MetricsToken != "s3cr3t-token-with-enough-bytes" {
-		t.Errorf("token: %q", c.MetricsToken)
-	}
-	if got := c.MetricsTrustedCIDRs; len(got) != 2 || got[0] != "10.0.0.0/8" || got[1] != "192.168.1.0/24" {
-		t.Errorf("cidrs: %v", got)
-	}
-	if !c.MetricsAuthEnabled() {
-		t.Errorf("MetricsAuthEnabled should be true")
-	}
-}
-
-func TestMetricsTrustedCIDRRejectsGarbage(t *testing.T) {
-	t.Setenv("RM_LISTEN", ":8080")
-	t.Setenv("RM_DATA_DIR", "/tmp/x")
-	t.Setenv("RM_METRICS_TRUSTED_CIDR", "garbage")
-
-	if _, err := Load(""); err == nil {
-		t.Fatal("expected validation error, got nil")
-	}
-}
-
 func writeFile(path string, body []byte) error {
 	return writeFileImpl(path, body)
 }
@@ -1,221 +0,0 @@
-// Package fleetupdate drives a rolling, sequential agent self-update
-// over a list of hosts. One worker goroutine per Start() call (gated
-// at the store layer to at-most-one-running-fleet-update).
-package fleetupdate
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"log/slog"
-	"time"
-
-	"github.com/oklog/ulid/v2"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-)
-
-// Hub is the slim "is this host connected?" surface.
-type Hub interface {
-	Connected(hostID string) bool
-}
-
-// Dispatcher sends one command.update envelope. The implementer also
-// creates the jobs row, writes audit, and registers with the update
-// watcher. Pre-checks are the dispatcher's responsibility — the worker
-// passes through whatever error it returns.
-type Dispatcher interface {
-	DispatchUpdate(ctx context.Context, hostID string, actorUserID string) (jobID string, code string, err error)
-}
-
-// AlertRaiser is the slim view of the alert engine's host-less raise
-// path. Used to emit fleet_update_halted on first failure.
-type AlertRaiser interface {
-	RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time)
-}
-
-// Worker is the long-lived fleet-update orchestrator. There is at most
-// one *running* fleet update at a time (enforced by the store).
-type Worker struct {
-	store  *store.Store
-	hub    Hub
-	disp   Dispatcher
-	alerts AlertRaiser
-
-	// targetVersion is the version every dispatched agent is expected
-	// to come back with. Captured at Start time to avoid drift.
-	targetVersion string
-
-	// pollPeriod controls the cadence at which the worker re-reads the
-	// host row to check for the version transition. Exposed for tests.
-	pollPeriod time.Duration
-	// hostTimeout bounds how long the worker waits for one host to
-	// reach the target version before halting.
-	hostTimeout time.Duration
-}
-
-// NewWorker builds an unstarted worker. targetVersion is set on each
-// Start call; the values here are defaults.
-func NewWorker(st *store.Store, hub Hub, disp Dispatcher, alerts AlertRaiser) *Worker {
-	return &Worker{
-		store:       st,
-		hub:         hub,
-		disp:        disp,
-		alerts:      alerts,
-		pollPeriod:  1 * time.Second,
-		hostTimeout: 95 * time.Second,
-	}
-}
-
-// Start creates the parent + child rows, then spawns the per-host
-// worker goroutine. Returns the new fleet_update_id on success.
-// store.ErrFleetUpdateRunning bubbles up unchanged.
-func (w *Worker) Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error) {
-	if userID == "" || targetVersion == "" {
-		return "", errors.New("fleetupdate: userID and targetVersion required")
-	}
-	if len(hostIDs) == 0 {
-		return "", errors.New("fleetupdate: at least one host required")
-	}
-	fuID := ulid.Make().String()
-	now := time.Now().UTC()
-	if err := w.store.CreateFleetUpdate(ctx, store.FleetUpdate{
-		ID:              fuID,
-		StartedAt:       now,
-		StartedByUserID: userID,
-		TargetVersion:   targetVersion,
-		Status:          "running",
-	}, hostIDs); err != nil {
-		return "", err
-	}
-
-	// The goroutine outlives the request that started it; carry a
-	// detached context so an HTTP-handler ctx cancel doesn't abort
-	// the long roll.
-	bg := context.WithoutCancel(ctx)
-	go w.run(bg, fuID, userID, targetVersion)
-	return fuID, nil
-}
-
-// Cancel marks the fleet update cancelled. The running goroutine
-// observes the new status on its next pre-check and exits without
-// dispatching further hosts. The currently-dispatched job is left to
-// finish on its own — cancelling agent-side is out of scope for v1.
-func (w *Worker) Cancel(ctx context.Context, fuID string) error {
-	return w.store.CancelFleetUpdate(ctx, fuID, time.Now().UTC())
-}
-
-// run is the per-host loop. Halts on first failure; emits one alert
-// on transition.
-func (w *Worker) run(ctx context.Context, fuID, userID, targetVersion string) {
-	w.targetVersion = targetVersion
-
-	for {
-		// Check the parent row's status — picks up Cancel.
-		fu, err := w.store.ActiveFleetUpdate(ctx)
-		if err != nil {
-			slog.Warn("fleetupdate: read active", "fu_id", fuID, "err", err)
-			return
-		}
-		if fu == nil || fu.ID != fuID {
-			// Cancelled, halted, or completed externally. Done.
-			return
-		}
-
-		pending, err := w.store.ListPendingFleetUpdateHosts(ctx, fuID)
-		if err != nil {
-			slog.Warn("fleetupdate: list pending", "fu_id", fuID, "err", err)
-			return
-		}
-		if len(pending) == 0 {
-			now := time.Now().UTC()
-			if err := w.store.CompleteFleetUpdate(ctx, fuID, now); err != nil {
-				slog.Warn("fleetupdate: complete", "fu_id", fuID, "err", err)
-			}
-			return
-		}
-
-		next := pending[0]
-		w.processHost(ctx, fuID, userID, next)
-	}
-}
-
-// processHost handles one host slot. Marks it skipped, succeeded, or
-// failed (and halts the fleet on failure).
-func (w *Worker) processHost(ctx context.Context, fuID, userID string, slot store.FleetUpdateHost) {
-	hostID := slot.HostID
-	_ = w.store.SetFleetUpdateCurrentHost(ctx, fuID, hostID)
-
-	// Pre-flight: re-read the host. The dispatch path repeats most of
-	// these checks but doing them up-front lets us emit the right
-	// per-host status (skipped vs failed) without consuming a job row.
-	host, err := w.store.GetHost(ctx, hostID)
-	if err != nil || host == nil {
-		_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "skipped", "host not found", "")
-		return
-	}
-	if host.AgentVersion != "" && host.AgentVersion == w.targetVersion {
-		_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "skipped", "already at target version", "")
-		return
-	}
-	if !w.hub.Connected(hostID) {
-		reason := fmt.Sprintf("host went offline: %s", hostID)
-		_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, "")
-		w.halt(ctx, fuID, reason)
-		return
-	}
-
-	// Dispatch.
-	_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "running", "", "")
-	jobID, code, err := w.disp.DispatchUpdate(ctx, hostID, userID)
-	if err != nil || code != "" {
-		reason := dispatchErrorReason(code, err)
-		_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, jobID)
-		w.halt(ctx, fuID, reason)
-		return
-	}
-
-	// Poll until the host's recorded agent_version matches target, or
-	// timeout.
-	deadline := time.Now().Add(w.hostTimeout)
-	for time.Now().Before(deadline) {
-		// Honour cancellation between polls.
-		fu, err := w.store.ActiveFleetUpdate(ctx)
-		if err == nil && (fu == nil || fu.ID != fuID) {
-			// Cancelled mid-host; leave the slot in 'running' for the
-			// admin to inspect. No further dispatches.
-			return
-		}
-		time.Sleep(w.pollPeriod)
-		h, err := w.store.GetHost(ctx, hostID)
-		if err == nil && h != nil && h.AgentVersion == w.targetVersion {
-			if err := w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "succeeded", "", jobID); err != nil {
-				slog.Warn("fleetupdate: set succeeded", "fu_id", fuID, "host_id", hostID, "err", err)
-			}
-			return
-		}
-	}
-	reason := fmt.Sprintf("timeout waiting for %s to reach %s", hostID, w.targetVersion)
-	_ = w.store.SetFleetUpdateHostStatus(ctx, fuID, hostID, "failed", reason, jobID)
-	w.halt(ctx, fuID, reason)
-}
-
-func (w *Worker) halt(ctx context.Context, fuID, reason string) {
-	now := time.Now().UTC()
-	if err := w.store.HaltFleetUpdate(ctx, fuID, reason, now); err != nil {
-		slog.Warn("fleetupdate: halt", "fu_id", fuID, "err", err)
-	}
-	if w.alerts != nil {
-		w.alerts.RaiseFleetUpdateHalted(ctx, fuID, reason, now)
-	}
-}
-
-func dispatchErrorReason(code string, err error) string {
-	if code != "" {
-		return "dispatch failed: " + code
-	}
-	if err != nil {
-		return err.Error()
-	}
-	return "dispatch failed"
-}
@@ -1,344 +0,0 @@
-package fleetupdate
-
-import (
-	"context"
-	"errors"
-	"path/filepath"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/oklog/ulid/v2"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-)
-
-type fakeHub struct {
-	mu     sync.Mutex
-	online map[string]bool
-}
-
-func (f *fakeHub) Connected(hostID string) bool {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	return f.online[hostID]
-}
-
-type fakeDispatcher struct {
-	mu    sync.Mutex
-	calls []string // host IDs
-	// after dispatch, set the host's agent_version to this on the
-	// store so the worker observes the version transition.
-	st         *store.Store
-	target     string
-	delayMS    int
-	failOnHost map[string]string // host → error code
-}
-
-func (f *fakeDispatcher) DispatchUpdate(ctx context.Context, hostID, _ string) (string, string, error) {
-	f.mu.Lock()
-	f.calls = append(f.calls, hostID)
-	if code, ok := f.failOnHost[hostID]; ok {
-		f.mu.Unlock()
-		return "", code, nil
-	}
-	st := f.st
-	target := f.target
-	delay := f.delayMS
-	f.mu.Unlock()
-
-	jobID := ulid.Make().String()
-	if st != nil {
-		_ = st.CreateJob(context.Background(), store.Job{
-			ID: jobID, HostID: hostID, Kind: "update",
-			ActorKind: "user", CreatedAt: time.Now().UTC(),
-		})
-	}
-	if st != nil && target != "" {
-		go func() {
-			if delay > 0 {
-				time.Sleep(time.Duration(delay) * time.Millisecond)
-			}
-			_ = st.MarkHostHello(context.Background(), hostID, target, "0.17", api.CurrentProtocolVersion, time.Now().UTC())
-		}()
-	}
-	return jobID, "", nil
-}
-
-type recAlert struct {
-	mu      sync.Mutex
-	reasons []string
-}
-
-func (r *recAlert) RaiseFleetUpdateHalted(_ context.Context, _ string, reason string, _ time.Time) {
-	r.mu.Lock()
-	r.reasons = append(r.reasons, reason)
-	r.mu.Unlock()
-}
-
-func openStore(t *testing.T) *store.Store {
-	t.Helper()
-	dir := t.TempDir()
-	st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
-	if err != nil {
-		t.Fatalf("open: %v", err)
-	}
-	t.Cleanup(func() { _ = st.Close() })
-	return st
-}
-
-func mustCreateAdmin(t *testing.T, st *store.Store) string {
-	t.Helper()
-	uid := ulid.Make().String()
-	if err := st.CreateUser(context.Background(), store.User{
-		ID: uid, Username: "u-" + uid[:6],
-		PasswordHash: "x", Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
-	}); err != nil {
-		t.Fatalf("user: %v", err)
-	}
-	return uid
-}
-
-func mustCreateHost(t *testing.T, st *store.Store, name, version string) string {
-	t.Helper()
-	hostID := ulid.Make().String()
-	if err := st.CreateHost(context.Background(), store.Host{
-		ID: hostID, Name: name, OS: "linux", Arch: "amd64",
-		EnrolledAt: time.Now().UTC(),
-	}, "deadbeef-"+hostID, ""); err != nil {
-		t.Fatalf("host: %v", err)
-	}
-	if version != "" {
-		if err := st.MarkHostHello(context.Background(), hostID, version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
-			t.Fatalf("hello: %v", err)
-		}
-	}
-	return hostID
-}
-
-func waitForStatus(t *testing.T, st *store.Store, fuID, want string, timeout time.Duration) *store.FleetUpdate {
-	t.Helper()
-	deadline := time.Now().Add(timeout)
-	for time.Now().Before(deadline) {
-		fu, _, err := st.GetFleetUpdate(context.Background(), fuID)
-		if err == nil && fu != nil && fu.Status == want {
-			return fu
-		}
-		time.Sleep(20 * time.Millisecond)
-	}
-	t.Fatalf("status never reached %q", want)
-	return nil
-}
-
-func TestWorkerTwoHostsBothSucceed(t *testing.T) {
-	st := openStore(t)
-	uid := mustCreateAdmin(t, st)
-	h1 := mustCreateHost(t, st, "h1", "v0")
-	h2 := mustCreateHost(t, st, "h2", "v0")
-
-	hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
-	disp := &fakeDispatcher{st: st, target: "v2", delayMS: 30}
-	alerts := &recAlert{}
-	w := NewWorker(st, hub, disp, alerts)
-	w.pollPeriod = 20 * time.Millisecond
-	w.hostTimeout = 2 * time.Second
-
-	fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
-	if err != nil {
-		t.Fatalf("start: %v", err)
-	}
-	waitForStatus(t, st, fuID, "completed", 5*time.Second)
-	_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
-	for _, h := range hosts {
-		if h.Status != "succeeded" {
-			t.Errorf("host %s status %q want succeeded", h.HostID, h.Status)
-		}
-	}
-	if n := len(alerts.reasons); n != 0 {
-		t.Errorf("unexpected halt alert: %v", alerts.reasons)
-	}
-}
-
-func TestWorkerSecondHostTimesOutHalts(t *testing.T) {
-	st := openStore(t)
-	uid := mustCreateAdmin(t, st)
-	h1 := mustCreateHost(t, st, "h1", "v0")
-	h2 := mustCreateHost(t, st, "h2", "v0")
-	h3 := mustCreateHost(t, st, "h3", "v0")
-
-	hub := &fakeHub{online: map[string]bool{h1: true, h2: true, h3: true}}
-	// h1 dispatches normally (transitions to v2). h2 dispatch returns
-	// success but never transitions.
-	disp := &fakeDispatcher{st: st, target: "v2", delayMS: 20, failOnHost: map[string]string{
-		h2: "", // not a code-failure; simulate by clearing target on this disp run
-	}}
-	// Actually: drop h2 from the auto-transition by faking with a
-	// per-host store setter. Easiest: subclass via a wrapper.
-	_ = disp
-	customDisp := &perHostDispatcher{base: disp, st: st, target: "v2", noTransition: map[string]bool{h2: true}}
-
-	alerts := &recAlert{}
-	w := NewWorker(st, hub, customDisp, alerts)
-	w.pollPeriod = 20 * time.Millisecond
-	w.hostTimeout = 200 * time.Millisecond
-
-	fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2, h3})
-	if err != nil {
-		t.Fatalf("start: %v", err)
-	}
-	waitForStatus(t, st, fuID, "halted", 3*time.Second)
-	_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
-	gotStatus := map[string]string{}
-	for _, h := range hosts {
-		gotStatus[h.HostID] = h.Status
-	}
-	if gotStatus[h1] != "succeeded" {
-		t.Errorf("h1: %q", gotStatus[h1])
-	}
-	if gotStatus[h2] != "failed" {
-		t.Errorf("h2: %q", gotStatus[h2])
-	}
-	if gotStatus[h3] != "pending" {
-		t.Errorf("h3: %q", gotStatus[h3])
-	}
-	alerts.mu.Lock()
-	defer alerts.mu.Unlock()
-	if len(alerts.reasons) != 1 {
-		t.Errorf("alert reasons: %v", alerts.reasons)
-	}
-}
-
-// perHostDispatcher lets a test omit the auto-transition for selected
-// hosts so we can simulate timeout.
-type perHostDispatcher struct {
-	mu           sync.Mutex
-	base         *fakeDispatcher
-	st           *store.Store
-	target       string
-	noTransition map[string]bool
-}
-
-func (p *perHostDispatcher) DispatchUpdate(_ context.Context, hostID, _ string) (string, string, error) {
-	p.mu.Lock()
-	skip := p.noTransition[hostID]
-	p.mu.Unlock()
-	jobID := ulid.Make().String()
-	_ = p.st.CreateJob(context.Background(), store.Job{
-		ID: jobID, HostID: hostID, Kind: "update",
-		ActorKind: "user", CreatedAt: time.Now().UTC(),
-	})
-	if !skip {
-		go func() {
-			time.Sleep(20 * time.Millisecond)
-			_ = p.st.MarkHostHello(context.Background(), hostID, p.target, "0.17", api.CurrentProtocolVersion, time.Now().UTC())
-		}()
-	}
-	return jobID, "", nil
-}
-
-func TestWorkerHostOfflineHalts(t *testing.T) {
-	st := openStore(t)
-	uid := mustCreateAdmin(t, st)
-	h1 := mustCreateHost(t, st, "h1", "v0")
-	h2 := mustCreateHost(t, st, "h2", "v0")
-	hub := &fakeHub{online: map[string]bool{h1: false, h2: true}}
-	disp := &fakeDispatcher{st: st, target: "v2"}
-	alerts := &recAlert{}
-	w := NewWorker(st, hub, disp, alerts)
-	w.pollPeriod = 20 * time.Millisecond
-	w.hostTimeout = 500 * time.Millisecond
-
-	fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
-	if err != nil {
-		t.Fatalf("start: %v", err)
-	}
-	waitForStatus(t, st, fuID, "halted", 2*time.Second)
-	_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
-	if hosts[0].Status != "failed" {
-		t.Errorf("h1 status: %q", hosts[0].Status)
-	}
-	if hosts[1].Status != "pending" {
-		t.Errorf("h2 status: %q", hosts[1].Status)
-	}
-}
-
-func TestWorkerAlreadyAtTargetSkipped(t *testing.T) {
-	st := openStore(t)
-	uid := mustCreateAdmin(t, st)
-	h1 := mustCreateHost(t, st, "h1", "v2")
-	h2 := mustCreateHost(t, st, "h2", "v0")
-	hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
-	disp := &fakeDispatcher{st: st, target: "v2", delayMS: 20}
-	alerts := &recAlert{}
-	w := NewWorker(st, hub, disp, alerts)
-	w.pollPeriod = 20 * time.Millisecond
-	w.hostTimeout = 2 * time.Second
-
-	fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
-	if err != nil {
-		t.Fatalf("start: %v", err)
-	}
-	waitForStatus(t, st, fuID, "completed", 4*time.Second)
-	_, hosts, _ := st.GetFleetUpdate(context.Background(), fuID)
-	want := map[string]string{h1: "skipped", h2: "succeeded"}
-	for _, h := range hosts {
-		if h.Status != want[h.HostID] {
-			t.Errorf("host %s: got %q want %q", h.HostID, h.Status, want[h.HostID])
-		}
-	}
-}
-
-func TestWorkerCancelMidRun(t *testing.T) {
-	st := openStore(t)
-	uid := mustCreateAdmin(t, st)
-	h1 := mustCreateHost(t, st, "h1", "v0")
-	h2 := mustCreateHost(t, st, "h2", "v0")
-	hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
-	// h1's transition is delayed long enough that we can cancel
-	// before it lands; h2 should never be touched.
-	disp := &fakeDispatcher{st: st, target: "v2", delayMS: 500}
-	alerts := &recAlert{}
-	w := NewWorker(st, hub, disp, alerts)
-	w.pollPeriod = 50 * time.Millisecond
-	w.hostTimeout = 5 * time.Second
-
-	fuID, err := w.Start(context.Background(), uid, "v2", []string{h1, h2})
-	if err != nil {
-		t.Fatalf("start: %v", err)
-	}
-	// Give the worker a moment to dispatch h1.
-	time.Sleep(100 * time.Millisecond)
-	if err := w.Cancel(context.Background(), fuID); err != nil {
-		t.Fatalf("cancel: %v", err)
-	}
-	waitForStatus(t, st, fuID, "cancelled", 2*time.Second)
-
-	// h2 should never be dispatched.
-	disp.mu.Lock()
-	defer disp.mu.Unlock()
-	for _, c := range disp.calls {
-		if c == h2 {
-			t.Errorf("h2 dispatched after cancel")
-		}
-	}
-}
-
-func TestWorkerStartWhileActiveErrors(t *testing.T) {
-	st := openStore(t)
-	uid := mustCreateAdmin(t, st)
-	h1 := mustCreateHost(t, st, "h1", "v0")
-	h2 := mustCreateHost(t, st, "h2", "v0")
-	hub := &fakeHub{online: map[string]bool{h1: true, h2: true}}
-	disp := &fakeDispatcher{st: st, target: "v2", delayMS: 5_000}
-	w := NewWorker(st, hub, disp, &recAlert{})
-	w.pollPeriod = 50 * time.Millisecond
-	w.hostTimeout = 2 * time.Second
-	if _, err := w.Start(context.Background(), uid, "v2", []string{h1}); err != nil {
-		t.Fatalf("first start: %v", err)
-	}
-	_, err := w.Start(context.Background(), uid, "v2", []string{h2})
-	if !errors.Is(err, store.ErrFleetUpdateRunning) {
-		t.Fatalf("err: %v want ErrFleetUpdateRunning", err)
-	}
-}
@@ -11,7 +11,6 @@ import (
 	"time"

 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )

 func makeFilterHosts() []store.Host {
@@ -99,23 +98,6 @@ func TestSortDashboardHostsColumns(t *testing.T) {
 	}
 }

-// TestFilterAndSortDashboardUpdatesBehind: ?updates=behind narrows
-// to hosts whose agent_version is non-empty AND != server's version.
-func TestFilterAndSortDashboardUpdatesBehind(t *testing.T) {
-	t.Parallel()
-	hosts := []store.Host{
-		{ID: "01a", Name: "alpha", AgentVersion: "v0.0.1", Status: "online"},
-		{ID: "01b", Name: "bravo", AgentVersion: version.Version, Status: "online"},
-		{ID: "01c", Name: "charlie", AgentVersion: "", Status: "online"}, // never seen
-		{ID: "01d", Name: "delta", AgentVersion: "v0.0.1", Status: "offline"},
-	}
-	got := filterAndSortDashboardHosts(hosts, dashboardFilter{Updates: "behind", Sort: "name", Dir: "asc"})
-	// alpha + delta both behind; bravo (current) and charlie (empty) excluded.
-	if len(got) != 2 || got[0].Name != "alpha" || got[1].Name != "delta" {
-		t.Errorf("updates=behind: got %v", namesOf(got))
-	}
-}
-
 // TestParseDashboardFilterDefaults: empty query gives sort=name asc.
 func TestParseDashboardFilterDefaults(t *testing.T) {
 	t.Parallel()
@@ -1,379 +0,0 @@
-// fleet_update.go — admin-only fleet rolling-update endpoints + page.
-//
-// Surface:
-//   - POST /api/fleet/update          → starts a fleet update (JSON)
-//   - POST /api/fleet-updates/{id}/cancel
-//   - GET  /api/fleet-updates/{id}    → JSON parent + per-host array
-//   - GET  /settings/fleet-update     → admin UI page
-//   - GET  /settings/fleet-update/partial → htmx polling fragment
-//
-// All routes are mounted in the admin band (see routes()).
-package http
-
-import (
-	"context"
-	"encoding/json"
-	"errors"
-	"log/slog"
-	stdhttp "net/http"
-	"time"
-
-	"github.com/go-chi/chi/v5"
-	"github.com/oklog/ulid/v2"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
-)
-
-// fleetUpdateStartReq is the JSON body for POST /api/fleet/update.
-// Both fields are optional: empty target_version defaults to the
-// server's current version, empty host_ids derives the out-of-date
-// online subset.
-type fleetUpdateStartReq struct {
-	TargetVersion string   `json:"target_version,omitempty"`
-	HostIDs       []string `json:"host_ids,omitempty"`
-}
-
-// fleetUpdateHostView is one row in the JSON response for GET
-// /api/fleet-updates/{id}. Hostname is hydrated from the store so
-// callers don't need a second round-trip per host.
-type fleetUpdateHostView struct {
-	HostID       string `json:"host_id"`
-	HostName     string `json:"host_name,omitempty"`
-	Position     int    `json:"position"`
-	Status       string `json:"status"`
-	JobID        string `json:"job_id,omitempty"`
-	FailedReason string `json:"failed_reason,omitempty"`
-}
-
-// fleetUpdateView is the JSON projection of the parent + children.
-type fleetUpdateView struct {
-	ID              string                `json:"id"`
-	StartedAt       string                `json:"started_at"`
-	StartedByUserID string                `json:"started_by_user_id"`
-	TargetVersion   string                `json:"target_version"`
-	Status          string                `json:"status"`
-	CurrentHostID   string                `json:"current_host_id,omitempty"`
-	HaltedReason    string                `json:"halted_reason,omitempty"`
-	CompletedAt     *string               `json:"completed_at,omitempty"`
-	Hosts           []fleetUpdateHostView `json:"hosts"`
-}
-
-// fleetUpdatePage backs both the full /settings/fleet-update page
-// and the partial polled fragment. Idle / Active are mutually
-// exclusive: if Active is non-nil, render the progress view.
-type fleetUpdatePage struct {
-	// Idle-state fields.
-	OutOfDateHosts []store.Host // online hosts whose version != target
-	TargetVersion  string
-
-	// Active-state fields. Nil when no fleet update has ever run.
-	Active     *store.FleetUpdate
-	ActiveRows []fleetUpdateHostView
-
-	// Common.
-	HostNames map[string]string
-	// PollURL is the partial endpoint htmx polls every few seconds.
-	PollURL string
-}
-
-// handleAPIFleetUpdateStart is POST /api/fleet/update.
-func (s *Server) handleAPIFleetUpdateStart(w stdhttp.ResponseWriter, r *stdhttp.Request) {
-	user, ok := s.requireUser(r)
-	if !ok {
-		writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
-		return
-	}
-	if s.deps.FleetWorker == nil {
-		writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "")
-		return
-	}
-	var body fleetUpdateStartReq
-	// Empty body is fine — both fields are optional.
-	if r.ContentLength != 0 {
-		if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
-			writeJSONError(w, stdhttp.StatusBadRequest, "invalid_json", err.Error())
-			return
-		}
-	}
-	target := body.TargetVersion
-	if target == "" {
-		target = version.Version
-	}
-	hostIDs := body.HostIDs
-	if len(hostIDs) == 0 {
-		derived, err := s.deriveOutOfDateOnlineHostIDs(r.Context(), target)
-		if err != nil {
-			writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
-			return
-		}
-		hostIDs = derived
-	}
-	if len(hostIDs) == 0 {
-		writeJSONError(w, stdhttp.StatusConflict, "no_hosts_eligible",
-			"no online hosts are out of date")
-		return
-	}
-
-	fuID, err := s.deps.FleetWorker.Start(r.Context(), user.ID, target, hostIDs)
-	if err != nil {
-		if errors.Is(err, store.ErrFleetUpdateRunning) {
-			writeJSONError(w, stdhttp.StatusConflict, "fleet_update_in_progress", err.Error())
-			return
-		}
-		writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
-		return
-	}
-
-	auditPayload, _ := json.Marshal(map[string]any{
-		"fleet_update_id": fuID,
-		"target_version":  target,
-		"host_count":      len(hostIDs),
-	})
-	_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
-		ID: ulid.Make().String(), UserID: &user.ID, Actor: "user",
-		Action:     "fleet.update_started",
-		TargetKind: ptr("fleet_update"), TargetID: &fuID,
-		TS:      time.Now().UTC(),
-		Payload: auditPayload,
-	})
-
-	writeJSON(w, stdhttp.StatusAccepted, map[string]string{"fleet_update_id": fuID})
-}
-
-// handleAPIFleetUpdateCancel is POST /api/fleet-updates/{id}/cancel.
-func (s *Server) handleAPIFleetUpdateCancel(w stdhttp.ResponseWriter, r *stdhttp.Request) {
-	user, ok := s.requireUser(r)
-	if !ok {
-		writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
-		return
-	}
-	if s.deps.FleetWorker == nil {
-		writeJSONError(w, stdhttp.StatusServiceUnavailable, "fleet_worker_unavailable", "")
-		return
-	}
-	fuID := chi.URLParam(r, "id")
-	if fuID == "" {
-		writeJSONError(w, stdhttp.StatusBadRequest, "missing_id", "")
-		return
-	}
-	fu, _, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID)
-	if err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "")
-			return
-		}
-		writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
-		return
-	}
-	if fu.Status != "running" {
-		writeJSONError(w, stdhttp.StatusConflict, "fleet_update_not_running",
-			"fleet update is not in the running state")
-		return
-	}
-	if err := s.deps.FleetWorker.Cancel(r.Context(), fuID); err != nil {
-		writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
-		return
-	}
-	_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
-		ID: ulid.Make().String(), UserID: &user.ID, Actor: "user",
-		Action:     "fleet.update_cancelled",
-		TargetKind: ptr("fleet_update"), TargetID: &fuID,
-		TS: time.Now().UTC(),
-	})
-	w.WriteHeader(stdhttp.StatusNoContent)
-}
-
-// handleAPIFleetUpdateGet is GET /api/fleet-updates/{id}.
-func (s *Server) handleAPIFleetUpdateGet(w stdhttp.ResponseWriter, r *stdhttp.Request) {
-	if _, ok := s.requireUser(r); !ok {
-		writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
-		return
-	}
-	fuID := chi.URLParam(r, "id")
-	fu, hosts, err := s.deps.Store.GetFleetUpdate(r.Context(), fuID)
-	if err != nil {
-		if errors.Is(err, store.ErrNotFound) {
-			writeJSONError(w, stdhttp.StatusNotFound, "fleet_update_not_found", "")
-			return
-		}
-		writeJSONError(w, stdhttp.StatusInternalServerError, "internal", err.Error())
-		return
-	}
-	names := s.hostNameMap(r)
-	view := fleetUpdateView{
-		ID:              fu.ID,
-		StartedAt:       fu.StartedAt.UTC().Format(time.RFC3339Nano),
-		StartedByUserID: fu.StartedByUserID,
-		TargetVersion:   fu.TargetVersion,
-		Status:          fu.Status,
-		CurrentHostID:   fu.CurrentHostID,
-		HaltedReason:    fu.HaltedReason,
-		Hosts:           make([]fleetUpdateHostView, 0, len(hosts)),
-	}
-	if fu.CompletedAt != nil {
-		s := fu.CompletedAt.UTC().Format(time.RFC3339Nano)
-		view.CompletedAt = &s
-	}
-	for _, h := range hosts {
-		view.Hosts = append(view.Hosts, fleetUpdateHostView{
-			HostID:       h.HostID,
-			HostName:     names[h.HostID],
-			Position:     h.Position,
-			Status:       h.Status,
-			JobID:        h.JobID,
-			FailedReason: h.FailedReason,
-		})
-	}
-	writeJSON(w, stdhttp.StatusOK, view)
-}
-
-// handleUIFleetUpdate renders /settings/fleet-update.
-func (s *Server) handleUIFleetUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) {
-	u := s.requireUIUser(w, r)
-	if u == nil {
-		return
-	}
-	page, err := s.buildFleetUpdatePage(r)
-	if err != nil {
-		slog.Error("ui fleet update: build page", "err", err)
-		stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
-		return
-	}
-	view := s.baseView(r, u)
-	view.Title = "Fleet update · restic-manager"
-	view.Active = "settings"
-	view.Page = page
-	if err := s.deps.UI.Render(w, "fleet_update", view); err != nil {
-		slog.Error("ui fleet update: render", "err", err)
-	}
-}
-
-// handleUIFleetUpdatePartial renders just the inner panel for htmx
-// auto-refresh polling — same data, no chrome.
-func (s *Server) handleUIFleetUpdatePartial(w stdhttp.ResponseWriter, r *stdhttp.Request) {
-	u := s.requireUIUser(w, r)
-	if u == nil {
-		return
-	}
-	page, err := s.buildFleetUpdatePage(r)
-	if err != nil {
-		slog.Error("ui fleet update partial: build page", "err", err)
-		stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
-		return
-	}
-	view := s.baseView(r, u)
-	view.Page = page
-	if err := s.deps.UI.RenderPartial(w, "fleet_update_inner", view); err != nil {
-		slog.Error("ui fleet update partial: render", "err", err)
-	}
-}
-
-// buildFleetUpdatePage assembles the data both /settings/fleet-update
-// and its partial render against. Resolves the most-recent fleet
-// update (active OR completed/cancelled/halted) so the page can show
-// the last roll's result instead of disappearing into "idle" the
-// instant a roll finishes.
-func (s *Server) buildFleetUpdatePage(r *stdhttp.Request) (fleetUpdatePage, error) {
-	page := fleetUpdatePage{
-		TargetVersion: version.Version,
-		HostNames:     map[string]string{},
-		PollURL:       "/settings/fleet-update/partial",
-	}
-	hosts, err := s.deps.Store.ListHosts(r.Context())
-	if err != nil {
-		return page, err
-	}
-	for _, h := range hosts {
-		page.HostNames[h.ID] = h.Name
-	}
-
-	active, err := s.deps.Store.ActiveFleetUpdate(r.Context())
-	if err != nil {
-		return page, err
-	}
-	mostRecent := active
-	if mostRecent == nil {
-		// Fall back to the most recent terminal row so the page can
-		// show "completed" / "halted" / "cancelled" once the worker
-		// finishes. One small bespoke query — keeps the page from
-		// flashing back to "idle" the instant a roll wraps up.
-		var id string
-		err := s.deps.Store.DB().QueryRowContext(r.Context(),
-			`SELECT id FROM fleet_updates ORDER BY started_at DESC LIMIT 1`).
-			Scan(&id)
-		if err == nil {
-			fu, _, gerr := s.deps.Store.GetFleetUpdate(r.Context(), id)
-			if gerr == nil {
-				mostRecent = fu
-			}
-		}
-	}
-
-	if mostRecent != nil {
-		_, rows, gerr := s.deps.Store.GetFleetUpdate(r.Context(), mostRecent.ID)
-		if gerr == nil {
-			page.Active = mostRecent
-			page.ActiveRows = make([]fleetUpdateHostView, 0, len(rows))
-			for _, hr := range rows {
-				page.ActiveRows = append(page.ActiveRows, fleetUpdateHostView{
-					HostID:       hr.HostID,
-					HostName:     page.HostNames[hr.HostID],
-					Position:     hr.Position,
-					Status:       hr.Status,
-					JobID:        hr.JobID,
-					FailedReason: hr.FailedReason,
-				})
-			}
-		}
-	}
-
-	// Idle list (or "still out of date" reference even when an active
-	// roll is running — cheap to compute, harmless to attach).
-	for _, h := range hosts {
-		if h.Status != "online" {
-			continue
-		}
-		if h.AgentVersion == "" || h.AgentVersion == page.TargetVersion {
-			continue
-		}
-		page.OutOfDateHosts = append(page.OutOfDateHosts, h)
-	}
-	return page, nil
-}
-
-// deriveOutOfDateOnlineHostIDs returns the list of host IDs that
-// (a) are online (Hub.Connected) and (b) have an agent_version that's
-// non-empty AND != target. Used by the start endpoint when the caller
-// omits host_ids.
-func (s *Server) deriveOutOfDateOnlineHostIDs(ctx context.Context, target string) ([]string, error) {
-	hosts, err := s.deps.Store.ListHosts(ctx)
-	if err != nil {
-		return nil, err
-	}
-	out := []string{}
-	for _, h := range hosts {
-		if h.AgentVersion == "" || h.AgentVersion == target {
-			continue
-		}
-		if !s.deps.Hub.Connected(h.ID) {
-			continue
-		}
-		out = append(out, h.ID)
-	}
-	return out, nil
-}
-
-// hostNameMap returns hostID → name; used to hydrate fleet-update
-// JSON responses.
-func (s *Server) hostNameMap(r *stdhttp.Request) map[string]string {
-	out := map[string]string{}
-	hosts, err := s.deps.Store.ListHosts(r.Context())
-	if err != nil {
-		return out
-	}
-	for _, h := range hosts {
-		out[h.ID] = h.Name
-	}
-	return out
-}
@@ -1,334 +0,0 @@
-// fleet_update_test.go — coverage for the P6-15 fleet-update HTTP
-// surface: start/cancel/get JSON endpoints + RBAC.
-package http
-
-import (
-	"bytes"
-	"context"
-	"encoding/json"
-	stdhttp "net/http"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/oklog/ulid/v2"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
-)
-
-// fakeFleetWorker stands in for *fleetupdate.Worker in HTTP tests.
-// It records what was passed to Start/Cancel and lets tests inject
-// canned errors. Satisfies the FleetWorker interface in
-// host_update.go.
-type fakeFleetWorker struct {
-	mu sync.Mutex
-
-	startCalls []fakeStartCall
-	startID    string
-	startErr   error
-
-	cancelCalls []string
-	cancelErr   error
-}
-
-type fakeStartCall struct {
-	UserID  string
-	Target  string
-	HostIDs []string
-}
-
-func (f *fakeFleetWorker) Start(_ context.Context, userID, target string, hostIDs []string) (string, error) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	f.startCalls = append(f.startCalls, fakeStartCall{userID, target, append([]string(nil), hostIDs...)})
-	if f.startErr != nil {
-		return "", f.startErr
-	}
-	return f.startID, nil
-}
-
-func (f *fakeFleetWorker) Cancel(_ context.Context, id string) error {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	f.cancelCalls = append(f.cancelCalls, id)
-	return f.cancelErr
-}
-
-// helloOnlineHost is the smallest setup that lets the dispatch /
-// derivation logic see a host as "online + version mismatch".
-// Returns the host id.
-func helloOnlineHost(t *testing.T, srv *Server, st *store.Store, name, agentVer string) string {
-	t.Helper()
-	id := makeHost(t, st, name)
-	if err := st.MarkHostHello(context.Background(), id, agentVer, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
-		t.Fatalf("mark hello: %v", err)
-	}
-	// Mark connected on the hub so deriveOutOfDateOnlineHostIDs
-	// considers it online without needing a real WS handshake. The
-	// Conn has a nil websocket pointer — tests never call Send on it.
-	srv.deps.Hub.Register(id, ws.NewConn(id, nil))
-	return id
-}
-
-func TestFleetUpdateStartHappyPath(t *testing.T) {
-	t.Parallel()
-	srv, ts, st := rawTestServer(t)
-	worker := &fakeFleetWorker{startID: ulid.Make().String()}
-	srv.deps.FleetWorker = worker
-
-	cookie, uid := loginAsAdminWithID(t, st)
-	hostID := helloOnlineHost(t, srv, st, "fu-host", "v0")
-
-	body := map[string]any{"host_ids": []string{hostID}}
-	raw, _ := json.Marshal(body)
-	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader(raw))
-	req.AddCookie(cookie)
-	req.Header.Set("Content-Type", "application/json")
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusAccepted {
-		t.Fatalf("status: got %d, want 202", res.StatusCode)
-	}
-	var out struct {
-		FleetUpdateID string `json:"fleet_update_id"`
-	}
-	if err := json.NewDecoder(res.Body).Decode(&out); err != nil {
-		t.Fatalf("decode: %v", err)
-	}
-	if out.FleetUpdateID != worker.startID {
-		t.Fatalf("fleet_update_id: got %q, want %q", out.FleetUpdateID, worker.startID)
-	}
-	worker.mu.Lock()
-	if len(worker.startCalls) != 1 || worker.startCalls[0].UserID != uid {
-		t.Fatalf("start calls: %+v", worker.startCalls)
-	}
-	if got := worker.startCalls[0].HostIDs; len(got) != 1 || got[0] != hostID {
-		t.Fatalf("host_ids: %v", got)
-	}
-	worker.mu.Unlock()
-
-	// Audit row.
-	var n int
-	if err := st.DB().QueryRow(
-		`SELECT COUNT(*) FROM audit_log WHERE action = 'fleet.update_started' AND target_id = ?`,
-		out.FleetUpdateID).Scan(&n); err != nil {
-		t.Fatalf("audit count: %v", err)
-	}
-	if n != 1 {
-		t.Fatalf("audit rows: got %d, want 1", n)
-	}
-}
-
-func TestFleetUpdateStartConflictWhenAlreadyRunning(t *testing.T) {
-	t.Parallel()
-	srv, ts, st := rawTestServer(t)
-	worker := &fakeFleetWorker{startErr: store.ErrFleetUpdateRunning}
-	srv.deps.FleetWorker = worker
-	cookie := loginAsAdmin(t, st)
-	_ = helloOnlineHost(t, srv, st, "fu-host", "v0")
-
-	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
-	req.AddCookie(cookie)
-	req.Header.Set("Content-Type", "application/json")
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusConflict {
-		t.Fatalf("status: got %d, want 409", res.StatusCode)
-	}
-	body := readJSONError(t, res.Body)
-	if body.Code != "fleet_update_in_progress" {
-		t.Fatalf("code: %q", body.Code)
-	}
-}
-
-func TestFleetUpdateStartDerivesHostIDsWhenEmpty(t *testing.T) {
-	t.Parallel()
-	srv, ts, st := rawTestServer(t)
-	worker := &fakeFleetWorker{startID: ulid.Make().String()}
-	srv.deps.FleetWorker = worker
-	cookie := loginAsAdmin(t, st)
-
-	// Two online + out-of-date, one online + at-target, one offline.
-	a := helloOnlineHost(t, srv, st, "behind-a", "v0")
-	b := helloOnlineHost(t, srv, st, "behind-b", "v0")
-	_ = helloOnlineHost(t, srv, st, "uptodate", version.Version)
-	offlineID := makeHost(t, st, "offline-host")
-	if err := st.MarkHostHello(context.Background(), offlineID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
-		t.Fatalf("mark hello: %v", err)
-	}
-	// Don't MarkOnline → derivation should skip.
-
-	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
-	req.AddCookie(cookie)
-	req.Header.Set("Content-Type", "application/json")
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusAccepted {
-		t.Fatalf("status: got %d, want 202", res.StatusCode)
-	}
-	worker.mu.Lock()
-	defer worker.mu.Unlock()
-	if len(worker.startCalls) != 1 {
-		t.Fatalf("start calls: %d", len(worker.startCalls))
-	}
-	got := worker.startCalls[0].HostIDs
-	want := map[string]bool{a: true, b: true}
-	if len(got) != 2 || !want[got[0]] || !want[got[1]] {
-		t.Fatalf("derived host_ids: got %v, want both of %v", got, []string{a, b})
-	}
-}
-
-func TestFleetUpdateCancelHappyPath(t *testing.T) {
-	t.Parallel()
-	srv, ts, st := rawTestServer(t)
-	worker := &fakeFleetWorker{}
-	srv.deps.FleetWorker = worker
-	cookie := loginAsAdmin(t, st)
-
-	// Seed a running fleet update directly.
-	fuID := ulid.Make().String()
-	uid := ulid.Make().String()
-	if err := st.CreateUser(context.Background(), store.User{
-		ID: uid, Username: "starter", PasswordHash: "x",
-		Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
-	}); err != nil {
-		t.Fatalf("seed user: %v", err)
-	}
-	hostID := makeHost(t, st, "fu-cancel-host")
-	if err := st.CreateFleetUpdate(context.Background(),
-		store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1"},
-		[]string{hostID}); err != nil {
-		t.Fatalf("seed fleet update: %v", err)
-	}
-
-	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet-updates/"+fuID+"/cancel", nil)
-	req.AddCookie(cookie)
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusNoContent {
-		t.Fatalf("status: got %d, want 204", res.StatusCode)
-	}
-	worker.mu.Lock()
-	if len(worker.cancelCalls) != 1 || worker.cancelCalls[0] != fuID {
-		t.Fatalf("cancel calls: %v", worker.cancelCalls)
-	}
-	worker.mu.Unlock()
-}
-
-func TestFleetUpdateCancelNotRunning(t *testing.T) {
-	t.Parallel()
-	srv, ts, st := rawTestServer(t)
-	srv.deps.FleetWorker = &fakeFleetWorker{}
-	cookie := loginAsAdmin(t, st)
-
-	// Seed + complete one so it's no longer running.
-	fuID := ulid.Make().String()
-	uid := ulid.Make().String()
-	_ = st.CreateUser(context.Background(), store.User{
-		ID: uid, Username: "starter2", PasswordHash: "x",
-		Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
-	})
-	hostID := makeHost(t, st, "fu-done-host")
-	_ = st.CreateFleetUpdate(context.Background(),
-		store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1"},
-		[]string{hostID})
-	if err := st.CompleteFleetUpdate(context.Background(), fuID, time.Now().UTC()); err != nil {
-		t.Fatalf("complete: %v", err)
-	}
-
-	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet-updates/"+fuID+"/cancel", nil)
-	req.AddCookie(cookie)
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusConflict {
-		t.Fatalf("status: got %d, want 409", res.StatusCode)
-	}
-	body := readJSONError(t, res.Body)
-	if body.Code != "fleet_update_not_running" {
-		t.Fatalf("code: %q", body.Code)
-	}
-}
-
-func TestFleetUpdateGetHydrates(t *testing.T) {
-	t.Parallel()
-	_, ts, st := rawTestServer(t)
-	cookie := loginAsAdmin(t, st)
-
-	uid := ulid.Make().String()
-	_ = st.CreateUser(context.Background(), store.User{
-		ID: uid, Username: "starter3", PasswordHash: "x",
-		Role: store.RoleAdmin, CreatedAt: time.Now().UTC(),
-	})
-	hostID := makeHost(t, st, "fu-get-host")
-	fuID := ulid.Make().String()
-	if err := st.CreateFleetUpdate(context.Background(),
-		store.FleetUpdate{ID: fuID, StartedByUserID: uid, TargetVersion: "v1.2.3"},
-		[]string{hostID}); err != nil {
-		t.Fatalf("seed: %v", err)
-	}
-
-	req, _ := stdhttp.NewRequest("GET", ts.URL+"/api/fleet-updates/"+fuID, nil)
-	req.AddCookie(cookie)
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusOK {
-		t.Fatalf("status: got %d, want 200", res.StatusCode)
-	}
-	var got fleetUpdateView
-	if err := json.NewDecoder(res.Body).Decode(&got); err != nil {
-		t.Fatalf("decode: %v", err)
-	}
-	if got.ID != fuID || got.TargetVersion != "v1.2.3" || got.Status != "running" {
-		t.Fatalf("parent: %+v", got)
-	}
-	if len(got.Hosts) != 1 || got.Hosts[0].HostID != hostID || got.Hosts[0].HostName != "fu-get-host" {
-		t.Fatalf("hosts: %+v", got.Hosts)
-	}
-}
-
-func TestFleetUpdateRBAC(t *testing.T) {
-	t.Parallel()
-	_, ts, st := rawTestServer(t)
-
-	for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} {
-		role := role
-		t.Run(string(role), func(t *testing.T) {
-			cookie := loginAsRole(t, st, role)
-			req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/fleet/update", bytes.NewReader([]byte(`{}`)))
-			req.AddCookie(cookie)
-			req.Header.Set("Content-Type", "application/json")
-			res, err := stdhttp.DefaultClient.Do(req)
-			if err != nil {
-				t.Fatalf("do: %v", err)
-			}
-			defer res.Body.Close()
-			if res.StatusCode != stdhttp.StatusForbidden {
-				t.Fatalf("status: got %d, want 403", res.StatusCode)
-			}
-		})
-	}
-}
-
-// Sanity check that fakeFleetWorker satisfies the FleetWorker iface.
-var _ FleetWorker = (*fakeFleetWorker)(nil)
@@ -1,217 +0,0 @@
-package http
-
-import (
-	"context"
-	"encoding/json"
-	stdhttp "net/http"
-	"time"
-
-	"github.com/go-chi/chi/v5"
-	"github.com/oklog/ulid/v2"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
-)
-
-// UpdateWatcher is the slim view of the ws.updateWatcher this package
-// uses for tracking in-flight update dispatches. Defined as an
-// interface so a test can inject a stub.
-type UpdateWatcher interface {
-	Track(jobID, hostID string)
-}
-
-// FleetWorker is the slim view of the fleetupdate.Worker this package
-// uses. Kept here for forward compatibility with P6-15 — the host
-// update endpoint itself does not use it.
-type FleetWorker interface {
-	Start(ctx context.Context, userID, targetVersion string, hostIDs []string) (string, error)
-	Cancel(ctx context.Context, fleetUpdateID string) error
-}
-
-// dispatchHostUpdateResult communicates structured outcomes from the
-// shared dispatch path so both the HTTP handler and the fleet worker
-// can format errors in their own idiom.
-type dispatchHostUpdateResult struct {
-	JobID  string
-	Code   string // "" on success
-	Status int    // HTTP status the JSON handler should use on error
-	Msg    string // human-readable detail (optional)
-}
-
-// dispatchHostUpdate is the shared "send command.update to one host"
-// path. It performs every pre-check (host exists, online, version
-// mismatch, no in-flight update) and on success creates the jobs row,
-// audits, dispatches the WS envelope, and tracks the watcher entry.
-//
-// Pre-checks are returned as structured codes rather than HTTP errors
-// so the fleet worker can map them onto its own per-host status enum
-// without parsing strings.
-func (s *Server) dispatchHostUpdate(ctx context.Context, hostID string, actorKind string, actorID *string) dispatchHostUpdateResult {
-	host, err := s.deps.Store.GetHost(ctx, hostID)
-	if err != nil || host == nil {
-		return dispatchHostUpdateResult{Code: "host_not_found", Status: stdhttp.StatusNotFound}
-	}
-	if !s.deps.Hub.Connected(host.ID) {
-		return dispatchHostUpdateResult{
-			Code: "host_offline", Status: stdhttp.StatusConflict,
-			Msg: "agent is not currently connected",
-		}
-	}
-	if host.AgentVersion != "" && host.AgentVersion == version.Version {
-		return dispatchHostUpdateResult{
-			Code: "already_up_to_date", Status: stdhttp.StatusConflict,
-			Msg: "agent already running version " + version.Version,
-		}
-	}
-	existing, err := s.deps.Store.RunningUpdateJobForHost(ctx, hostID)
-	if err != nil {
-		return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
-	}
-	if existing != "" {
-		return dispatchHostUpdateResult{
-			Code: "update_in_progress", Status: stdhttp.StatusConflict,
-			Msg:   "an update job is already in flight for this host",
-			JobID: existing,
-		}
-	}
-
-	jobID := ulid.Make().String()
-	now := time.Now().UTC()
-	if err := s.deps.Store.CreateJob(ctx, store.Job{
-		ID: jobID, HostID: hostID, Kind: "update",
-		ActorKind: actorKind, ActorID: actorID,
-		CreatedAt: now,
-	}); err != nil {
-		return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
-	}
-	env, err := api.Marshal(api.MsgCommandUpdate, ulid.Make().String(), api.CommandUpdatePayload{
-		JobID: jobID,
-	})
-	if err != nil {
-		return dispatchHostUpdateResult{Code: "internal", Status: stdhttp.StatusInternalServerError, Msg: err.Error()}
-	}
-	if err := s.deps.Hub.Send(ctx, hostID, env); err != nil {
-		// Roll the job to failed so we don't leak a queued row.
-		_ = s.deps.Store.MarkJobFinished(ctx, jobID, "failed", -1, nil, err.Error(), time.Now().UTC())
-		return dispatchHostUpdateResult{
-			Code: "host_offline", Status: stdhttp.StatusConflict, Msg: err.Error(),
-		}
-	}
-	if s.deps.UpdateWatcher != nil {
-		s.deps.UpdateWatcher.Track(jobID, hostID)
-	}
-
-	auditPayload, _ := json.Marshal(map[string]string{
-		"job_id":         jobID,
-		"target_version": version.Version,
-	})
-	_ = s.deps.Store.AppendAudit(ctx, store.AuditEntry{
-		ID:         ulid.Make().String(),
-		UserID:     actorID,
-		Actor:      actorKind,
-		Action:     "host.update_dispatched",
-		TargetKind: ptr("host"),
-		TargetID:   &hostID,
-		TS:         now,
-		Payload:    auditPayload,
-	})
-
-	return dispatchHostUpdateResult{JobID: jobID}
-}
-
-// handleHostUpdate is POST /api/hosts/{id}/update — JSON, admin-only.
-func (s *Server) handleHostUpdate(w stdhttp.ResponseWriter, r *stdhttp.Request) {
-	user, ok := s.requireUser(r)
-	if !ok {
-		writeJSONError(w, stdhttp.StatusUnauthorized, "unauthorised", "")
-		return
-	}
-	hostID := chi.URLParam(r, "id")
-	if hostID == "" {
-		writeJSONError(w, stdhttp.StatusBadRequest, "missing_host_id", "")
-		return
-	}
-	actor := "user"
-	var actorID *string
-	if user != nil {
-		actorID = &user.ID
-	}
-	res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID)
-	if res.Code != "" {
-		writeJSONError(w, res.Status, res.Code, res.Msg)
-		return
-	}
-	writeJSON(w, stdhttp.StatusAccepted, map[string]string{"job_id": res.JobID})
-}
-
-// handleHostUpdateForm is the HTMX-friendly POST /hosts/{id}/update
-// variant. On success it sets HX-Redirect to the job detail page; on
-// pre-check failures it renders an inline error banner.
-func (s *Server) handleHostUpdateForm(w stdhttp.ResponseWriter, r *stdhttp.Request) {
-	user, ok := s.requireUser(r)
-	if !ok {
-		stdhttp.Error(w, "unauthorised", stdhttp.StatusUnauthorized)
-		return
-	}
-	hostID := chi.URLParam(r, "id")
-	if hostID == "" {
-		stdhttp.Error(w, "missing host_id", stdhttp.StatusBadRequest)
-		return
-	}
-	actor := "user"
-	var actorID *string
-	if user != nil {
-		actorID = &user.ID
-	}
-	res := s.dispatchHostUpdate(r.Context(), hostID, actor, actorID)
-	if res.Code != "" {
-		// Inline banner for HTMX swaps. Mirrors what host_credentials
-		// returns on validation errors — small text/html fragment.
-		w.Header().Set("Content-Type", "text/html; charset=utf-8")
-		w.WriteHeader(res.Status)
-		msg := hostUpdateErrorMessage(res.Code, res.Msg)
-		_, _ = w.Write([]byte(`<div class="banner banner-error" role="alert">` + htmlEscape(msg) + `</div>`))
-		return
-	}
-	w.Header().Set("HX-Redirect", "/jobs/"+res.JobID)
-	w.WriteHeader(stdhttp.StatusOK)
-}
-
-func hostUpdateErrorMessage(code, msg string) string {
-	switch code {
-	case "host_not_found":
-		return "Host not found."
-	case "host_offline":
-		return "Agent is offline; can't deliver the update command."
-	case "already_up_to_date":
-		return "Agent is already running the current version."
-	case "update_in_progress":
-		return "An update is already in progress for this host."
-	}
-	if msg != "" {
-		return msg
-	}
-	return "Update dispatch failed."
-}
-
-// htmlEscape is a minimal HTML-attr-safe escaper. Avoids pulling html/template
-// for a one-shot inline banner.
-func htmlEscape(s string) string {
-	out := make([]byte, 0, len(s))
-	for i := 0; i < len(s); i++ {
-		switch s[i] {
-		case '&':
-			out = append(out, []byte("&amp;")...)
-		case '<':
-			out = append(out, []byte("&lt;")...)
-		case '>':
-			out = append(out, []byte("&gt;")...)
-		case '"':
-			out = append(out, []byte("&quot;")...)
-		default:
-			out = append(out, s[i])
-		}
-	}
-	return string(out)
-}
@@ -1,270 +0,0 @@
-// host_update_test.go — covers POST /api/hosts/{id}/update.
-package http
-
-import (
-	"context"
-	"encoding/json"
-	"io"
-	stdhttp "net/http"
-	"strings"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/coder/websocket"
-	"github.com/oklog/ulid/v2"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
-)
-
-// stubWatcher records Track calls so tests can assert the watcher was
-// notified.
-type stubWatcher struct {
-	mu      sync.Mutex
-	tracked []string // hostIDs
-}
-
-func (s *stubWatcher) Track(_, hostID string) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	s.tracked = append(s.tracked, hostID)
-}
-
-func TestHostUpdateHappyPath(t *testing.T) {
-	t.Parallel()
-	srv, ts, st := rawTestServer(t)
-	watcher := &stubWatcher{}
-	srv.deps.UpdateWatcher = watcher
-	hostID, token := enrolHostForWS(t, srv, st, "upd-host")
-	c := agentDial(t, srv, ts, hostID, token)
-	sendHello(t, c, "upd-host")
-	_ = drainUntil(t, c, api.MsgScheduleSet)
-
-	// Force a version mismatch so the dispatch isn't short-circuited.
-	if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
-		t.Fatalf("mark hello: %v", err)
-	}
-
-	cookie := loginAsAdmin(t, st)
-	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
-	req.AddCookie(cookie)
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusAccepted {
-		t.Fatalf("status: got %d, want 202", res.StatusCode)
-	}
-	var out struct {
-		JobID string `json:"job_id"`
-	}
-	if err := json.NewDecoder(res.Body).Decode(&out); err != nil {
-		t.Fatalf("decode: %v", err)
-	}
-	if out.JobID == "" {
-		t.Fatal("missing job_id in response")
-	}
-
-	// command.update envelope arrives.
-	deadline := time.Now().Add(2 * time.Second)
-	var got api.Envelope
-	for time.Now().Before(deadline) {
-		ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
-		mt, raw, rerr := c.Read(ctx)
-		cancel()
-		if rerr != nil {
-			break
-		}
-		if mt != websocket.MessageText {
-			continue
-		}
-		if !strings.Contains(string(raw), `"command.update"`) {
-			continue
-		}
-		_ = json.Unmarshal(raw, &got)
-		break
-	}
-	if got.Type != api.MsgCommandUpdate {
-		t.Fatal("never received command.update envelope")
-	}
-	var cp api.CommandUpdatePayload
-	if err := got.UnmarshalPayload(&cp); err != nil {
-		t.Fatalf("payload: %v", err)
-	}
-	if cp.JobID != out.JobID {
-		t.Fatalf("payload job_id: got %q want %q", cp.JobID, out.JobID)
-	}
-
-	// Watcher tracked.
-	watcher.mu.Lock()
-	defer watcher.mu.Unlock()
-	if len(watcher.tracked) != 1 || watcher.tracked[0] != hostID {
-		t.Fatalf("watcher tracked: %v", watcher.tracked)
-	}
-
-	// Audit row exists.
-	var n int
-	if err := st.DB().QueryRow(
-		`SELECT COUNT(*) FROM audit_log WHERE action = 'host.update_dispatched' AND target_id = ?`,
-		hostID).Scan(&n); err != nil {
-		t.Fatalf("audit count: %v", err)
-	}
-	if n != 1 {
-		t.Fatalf("audit rows: got %d, want 1", n)
-	}
-}
-
-func TestHostUpdateNotFound(t *testing.T) {
-	t.Parallel()
-	_, ts, st := rawTestServer(t)
-	cookie := loginAsAdmin(t, st)
-	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/no-such/update", nil)
-	req.AddCookie(cookie)
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusNotFound {
-		t.Fatalf("status: got %d want 404", res.StatusCode)
-	}
-}
-
-func TestHostUpdateOffline(t *testing.T) {
-	t.Parallel()
-	_, ts, st := rawTestServer(t)
-	hostID := ulid.Make().String()
-	if err := st.CreateHost(context.Background(), store.Host{
-		ID: hostID, Name: "off", OS: "linux", Arch: "amd64",
-		EnrolledAt: time.Now().UTC(),
-	}, "deadbeef", ""); err != nil {
-		t.Fatalf("create: %v", err)
-	}
-	cookie := loginAsAdmin(t, st)
-	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
-	req.AddCookie(cookie)
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusConflict {
-		t.Fatalf("status: got %d want 409", res.StatusCode)
-	}
-	body := readJSONError(t, res.Body)
-	if body.Code != "host_offline" {
-		t.Fatalf("code: %q", body.Code)
-	}
-}
-
-func TestHostUpdateAlreadyUpToDate(t *testing.T) {
-	t.Parallel()
-	srv, ts, st := rawTestServer(t)
-	hostID, token := enrolHostForWS(t, srv, st, "uptodate-host")
-	c := agentDial(t, srv, ts, hostID, token)
-	sendHello(t, c, "uptodate-host")
-	_ = drainUntil(t, c, api.MsgScheduleSet)
-
-	// Force agent_version == version.Version.
-	if err := st.MarkHostHello(context.Background(), hostID, version.Version, "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
-		t.Fatalf("mark hello: %v", err)
-	}
-
-	cookie := loginAsAdmin(t, st)
-	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
-	req.AddCookie(cookie)
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusConflict {
-		t.Fatalf("status: got %d want 409", res.StatusCode)
-	}
-	body := readJSONError(t, res.Body)
-	if body.Code != "already_up_to_date" {
-		t.Fatalf("code: %q", body.Code)
-	}
-}
-
-func TestHostUpdateInProgress(t *testing.T) {
-	t.Parallel()
-	srv, ts, st := rawTestServer(t)
-	hostID, token := enrolHostForWS(t, srv, st, "inprog-host")
-	c := agentDial(t, srv, ts, hostID, token)
-	sendHello(t, c, "inprog-host")
-	_ = drainUntil(t, c, api.MsgScheduleSet)
-	if err := st.MarkHostHello(context.Background(), hostID, "v0", "0.17", api.CurrentProtocolVersion, time.Now().UTC()); err != nil {
-		t.Fatalf("mark hello: %v", err)
-	}
-
-	// Pre-seed an in-flight update job.
-	jobID := ulid.Make().String()
-	if err := st.CreateJob(context.Background(), store.Job{
-		ID: jobID, HostID: hostID, Kind: "update",
-		ActorKind: "user", CreatedAt: time.Now().UTC(),
-	}); err != nil {
-		t.Fatalf("seed job: %v", err)
-	}
-
-	cookie := loginAsAdmin(t, st)
-	req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
-	req.AddCookie(cookie)
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("do: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusConflict {
-		t.Fatalf("status: got %d want 409", res.StatusCode)
-	}
-	body := readJSONError(t, res.Body)
-	if body.Code != "update_in_progress" {
-		t.Fatalf("code: %q", body.Code)
-	}
-}
-
-func TestHostUpdateRBAC(t *testing.T) {
-	t.Parallel()
-	_, ts, st := rawTestServer(t)
-	hostID := ulid.Make().String()
-	if err := st.CreateHost(context.Background(), store.Host{
-		ID: hostID, Name: "rbac-host", OS: "linux", Arch: "amd64",
-		EnrolledAt: time.Now().UTC(),
-	}, "deadbeef", ""); err != nil {
-		t.Fatalf("create: %v", err)
-	}
-	for _, role := range []store.Role{store.RoleViewer, store.RoleOperator} {
-		role := role
-		t.Run(string(role), func(t *testing.T) {
-			cookie := loginAsRole(t, st, role)
-			req, _ := stdhttp.NewRequest("POST", ts.URL+"/api/hosts/"+hostID+"/update", nil)
-			req.AddCookie(cookie)
-			res, err := stdhttp.DefaultClient.Do(req)
-			if err != nil {
-				t.Fatalf("do: %v", err)
-			}
-			defer res.Body.Close()
-			if res.StatusCode != stdhttp.StatusForbidden {
-				t.Fatalf("status for %s: got %d want 403", role, res.StatusCode)
-			}
-		})
-	}
-}
-
-type jsonErrBody struct {
-	Code    string `json:"code"`
-	Message string `json:"message,omitempty"`
-}
-
-func readJSONError(t *testing.T, body io.Reader) jsonErrBody {
-	t.Helper()
-	var out jsonErrBody
-	if err := json.NewDecoder(body).Decode(&out); err != nil {
-		t.Fatalf("decode error body: %v", err)
-	}
-	return out
-}
@@ -4,7 +4,6 @@ import (
 	stdhttp "net/http"

 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )

 // hostView is the JSON projection of a Host row. Same shape as the
@@ -28,8 +27,6 @@ type hostView struct {
 	RepoSizeBytes    int64    `json:"repo_size_bytes"`
 	SnapshotCount    int      `json:"snapshot_count"`
 	OpenAlertCount   int      `json:"open_alert_count"`
-	UpdateAvailable  bool     `json:"update_available"`
-	TargetVersion    string   `json:"target_version,omitempty"`
 }

 // handleListHosts returns the full fleet as JSON. Authenticated; the
@@ -88,8 +85,6 @@ func hostToView(h store.Host) hostView {
 		RepoSizeBytes:    h.RepoSizeBytes,
 		SnapshotCount:    h.SnapshotCount,
 		OpenAlertCount:   h.OpenAlertCount,
-		TargetVersion:    version.Version,
-		UpdateAvailable:  h.AgentVersion != "" && h.AgentVersion != version.Version,
 	}
 	if v.Tags == nil {
 		v.Tags = []string{}
@@ -1,185 +0,0 @@
-package http
-
-import (
-	"context"
-	"crypto/subtle"
-	"net"
-	"net/http"
-	"net/netip"
-	"runtime"
-	"strings"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
-)
-
-// handleMetrics serves the Prometheus exposition body. The route is
-// only mounted when the operator has opted in via RM_METRICS_TOKEN
-// or RM_METRICS_TRUSTED_CIDR (see Server.New + Cfg.MetricsAuthEnabled).
-func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
-	if !authoriseMetricsScrape(r, s.deps.Cfg) {
-		// 401 with no body; Prom respects this and surfaces the failed
-		// scrape. WWW-Authenticate hints at bearer when the operator
-		// actually configured a token.
-		if s.deps.Cfg.MetricsToken != "" {
-			w.Header().Set("WWW-Authenticate", `Bearer realm="restic-manager metrics"`)
-		}
-		w.WriteHeader(http.StatusUnauthorized)
-		return
-	}
-
-	snap, err := s.gatherMetricsSnapshot(r.Context())
-	if err != nil {
-		http.Error(w, "snapshot: "+err.Error(), http.StatusInternalServerError)
-		return
-	}
-
-	// 0.0.4 is the long-stable text-format version Prometheus accepts
-	// without negotiation; OpenMetrics is intentionally not used here.
-	w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
-	if err := metrics.Render(w, snap); err != nil {
-		// Body is partially written; nothing useful we can do beyond
-		// dropping the connection (chi's recoverer will log).
-		return
-	}
-}
-
-// authoriseMetricsScrape applies bearer + CIDR gates per the spec.
-// AND semantics when both are configured; either alone is sufficient
-// when only it is configured.
-func authoriseMetricsScrape(r *http.Request, cfg config.Config) bool {
-	tokenOK := true
-	if cfg.MetricsToken != "" {
-		tokenOK = false
-		hdr := r.Header.Get("Authorization")
-		const prefix = "Bearer "
-		if strings.HasPrefix(hdr, prefix) {
-			got := []byte(strings.TrimPrefix(hdr, prefix))
-			want := []byte(cfg.MetricsToken)
-			if subtle.ConstantTimeCompare(got, want) == 1 {
-				tokenOK = true
-			}
-		}
-	}
-
-	cidrOK := true
-	if len(cfg.MetricsTrustedCIDRs) > 0 {
-		cidrOK = false
-		ip := callerIP(r, cfg.TrustedProxies)
-		if ip.IsValid() {
-			for _, c := range cfg.MetricsTrustedCIDRs {
-				prefix, err := netip.ParsePrefix(c)
-				if err != nil {
-					continue
-				}
-				if prefix.Contains(ip) {
-					cidrOK = true
-					break
-				}
-			}
-		}
-	}
-	return tokenOK && cidrOK
-}
-
-// callerIP resolves the client IP. When the request hit the server
-// directly we use RemoteAddr; when the immediate hop is a trusted
-// proxy we honour the right-most untrusted X-Forwarded-For entry
-// (mirrors how realIP middlewares typically resolve).
-func callerIP(r *http.Request, trustedProxies []string) netip.Addr {
-	host, _, err := net.SplitHostPort(r.RemoteAddr)
-	if err != nil {
-		host = r.RemoteAddr
-	}
-	directAddr, err := netip.ParseAddr(host)
-	if err != nil {
-		return netip.Addr{}
-	}
-
-	if !addrInAnyCIDR(directAddr, trustedProxies) {
-		return directAddr
-	}
-
-	xff := r.Header.Get("X-Forwarded-For")
-	if xff == "" {
-		return directAddr
-	}
-	parts := strings.Split(xff, ",")
-	// Walk right→left, skipping trusted proxies, until we land on the
-	// first untrusted hop — that's the genuine client.
-	for i := len(parts) - 1; i >= 0; i-- {
-		p := strings.TrimSpace(parts[i])
-		a, err := netip.ParseAddr(p)
-		if err != nil {
-			continue
-		}
-		if addrInAnyCIDR(a, trustedProxies) {
-			continue
-		}
-		return a
-	}
-	return directAddr
-}
-
-func addrInAnyCIDR(a netip.Addr, cidrs []string) bool {
-	for _, c := range cidrs {
-		pre, err := netip.ParsePrefix(c)
-		if err != nil {
-			continue
-		}
-		if pre.Contains(a) {
-			return true
-		}
-	}
-	return false
-}
-
-// gatherMetricsSnapshot pulls the data the renderer needs. One
-// indexed query per per-host or fleet-wide read; no N+1.
-func (s *Server) gatherMetricsSnapshot(ctx context.Context) (metrics.Snapshot, error) {
-	hosts, err := s.deps.Store.ListHosts(ctx)
-	if err != nil {
-		return metrics.Snapshot{}, err
-	}
-	hostRows := make([]metrics.HostRow, 0, len(hosts))
-	for _, h := range hosts {
-		row := metrics.HostRow{
-			ID:             h.ID,
-			Name:           h.Name,
-			Online:         h.Status == "online",
-			SnapshotCount:  h.SnapshotCount,
-			OpenAlertCount: h.OpenAlertCount,
-			RepoStatus:     h.RepoStatus,
-		}
-		if h.LastBackupAt != nil {
-			ts := h.LastBackupAt.Unix()
-			row.LastBackupUnix = &ts
-		}
-		if h.LastBackupStatus != nil {
-			ok := *h.LastBackupStatus == "succeeded"
-			row.LastBackupSucceeded = &ok
-		}
-		if h.RepoSizeBytes > 0 {
-			sz := h.RepoSizeBytes
-			row.RepoSizeBytes = &sz
-		}
-		hostRows = append(hostRows, row)
-	}
-
-	open, err := s.deps.Store.ListAlerts(ctx, store.AlertFilter{Status: "open"})
-	if err != nil {
-		return metrics.Snapshot{}, err
-	}
-	bySeverity := map[string]int{"info": 0, "warning": 0, "critical": 0}
-	for _, a := range open {
-		bySeverity[a.Severity]++
-	}
-
-	reg := s.deps.Metrics
-	if reg == nil {
-		reg = metrics.NewRegistry() // empty histogram block
-	}
-	return reg.SnapshotWith(hostRows, bySeverity, version.Version, version.Commit, runtime.Version()), nil
-}
@@ -1,209 +0,0 @@
-package http
-
-import (
-	"context"
-	"io"
-	stdhttp "net/http"
-	"net/http/httptest"
-	"path/filepath"
-	"strings"
-	"testing"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-)
-
-// newMetricsServer builds a Server with metrics enabled per cfg.
-// Returns (URL, registry) so tests can both observe job durations
-// directly and exercise the HTTP gate.
-func newMetricsServer(t *testing.T, cfg config.Config) (string, *metrics.Registry, *store.Store) {
-	t.Helper()
-	dir := t.TempDir()
-
-	st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
-	if err != nil {
-		t.Fatalf("store: %v", err)
-	}
-	t.Cleanup(func() { _ = st.Close() })
-
-	keyPath := filepath.Join(dir, "secret.key")
-	if err := crypto.GenerateKeyFile(keyPath); err != nil {
-		t.Fatalf("genkey: %v", err)
-	}
-	key, _ := crypto.LoadKeyFromFile(keyPath)
-	aead, _ := crypto.NewAEAD(key)
-
-	cfg.Listen = ":0"
-	cfg.DataDir = dir
-	cfg.SecretKeyFile = keyPath
-
-	reg := metrics.NewRegistry()
-	deps := Deps{
-		Cfg:     cfg,
-		Store:   st,
-		AEAD:    aead,
-		Metrics: reg,
-	}
-	s := New(deps)
-	ts := httptest.NewServer(s.srv.Handler)
-	t.Cleanup(ts.Close)
-	return ts.URL, reg, st
-}
-
-func TestMetricsRouteNotMountedByDefault(t *testing.T) {
-	t.Parallel()
-	url, _, _ := newMetricsServer(t, config.Config{})
-	res, err := stdhttp.Get(url + "/metrics")
-	if err != nil {
-		t.Fatalf("GET: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusNotFound {
-		t.Errorf("status: got %d, want 404 (route should not be mounted)", res.StatusCode)
-	}
-}
-
-func TestMetricsTokenRequired(t *testing.T) {
-	t.Parallel()
-	url, _, _ := newMetricsServer(t, config.Config{
-		MetricsToken: "the-token",
-	})
-
-	// Missing token.
-	res, err := stdhttp.Get(url + "/metrics")
-	if err != nil {
-		t.Fatalf("GET: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusUnauthorized {
-		t.Errorf("no token: got %d", res.StatusCode)
-	}
-	if !strings.Contains(res.Header.Get("WWW-Authenticate"), "Bearer") {
-		t.Errorf("WWW-Authenticate hint missing: %q", res.Header.Get("WWW-Authenticate"))
-	}
-
-	// Wrong token.
-	req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
-	req.Header.Set("Authorization", "Bearer not-the-token")
-	res2, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("GET: %v", err)
-	}
-	defer res2.Body.Close()
-	if res2.StatusCode != stdhttp.StatusUnauthorized {
-		t.Errorf("wrong token: got %d", res2.StatusCode)
-	}
-
-	// Right token.
-	req3, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
-	req3.Header.Set("Authorization", "Bearer the-token")
-	res3, err3 := stdhttp.DefaultClient.Do(req3)
-	if err3 != nil {
-		t.Fatalf("GET: %v", err3)
-	}
-	defer res3.Body.Close()
-	if res3.StatusCode != stdhttp.StatusOK {
-		t.Errorf("right token: got %d", res3.StatusCode)
-	}
-	if ct := res3.Header.Get("Content-Type"); !strings.HasPrefix(ct, "text/plain") {
-		t.Errorf("content-type: %q", ct)
-	}
-}
-
-func TestMetricsCIDRGate(t *testing.T) {
-	t.Parallel()
-	// 127.0.0.1 is what httptest hits with; pick a CIDR that excludes it
-	// to assert the "wrong source" branch.
-	url, _, _ := newMetricsServer(t, config.Config{
-		MetricsTrustedCIDRs: []string{"10.0.0.0/8"},
-	})
-	res, err := stdhttp.Get(url + "/metrics")
-	if err != nil {
-		t.Fatalf("GET: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusUnauthorized {
-		t.Errorf("loopback hitting non-matching CIDR: got %d, want 401", res.StatusCode)
-	}
-
-	// Now allow loopback.
-	url2, _, _ := newMetricsServer(t, config.Config{
-		MetricsTrustedCIDRs: []string{"127.0.0.0/8"},
-	})
-	res2, err := stdhttp.Get(url2 + "/metrics")
-	if err != nil {
-		t.Fatalf("GET: %v", err)
-	}
-	defer res2.Body.Close()
-	if res2.StatusCode != stdhttp.StatusOK {
-		t.Errorf("loopback in allow CIDR: got %d, want 200", res2.StatusCode)
-	}
-}
-
-func TestMetricsTokenAndCIDRBothRequired(t *testing.T) {
-	t.Parallel()
-	url, _, _ := newMetricsServer(t, config.Config{
-		MetricsToken:        "the-token",
-		MetricsTrustedCIDRs: []string{"127.0.0.0/8"},
-	})
-	// Token only — CIDR ok (loopback) but token missing.
-	res, err := stdhttp.Get(url + "/metrics")
-	if err != nil {
-		t.Fatalf("GET: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusUnauthorized {
-		t.Errorf("missing token but in CIDR: got %d", res.StatusCode)
-	}
-
-	// Both right.
-	req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
-	req.Header.Set("Authorization", "Bearer the-token")
-	res2, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("GET: %v", err)
-	}
-	defer res2.Body.Close()
-	if res2.StatusCode != stdhttp.StatusOK {
-		t.Errorf("both right: got %d", res2.StatusCode)
-	}
-}
-
-func readAll(t *testing.T, r io.Reader) string {
-	t.Helper()
-	b, err := io.ReadAll(r)
-	if err != nil {
-		t.Fatalf("read: %v", err)
-	}
-	return string(b)
-}
-
-func TestMetricsBodyContainsExpectedLines(t *testing.T) {
-	t.Parallel()
-	url, reg, _ := newMetricsServer(t, config.Config{
-		MetricsToken: "the-token",
-	})
-	reg.ObserveJob("backup", "succeeded", 0) // produce one histogram row
-
-	req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
-	req.Header.Set("Authorization", "Bearer the-token")
-	res, err := stdhttp.DefaultClient.Do(req)
-	if err != nil {
-		t.Fatalf("GET: %v", err)
-	}
-	defer res.Body.Close()
-	body := readAll(t, res.Body)
-	for _, want := range []string{
-		"rm_hosts_total",
-		"rm_hosts_online",
-		`rm_active_alerts{severity="critical"}`,
-		"rm_build_info{",
-		"rm_job_duration_seconds_count{kind=\"backup\",status=\"succeeded\"}",
-	} {
-		if !strings.Contains(body, want) {
-			t.Errorf("body missing %q\n--- body ---\n%s", want, body)
-		}
-	}
-}
@@ -17,7 +17,6 @@ import (
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
@@ -40,13 +39,6 @@ type Deps struct {
 	// NotificationHub (optional, wired in G1) is used by the test-fire
 	// endpoint to dispatch a single synthetic payload through a channel.
 	NotificationHub *notification.Hub
-	// UpdateWatcher tracks in-flight agent self-update dispatches and
-	// reconciles them against incoming hello envelopes. Optional;
-	// nil = no-op (handlers degrade by skipping the Track call).
-	UpdateWatcher UpdateWatcher
-	// FleetWorker drives the rolling fleet-update worker. Optional;
-	// nil = fleet update endpoints (P6-15) report unavailable.
-	FleetWorker FleetWorker
 	// Version is the binary's build version, surfaced in the chrome.
 	// Empty falls back to "dev".
 	Version string
@@ -57,12 +49,6 @@ type Deps struct {
 	// OIDC (optional). Non-nil when the operator has configured an
 	// IdP — handlers under /auth/oidc/* are mounted only when set.
 	OIDC *oidc.Client
-	// Metrics (optional). When non-nil the WS job-finished branch
-	// records job durations and the /metrics handler can pull a
-	// histogram snapshot. Independent of MetricsAuthEnabled — the
-	// recorder runs even if the scrape endpoint is gated off, so a
-	// later config flip doesn't lose the running window.
-	Metrics *metrics.Registry
 }

 // Server is the running HTTP server.
@@ -137,25 +123,16 @@ func (s *Server) routes(r chi.Router) {
 	r.Post("/api/agents/announce", s.handleAnnounce)
 	r.Get("/agent/binary", s.handleAgentBinary)
 	r.Get("/install/*", s.handleInstallAsset)
-	r.Get("/api/version", s.handleVersion)
-	if s.deps.Cfg.MetricsAuthEnabled() {
-		r.Get("/metrics", s.handleMetrics)
-	}
 	if s.deps.Hub != nil {
-		hd := ws.HandlerDeps{
+		r.Mount("/ws/agent", ws.AgentHandler(ws.HandlerDeps{
 			Hub:            s.deps.Hub,
 			Store:          s.deps.Store,
 			JobHub:         s.deps.JobHub,
 			AlertEngine:    s.deps.AlertEngine,
-			Metrics:        s.deps.Metrics,
 			OnHello:        s.onAgentHello,
 			OnScheduleAck:  s.applyScheduleAck,
 			OnScheduleFire: s.dispatchScheduledJob,
-		}
-		if w, ok := s.deps.UpdateWatcher.(*ws.UpdateWatcher); ok && w != nil {
-			hd.UpdateWatcher = w
-		}
-		r.Mount("/ws/agent", ws.AgentHandler(hd))
+		}))
 	}
 	r.Get("/ws/agent/pending", s.handlePendingWS)
 	r.Mount("/static/", staticHandler())
@@ -206,9 +183,7 @@ func (s *Server) routes(r chi.Router) {
 			r.Get("/hosts/{id}/sources", s.handleUIHostSources)
 			r.Get("/hosts/{id}/sources/new", s.handleUISourceGroupNewGet)
 			r.Get("/hosts/{id}/sources/{gid}/edit", s.handleUISourceGroupEditGet)
-			r.Get("/hosts/{id}/jobs", s.handleUIHostJobs)
 			r.Get("/hosts/{id}/repo", s.handleUIHostRepo)
-			r.Get("/hosts/{id}/repo/trend", s.handleUIRepoTrend)
 			r.Get("/hosts/{id}/schedules", s.handleUISchedulesList)
 			r.Get("/hosts/{id}/schedules/new", s.handleUIScheduleNewGet)
 			r.Get("/hosts/{id}/schedules/{sid}/edit", s.handleUIScheduleEditGet)
@@ -295,14 +270,6 @@ func (s *Server) routes(r chi.Router) {
 	r.Group(func(r chi.Router) {
 		r.Use(s.requireRole(store.RoleAdmin))

-		r.Post("/api/hosts/{id}/update", s.handleHostUpdate)
-		r.Post("/hosts/{id}/update", s.handleHostUpdateForm)
-
-		// Fleet update (P6-15): rolling update across many hosts.
-		r.Post("/api/fleet/update", s.handleAPIFleetUpdateStart)
-		r.Post("/api/fleet-updates/{id}/cancel", s.handleAPIFleetUpdateCancel)
-		r.Get("/api/fleet-updates/{id}", s.handleAPIFleetUpdateGet)
-
 		r.Get("/api/users", s.handleAPIUsersList)
 		r.Post("/api/users", s.handleAPIUserCreate)
 		r.Get("/api/users/{id}", s.handleAPIUserGet)
@@ -316,8 +283,6 @@ func (s *Server) routes(r chi.Router) {
 		if s.deps.UI != nil {
 			r.Post("/hosts/{id}/delete", s.handleUIHostDelete)
 			r.Get("/settings", s.handleUISettings)
-			r.Get("/settings/fleet-update", s.handleUIFleetUpdate)
-			r.Get("/settings/fleet-update/partial", s.handleUIFleetUpdatePartial)
 			r.Get("/settings/users", s.handleUIUsersList)
 			r.Get("/settings/users/new", s.handleUIUserNewGet)
 			r.Post("/settings/users/new", s.handleUIUserNewPost)
@@ -356,27 +321,6 @@ func (s *Server) Shutdown(ctx context.Context) error {
 	return s.srv.Shutdown(ctx)
 }

-// SetFleetWorker installs the fleet-update worker post-construction.
-// Used to break the wiring loop in cmd/server (the worker depends on a
-// dispatcher that delegates back into the server's host-update path).
-func (s *Server) SetFleetWorker(fw FleetWorker) { s.deps.FleetWorker = fw }
-
-// DispatchHostUpdate is the public entry point for callers (the fleet
-// worker) that need to drive the same dispatch path the HTTP handler
-// uses, without going through HTTP. Returns the structured result so
-// the caller can map error codes to its own status enum.
-func (s *Server) DispatchHostUpdate(ctx context.Context, hostID, actorUserID string) (jobID string, code string, err error) {
-	var actorID *string
-	if actorUserID != "" {
-		actorID = &actorUserID
-	}
-	res := s.dispatchHostUpdate(ctx, hostID, "user", actorID)
-	if res.Code != "" {
-		return res.JobID, res.Code, nil
-	}
-	return res.JobID, "", nil
-}
-
 // Addr returns the configured listen address. Useful in tests when
 // the caller passes :0 to get a random port.
 func (s *Server) Addr() string { return s.srv.Addr }
@@ -1,83 +0,0 @@
-package http
-
-import (
-	"context"
-	stdhttp "net/http"
-	"strings"
-	"testing"
-	"time"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-)
-
-func getDashboard(t *testing.T, baseURL string, cookie *stdhttp.Cookie) string {
-	t.Helper()
-	client := &stdhttp.Client{
-		CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
-			return stdhttp.ErrUseLastResponse
-		},
-	}
-	req, err := stdhttp.NewRequest("GET", baseURL+"/", nil)
-	if err != nil {
-		t.Fatalf("new request: %v", err)
-	}
-	req.AddCookie(cookie)
-	res, err := client.Do(req)
-	if err != nil {
-		t.Fatalf("GET /: %v", err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusOK {
-		t.Fatalf("GET /: want 200, got %d", res.StatusCode)
-	}
-	body := make([]byte, 0, 1<<20)
-	buf := make([]byte, 4096)
-	for {
-		n, rerr := res.Body.Read(buf)
-		body = append(body, buf[:n]...)
-		if rerr != nil {
-			break
-		}
-	}
-	return string(body)
-}
-
-func TestDashboard_HostRowSparklineRendersWithHistory(t *testing.T) {
-	t.Parallel()
-	_, baseURL, st := newTestServerWithUI(t)
-	cookie := loginAsAdmin(t, st)
-	hostID := makeHost(t, st, "h-spark")
-	ctx := context.Background()
-
-	// Two history points → polyline must render.
-	for i, day := range []string{"2026-05-05", "2026-05-06"} {
-		v := int64(100 + i*50)
-		if err := st.UpsertHostRepoStatsHistory(ctx, hostID, day,
-			store.HostRepoStats{TotalSizeBytes: &v}, time.Now().UTC()); err != nil {
-			t.Fatalf("upsert %s: %v", day, err)
-		}
-	}
-
-	body := getDashboard(t, baseURL, cookie)
-	if !strings.Contains(body, `class="repo-sparkline"`) {
-		t.Errorf("expected sparkline SVG in dashboard body (class=repo-sparkline missing)")
-	}
-	if !strings.Contains(body, `<polyline`) {
-		t.Errorf("expected <polyline> in dashboard body")
-	}
-}
-
-func TestDashboard_HostRowSparklineEmptyState(t *testing.T) {
-	t.Parallel()
-	_, baseURL, st := newTestServerWithUI(t)
-	cookie := loginAsAdmin(t, st)
-	makeHost(t, st, "h-empty")
-
-	body := getDashboard(t, baseURL, cookie)
-	if !strings.Contains(body, `class="repo-sparkline"`) {
-		t.Errorf("expected sparkline SVG element on dashboard")
-	}
-	if !strings.Contains(body, `>—<`) {
-		t.Errorf("expected em-dash placeholder in empty sparkline cell")
-	}
-}
@@ -5,10 +5,8 @@ import (
 	"encoding/base64"
 	"encoding/json"
 	"errors"
-	"html/template"
 	"io/fs"
 	"log/slog"
-	"math"
 	stdhttp "net/http"
 	"net/url"
 	"sort"
@@ -25,8 +23,6 @@ import (
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/web/sparkline"
 	"gitea.dcglab.co.uk/steve/restic-manager/web"
 )

@@ -159,10 +155,6 @@ type dashboardPage struct {
 	// when it's already active). Pre-computed so the template stays
 	// dumb.
 	SortURL map[string]string
-	// UpdatesBehind is the count of online hosts whose agent_version
-	// trails the server. Surfaces as the dashboard "N hosts behind"
-	// hero tile and links to ?updates=behind.
-	UpdatesBehind int
 }

 // dashboardFilter holds the parsed query-string filter state.
@@ -173,10 +165,6 @@ type dashboardFilter struct {
 	Tag        string // mirrors ActiveTag for round-trip on links
 	Sort       string // column key (see sortDashboard)
 	Dir        string // "asc" | "desc"
-	// Updates narrows to hosts whose agent is behind the server's
-	// version. Only valid value today is "behind"; empty means no
-	// filter.
-	Updates string
 }

 // dashboardHostRow carries a host plus the per-row Run-now decision
@@ -192,17 +180,6 @@ type dashboardHostRow struct {
 	// NextRun is the next-fire time of RunAllScheduleID (when set),
 	// computed server-side from its cron. nil otherwise.
 	NextRun *time.Time
-	// UpdateAvailable is true when the host's agent has connected at
-	// least once AND its agent_version differs from the server's. Used
-	// by the host_row partial to render the update-available chip.
-	UpdateAvailable bool
-	// TargetVersion is the server's build version, surfaced in the
-	// chip's tooltip and label.
-	TargetVersion string
-	// RepoSparklineSVG is a server-rendered inline SVG showing the
-	// 30-day repo-size trend. Empty-state SVG (em-dash) is returned
-	// when no history rows exist for the host.
-	RepoSparklineSVG template.HTML
 }

 // pickRunAllSchedule returns the ID of the single schedule whose
@@ -278,11 +255,7 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
 	// calls per host — fine at fleet sizes we care about.
 	rows := make([]dashboardHostRow, 0, len(hosts))
 	for _, h := range hosts {
-		row := dashboardHostRow{
-			Host:            h,
-			TargetVersion:   version.Version,
-			UpdateAvailable: h.AgentVersion != "" && h.AgentVersion != version.Version,
-		}
+		row := dashboardHostRow{Host: h}
 		groups, gerr := s.deps.Store.ListSourceGroupsByHost(r.Context(), h.ID)
 		if gerr != nil {
 			slog.Warn("ui dashboard: list source groups", "host_id", h.ID, "err", gerr)
@@ -303,20 +276,6 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
 				}
 			}
 		}
-		since := time.Now().UTC().AddDate(0, 0, -30)
-		pts, herr := s.deps.Store.ListHostRepoStatsHistory(r.Context(), h.ID, since)
-		if herr != nil {
-			slog.Warn("ui dashboard: list repo history", "host_id", h.ID, "err", herr)
-		}
-		sparkPoints := make([]float64, len(pts))
-		for i, p := range pts {
-			if p.TotalSizeBytes == nil {
-				sparkPoints[i] = math.NaN()
-			} else {
-				sparkPoints[i] = float64(*p.TotalSizeBytes)
-			}
-		}
-		row.RepoSparklineSVG = sparkline.RenderSparkline(sparkPoints, 88, 20)
 		rows = append(rows, row)
 	}

@@ -330,13 +289,6 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
 		critOpenCount = len(crit)
 	}

-	updatesBehind := 0
-	for _, h := range allHosts {
-		if h.Status == "online" && h.AgentVersion != "" && h.AgentVersion != version.Version {
-			updatesBehind++
-		}
-	}
-
 	view := s.baseView(r, u)
 	view.Page = dashboardPage{
 		Hosts:         rows,
@@ -350,7 +302,6 @@ func (s *Server) handleUIDashboard(w stdhttp.ResponseWriter, r *stdhttp.Request)
 		Filter:        filter,
 		RefreshURL:    "/?" + filter.encode(),
 		SortURL:       buildDashboardSortURLs(filter),
-		UpdatesBehind: updatesBehind,
 	}
 	if err := s.deps.UI.Render(w, "dashboard", view); err != nil {
 		slog.Error("ui: render dashboard", "err", err)
@@ -369,7 +320,6 @@ func parseDashboardFilter(q url.Values) dashboardFilter {
 		Tag:        q.Get("tag"),
 		Sort:       q.Get("sort"),
 		Dir:        q.Get("dir"),
-		Updates:    q.Get("updates"),
 	}
 	if f.Sort == "" {
 		f.Sort = "name"
@@ -402,9 +352,6 @@ func (f dashboardFilter) encode() string {
 	if f.Dir != "" && f.Dir != "asc" {
 		v.Set("dir", f.Dir)
 	}
-	if f.Updates != "" {
-		v.Set("updates", f.Updates)
-	}
 	return v.Encode()
 }

@@ -455,11 +402,6 @@ func filterAndSortDashboardHosts(hosts []store.Host, f dashboardFilter) []store.
 				continue
 			}
 		}
-		if f.Updates == "behind" {
-			if h.AgentVersion == "" || h.AgentVersion == version.Version {
-				continue
-			}
-		}
 		out = append(out, h)
 	}
 	sortDashboardHosts(out, f.Sort, f.Dir)
@@ -867,20 +809,6 @@ type hostChromeData struct {
 	SourceGroupCount int
 	ScheduleCount    int
 	ScheduleVersion  int64 // host_schedule_version (latest desired)
-	// UpdateAvailable + TargetVersion drive the agent-out-of-date chip
-	// in the host detail header. UpdateAvailable is true iff the host
-	// has connected at least once AND its agent_version != server's.
-	UpdateAvailable bool
-	TargetVersion   string
-	// Online + UpdateInProgress drive the per-host "Update agent"
-	// button on host_detail. Online mirrors hub.Connected; pulled here
-	// so the button can disable when the host is unreachable.
-	Online           bool
-	UpdateInProgress bool
-	// CanAdmin is true when the viewing user has admin role; used to
-	// gate the "Update agent" button. Kept on the chrome struct so any
-	// page reusing host_chrome already has it for free.
-	CanAdmin bool
 	// KnownTags is the union of tags already in use across the fleet,
 	// used for autocomplete on the host-tags edit form. Cheap query.
 	KnownTags []string
@@ -906,14 +834,6 @@ type hostChromeData struct {
 // render the page with stale counts than 500 the whole tab.
 func (s *Server) loadHostChrome(r *stdhttp.Request, host store.Host, subtab, crumb string) hostChromeData {
 	d := hostChromeData{Host: host, SubTab: subtab, Crumb: crumb}
-	d.TargetVersion = version.Version
-	d.UpdateAvailable = host.AgentVersion != "" && host.AgentVersion != version.Version
-	if s.deps.Hub != nil {
-		d.Online = s.deps.Hub.Connected(host.ID)
-	}
-	if existing, _ := s.deps.Store.RunningUpdateJobForHost(r.Context(), host.ID); existing != "" {
-		d.UpdateInProgress = true
-	}
 	if groups, err := s.deps.Store.ListSourceGroupsByHost(r.Context(), host.ID); err == nil {
 		d.SourceGroupCount = len(groups)
 	} else {
@@ -1052,10 +972,8 @@ func (s *Server) handleUIHostDetail(w stdhttp.ResponseWriter, r *stdhttp.Request

 	view := s.baseView(r, u)
 	view.Title = host.Name + " · restic-manager"
-	chrome := s.loadHostChrome(r, *host, "snapshots", "snapshots")
-	chrome.CanAdmin = u.Role == string(store.RoleAdmin)
 	view.Page = hostDetailPage{
-		hostChromeData: chrome,
+		hostChromeData: s.loadHostChrome(r, *host, "snapshots", "snapshots"),
 		Snapshots:      shown,
 		SnapshotsShown: len(shown),
 		LegacyRestic:   !restic.Env{Version: host.ResticVersion}.AtLeastVersion(0, 17),
@@ -1,47 +0,0 @@
-package http
-
-import (
-	"log/slog"
-	stdhttp "net/http"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-)
-
-// hostJobsPage is the page-data struct for /hosts/{id}/jobs.
-type hostJobsPage struct {
-	hostChromeData
-	Jobs []store.Job
-}
-
-// handleUIHostJobs renders the per-host jobs list. Read-only — no
-// actions, just a click-through to the existing /jobs/{id} detail
-// page for any row.
-func (s *Server) handleUIHostJobs(w stdhttp.ResponseWriter, r *stdhttp.Request) {
-	u := s.requireUIUser(w, r)
-	if u == nil {
-		return
-	}
-	host, ok := s.loadHostForUI(w, r)
-	if !ok {
-		return
-	}
-
-	jobs, err := s.deps.Store.ListJobsByHost(r.Context(), host.ID, 100)
-	if err != nil {
-		slog.Error("ui host jobs: list", "host_id", host.ID, "err", err)
-		stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
-		return
-	}
-
-	page := hostJobsPage{
-		hostChromeData: s.loadHostChrome(r, *host, "jobs", "jobs"),
-		Jobs:           jobs,
-	}
-	view := s.baseView(r, u)
-	view.Title = host.Name + " jobs · restic-manager"
-	view.Page = page
-	if err := s.deps.UI.Render(w, "host_jobs", view); err != nil {
-		slog.Error("ui: render host_jobs", "err", err)
-		stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
-	}
-}
@@ -1,85 +0,0 @@
-package http
-
-import (
-	"context"
-	"io"
-	stdhttp "net/http"
-	"strings"
-	"testing"
-	"time"
-
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-)
-
-func TestUIHostJobs_RendersList(t *testing.T) {
-	t.Parallel()
-	_, baseURL, st := newTestServerWithUI(t)
-	cookie := loginAsAdmin(t, st)
-	hostID := makeHost(t, st, "h-jobs-render")
-
-	// Two jobs with distinct kinds + statuses.
-	now := time.Now().UTC()
-	ctx := context.Background()
-	if err := st.CreateJob(ctx, store.Job{
-		ID: "01HZZZZZZZZZZZZZZZZZZZZZ10", HostID: hostID, Kind: "backup",
-		ActorKind: "user", CreatedAt: now.Add(-time.Hour),
-	}); err != nil {
-		t.Fatalf("create job: %v", err)
-	}
-	if err := st.MarkJobFinished(ctx, "01HZZZZZZZZZZZZZZZZZZZZZ10", "succeeded", 0, nil, "", now.Add(-time.Hour+time.Minute)); err != nil {
-		t.Fatalf("finish job: %v", err)
-	}
-	if err := st.CreateJob(ctx, store.Job{
-		ID: "01HZZZZZZZZZZZZZZZZZZZZZ11", HostID: hostID, Kind: "prune",
-		ActorKind: "schedule", CreatedAt: now,
-	}); err != nil {
-		t.Fatalf("create job: %v", err)
-	}
-	if err := st.MarkJobFinished(ctx, "01HZZZZZZZZZZZZZZZZZZZZZ11", "failed", 1, nil, "boom", now.Add(time.Minute)); err != nil {
-		t.Fatalf("finish job: %v", err)
-	}
-
-	body := getHostJobsPage(t, baseURL, hostID, cookie)
-	for _, want := range []string{"backup", "prune", "succeeded", "failed", "schedule", "user", `class="jobs-row`} {
-		if !strings.Contains(body, want) {
-			t.Errorf("expected %q in body, missing", want)
-		}
-	}
-}
-
-func TestUIHostJobs_EmptyState(t *testing.T) {
-	t.Parallel()
-	_, baseURL, st := newTestServerWithUI(t)
-	cookie := loginAsAdmin(t, st)
-	hostID := makeHost(t, st, "h-jobs-empty")
-
-	body := getHostJobsPage(t, baseURL, hostID, cookie)
-	if !strings.Contains(body, "No jobs yet.") {
-		t.Error("expected empty-state heading")
-	}
-}
-
-// getHostJobsPage fetches /hosts/{id}/jobs and returns the body string.
-func getHostJobsPage(t *testing.T, baseURL, hostID string, cookie *stdhttp.Cookie) string {
-	t.Helper()
-	client := &stdhttp.Client{
-		CheckRedirect: func(_ *stdhttp.Request, _ []*stdhttp.Request) error {
-			return stdhttp.ErrUseLastResponse
-		},
-	}
-	req, err := stdhttp.NewRequest("GET", baseURL+"/hosts/"+hostID+"/jobs", nil)
-	if err != nil {
-		t.Fatalf("new request: %v", err)
-	}
-	req.AddCookie(cookie)
-	res, err := client.Do(req)
-	if err != nil {
-		t.Fatalf("GET /hosts/%s/jobs: %v", hostID, err)
-	}
-	defer res.Body.Close()
-	if res.StatusCode != stdhttp.StatusOK {
-		t.Fatalf("GET /hosts/%s/jobs: want 200, got %d", hostID, res.StatusCode)
-	}
-	raw, _ := io.ReadAll(res.Body)
-	return string(raw)
-}
@@ -1,12 +1,9 @@
 package http

 import (
-	"context"
 	"encoding/json"
 	"errors"
-	"html/template"
 	"log/slog"
-	"math"
 	stdhttp "net/http"
 	"strconv"
 	"strings"
@@ -16,7 +13,6 @@ import (

 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
-	"gitea.dcglab.co.uk/steve/restic-manager/internal/web/sparkline"
 )

 // ui_repo.go — HTML form-driven repo-tab handlers (connection,
@@ -31,15 +27,6 @@ import (
 //   POST /hosts/{id}/admin-credentials               — admin (prune) creds
 //   POST /hosts/{id}/admin-credentials/delete        — clear admin creds

-// repoTrendView is the data the repo_size_chart partial needs.
-// HostID + Range round-trip through the htmx range pills; ChartSVG
-// is pre-rendered server-side so the partial is just a wrapper.
-type repoTrendView struct {
-	HostID   string
-	Range    string
-	ChartSVG template.HTML
-}
-
 // repoStatsView is a flat, pre-dereferenced projection of
 // store.HostRepoStats for use in templates. Nil pointer fields are
 // collapsed to zero/false and accompanied by a Has* sentinel so the
@@ -87,10 +74,6 @@ type hostRepoPage struct {
 	// Nil when no row exists yet (fresh hosts).
 	StatsView *repoStatsView

-	// Trend holds the pre-rendered chart fragment data for the
-	// 30/90/365-day repo-size + snapshot-count overlay chart.
-	Trend repoTrendView
-
 	// Snapshots-by-tag — map[group_name]count, plus an "untagged" row.
 	SnapshotsByTag    map[string]int
 	UntaggedSnapshots int
@@ -242,52 +225,9 @@ func (s *Server) loadHostRepoPage(r *stdhttp.Request, host store.Host) (*hostRep
 			}
 		}
 	}
-	p.Trend = s.buildRepoTrendView(r.Context(), host.ID, "30d")
-
 	return p, nil
 }

-// buildRepoTrendView builds the chart-partial data for a host. Used
-// both by the page-load (initial 30d render) and the htmx fragment
-// endpoint (range switching). An invalid rangeKey falls back to "30d".
-func (s *Server) buildRepoTrendView(ctx context.Context, hostID, rangeKey string) repoTrendView {
-	days := 30
-	switch rangeKey {
-	case "90d":
-		days = 90
-	case "1y":
-		days = 365
-	default:
-		rangeKey = "30d"
-	}
-	since := time.Now().UTC().AddDate(0, 0, -days)
-	pts, err := s.deps.Store.ListHostRepoStatsHistory(ctx, hostID, since)
-	if err != nil {
-		slog.Warn("ui repo trend: list history", "host_id", hostID, "err", err)
-	}
-	sizes := make([]float64, len(pts))
-	counts := make([]float64, len(pts))
-	dayList := make([]time.Time, len(pts))
-	for i, p := range pts {
-		dayList[i] = p.Day
-		if p.TotalSizeBytes == nil {
-			sizes[i] = math.NaN()
-		} else {
-			sizes[i] = float64(*p.TotalSizeBytes)
-		}
-		if p.SnapshotCount == nil {
-			counts[i] = math.NaN()
-		} else {
-			counts[i] = float64(*p.SnapshotCount)
-		}
-	}
-	chartSVG := sparkline.RenderChart([]sparkline.Series{
-		{Name: "size", Stroke: "#3b82f6", Axis: sparkline.AxisLeft, Format: sparkline.FormatBytes, Points: sizes},
-		{Name: "snapshots", Stroke: "#f59e0b", Axis: sparkline.AxisRight, Format: sparkline.FormatCount, Points: counts},
-	}, dayList, sparkline.ChartOpts{Width: 640, Height: 220})
-	return repoTrendView{HostID: hostID, Range: rangeKey, ChartSVG: chartSVG}
-}
-
 func (s *Server) handleUIHostRepo(w stdhttp.ResponseWriter, r *stdhttp.Request) {
 	u := s.requireUIUser(w, r)
 	if u == nil {
--- a/Show More
+++ b/Show More