Merge pull request 'De-flake TestDrainPendingSerializesPerHost (CI stability)' (#33 ) from fix-flaky-server-http-tests into main

Reviewed-on: #33
test(pending-drain): de-flake TestDrainPendingSerializesPerHost
2026-06-16 15:44:47 +01:00 · 2026-06-16 13:29:47 +01:00 · 2026-06-16 07:32:00 +01:00 · 2026-06-15 23:07:43 +01:00 · 2026-06-15 23:01:03 +01:00 · 2026-06-15 23:00:56 +01:00
317 changed files with 34812 additions and 2460 deletions
@@ -0,0 +1,32 @@
+<!--
+Thanks for the PR! A few quick checks before submitting:
+
+* Did you open an issue first for non-trivial changes?
+* `make lint test` is green locally?
+* Commits are focused (one logical change per commit)?
+* No `Co-Authored-By` trailers (repo policy)?
+* No new dependencies without a one-line justification below?
+-->
+
+## Summary
+
+<!-- One paragraph: what changed and why. -->
+
+## Test plan
+
+<!-- Bullet list of what you actually ran. Be specific.
+     - `make test` → green
+     - Manually exercised the new flow at /hosts/{id}/foo
+     - Smoke env: enrolled a fresh host, ran a backup end-to-end
+-->
+
+## Notes for the reviewer
+
+<!-- Anything the reviewer needs to know that isn't obvious from the
+     diff: related issue, follow-up work that's intentionally not
+     in this PR, deferred concerns, design alternatives considered
+     and rejected. -->
+
+## Linked issues
+
+<!-- "Closes #123" / "Refs #456" / "Part of P5-06" -->
@@ -0,0 +1,52 @@
+---
+name: Bug report
+about: Something isn't behaving the way the docs / code suggest it should
+title: "[bug] "
+labels: bug
+---
+
+## What happened
+
+<!-- A clear description of the actual behaviour. Include the exact
+     UI surface, API endpoint, or CLI invocation involved. -->
+
+## What you expected
+
+<!-- What you thought would happen, and where that expectation came from
+     (docs page, command output, prior behaviour). -->
+
+## Steps to reproduce
+
+1.
+2.
+3.
+
+## Environment
+
+- restic-manager server version: <!-- `restic-manager-server --version` or footer of the UI -->
+- Agent version (if relevant): <!-- `restic-manager-agent --version` -->
+- restic version on affected host: <!-- `restic version` -->
+- Host OS: <!-- e.g. "Ubuntu 22.04 amd64" or "Windows Server 2022" -->
+- How was the server installed: <!-- docker compose / source build / other -->
+
+## Logs / output
+
+<details><summary>Server log (sanitised)</summary>
+
+```
+<!-- paste relevant lines; redact tokens, passwords, repo URLs -->
+```
+
+</details>
+
+<details><summary>Agent log (sanitised)</summary>
+
+```
+```
+
+</details>
+
+## Anything else
+
+<!-- Screenshots, related issues, recent changes you made before the
+     bug appeared, anything that might help. -->
@@ -0,0 +1,34 @@
+---
+name: Feature request
+about: Suggest a new capability or change to existing behaviour
+title: "[feature] "
+labels: enhancement
+---
+
+## What you're trying to do
+
+<!-- Describe the use case, not the proposed solution. Who is the
+     operator, what are they trying to accomplish, and what's
+     blocking them today? -->
+
+## Why the current behaviour falls short
+
+<!-- What does the system do today, and where does it stop short of
+     the use case above? -->
+
+## Proposed direction (optional)
+
+<!-- If you have a specific design in mind, describe it. Skip this
+     section if you'd rather leave it to the maintainer. -->
+
+## Scope check
+
+- [ ] I've read [`spec.md`](../spec.md) §2 (Goals & Non-Goals).
+- [ ] This isn't already on the roadmap in [`tasks.md`](../tasks.md).
+- [ ] This fits the project's "small fleet, one person operating"
+      target rather than enterprise / multi-tenant / SaaS use cases.
+
+## Anything else
+
+<!-- Related restic features, prior art in similar tools, links to
+     discussions you've had elsewhere. -->
@@ -2,28 +2,34 @@
 #
 # Notes for anyone editing this file:
 #
+# Custom runner image
+#   Every job runs inside `gitea.dcglab.co.uk/steve/ci-runner-go`
+#   (recipe: https://gitea.dcglab.co.uk/steve/ci/src/branch/main/images/ci-runner-go).
+#   That image already ships:
+#     * Go on PATH at /usr/local/go/bin (so `actions/setup-go` is
+#       redundant and intentionally NOT used here — the action would
+#       otherwise re-download Go on every job)
+#     * Node.js + npm (used by docs / e2e workflows)
+#     * Docker CLI, Buildx, Compose v2 (used by docker-build steps)
+#   When bumping the Go floor, push a new ci-runner-go image with
+#   the matching Go version and bump the date pin in IMAGE below.
+#
 # Self-hosted runner expectations
-#   The Gitea runners are provisioned out-of-band (the infra team owns
-#   the script). Each runner host bind-mounts persistent volumes for
-#   /root/go/pkg/mod (GOMODCACHE), /root/.cache/go-build (GOCACHE), and
-#   /root/.cache/act (action clones) into every job container. As a
+#   Each runner host bind-mounts persistent volumes for
+#   /root/go/pkg/mod (GOMODCACHE), /root/.cache/go-build (GOCACHE),
+#   and /root/.cache/act (action clones) into every job container —
+#   regardless of which image the container is built from. As a
 #   result:
-#     * `cache: true` on actions/setup-go is intentionally OMITTED — the
-#       action would otherwise tar/untar GOMODCACHE+GOCACHE through the
-#       Gitea cache backend on every job, undoing the host-volume cache
-#       and adding ~10s of redundant zstd round-trip per job.
-#     * Common GitHub actions (actions/checkout, actions/setup-go,
-#       actions/upload-artifact, golangci/golangci-lint-action) are
-#       pre-cloned into /root/.cache/act on the runner, so the per-job
-#       "git clone https://github.com/actions/..." step is a fetch, not
-#       a full clone.
+#     * Common GitHub actions (actions/checkout, actions/upload-artifact,
+#       golangci/golangci-lint-action) are pre-cloned into
+#       /root/.cache/act on the runner, so the per-job
+#       "git clone https://github.com/actions/..." step is a fetch,
+#       not a full clone.
 #     * golangci-lint is pre-installed at /usr/local/bin/golangci-lint
-#       on the runner (latest v2.x). The golangci-lint-action below
-#       still pins a specific version and re-downloads — that's fine
-#       (deterministic CI > marginal speed) but means the host-installed
-#       binary is currently unused. Drop the `version:` arg below to
-#       use the host-installed one if you want to trade determinism
-#       for speed.
+#       on the runner host BUT that's outside the job's filesystem
+#       view; the golangci-lint-action below pins a specific version
+#       and re-downloads — that's fine (deterministic CI > marginal
+#       speed).
 #
 # Build matrix
 #   Linux amd64 + arm64 + Windows amd64. CGO_ENABLED=0 throughout —
@@ -32,10 +38,10 @@
 #   binaries.
 #
 # Go version
-#   The GO_VERSION env var anchors all three jobs. Floor is set by the
-#   heaviest dep (modernc.org/sqlite v1.50+ requires Go 1.23+ today;
-#   we run 1.25 so golangci-lint's Go-version compatibility check is
-#   happy — see the version pin in the lint job).
+#   Anchored by the ci-runner-go image (currently Go 1.25.7). Floor
+#   is set by the heaviest dep (modernc.org/sqlite v1.50+ requires
+#   Go 1.23+; we run 1.25 so golangci-lint's Go-version compatibility
+#   check is happy — see the version pin in the lint job).
 #
 # upload-artifact
 #   Pinned at v3 historically; v3 was deprecated upstream. v4 should
@@ -48,35 +54,68 @@ on:
  pull_request:
    branches: [main]

-env:
-  GO_VERSION: "1.25"
+# Force bash as the default shell. With `container:` set on every
+# job, Gitea Actions otherwise picks `sh -e` and our `set -euo
+# pipefail` fails on dash with "Illegal option -o pipefail".
+defaults:
+  run:
+    shell: bash

 jobs:
  test:
-    name: Test (linux/amd64)
+    # Sharded by package group. server/http and store are the two
+    # heavy packages (~156s and ~75s in CI respectively under
+    # `-race`); pulling them onto their own runners lets each shard
+    # have all CPUs to itself instead of CPU-starving each other on
+    # one runner. The third shard ("rest") covers everything else.
+    name: Test (${{ matrix.name }})
    runs-on: ubuntu-latest
+    container:
+      image: docker.dcglab.co.uk/ci-runner-go:2026-05-15
+      credentials:
+        username: ${{ secrets.ZOT_USERNAME }}
+        password: ${{ secrets.ZOT_PASSWORD }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: server-http
+            packages: ./internal/server/http/...
+          - name: store
+            packages: ./internal/store/...
+          - name: rest
+            # Computed at runtime — see the "go test" step below.
+            packages: ""
    steps:
      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version: ${{ env.GO_VERSION }}
-          # cache: true intentionally omitted — see header notes.
      - name: go vet
        run: go vet ./...
      - name: go test
-        run: go test -race -coverprofile=coverage.out ./...
+        run: |
+          set -euo pipefail
+          if [ -n "${{ matrix.packages }}" ]; then
+            pkgs="${{ matrix.packages }}"
+          else
+            # "rest" shard: everything except the dedicated shards.
+            pkgs=$(go list ./... \
+              | grep -v '/internal/server/http$' \
+              | grep -v '/internal/store$')
+          fi
+          # shellcheck disable=SC2086
+          go test -race -coverprofile=coverage.out $pkgs
      - name: coverage summary
        run: go tool cover -func=coverage.out | tail -1

  lint:
    name: Lint
    runs-on: ubuntu-latest
+    container:
+      image: docker.dcglab.co.uk/ci-runner-go:2026-05-15
+      credentials:
+        username: ${{ secrets.ZOT_USERNAME }}
+        password: ${{ secrets.ZOT_PASSWORD }}
    steps:
      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version: ${{ env.GO_VERSION }}
-          # cache: true intentionally omitted — see header notes.
      - uses: golangci/golangci-lint-action@v7
        with:
          # Must be built against the same Go release as go.mod targets,
@@ -90,6 +129,11 @@ jobs:
  build:
    name: Build (${{ matrix.goos }}/${{ matrix.goarch }})
    runs-on: ubuntu-latest
+    container:
+      image: docker.dcglab.co.uk/ci-runner-go:2026-05-15
+      credentials:
+        username: ${{ secrets.ZOT_USERNAME }}
+        password: ${{ secrets.ZOT_PASSWORD }}
    strategy:
      fail-fast: false
      matrix:
@@ -103,10 +147,6 @@ jobs:
            ext: ".exe"
    steps:
      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version: ${{ env.GO_VERSION }}
-          # cache: true intentionally omitted — see header notes.
      - name: build server + agent
        env:
          GOOS: ${{ matrix.goos }}
@@ -0,0 +1,133 @@
+# P5-06 — End-to-end test suite.
+#
+# Spec : docs/superpowers/specs/2026-05-07-p5-oss-readiness-design.md
+# Stack: e2e/compose.e2e.yml (server + agent + rest-server + playwright)
+# Tests: e2e/playwright/tests/*.spec.ts
+#
+# Triggered on every PR into main and on workflow_dispatch. Runs
+# longer than the unit-test workflow (~3-4 minutes for a clean run);
+# kept separate so a slow e2e doesn't block the fast lint/test loop.
+#
+# Networking note: every interaction with the server (health probe,
+# Playwright) happens from a container on the compose `rmnet`
+# network, addressing the server as `http://server:8080`. We can't
+# rely on `127.0.0.1:8080` because Gitea's runner executes steps
+# inside its own container, where compose's host port-publish is
+# not visible.
+
+name: e2e
+
+on:
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+# Force bash as the default shell — see ci.yml header.
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  e2e:
+    name: Playwright vs docker-compose
+    runs-on: ubuntu-latest
+    container: gitea.dcglab.co.uk/steve/ci-runner-go:2026-05-08
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build the e2e stack
+        # --profile test pulls in the playwright service which is
+        # otherwise gated. --pull refreshes base images so a bump
+        # to the Dockerfile's FROM tag (e.g. mcr.microsoft.com/
+        # playwright:vX.Y.Z-jammy) isn't masked by a stale runner
+        # cache that still has the old tag's layers.
+        run: docker compose --profile test -f e2e/compose.e2e.yml build --pull
+
+      - name: Bring up the stack
+        run: docker compose -f e2e/compose.e2e.yml up -d server rest-server source-fixture
+
+      - name: Wait for server health
+        run: |
+          set -eu
+          for i in $(seq 1 30); do
+            if docker run --rm --network e2e_rmnet curlimages/curl:8.10.1 \
+                  -fsS http://server:8080/api/version >/dev/null 2>&1; then
+              echo "server up"; exit 0
+            fi
+            sleep 2
+          done
+          echo "server didn't come up"; docker compose -f e2e/compose.e2e.yml logs server; exit 1
+
+      - name: Capture bootstrap token from server logs
+        id: bootstrap
+        run: |
+          set -eu
+          for i in $(seq 1 15); do
+            line=$(docker compose -f e2e/compose.e2e.yml logs server 2>&1 | grep -E 'bootstrap token' -A2 | grep -Eo '[a-zA-Z0-9_-]{40,}' | head -1 || true)
+            if [ -n "$line" ]; then
+              echo "RM_BOOTSTRAP_TOKEN=$line" >> "$GITHUB_ENV"
+              echo "got bootstrap token (${#line} chars)"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "bootstrap token not found in logs"
+          docker compose -f e2e/compose.e2e.yml logs server
+          exit 1
+
+      - name: Start the agent
+        run: docker compose -f e2e/compose.e2e.yml up -d agent
+
+      - name: Run Playwright tests
+        id: playwright
+        env:
+          RM_BOOTSTRAP_TOKEN: ${{ env.RM_BOOTSTRAP_TOKEN }}
+        # --name pins a stable container ID so the next step can
+        # docker cp out of it before tear-down. We deliberately
+        # drop --rm so the container survives the test exit; the
+        # tear-down step removes it.
+        run: docker compose -f e2e/compose.e2e.yml run --name e2e-pw playwright
+
+      - name: Extract Playwright report
+        if: always() && steps.playwright.outcome != 'skipped'
+        run: |
+          mkdir -p e2e/playwright/playwright-report e2e/playwright/test-results
+          docker cp e2e-pw:/work/playwright-report/. e2e/playwright/playwright-report/ || true
+          docker cp e2e-pw:/work/test-results/. e2e/playwright/test-results/ || true
+
+      - name: Show Playwright failure context (on failure)
+        if: failure()
+        run: |
+          set +e
+          shopt -s nullglob globstar
+          for f in e2e/playwright/test-results/**/error-context.md; do
+            echo "::group::$f"
+            cat "$f"
+            echo "::endgroup::"
+          done
+          echo "Failure attachments (download via the playwright-report artifact):"
+          find e2e/playwright/test-results \( -name '*.png' -o -name '*.webm' -o -name 'trace.zip' \) -printf '  %p\n' | sort
+
+      - name: Compose logs (on failure)
+        if: failure()
+        run: |
+          docker compose -f e2e/compose.e2e.yml logs --tail=200 server
+          docker compose -f e2e/compose.e2e.yml logs --tail=200 agent
+          docker compose -f e2e/compose.e2e.yml logs --tail=200 rest-server
+
+      - name: Upload Playwright report (on failure)
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: playwright-report
+          path: |
+            e2e/playwright/playwright-report
+            e2e/playwright/test-results
+          retention-days: 7
+
+      - name: Tear down
+        if: always()
+        run: |
+          docker rm -f e2e-pw 2>/dev/null || true
+          docker compose -f e2e/compose.e2e.yml down -v
@@ -0,0 +1,111 @@
+# Release workflow — P5-03 (docker-only release path).
+#
+# Spec : docs/superpowers/specs/2026-05-05-p5-03-docker-only-release.md
+# Plan : docs/superpowers/plans/2026-05-05-p5-03-docker-only-release.md
+#
+# What it does
+#   * Triggered by either:
+#       - tag push matching v[0-9]+.[0-9]+.[0-9]+ (real release), or
+#       - workflow_dispatch (snapshot iteration without tagging).
+#   * Cross-builds a multi-arch (linux/amd64,linux/arm64) image of the
+#     server, with three agent binaries (linux amd64+arm64, windows amd64)
+#     plus install.sh / install.ps1 / the systemd unit baked in under
+#     /opt/restic-manager/dist (the read-only fallback path the server
+#     handlers use when <DataDir>/... is empty).
+#   * Pushes to zot OCI registry (docker.dcglab.co.uk).
+#
+# Tag fan-out
+#   * tag push: :vX.Y.Z, :X.Y, :X
+#   * tag push and X >= 1: also :latest
+#   * workflow_dispatch: only :snapshot-<shortsha>; nothing else moves.
+
+name: Release
+
+on:
+  push:
+    tags:
+      - 'v[0-9]+.[0-9]+.[0-9]+'
+  workflow_dispatch:
+
+env:
+  REGISTRY: docker.dcglab.co.uk
+  IMAGE_NAME: restic-manager
+
+# Force bash as the default shell — see ci.yml header.
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  image:
+    name: Build + push image
+    runs-on: ubuntu-latest
+    container:
+      image: docker.dcglab.co.uk/ci-runner-go:2026-05-15
+      credentials:
+        username: ${{ secrets.ZOT_USERNAME }}
+        password: ${{ secrets.ZOT_PASSWORD }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: docker/setup-qemu-action@v3
+      - uses: docker/setup-buildx-action@v3
+
+      - name: Log in to zot registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ secrets.ZOT_USERNAME }}
+          password: ${{ secrets.ZOT_PASSWORD }}
+
+      - name: Compute tags + version
+        id: meta
+        shell: bash
+        run: |
+          set -euo pipefail
+          REG="${REGISTRY}/${IMAGE_NAME}"
+          DATE="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+          SHORT_SHA="${GITHUB_SHA::7}"
+
+          if [ "${GITHUB_EVENT_NAME}" = "push" ] && [ "${GITHUB_REF_TYPE}" = "tag" ]; then
+            TAG="${GITHUB_REF_NAME}"            # vX.Y.Z
+            VER="${TAG#v}"                       # X.Y.Z
+            MAJOR="${VER%%.*}"
+            MINOR="${VER#${MAJOR}.}"; MINOR="${MINOR%%.*}"
+
+            TAGS="${REG}:${TAG}"
+            TAGS="${TAGS},${REG}:${MAJOR}.${MINOR}"
+            TAGS="${TAGS},${REG}:${MAJOR}"
+            # Pre-1.0 holds back :latest by design; operators must
+            # pin a version explicitly until v1.0.0.
+            if [ "${MAJOR}" -ge 1 ]; then
+              TAGS="${TAGS},${REG}:latest"
+            fi
+            VERSION="${TAG}"
+          else
+            TAGS="${REG}:snapshot-${SHORT_SHA}"
+            VERSION="0.0.0-snapshot-${SHORT_SHA}"
+          fi
+
+          {
+            echo "tags=${TAGS}"
+            echo "version=${VERSION}"
+            echo "date=${DATE}"
+          } >> "${GITHUB_OUTPUT}"
+
+      - name: Build + push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: deploy/Dockerfile.server
+          platforms: linux/amd64,linux/arm64
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          build-args: |
+            VERSION=${{ steps.meta.outputs.version }}
+            COMMIT=${{ gitea.sha }}
+            DATE=${{ steps.meta.outputs.date }}
+          labels: |
+            org.opencontainers.image.version=${{ steps.meta.outputs.version }}
+            org.opencontainers.image.revision=${{ gitea.sha }}
+            org.opencontainers.image.created=${{ steps.meta.outputs.date }}
@@ -2,6 +2,10 @@
 /bin/
 /dist/

+# Generated mdBook output (source under docs/book/src is committed,
+# the rendered book/ directory is not).
+/docs/book/book/
+
 # Local data / runtime state
 /data/
 /certs/
@@ -26,7 +30,25 @@ coverage.html
 .env.local
 *.local

+# Local docker-compose for the dev/test bench. Has host-specific IPs,
+# hostnames, and ports — never committed; the canonical reference
+# deployment lives in deploy/.
+/compose.yaml
+/compose.override.yaml
+
 # Local diagnostic helpers (never shipped). Go's build tooling already
 # skips paths beginning with _ or ., but ignore explicitly so nothing
 # checked in here can leak into a release tarball.
 /_diag/
+
+# Dev-only one-shot binaries (cmd/_*) — never shipped. Go's build
+# tooling already skips paths starting with _, but ignore explicitly
+# so an accidental `git add cmd/.` can't sneak them into a release.
+/cmd/_*/
+
+# Local-only planning / scratch — never committed.
+/ask.md
+/docs/superpowers/
+
+# Claude Code agent worktrees (transient, harness-created).
+/.claude/worktrees/
@@ -26,7 +26,7 @@ linters:
        - name: exported
          arguments: ["disableStutteringCheck"]
    misspell:
-      locale: US
+      locale: UK
  exclusions:
    rules:
      - path: _test\.go
@@ -0,0 +1,127 @@
+# Changelog
+
+All notable changes to this project are documented here.
+The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and the project follows [Semantic Versioning](https://semver.org/).
+
+## [Unreleased]
+
+## [1.1.0] - 2026-06-15
+
+### Added
+
+- **Always-On vs intermittent host mode.** A host can now be marked as
+  not always-on — for laptops/workstations that legitimately sleep,
+  travel, or shut down outside hours. An intermittent host no longer
+  raises "agent offline" alerts when it disappears; instead it shows a
+  calm "asleep" state in the UI ("asleep · last seen … · will catch up
+  on return") and is covered by a longer-horizon staleness alert (raised
+  only when it has an enabled schedule and no successful backup in 7
+  days). When such a host reconnects, the server waits a short settle
+  window and then automatically dispatches any scheduled backup whose
+  window elapsed while it was asleep. Toggle per host from the host
+  detail page (operator-band, audited as `host.mode_updated`). New and
+  existing hosts default to always-on, so current fleets are unaffected.
+
+### Changed
+
+- Host-detail header redesign: tags and presence are grouped into
+  labelled, boxed pills with click-to-edit; presence shows a `24x7` /
+  `Free` chip; the agent "out of date" indicator is simplified (the full
+  version detail remains in the Agent-update panel and on hover).
+- Relative timestamps ("2h ago") now tick client-side, so a tab left
+  open no longer shows a stale value as wall-clock time moves on.
+- Release and CI container images are now published to and pulled from
+  the zot OCI registry (`docker.dcglab.co.uk`).
+
+## [1.0.1] - 2026-05-09
+
+### Fixed
+
+- Build version is now single-sourced from `internal/version`, and the
+  server Dockerfile's ldflags were corrected so docker-built binaries
+  report their real version. Previously `internal/version.Version` stayed
+  at its "dev" default in docker images, which made every host look
+  permanently out-of-date to the update logic.
+
+## [1.0.0] - 2026-05-09
+
+First tagged release. Six development phases brought the project from
+empty repo to a self-hostable, multi-tenant restic backup orchestrator
+with a web UI, JSON API, and self-updating agent fleet.
+
+### Phase 1 — MVP: enrolment, visibility, on-demand backup
+
+- HTTP server, SQLite store with migrations, AEAD-encrypted
+  credentials at rest, Argon2id password hashing, session cookies.
+- WebSocket transport between server and agents (heartbeat, hello,
+  schedule fan-out, job log streaming).
+- Agent install path for Linux (systemd unit + `install.sh`); one-time
+  enrolment tokens with embedded repo credentials.
+- Run-now backup execution end-to-end, snapshot listing.
+- Server-side encrypted repo creds pushed to the agent on hello.
+
+### Phase 2 — Scheduling, retention, repo operations
+
+- Source groups (paths + excludes + pre/post hooks + bandwidth caps)
+  decoupled from schedules; a schedule fires a source group.
+- Cron-style schedules with retention policies, server-driven
+  reconciliation push and ack.
+- `restic forget`, `prune`, `check`, `unlock` automation; periodic
+  maintenance ticker with per-host stagger.
+- Pending-runs queue with backpressure (`max_concurrent_jobs` per
+  host).
+- Repo stats panel on the host detail page (size, last-check, last-
+  prune, stale-lock banner).
+- Auto-init of repos on first onboard with credential-failure surface
+  on the host detail page.
+- Announce-and-approve enrolment path for hosts that don't have a
+  pre-minted token (Ed25519 fingerprint, operator approves).
+- Windows agent: SCM service integration + `install.ps1` installer.
+- Cross-platform alt-enrolment (announce flow on Windows).
+
+### Phase 3 — Restore, alerts, audit
+
+- Restore wizard: pick a snapshot, pick paths, pick a target
+  (in-place / new directory), live progress.
+- Snapshot diff against parent.
+- Alert engine: per-source-group dedup, severity tiers, ack / resolve.
+- Live-refresh alerts table with severity cues.
+- Audit log UI with filters, sort, CSV export, payload-detail modal.
+
+### Phase 4 — RBAC, OIDC, host tags
+
+- Role-based access control: viewer / operator / admin.
+- User management UI (invite, role change, disable, password reset).
+- Generic OIDC SSO with JIT user provisioning + role mapping.
+- Per-host tags with chip-row filter on the dashboard.
+
+### Phase 5 — OSS readiness
+
+- mdBook-rendered docs site at `docs/book/`.
+- Contributor onboarding (CONTRIBUTING.md, security policy, license).
+- Docker-only release pipeline + reference deployment compose file.
+- Playwright e2e harness covering the smoke runbook.
+
+### Phase 6 — Update delivery + observability
+
+- Agent self-update: server-side channel pin per host, signed binary
+  fetch via the WS transport, atomic swap with rollback on failure.
+- Fleet-wide update orchestration with per-host stagger and an admin
+  pause switch.
+- Prometheus `/metrics` endpoint + Grafana dashboard JSON.
+- Repo size trend per host (90-day rolling) on the host detail page.
+
+### Cross-cutting
+
+- Live dashboard with column sort, filters, free-text host search,
+  background-tab-aware live refresh (5s cadence).
+- Pure-Go binary with embedded UI, no Node/CGO at runtime.
+- Reproducible `-trimpath -ldflags="-s -w"` builds for
+  linux/amd64, linux/arm64, windows/amd64.
+- Sharded CI (server-http / store / rest), pre-commit hooks (gofumpt,
+  go vet, golangci-lint).
+- Threat model published (`docs/threat-model.md`).
+
+[Unreleased]: https://gitea.dcglab.co.uk/steve/restic-manager/compare/v1.0.0...HEAD
+[1.0.0]: https://gitea.dcglab.co.uk/steve/restic-manager/releases/tag/v1.0.0
@@ -2,6 +2,19 @@

 Project-specific rules for Claude when working in this repo.

+## Commands
+
+Is the user types in any of the following, follow the instructions in the table
+
+| Command | Action |
+| --- | --- |
+| :release | trigger subagent to commit (if needed), push (if needed), raise PR, wait for PR to pass or fail. If fail, report back. If pass, merge in to main |
+
+## Repo
+
+The repo lives inside a Gitea instance; `tea` CLI is available for use by agents
+
+
 ## Run `go vet` before every commit

 CI runs `go vet ./...` and will fail the build on any vet error.
@@ -25,7 +38,7 @@ but the **agent** is fetched by the install script from the server's
 **install script** are fetched from `<DataDir>/install/`. Plain
 `make build` doesn't touch any of those — the source-of-truth files
 in the working tree (`deploy/install/*`, `bin/restic-manager-agent`)
-must be copied into `/tmp/rm-smoke/data/...` *and* the running agent
+must be copied into `$HOME/smoke/data/...` *and* the running agent
 on this dev host needs replacing if the change touches agent code or
 the unit file.

@@ -40,11 +53,13 @@ asking the operator to test.**
 ```sh
 # 1. Restage what the install script serves (binary + unit + script).
 cp bin/restic-manager-agent \
-   /tmp/rm-smoke/data/agent-binaries/restic-manager-agent-linux-amd64
+   $HOME/smoke/data/agent-binaries/restic-manager-agent-linux-amd64
 cp deploy/install/install.sh \
-   /tmp/rm-smoke/data/install/install.sh
+   $HOME/smoke/data/install/install.sh
+cp deploy/install/install.ps1 \
+   $HOME/smoke/data/install/install.ps1
 cp deploy/install/restic-manager-agent.service \
-   /tmp/rm-smoke/data/install/restic-manager-agent.service
+   $HOME/smoke/data/install/restic-manager-agent.service

 # 2. Replace the running agent on this dev box and restart the
 #    service. Skip only when the change is server-side only AND
@@ -59,15 +74,36 @@ sudo -n systemctl restart restic-manager-agent
 # 3. The server runs from the working tree; restart it manually
 #    after a build that touches server code:
 pkill -f restic-manager-server
-RM_LISTEN=:8080 RM_DATA_DIR=/tmp/rm-smoke/data \
+RM_LISTEN=:8080 RM_DATA_DIR=$HOME/smoke/data \
 RM_BASE_URL=http://127.0.0.1:8080 \
-RM_SECRET_KEY_FILE=/tmp/rm-smoke/data/secret.key \
+RM_SECRET_KEY_FILE=$HOME/smoke/data/secret.key \
 RM_COOKIE_SECURE=false \
-./bin/restic-manager-server >> /tmp/rm-smoke/server.log 2>&1 &
+./bin/restic-manager-server >> $HOME/smoke/server.log 2>&1 &
 ```

-A `make smoke-deploy` target that bundles all of this would be a
-good follow-up.
+## Smoke server: use the Make targets, not raw `nohup`
+
+The smoke server runs as a transient `systemd --user` unit named
+`restic-manager-smoke.service` so it survives any sandbox or
+process-group boundary that would otherwise SIGTERM a backgrounded
+process. Use the Make targets:
+
+```
+make smoke-restart   # rebuild server + (re)launch as systemd --user unit
+make smoke-status    # systemctl --user status
+make smoke-logs      # tail $HOME/smoke/server.log
+make smoke-stop      # stop the unit
+make smoke-deploy    # full rebuild + restage agent assets + restart
+```
+
+`./bin/restic-manager-server &` from inside a Bash tool call gets
+reaped when the tool exits — don't do that. If the unit fails to
+start: `systemctl --user status restic-manager-smoke` and
+`$HOME/smoke/server.log` have the diagnosis.
+
+`smoke-deploy` does NOT touch `/usr/local/bin/restic-manager-agent`
+on this dev box; if your change requires the live agent here to
+update, run the agent restage block above by hand.

 ## Migrations: prefer column-level ALTERs over table rebuilds

@@ -0,0 +1,69 @@
+# Code of Conduct
+
+restic-manager is a small project run by one person. This Code of
+Conduct sets out the basic expectations for participating in the
+project's issue tracker, pull requests, and any other community
+spaces (chat, mailing lists) we may run in future.
+
+## Expected behaviour
+
+- **Be civil.** Disagreement is fine; rudeness is not. The same
+  comment can usually be made without making it personal.
+- **Assume good faith.** People asking what feels like a basic
+  question may be new to the project. People proposing what feels
+  like a duplicate idea may not have seen the prior discussion.
+  Point them to the right place politely.
+- **Stay on topic.** Issue threads are for the issue. Tangential
+  conversations belong in their own thread.
+- **Acknowledge the project's scope.** restic-manager is
+  intentionally small in scope (see `spec.md` §2). Reasonable
+  feature suggestions may still be declined for fit reasons.
+
+## Unacceptable behaviour
+
+- Harassment, threats, or insults — public or private.
+- Discriminatory comments based on age, body size, disability,
+  ethnicity, gender identity or expression, level of experience,
+  nationality, personal appearance, race, religion, sexual identity
+  or orientation.
+- Sustained disruption — derailing threads, ignoring repeated
+  requests to take a discussion elsewhere, brigading.
+- Publishing other people's private information without permission.
+
+## Reporting
+
+If someone in the project's spaces is behaving in a way that
+breaches this Code of Conduct, contact the maintainer directly
+through the contact details on their Gitea profile, or via the
+private security disclosure path documented in
+[SECURITY.md](./SECURITY.md). Reports stay confidential.
+
+The maintainer will review the report, gather context if needed,
+and respond. Possible outcomes include a private warning, a public
+clarification of expectations, a temporary or permanent ban from
+project spaces, or no action if the report doesn't hold up.
+
+There is no formal appeals process — this is a one-person project,
+not a foundation. If you think a decision was wrong you can say
+so, in writing, to the maintainer; that's it.
+
+## Scope
+
+This Code of Conduct applies to interactions in any space the
+project owns or operates: the Gitea repository (issues, pull
+requests, discussions, wiki), any chat channels we publish, and
+any conferences or events the project is officially represented at.
+
+It does not apply to:
+
+- Forks of the project that aren't being submitted back upstream.
+- Conversations between contributors that don't reference the
+  project.
+- Public criticism of the project itself.
+
+## Acknowledgement
+
+This document borrows shape and language from the
+[Contributor Covenant](https://www.contributor-covenant.org/) v2.1
+but is intentionally shorter and adapted to the project's
+single-maintainer reality.
@@ -1,30 +1,168 @@
-# Contributing
+# Contributing to restic-manager

-Thanks for your interest in contributing to restic-manager.
+Thanks for your interest in restic-manager. This document covers how
+to set up a development environment, the conventions the project
+follows, and how patches make it from your machine into `main`.

-> This is a placeholder. The project is in pre-alpha (Phase 1 / MVP). A
-> full contributor guide will land alongside the Phase 5 OSS-readiness
-> work — see [`tasks.md`](./tasks.md) P5-02. Until then the notes below
-> apply.
+## Project status and scope

-## Before opening a PR
+restic-manager is in pre-1.0. Core functionality (Phases 0–4) is
+landed; OSS-readiness polish is in progress. The top of
+[`tasks.md`](./tasks.md) tracks what's next; [`spec.md`](./spec.md)
+is the canonical design doc and the source of truth for any
+"why is it built this way" question.

-1. Open an issue first for non-trivial changes — the design is still
-   moving (see [`spec.md`](./spec.md)) and unsolicited large PRs may
-   conflict with in-flight work.
-2. `make lint test` should pass.
-3. Match the existing code style — `gofumpt`, `goimports`, no comments
-   that just restate what the code does.
-4. Keep commits focused; one logical change per commit.
+The project is **single-maintainer, hobbyist-scale, and licensed
+under [PolyForm Noncommercial 1.0.0](./LICENSE)**. That has two
+practical implications:

-## Reporting security issues
+1. Big PRs without prior discussion may be declined for fit
+   reasons even when they're correct — opening an issue first lets
+   us check alignment cheaply.
+2. Commercial use is not permitted by the license. Bug reports and
+   patches from operators of personal/community deployments are
+   very welcome.

-Please do **not** open a public issue for security problems. A
-`SECURITY.md` with a private disclosure path will be added in Phase 5
-(P5-05). Until then, contact the repository owner directly via the
-contact details on their gitea profile.
+## Getting started
+
+### Prerequisites
+
+- Go 1.25 or newer (`go.mod` is the source of truth)
+- `make`
+- For the front-end CSS bundle: nothing extra — `make build`
+  downloads a pinned `tailwindcss` standalone binary into `bin/`.
+- For the docs site: nothing extra — `make docs` does the same trick
+  with `mdbook`.
+- For end-to-end tests: Docker + Docker Compose, plus `npx` for
+  Playwright.
+
+### One-time setup
+
+```sh
+git clone https://gitea.dcglab.co.uk/steve/restic-manager.git
+cd restic-manager
+make build          # compiles bin/restic-manager-{server,agent}
+make test           # full unit + integration test sweep
+make lint           # gofumpt + goimports + golangci-lint
+```
+
+### Running locally
+
+For most development, the [smoke environment](./docs/e2e-smoke.md)
+is the path of least resistance:
+
+```sh
+make smoke-restart  # rebuilds, launches as a systemd --user unit
+make smoke-logs     # tail of the server log
+```
+
+Then point a browser at `http://127.0.0.1:8080`. The first run
+prints a one-time bootstrap token to the log; use it to create the
+admin user.
+
+## Code conventions
+
+### Style
+
+- `gofumpt` for formatting; `goimports` for import grouping.
+  Both run via the pre-commit hook in this repo.
+- `golangci-lint` with `.golangci.yml` defaults; CI rejects on lint
+  errors.
+- UK English in identifiers, comments, log messages, and UI strings
+  (the misspell linter is configured for the UK locale — see
+  P3-X5 for the original sweep).
+- Comments explain **why**, not what; avoid restating the code.
+  A surprising invariant or an external constraint is worth
+  writing down. "Adds 1 to x" is not.
+- `slog` for structured logs. Never log secrets — and especially
+  never the merged-creds rest-server URL (see [`CLAUDE.md`](./CLAUDE.md)).
+
+### File and package layout
+
+- `cmd/server` and `cmd/agent` are the two binary entry points.
+- `internal/` holds everything that's not part of the public Go
+  API (which is none of it — restic-manager isn't a library).
+- Per-feature packages live under `internal/server/...` for the
+  control plane and `internal/agent/...` for the agent.
+- `web/templates/` are HTML templates rendered with the standard
+  library; embedded via `web.FS`.
+
+### Tests
+
+- Unit tests live alongside the code as `*_test.go`. Use the
+  in-process sqlite store (`store.Open(":memory:")`) when you need
+  state — there is no test mock layer to maintain.
+- HTTP handlers test through `httptest.NewServer` against the real
+  router; see `internal/server/http/auth_test.go` for the canonical
+  fixture pattern.
+- End-to-end tests live in `e2e/` and run against a Docker Compose
+  stack. See [`docs/e2e.md`](./docs/e2e.md).
+
+### Database migrations
+
+- Migrations are hand-rolled SQL in `internal/store/migrations/`
+  and embedded via `embed.FS`.
+- Prefer column-level `ALTER TABLE` over rebuilds — see
+  [`CLAUDE.md`](./CLAUDE.md) "Migrations" section for the FK-cascade
+  trap that bit migration 0007's first draft.
+
+## Workflow
+
+### Before opening a PR
+
+1. **Open an issue first** for non-trivial changes. The design is
+   still moving; an issue lets us agree on direction cheaply.
+2. Run `make lint test` locally — both must pass.
+3. Match existing code style (see above).
+4. Keep commits focused: one logical change per commit. Imperative
+   subject lines, body explaining why if it isn't obvious.
+5. Don't add `Co-Authored-By` trailers — repo policy. If you used
+   AI assistance in writing the patch, that's fine; we just don't
+   pollute every commit message with attribution boilerplate.
+
+### Pull requests
+
+PRs target `main`. CI runs lint + tests on Linux amd64/arm64 and
+Windows amd64; all three must be green to merge. Squash-merge is
+the default; the PR title becomes the merge-commit subject, so
+keep it short and informative.
+
+The PR template asks for:
+
+- A short description of what changed and why.
+- A test plan (commands run, scenarios verified).
+- Anything reviewers need to know to assess the change (related
+  issue, follow-up work, deferred concerns).
+
+### Reporting bugs
+
+Open an issue with:
+
+- restic-manager version (`server --version`) and agent version.
+- restic version on the affected host.
+- Steps to reproduce.
+- Server and agent logs (sanitise any tokens before pasting).
+
+Security-sensitive bugs go through the [SECURITY.md](./SECURITY.md)
+disclosure path instead — please don't open a public issue for
+them.
+
+### Suggesting features
+
+Open an issue describing the use case (not just the proposed
+solution). The roadmap in `tasks.md` shows where the project is
+heading; if the suggestion fits a future phase we'll wire it in
+there. If it falls outside the project's scope (multi-tenancy, SaaS,
+non-restic backends — see `spec.md` §2 non-goals) we'll say so
+early to save your time.
+
+## Code of conduct
+
+Project participation is governed by [CODE_OF_CONDUCT.md](./CODE_OF_CONDUCT.md).
+The short version: be civil; assume good faith; harassment is not
+tolerated.

 ## License

-By contributing you agree that your contributions are licensed under
-the [PolyForm Noncommercial 1.0.0](./LICENSE) license.
+By contributing you agree that your contributions are licensed
+under the [PolyForm Noncommercial 1.0.0](./LICENSE) license.
@@ -5,9 +5,15 @@ BIN_DIR        := bin
 SERVER_BIN     := $(BIN_DIR)/restic-manager-server
 AGENT_BIN      := $(BIN_DIR)/restic-manager-agent
 VERSION        ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev)
-LDFLAGS        := -s -w -X main.version=$(VERSION)
+COMMIT         ?= $(shell git rev-parse HEAD 2>/dev/null || echo none)
+DATE           ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
+VERSION_PKG    := gitea.dcglab.co.uk/steve/restic-manager/internal/version
+LDFLAGS        := -s -w \
+                  -X $(VERSION_PKG).Version=$(VERSION) \
+                  -X $(VERSION_PKG).Commit=$(COMMIT) \
+                  -X $(VERSION_PKG).Date=$(DATE)
 GOFLAGS        := -trimpath
-DOCKER_IMAGE   ?= ghcr.io/dcglab/restic-manager
+DOCKER_IMAGE   ?= gitea.dcglab.co.uk/steve/restic-manager
 DOCKER_TAG     ?= dev

 # Tailwind standalone CLI — single binary, no Node toolchain.
@@ -20,7 +26,29 @@ TAILWIND_URL      := https://github.com/tailwindlabs/tailwindcss/releases/downlo
 TAILWIND_INPUT    := web/styles/input.css
 TAILWIND_OUTPUT   := web/static/css/styles.css

-.PHONY: help build server agent test test-race lint fmt tidy clean run-server run-agent docker release tailwind tailwind-watch setup hooks
+# mdBook for the docs site (P5-01). Single static binary, no
+# Rust toolchain — same pattern as Tailwind.
+MDBOOK_VERSION    ?= v0.4.51
+MDBOOK_OS         := $(shell uname -s | tr A-Z a-z)
+MDBOOK_TRIPLE     := $(shell uname -m)-unknown-$(if $(filter darwin,$(MDBOOK_OS)),apple-darwin,linux-gnu)
+MDBOOK_BIN        := $(BIN_DIR)/mdbook
+MDBOOK_TARBALL    := mdbook-$(MDBOOK_VERSION)-$(MDBOOK_TRIPLE).tar.gz
+MDBOOK_URL        := https://github.com/rust-lang/mdBook/releases/download/$(MDBOOK_VERSION)/$(MDBOOK_TARBALL)
+DOCS_BOOK_DIR     := docs/book
+DOCS_BOOK_OUT     := $(DOCS_BOOK_DIR)/book
+
+.PHONY: help build server agent test test-race lint fmt tidy clean run-server run-agent docker release tailwind tailwind-watch docs docs-watch setup hooks smoke-restart smoke-stop smoke-status smoke-logs smoke-deploy
+
+# ---- smoke-env tooling -------------------------------------------------
+# The smoke server runs as a transient user-systemd unit so it survives
+# bash-tool boundaries and reboots-of-the-shell. Use `make smoke-restart`
+# any time you've rebuilt the server. `make smoke-deploy` is the full
+# rebuild + restage + restart workflow described in CLAUDE.md.
+SMOKE_UNIT       := restic-manager-smoke
+SMOKE_DATA_DIR   := $(HOME)/smoke/data
+SMOKE_LOG_FILE   := $(HOME)/smoke/server.log
+SMOKE_BASE_URL   := http://127.0.0.1:8080
+SMOKE_LISTEN     := :8080

 help:
 	@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN{FS=":.*?## "};{printf "  \033[36m%-14s\033[0m %s\n",$$1,$$2}'
@@ -45,6 +73,18 @@ tailwind-watch: $(TAILWIND_BIN) ## Watch and rebuild on every save
 	@mkdir -p $$(dirname $(TAILWIND_OUTPUT))
 	$(TAILWIND_BIN) -c tailwind.config.js -i $(TAILWIND_INPUT) -o $(TAILWIND_OUTPUT) --watch

+$(MDBOOK_BIN):
+	@mkdir -p $(BIN_DIR)
+	@echo "==> downloading mdbook $(MDBOOK_VERSION) ($(MDBOOK_TRIPLE))"
+	curl -fsSL "$(MDBOOK_URL)" | tar -xz -C $(BIN_DIR) mdbook
+	@chmod +x $@
+
+docs: $(MDBOOK_BIN) ## Build the docs/book/ mdBook site into docs/book/book/
+	$(MDBOOK_BIN) build $(DOCS_BOOK_DIR)
+
+docs-watch: $(MDBOOK_BIN) ## Serve the docs site at http://127.0.0.1:3000 with live reload
+	$(MDBOOK_BIN) serve $(DOCS_BOOK_DIR) -n 127.0.0.1 -p 3000
+
 agent: ## Build the agent binary
 	@mkdir -p $(BIN_DIR)
 	CGO_ENABLED=0 go build $(GOFLAGS) -ldflags "$(LDFLAGS)" -o $(AGENT_BIN) ./cmd/agent
@@ -75,7 +115,7 @@ tidy: ## go mod tidy
 	go mod tidy

 clean: ## Remove build artifacts
-	rm -rf $(BIN_DIR) coverage.out coverage.html $(TAILWIND_OUTPUT)
+	rm -rf $(BIN_DIR) coverage.out coverage.html $(TAILWIND_OUTPUT) $(DOCS_BOOK_OUT)

 run-server: server ## Build and run the server
 	$(SERVER_BIN)
@@ -84,7 +124,53 @@ run-agent: agent ## Build and run the agent
 	$(AGENT_BIN)

 docker: ## Build the server Docker image
-	docker build -f deploy/Dockerfile.server --build-arg VERSION=$(VERSION) -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
+	docker build -f deploy/Dockerfile.server \
+	  --build-arg VERSION=$(VERSION) \
+	  --build-arg COMMIT=$(COMMIT) \
+	  --build-arg DATE=$(DATE) \
+	  -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
+
+smoke-restart: server ## (Re)start the smoke server as a transient user-systemd unit
+	@systemctl --user reset-failed $(SMOKE_UNIT) >/dev/null 2>&1 || true
+	@systemctl --user stop $(SMOKE_UNIT) >/dev/null 2>&1 || true
+	@echo "==> launching $(SMOKE_UNIT)"
+	systemd-run --user --unit=$(SMOKE_UNIT) \
+	  --setenv=RM_LISTEN=$(SMOKE_LISTEN) \
+	  --setenv=RM_DATA_DIR=$(SMOKE_DATA_DIR) \
+	  --setenv=RM_BASE_URL=$(SMOKE_BASE_URL) \
+	  --setenv=RM_SECRET_KEY_FILE=$(SMOKE_DATA_DIR)/secret.key \
+	  --setenv=RM_COOKIE_SECURE=false \
+	  --property=StandardOutput=append:$(SMOKE_LOG_FILE) \
+	  --property=StandardError=append:$(SMOKE_LOG_FILE) \
+	  --property=Restart=on-failure \
+	  $(PWD)/$(SERVER_BIN)
+	@for i in 1 2 3 4 5; do \
+	  curl -fsS -o /dev/null $(SMOKE_BASE_URL)/api/version 2>/dev/null && \
+	    { echo "==> smoke server up: $$(curl -s $(SMOKE_BASE_URL)/api/version)"; exit 0; }; \
+	  sleep 1; \
+	done; \
+	echo "!! smoke server did not respond on $(SMOKE_BASE_URL) — check $(SMOKE_LOG_FILE)" >&2; \
+	systemctl --user status --no-pager $(SMOKE_UNIT) || true; \
+	exit 1
+
+smoke-stop: ## Stop the smoke server
+	systemctl --user stop $(SMOKE_UNIT) || true
+	@systemctl --user reset-failed $(SMOKE_UNIT) >/dev/null 2>&1 || true
+
+smoke-status: ## Show status of the smoke server
+	@systemctl --user status --no-pager $(SMOKE_UNIT) 2>&1 | head -20 || true
+
+smoke-logs: ## Tail the smoke server log
+	tail -50 $(SMOKE_LOG_FILE)
+
+smoke-deploy: build smoke-restart ## Rebuild + restage agent into smoke + restart server (full per-CLAUDE.md cycle)
+	@echo "==> restaging agent + install assets into $(SMOKE_DATA_DIR)"
+	cp $(AGENT_BIN) $(SMOKE_DATA_DIR)/agent-binaries/restic-manager-agent-linux-amd64
+	cp deploy/install/install.sh $(SMOKE_DATA_DIR)/install/install.sh
+	cp deploy/install/install.ps1 $(SMOKE_DATA_DIR)/install/install.ps1
+	cp deploy/install/restic-manager-agent.service $(SMOKE_DATA_DIR)/install/restic-manager-agent.service
+	@echo "==> NOTE: this dev box's installed agent at /usr/local/bin/restic-manager-agent is NOT updated by this target."
+	@echo "    Run the agent restage block in CLAUDE.md if your change touches agent code or the unit file."

 release: ## Cross-compile for all supported platforms
 	@mkdir -p $(BIN_DIR)
@@ -1,36 +1,62 @@
 # restic-manager

 Self-hosted, browser-based, single-pane-of-glass for managing
-[restic](https://restic.net) backups across a fleet of Linux and Windows
-endpoints.
+[restic](https://restic.net) backups across a fleet of Linux and
+Windows endpoints.

-> Status: pre-alpha. Phase 0 (project bootstrap) complete; Phase 1 (MVP) in
-> progress. See [`spec.md`](./spec.md) for the design and
-> [`tasks.md`](./tasks.md) for the roadmap.
+> **Status:** pre-1.0, feature-complete for the original use
+> case. Phases 0–4 + 6 are landed (MVP, scheduling, restore,
+> RBAC + OIDC, observability); Phase 5 (OSS readiness — docs site,
+> contributor onboarding, end-to-end CI) is in flight. See
+> [`spec.md`](./spec.md) for the design and [`tasks.md`](./tasks.md)
+> for the live roadmap.

-## What it does (target)
+## What it does

- Central visibility into backup state for every endpoint
- Trigger any restic operation remotely (`backup`, `forget`, `prune`,
-  `check`, `unlock`, `snapshots`, `stats`, `diff`, `restore`)
- Manage per-host backup schedules from the UI
- Live job progress streamed back to the UI
- Restore wizard (browse snapshots, pick paths, restore to original or
-  alternate host)
- Repo health surfacing (size, dedup ratio, last check, lock state)
- Alerting on failure or staleness
- Cross-platform agent (Linux + Windows)
- Ransomware-resistant repo access via append-only credentials
+- Central visibility into backup state for every endpoint.
+- Trigger any restic operation remotely (`backup`, `forget`,
+  `prune`, `check`, `unlock`, `snapshots`, `stats`, `diff`,
+  `restore`).
+- Per-host schedules with named source groups + retention.
+- Live job log streamed to the browser; downloadable as
+  text/NDJSON afterwards.
+- Restore wizard: browse a snapshot's tree, pick paths, restore
+  in-place or to a new directory.
+- Repo health surfacing (size, raw size, last check, lock state),
+  plus a 30/90-day repo-size trend.
+- Alerting over webhook, ntfy, or SMTP.
+- Cross-platform agent (Linux systemd + Windows SCM).
+- Append-only-friendly: separate admin credential for prune.
+- Optional Prometheus `/metrics` endpoint + sample Grafana
+  dashboard.
+- Optional OIDC SSO (Authelia, Authentik, etc.).

-## Architecture (one-line summary)
+## Screenshots

-A small Go control-plane on the Proxmox host, lightweight Go agents on each
-endpoint that hold an outbound WebSocket to the control-plane, and a
-`restic/rest-server` on Unraid that holds the actual backup data. The
-control-plane never touches backup bytes.
+| Sign in | Empty dashboard | Add host |
+|:-------:|:---------------:|:--------:|
+| ![Sign in](docs/screenshots/01-login.png) | ![Dashboard, fresh](docs/screenshots/02-dashboard-empty.png) | ![Add host](docs/screenshots/03-add-host.png) |
+
+| Alerts | Settings | Audit log |
+|:------:|:--------:|:---------:|
+| ![Alerts](docs/screenshots/04-alerts.png) | ![Settings](docs/screenshots/05-settings.png) | ![Audit log](docs/screenshots/06-audit.png) |
+
+(Screenshots from a fresh smoke install with no hosts. A populated
+fleet view and the live-log + restore wizard surfaces are part of
+the docs site under [`docs/book/`](./docs/book) — `make docs` to
+render locally.)
+
+## Architecture (one-line)
+
+A small Go control-plane in Docker, lightweight Go agents on each
+endpoint holding an outbound WebSocket to the control-plane, and
+a restic repository (rest-server, S3, B2, SFTP — anything restic
+speaks) that holds the actual backup data. **The control-plane
+never touches backup bytes.**

 Full architecture diagram and component breakdown:
-[`spec.md` §3](./spec.md).
+[`spec.md` §3](./spec.md), or the rendered version in the
+[docs site](./docs/book/src/concepts/architecture.md).

 ## Repository layout

@@ -38,31 +64,63 @@ Full architecture diagram and component breakdown:
 cmd/server/        control-plane binary
 cmd/agent/         endpoint agent binary
 internal/api       shared API types (REST + WS envelopes)
-internal/server/   HTTP, WS, UI handlers
+internal/server/   HTTP, WS, UI handlers, alert engine
 internal/agent/    service integration, restic runner, local scheduler
 internal/restic    restic CLI wrapper
 internal/store     SQLite persistence
-internal/crypto    secret encryption
+internal/crypto    secret encryption (AEAD)
 internal/auth      passwords, sessions, agent tokens
 web/               server-rendered templates + static assets
-deploy/            Dockerfile, docker-compose.yml, install scripts
-design/            UI wireframes (Phase 0 design pass)
+deploy/            Dockerfile, docker-compose.yml, install scripts, Grafana dashboard
+docs/              prose docs + the mdBook site under docs/book
+e2e/               compose stack + Playwright tests for end-to-end CI
 ```

+## Quickstart
+
+The reference deployment is a single Docker container fronted by
+your existing reverse proxy. See the [installation guide](docs/book/src/getting-started/install.md)
+for the full path; the very short version:
+
+```sh
+export RM_VERSION=v0.9.0    # pin a real tag
+export RM_BASE_URL=https://restic.example.com
+export RM_TRUSTED_PROXY=10.0.0.0/8
+docker compose -f deploy/docker-compose.yml up -d
+```
+
+The server prints a one-time bootstrap token to the log on first
+start. POST it to `/api/bootstrap` (or open `/bootstrap` in a
+browser) to create the admin user.
+
 ## Local development

-Requires Go 1.25+ (built and tested on 1.26). The floor is set by
-`modernc.org/sqlite` v1.50.
+Requires Go 1.25+. The floor is set by `modernc.org/sqlite` v1.50.

 ```sh
 make build           # builds cmd/server and cmd/agent into ./bin
 make test            # runs go test ./...
 make lint            # runs golangci-lint
-make run-server      # runs the server (dev defaults)
+make smoke-restart   # systemd --user smoke server (see CLAUDE.md)
+make docs            # renders the mdBook site to docs/book/book/
 ```

+End-to-end test harness against a Docker Compose stack with a
+sibling Linux agent: see [`docs/e2e.md`](docs/e2e.md). Runs in CI
+on every PR.
+
+## Documentation
+
+- **Concepts and operator guides**: [docs site](docs/book/src/intro.md),
+  rendered with `make docs`.
+- **Reverse-proxy setup**: [docs/reverse-proxy.md](docs/reverse-proxy.md).
+- **Prometheus + Grafana**: [docs/prometheus.md](docs/prometheus.md).
+- **End-to-end test harness**: [docs/e2e.md](docs/e2e.md).
+- **Security policy**: [SECURITY.md](SECURITY.md).
+- **Contributing**: [CONTRIBUTING.md](CONTRIBUTING.md).
+
 ## License

-PolyForm Noncommercial 1.0.0 — see [`LICENSE`](./LICENSE). Free for personal,
-hobby, research, educational, governmental, and other noncommercial use.
-Commercial use requires a separate license.
+[PolyForm Noncommercial 1.0.0](./LICENSE). Free for personal,
+hobby, research, educational, governmental, and other noncommercial
+use. Commercial use requires a separate license.
@@ -0,0 +1,137 @@
+# Security policy
+
+restic-manager handles credentials that grant access to backup
+repositories — losing them means an attacker can read or destroy a
+fleet's backups. We take security reports seriously even at this
+project's small scale.
+
+## Supported versions
+
+Pre-1.0, only the latest tagged release on `main` is supported.
+Backporting fixes to older tags is not currently offered.
+
+| Version            | Supported      |
+|--------------------|----------------|
+| `main` HEAD        | Yes            |
+| Latest released tag| Yes            |
+| Anything older     | No             |
+
+## Reporting a vulnerability
+
+**Please don't open a public issue for security problems.**
+
+Instead, use one of these private channels:
+
+1. **Gitea private message** to the repository owner. The
+   instance is at <https://gitea.dcglab.co.uk> and the owner's
+   profile (`steve`) has direct-message contact set up.
+2. **Email** to the address on the maintainer's Gitea profile.
+   Use a subject like `[SECURITY] restic-manager: <one-line summary>`
+   so it doesn't get lost. PGP optional — if you want to encrypt,
+   ask for a key first.
+
+If you don't get an acknowledgement within **3 working days**,
+please escalate through the other channel — solo maintainers do
+miss things, and the goal here is to fix the problem, not to
+preserve protocol.
+
+### What to include
+
+- A description of the issue and the impact (what does an attacker
+  gain? confidentiality, integrity, availability?).
+- Affected component (server, agent, install script, docs).
+- Affected version (`restic-manager-server --version`).
+- Reproduction steps if you have them. A working PoC is welcome
+  but not required — a credible threat model is enough.
+- Whether you intend to publish a writeup, and any timing
+  preferences.
+
+### What we'll do
+
+1. Acknowledge receipt within 3 working days.
+2. Confirm or refute the issue, and agree a rough severity (CVSS
+   or just "this is bad / this isn't"). Asking clarifying
+   questions is normal at this stage — please don't read it as
+   foot-dragging.
+3. Develop a fix on a private branch, test it, and prepare a
+   release.
+4. Coordinate disclosure timing with you. The default is **30
+   days from confirmed report to public disclosure**, with a
+   patched release published before the disclosure date. Faster
+   if a workable PoC is already circulating; slower only by
+   mutual agreement.
+5. Credit the reporter in the release notes (or omit the credit
+   if you'd rather stay anonymous — your choice).
+
+## Scope
+
+In scope:
+
+- The server binary (`cmd/server`) and any HTTP, WebSocket, or CLI
+  surface it exposes.
+- The agent binary (`cmd/agent`) and the way it consumes commands
+  from the server.
+- The install scripts (`deploy/install/install.sh`, `install.ps1`)
+  and the systemd unit shipped with them.
+- The docker-compose reference deployment and the docker image we
+  publish.
+- Any cryptographic primitive choice or implementation detail
+  (AEAD, token hashing, session handling, OIDC handshake).
+- Documentation that, if followed, leads operators into an
+  insecure configuration.
+
+Out of scope (not because they aren't real problems, just not ones
+this report channel can act on):
+
+- Vulnerabilities in restic itself — report those upstream at
+  <https://github.com/restic/restic>.
+- Vulnerabilities in third-party dependencies that haven't yet been
+  patched upstream — report upstream first.
+- Issues that require pre-authenticated admin access on the control
+  plane (admins can already do everything; that's not a privilege
+  escalation, that's the design).
+- DoS via resource exhaustion on a deployment without the
+  recommended reverse proxy / rate limiting in front (see
+  `docs/reverse-proxy.md`).
+- Social-engineering scenarios that don't have a technical hook
+  into the project's own surfaces.
+
+## Threat model summary
+
+For context (longer version in [`spec.md`](./spec.md) §11):
+
+- The server is **HTTP-only**; TLS termination, ACME, HSTS, and
+  edge rate-limiting are the reverse proxy's job.
+- Credentials are encrypted at rest with an AEAD key loaded from
+  `RM_SECRET_KEY_FILE`. The same key encrypts agent secrets that
+  travel to the agent over the WS channel.
+- Agents authenticate with bearer tokens issued at enrolment and
+  hashed at rest. Compromise of the server DB does **not** leak
+  bearer tokens in plaintext, but does leak the hashes (which is
+  enough to log in *as* the agent until the operator revokes —
+  see [NS-01 / NS-02](./tasks.md) for the revoke + regenerate
+  flows).
+- The control plane intentionally **never touches backup bytes** —
+  the agent runs `restic` directly against the repo. A
+  compromised control plane can dispatch new jobs but cannot
+  exfiltrate snapshot contents in-band.
+- Append-only credentials are first-class. Forget/prune jobs use a
+  separate, admin-marked credential that the server only pushes
+  for the duration of a maintenance dispatch.
+
+## Hardening checklist for operators
+
+- Run behind a TLS-terminating reverse proxy (Caddy/nginx/Traefik).
+- Set `RM_TRUSTED_PROXY` to the proxy's CIDR so request IPs aren't
+  spoofable.
+- Back up `RM_SECRET_KEY_FILE` separately from the database.
+  Without it the encrypted creds are unrecoverable.
+- Use append-only credentials for the everyday backup path; only
+  the optional admin credential should have write/forget/prune
+  power.
+- Disable users (don't delete) when staff change roles — bearer
+  tokens stay valid until rotated.
+- Watch the alert and audit-log views during enrolment of new
+  hosts.
+
+Thanks for helping keep restic-manager users safe.
@@ -1,8 +0,0 @@
-# The ask!
-
-I have numerous servers deployed out in a lab, mainly Linux but some Windows
-All have restic installed on them
-I need to build a browser based management service that allows me to have a central single-plane-of-glass to monitor and manage all teh endpoints
-All endpoints will be enabled for SSH (unless other methods are better?)
-
-Plan out how we would go about this please?
@@ -0,0 +1,262 @@
+// announce.go — agent-side announce-and-approve enrolment (P2-18c).
+//
+// Run path: when the agent has no AgentToken set but RM_SERVER is
+// configured (and no -enroll-token was supplied), main() switches
+// into announce mode:
+//  1. Load (or mint+persist) an Ed25519 keypair in agent.yaml.
+//  2. POST {hostname, os, arch, agent_version, restic_version,
+//     public_key} to /api/agents/announce.
+//  3. Print the fingerprint to stderr in a copy-friendly banner so
+//     the operator can compare it against the dashboard.
+//  4. Open /ws/agent/pending?pending_id=…, sign the nonce with our
+//     private key, wait for an `enrolled` message.
+//  5. On enrolled: persist the bearer + repo creds, return; main()
+//     then drops into the normal WS run loop with the new bearer.
+//  6. On reject: server closes the socket with code 4001; we exit
+//     with a clear message.
+package main
+
+import (
+	"context"
+	"crypto/ed25519"
+	"crypto/rand"
+	"encoding/base64"
+	"encoding/json"
+	"errors"
+	"fmt"
+	stdhttp "net/http"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/coder/websocket"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/config"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/secrets"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/sysinfo"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
+)
+
+// announceRequest mirrors the server's announceRequest. Duplicated
+// here so cmd/agent stays decoupled from the http package.
+type announceRequest struct {
+	Hostname      string `json:"hostname"`
+	OS            string `json:"os"`
+	Arch          string `json:"arch"`
+	AgentVersion  string `json:"agent_version"`
+	ResticVersion string `json:"restic_version"`
+	PublicKey     string `json:"public_key"`
+}
+
+type announceResponse struct {
+	PendingID         string `json:"pending_id"`
+	Fingerprint       string `json:"fingerprint"`
+	HostnameCollision bool   `json:"hostname_collision"`
+}
+
+type pendingNonceMessage struct {
+	Type  string `json:"type"`
+	Nonce string `json:"nonce"`
+}
+
+type pendingSignedMessage struct {
+	Type      string `json:"type"`
+	Signature string `json:"signature"`
+}
+
+type pendingEnrolledMessage struct {
+	Type   string `json:"type"`
+	HostID string `json:"host_id"`
+	Bearer string `json:"bearer"`
+}
+
+// doAnnounce runs the full announce → wait-for-accept flow. On
+// success, persists the bearer + host_id into cfg + writes secrets
+// for the repo creds the admin supplied at accept time. Returns
+// only after the bearer has landed (or on hard error / reject).
+func doAnnounce(serverURL string, cfg *config.Config, agentVersion string) error {
+	ctx, cancel := context.WithTimeout(context.Background(), 24*time.Hour)
+	defer cancel()
+
+	// Ensure we have a keypair.
+	priv, pub, err := loadOrMintAnnounceKey(cfg)
+	if err != nil {
+		return fmt.Errorf("announce: keypair: %w", err)
+	}
+	fingerprint := store.FingerprintForKey(pub)
+
+	snap, err := sysinfo.Collect(ctx, cfg.ResticPath)
+	if err != nil {
+		return fmt.Errorf("announce: sysinfo: %w", err)
+	}
+
+	// POST /api/agents/announce.
+	body, _ := json.Marshal(announceRequest{
+		Hostname: snap.Hostname, OS: string(snap.OS), Arch: string(snap.Arch),
+		AgentVersion: agentVersion, ResticVersion: snap.ResticVersion,
+		PublicKey: base64.StdEncoding.EncodeToString(pub),
+	})
+	req, _ := stdhttp.NewRequestWithContext(ctx, "POST",
+		strings.TrimRight(serverURL, "/")+"/api/agents/announce",
+		strings.NewReader(string(body)))
+	req.Header.Set("Content-Type", "application/json")
+	res, err := stdhttp.DefaultClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("announce: POST: %w", err)
+	}
+	rawBody := readAllShort(res)
+	_ = res.Body.Close()
+	if res.StatusCode != stdhttp.StatusOK {
+		return fmt.Errorf("announce: server returned %d: %s", res.StatusCode, rawBody)
+	}
+	var ar announceResponse
+	if err := json.Unmarshal(rawBody, &ar); err != nil {
+		return fmt.Errorf("announce: parse response: %w", err)
+	}
+
+	// Print the fingerprint banner.
+	fmt.Fprintln(os.Stderr, strings.Repeat("=", 64))
+	fmt.Fprintln(os.Stderr, "  Restic-manager: announce-and-approve enrolment")
+	fmt.Fprintln(os.Stderr, "")
+	fmt.Fprintln(os.Stderr, "  Hostname    : "+snap.Hostname)
+	fmt.Fprintln(os.Stderr, "  Server      : "+serverURL)
+	fmt.Fprintln(os.Stderr, "  Pending ID  : "+ar.PendingID)
+	fmt.Fprintln(os.Stderr, "  Fingerprint : "+fingerprint)
+	if ar.HostnameCollision {
+		fmt.Fprintln(os.Stderr, "")
+		fmt.Fprintln(os.Stderr, "  WARNING: another pending host already uses this hostname.")
+		fmt.Fprintln(os.Stderr, "  Confirm the fingerprint above matches what you see in the UI.")
+	}
+	fmt.Fprintln(os.Stderr, "")
+	fmt.Fprintln(os.Stderr, "  Compare the fingerprint with the one in the UI before accepting.")
+	fmt.Fprintln(os.Stderr, "  Waiting for an admin to accept (1 hour timeout)…")
+	fmt.Fprintln(os.Stderr, strings.Repeat("=", 64))
+
+	// Open /ws/agent/pending and run the nonce-sign handshake.
+	wsURL := wsURLFromHTTP(serverURL) + "/ws/agent/pending?pending_id=" + ar.PendingID
+	dialCtx, dialCancel := context.WithTimeout(ctx, 30*time.Second)
+	c, dialRes, err := websocket.Dial(dialCtx, wsURL, nil)
+	dialCancel()
+	if err != nil {
+		return fmt.Errorf("announce: dial pending ws: %w", err)
+	}
+	if dialRes != nil && dialRes.Body != nil {
+		_ = dialRes.Body.Close()
+	}
+	defer func() { _ = c.CloseNow() }()
+
+	// Read nonce.
+	rctx, rcancel := context.WithTimeout(ctx, 30*time.Second)
+	_, raw, err := c.Read(rctx)
+	rcancel()
+	if err != nil {
+		return fmt.Errorf("announce: read nonce: %w", err)
+	}
+	var nm pendingNonceMessage
+	if err := json.Unmarshal(raw, &nm); err != nil {
+		return fmt.Errorf("announce: parse nonce: %w", err)
+	}
+	nonce, err := base64.StdEncoding.DecodeString(nm.Nonce)
+	if err != nil {
+		return fmt.Errorf("announce: decode nonce: %w", err)
+	}
+	sig := ed25519.Sign(priv, nonce)
+	reply, _ := json.Marshal(pendingSignedMessage{
+		Type: "signed_nonce", Signature: base64.StdEncoding.EncodeToString(sig),
+	})
+	wctx, wcancel := context.WithTimeout(ctx, 10*time.Second)
+	if err := c.Write(wctx, websocket.MessageText, reply); err != nil {
+		wcancel()
+		return fmt.Errorf("announce: write signed nonce: %w", err)
+	}
+	wcancel()
+
+	// Block until enrolled (or reject / disconnect).
+	rctx2, rcancel2 := context.WithTimeout(ctx, 1*time.Hour)
+	defer rcancel2()
+	_, raw2, err := c.Read(rctx2)
+	if err != nil {
+		// CloseError with our reject code 4001 = admin rejected.
+		var ce websocket.CloseError
+		if errors.As(err, &ce) && ce.Code == 4001 {
+			return errors.New("announce: rejected by admin")
+		}
+		return fmt.Errorf("announce: wait for enrolled: %w", err)
+	}
+	var em pendingEnrolledMessage
+	if err := json.Unmarshal(raw2, &em); err != nil {
+		return fmt.Errorf("announce: parse enrolled: %w", err)
+	}
+	if em.Type != "enrolled" || em.Bearer == "" {
+		return fmt.Errorf("announce: bad enrolled payload: %s", raw2)
+	}
+
+	// Persist the bearer + host_id.
+	cfg.ServerURL = serverURL
+	cfg.HostID = em.HostID
+	cfg.AgentToken = em.Bearer
+	if err := cfg.EnsureSecretsKey(); err != nil {
+		return fmt.Errorf("announce: mint secrets key: %w", err)
+	}
+	// Note: repo creds aren't pushed in the enrolled message — the
+	// server pushes them via `config.update` on first WS hello. The
+	// secrets store will start empty and fill in then.
+	if err := cfg.Save(); err != nil {
+		return fmt.Errorf("announce: save config: %w", err)
+	}
+	// Touch the secrets store so it exists with the right perms.
+	keyBytes, _ := cfg.SecretsKeyBytes()
+	if _, err := secrets.New(cfg.ResolvedSecretsPath(), keyBytes); err != nil {
+		return fmt.Errorf("announce: open secrets store: %w", err)
+	}
+	fmt.Fprintln(os.Stderr, "Accepted. Bearer persisted; reconnecting via the standard WS.")
+	return nil
+}
+
+// loadOrMintAnnounceKey returns the (priv, pub) keypair, generating
+// + persisting one when AnnounceKey is empty. The private key holds
+// the public half in its tail 32 bytes per ed25519 convention.
+func loadOrMintAnnounceKey(cfg *config.Config) (ed25519.PrivateKey, ed25519.PublicKey, error) {
+	if cfg.AnnounceKey != "" {
+		raw, err := base64.StdEncoding.DecodeString(cfg.AnnounceKey)
+		if err != nil {
+			return nil, nil, fmt.Errorf("decode AnnounceKey: %w", err)
+		}
+		if len(raw) != ed25519.PrivateKeySize {
+			return nil, nil, fmt.Errorf("AnnounceKey must be %d bytes, got %d",
+				ed25519.PrivateKeySize, len(raw))
+		}
+		priv := ed25519.PrivateKey(raw)
+		pub := priv.Public().(ed25519.PublicKey)
+		return priv, pub, nil
+	}
+	pub, priv, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		return nil, nil, fmt.Errorf("generate keypair: %w", err)
+	}
+	cfg.AnnounceKey = base64.StdEncoding.EncodeToString(priv)
+	if err := cfg.Save(); err != nil {
+		return nil, nil, fmt.Errorf("persist AnnounceKey: %w", err)
+	}
+	return priv, pub, nil
+}
+
+// wsURLFromHTTP swaps the http(s) scheme for ws(s).
+func wsURLFromHTTP(httpURL string) string {
+	switch {
+	case strings.HasPrefix(httpURL, "https://"):
+		return "wss://" + strings.TrimPrefix(httpURL, "https://")
+	case strings.HasPrefix(httpURL, "http://"):
+		return "ws://" + strings.TrimPrefix(httpURL, "http://")
+	default:
+		return httpURL
+	}
+}
+
+// readAllShort reads up to 64KB of the response body. The announce
+// response is small; we cap to avoid pathological server replies.
+func readAllShort(res *stdhttp.Response) []byte {
+	buf := make([]byte, 64*1024)
+	n, _ := res.Body.Read(buf)
+	return buf[:n]
+}
@@ -9,6 +9,7 @@ import (
 	"os"
 	"os/signal"
 	"strconv"
+	"sync"
 	"syscall"
 	"time"

@@ -16,14 +17,14 @@ import (
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/runner"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/scheduler"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/secrets"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/service"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/sysinfo"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/wsclient"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/restic"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )

-var version = "dev"
-
 func main() {
 	if err := run(); err != nil {
 		slog.Error("agent fatal", "err", err)
@@ -32,6 +33,27 @@ func main() {
 }

 func run() error {
+	// Optional first positional verb for SCM control on Windows.
+	// `restic-manager-agent install|uninstall|start|stop` route into
+	// the service package; everything else falls through to the
+	// flag-driven default (which is what systemd / interactive runs
+	// hit). On non-Windows builds these verbs return a clear error.
+	if len(os.Args) > 1 {
+		switch os.Args[1] {
+		case "install":
+			return service.Install()
+		case "uninstall":
+			return service.Uninstall()
+		case "start":
+			return service.Start()
+		case "stop":
+			return service.Stop()
+		case "run":
+			// Strip the verb so flag.Parse sees the rest unchanged.
+			os.Args = append([]string{os.Args[0]}, os.Args[2:]...)
+		}
+	}
+
 	configPath := flag.String("config", config.DefaultPath(), "path to agent.yaml")
 	enrollServer := flag.String("enroll-server", "", "server URL (used with -enroll-token to perform first-run enrollment)")
 	enrollToken := flag.String("enroll-token", "", "one-time enrollment token (operator copies this from the UI)")
@@ -39,7 +61,7 @@ func run() error {
 	flag.Parse()

 	if *showVersion {
-		fmt.Println("restic-manager-agent", version)
+		fmt.Printf("restic-manager-agent %s (commit %s, built %s)\n", version.Version, version.Commit, version.Date)
 		return nil
 	}

@@ -55,11 +77,20 @@ func run() error {
 		if *enrollServer == "" {
 			return errors.New("enrollment: -enroll-server is required with -enroll-token")
 		}
-		return doEnroll(*enrollServer, *enrollToken, cfg, version)
+		return doEnroll(*enrollServer, *enrollToken, cfg, version.Version)
+	}
+
+	// Announce-and-approve: -enroll-server set, no token, agent not
+	// yet enrolled. Run the announce flow inline; on success the cfg
+	// has the bearer + host_id and we drop into the normal run loop.
+	if !cfg.Enrolled() && *enrollServer != "" {
+		if err := doAnnounce(*enrollServer, cfg, version.Version); err != nil {
+			return fmt.Errorf("announce: %w", err)
+		}
 	}

 	if !cfg.Enrolled() {
-		return fmt.Errorf("agent is not enrolled; run with -enroll-server and -enroll-token first (config %q)", *configPath)
+		return fmt.Errorf("agent is not enrolled; run with -enroll-server (and either -enroll-token or wait for admin to accept the announce) first (config %q)", *configPath)
 	}

 	ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
@@ -70,7 +101,7 @@ func run() error {
 		return fmt.Errorf("sysinfo: %w", err)
 	}
 	slog.Info("agent starting",
-		"version", version,
+		"version", version.Version,
 		"host_id", cfg.HostID,
 		"server", cfg.ServerURL,
 		"restic_version", snap.ResticVersion,
@@ -79,6 +110,12 @@ func run() error {

 	resticBin, _ := restic.Locate(cfg.ResticPath) // empty is fine; commands fail with a clear error later

+	// Probe the actual restic binary for restore-flag support. We used
+	// to gate --no-ownership on a SemVer comparison (added in 0.17),
+	// but a restic 0.18.1 build was observed in the wild that still
+	// rejects the flag. The help text is the only reliable signal.
+	resticSupportsNoOwnership := restic.SupportsRestoreNoOwnership(ctx, resticBin)
+
 	// Open the secrets store. If the agent is enrolled but has no
 	// secrets key yet (legacy YAML), mint one and migrate any
 	// plaintext repo fields into the encrypted blob.
@@ -94,7 +131,7 @@ func run() error {
 		CertPinSHA256: cfg.CertPinSHA256,
 		HelloPayload: api.HelloPayload{
 			ProtocolVersion: snap.ProtocolVersion,
-			AgentVersion:    version,
+			AgentVersion:    version.Version,
 			ResticVersion:   snap.ResticVersion,
 			Hostname:        snap.Hostname,
 			OS:              snap.OS,
@@ -103,9 +140,12 @@ func run() error {
 	}

 	d := &dispatcher{
-		resticBin: resticBin,
-		secrets:   sec,
-		scheduler: scheduler.New(),
+		resticBin:                 resticBin,
+		resticVer:                 snap.ResticVersion,
+		resticSupportsNoOwnership: resticSupportsNoOwnership,
+		serverURL:                 cfg.ServerURL,
+		secrets:                   sec,
+		scheduler:                 scheduler.New(),
 	}
 	if err := wsclient.Run(ctx, wsCfg, d.handle); err != nil {
 		return fmt.Errorf("ws run: %w", err)
@@ -167,9 +207,59 @@ func openSecretsStore(cfg *config.Config) (*secrets.Store, error) {
 // secrets store on each job — config.update writes through to disk,
 // so a job dispatched in the same session sees the latest values.
 type dispatcher struct {
-	resticBin string
-	secrets   *secrets.Store
-	scheduler *scheduler.Scheduler
+	resticBin                 string
+	resticVer                 string // e.g. "0.17.1"; empty if restic isn't installed yet
+	resticSupportsNoOwnership bool   // captured at startup from `restic restore --help`
+	serverURL                 string // base URL of the server (used by the self-update fetch)
+	secrets                   *secrets.Store
+	scheduler                 *scheduler.Scheduler
+
+	// Bandwidth caps in KB/s pushed via config.update. Mutated under
+	// bwMu by the config.update handler; read by runJob when building
+	// the runner. <=0 means "no cap" (do not pass --limit-* to restic).
+	// Per-job overrides on CommandRunPayload take precedence.
+	bwMu       sync.Mutex
+	bwUpKBps   int
+	bwDownKBps int
+
+	// Per-running-job cancellation handles. Populated when runJob
+	// spawns the goroutine, removed when it returns. Looked up by
+	// the command.cancel handler (server → agent) to abort an
+	// in-flight restic invocation.
+	cancelMu sync.Mutex
+	cancels  map[string]context.CancelFunc
+}
+
+// trackJob registers a cancel func for an in-flight job and returns a
+// cleanup that removes it. Call cleanup when the job goroutine exits
+// regardless of outcome — runs even on panic.
+func (d *dispatcher) trackJob(jobID string, cancel context.CancelFunc) func() {
+	d.cancelMu.Lock()
+	if d.cancels == nil {
+		d.cancels = make(map[string]context.CancelFunc)
+	}
+	d.cancels[jobID] = cancel
+	d.cancelMu.Unlock()
+	return func() {
+		d.cancelMu.Lock()
+		delete(d.cancels, jobID)
+		d.cancelMu.Unlock()
+	}
+}
+
+// cancelJob fires the cancel func for jobID if there is one and
+// returns whether the job was actually known. The runner is expected
+// to surface the resulting context.Canceled as a JobCancelled status
+// in its job.finished envelope (see runner.sendFinished).
+func (d *dispatcher) cancelJob(jobID string) bool {
+	d.cancelMu.Lock()
+	cancel, ok := d.cancels[jobID]
+	d.cancelMu.Unlock()
+	if !ok {
+		return false
+	}
+	cancel()
+	return true
 }

 func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.Sender) error {
@@ -182,8 +272,29 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S
 		return d.runJob(ctx, p, tx)

 	case api.MsgCommandCancel:
-		// TODO(P2): cancellation requires keeping a job→cancelFunc map.
-		slog.Info("ws agent: command.cancel received (cancellation lands in P2)", "id", env.ID)
+		var p api.CommandCancelPayload
+		if err := env.UnmarshalPayload(&p); err != nil {
+			return fmt.Errorf("command.cancel: %w", err)
+		}
+		if d.cancelJob(p.JobID) {
+			slog.Info("ws agent: command.cancel applied", "job_id", p.JobID)
+		} else {
+			// Job already finished or was never seen on this agent.
+			// Not an error — operator may have raced cancel against
+			// natural completion. Server-side state is authoritative.
+			slog.Info("ws agent: command.cancel for unknown job (already finished?)", "job_id", p.JobID)
+		}
+
+	case api.MsgTreeList:
+		// Synchronous RPC for the restore wizard's tree browser. The
+		// server has serialised access; we just run restic ls and reply
+		// with the same envelope ID. Run in a goroutine so the WS read
+		// loop keeps draining.
+		var p api.TreeListRequestPayload
+		if err := env.UnmarshalPayload(&p); err != nil {
+			return fmt.Errorf("tree.list: %w", err)
+		}
+		go d.handleTreeList(ctx, env.ID, p, tx)

 	case api.MsgScheduleSet:
 		var p api.ScheduleSetPayload
@@ -263,10 +374,30 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S
 			slog.Warn("ws agent: unknown config.update slot, ignoring", "slot", p.Slot)
 		}

-	case api.MsgAgentUpdateAvail:
-		var p api.AgentUpdateAvailablePayload
-		_ = env.UnmarshalPayload(&p)
-		slog.Info("ws agent: update available", "version", p.LatestVersion, "url", p.PackageURL)
+		// Bandwidth caps ride independently of the slot — they're host-
+		// wide and apply to every restic invocation regardless of which
+		// credentials slot the job uses. nil pointer = no change in this
+		// push; non-nil = set to that value (≤0 clears the cap).
+		if p.BandwidthUpKBps != nil || p.BandwidthDownKBps != nil {
+			d.bwMu.Lock()
+			if p.BandwidthUpKBps != nil {
+				d.bwUpKBps = *p.BandwidthUpKBps
+			}
+			if p.BandwidthDownKBps != nil {
+				d.bwDownKBps = *p.BandwidthDownKBps
+			}
+			up, down := d.bwUpKBps, d.bwDownKBps
+			d.bwMu.Unlock()
+			slog.Info("ws agent: bandwidth caps updated",
+				"up_kbps", up, "down_kbps", down)
+		}
+
+	case api.MsgCommandUpdate:
+		var p api.CommandUpdatePayload
+		if err := env.UnmarshalPayload(&p); err != nil {
+			return fmt.Errorf("command.update: %w", err)
+		}
+		go d.runUpdate(ctx, p, tx)

 	default:
 		slog.Debug("ws agent: ignored message", "type", env.Type)
@@ -274,17 +405,113 @@ func (d *dispatcher) handle(ctx context.Context, env api.Envelope, tx wsclient.S
 	return nil
 }

+// handleTreeList runs `restic ls --json <snapshot> <path>` and ships
+// the matching tree.list.result envelope back, correlated by the
+// request envelope's ID. Errors (missing creds, restic failure)
+// surface in the result's Error field rather than as transport-level
+// failures so the server-side waiter can render a sensible message.
+func (d *dispatcher) handleTreeList(ctx context.Context, reqID string, p api.TreeListRequestPayload, tx wsclient.Sender) {
+	reply := func(result api.TreeListResultPayload) {
+		result.SnapshotID = p.SnapshotID
+		result.Path = p.Path
+		env, err := api.Marshal(api.MsgTreeListResult, reqID, result)
+		if err != nil {
+			slog.Warn("ws agent: marshal tree.list.result", "err", err)
+			return
+		}
+		_ = tx.Send(env)
+	}
+
+	if d.resticBin == "" {
+		reply(api.TreeListResultPayload{Error: "restic binary not located on this agent"})
+		return
+	}
+	creds, err := d.secrets.Load()
+	if err != nil {
+		reply(api.TreeListResultPayload{Error: "load credentials: " + err.Error()})
+		return
+	}
+	if creds.Empty() {
+		reply(api.TreeListResultPayload{Error: "repo credentials not configured"})
+		return
+	}
+
+	d.bwMu.Lock()
+	upKBps, downKBps := d.bwUpKBps, d.bwDownKBps
+	d.bwMu.Unlock()
+
+	env := restic.Env{
+		Bin:               d.resticBin,
+		RepoURL:           creds.URL,
+		RepoUsername:      creds.Username,
+		RepoPassword:      creds.Password,
+		LimitUploadKBps:   upKBps,
+		LimitDownloadKBps: downKBps,
+	}
+
+	// 60s ceiling matches snapshots/stats — restic ls on a single
+	// directory is normally sub-second; if the repo is unreachable we
+	// want to surface the failure rather than block the wizard.
+	listCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
+	defer cancel()
+
+	entries, err := env.ListTreeChildren(listCtx, p.SnapshotID, p.Path)
+	if err != nil {
+		reply(api.TreeListResultPayload{Error: err.Error()})
+		return
+	}
+	apiEntries := make([]api.TreeListEntry, 0, len(entries))
+	for _, e := range entries {
+		apiEntries = append(apiEntries, api.TreeListEntry{
+			Name: e.Name,
+			Type: e.Type,
+			Size: e.Size,
+		})
+	}
+	reply(api.TreeListResultPayload{Entries: apiEntries})
+}
+
+// failJob ships a synthetic job.started + job.finished(failed) pair
+// for a command.run we couldn't even spawn locally — missing restic
+// binary, missing credentials, or a malformed payload. Without these
+// envelopes the server has no way to know the job will never produce
+// output: the row sits in "running", the live stream stays stuck on
+// "awaiting agent output," and a subsequent command.cancel arrives
+// for a job_id the agent never registered (we log "unknown job"
+// because trackJob was never called). Sending a terminal envelope
+// here closes the loop on both fronts.
+func failJob(p api.CommandRunPayload, tx wsclient.Sender, errMsg string) {
+	now := time.Now().UTC()
+	if startedEnv, err := api.Marshal(api.MsgJobStarted, p.JobID, api.JobStartedPayload{
+		JobID: p.JobID, Kind: p.Kind, StartedAt: now,
+	}); err == nil {
+		_ = tx.Send(startedEnv)
+	}
+	if finEnv, err := api.Marshal(api.MsgJobFinished, p.JobID, api.JobFinishedPayload{
+		JobID:      p.JobID,
+		Status:     api.JobFailed,
+		ExitCode:   -1,
+		FinishedAt: now,
+		Error:      errMsg,
+	}); err == nil {
+		_ = tx.Send(finEnv)
+	}
+}
+
 // runJob spawns a runner for one job. We launch a goroutine so the
 // WS read loop keeps draining messages while restic chugs along.
 func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsclient.Sender) error {
 	if d.resticBin == "" {
+		failJob(p, tx, "restic binary not located on this agent")
 		return fmt.Errorf("restic binary not located on this agent")
 	}
 	creds, err := d.secrets.Load()
 	if err != nil {
+		failJob(p, tx, "load repo credentials: "+err.Error())
 		return fmt.Errorf("load repo credentials: %w", err)
 	}
 	if creds.Empty() {
+		failJob(p, tx, "repo credentials not configured (waiting for server config.update push)")
 		return fmt.Errorf("repo credentials not configured (waiting for server config.update push)")
 	}
 	// r is the everyday runner — bound to the host's repo
@@ -295,13 +522,48 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
 	// not on r). If you find yourself adding a new JobKind that
 	// needs delete authority, mirror the JobPrune pattern below
 	// — don't try to overload r.
+	// Resolve bandwidth caps: per-job override (if set) wins over the
+	// host-wide caps last pushed via config.update. <=0 means no cap.
+	d.bwMu.Lock()
+	upKBps, downKBps := d.bwUpKBps, d.bwDownKBps
+	d.bwMu.Unlock()
+	if p.BandwidthUpKBps != nil {
+		upKBps = *p.BandwidthUpKBps
+	}
+	if p.BandwidthDownKBps != nil {
+		downKBps = *p.BandwidthDownKBps
+	}
+
 	r := runner.New(runner.Config{
-		ResticBin:    d.resticBin,
-		RepoURL:      creds.URL,
-		RepoUsername: creds.Username,
-		RepoPassword: creds.Password,
+		ResticBin:                  d.resticBin,
+		ResticVersion:              d.resticVer,
+		RepoURL:                    creds.URL,
+		RepoUsername:               creds.Username,
+		RepoPassword:               creds.Password,
+		SupportsRestoreNoOwnership: d.resticSupportsNoOwnership,
+		LimitUploadKBps:            upKBps,
+		LimitDownloadKBps:          downKBps,
 	}, tx, time.Second)

+	// spawn wraps the kind-specific goroutine: derives a per-job
+	// cancellable context from the connection-scoped ctx, registers
+	// the cancel func so command.cancel can fire it, deregisters on
+	// completion. Per-job ctx means canceling one job doesn't kill
+	// any other in-flight invocations.
+	spawn := func(name string, fn func(ctx context.Context) error) {
+		jobCtx, cancel := context.WithCancel(ctx)
+		cleanup := d.trackJob(p.JobID, cancel)
+		go func() {
+			defer cleanup()
+			defer cancel() // release ctx resources on goroutine exit
+			if err := fn(jobCtx); err != nil {
+				slog.Warn("agent: "+name+" job failed", "job_id", p.JobID, "err", err)
+				return
+			}
+			slog.Info("agent: "+name+" job complete", "job_id", p.JobID)
+		}()
+	}
+
 	switch p.Kind {
 	case api.JobBackup:
 		// Includes/Excludes/Tag come from the source group resolved
@@ -318,22 +580,15 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
 		}
 		slog.Info("agent: accepting backup job",
 			"job_id", p.JobID, "paths", paths, "excludes", p.Excludes, "tag", p.Tag)
-		go func() {
-			if err := r.RunBackup(ctx, p.JobID, paths, p.Excludes, tags); err != nil {
-				slog.Warn("agent: backup job failed", "job_id", p.JobID, "err", err)
-				return
-			}
-			slog.Info("agent: backup job complete", "job_id", p.JobID)
-		}()
+		hooks := runner.BackupHooks{Pre: p.PreHook, Post: p.PostHook}
+		spawn("backup", func(jobCtx context.Context) error {
+			return r.RunBackup(jobCtx, p.JobID, paths, p.Excludes, tags, hooks)
+		})
 	case api.JobInit:
 		slog.Info("agent: accepting init job", "job_id", p.JobID)
-		go func() {
-			if err := r.RunInit(ctx, p.JobID); err != nil {
-				slog.Warn("agent: init job failed", "job_id", p.JobID, "err", err)
-				return
-			}
-			slog.Info("agent: init job complete", "job_id", p.JobID)
-		}()
+		spawn("init", func(jobCtx context.Context) error {
+			return r.RunInit(jobCtx, p.JobID)
+		})
 	case api.JobForget:
 		if len(p.ForgetGroups) == 0 {
 			// Hard-error rather than fall back to a single-policy form:
@@ -343,6 +598,7 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
 			// policy fallback was specced but skipped — see the
 			// Phase 5 plan rationale and version.go's lockstep-deploy
 			// note for why.
+			failJob(p, tx, "forget: command.run carried no forget_groups (server didn't populate them)")
 			return fmt.Errorf("forget: command.run carried no forget_groups (server didn't populate them)")
 		}
 		groups := make([]restic.ForgetGroup, 0, len(p.ForgetGroups))
@@ -360,13 +616,9 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
 			})
 		}
 		slog.Info("agent: accepting forget job", "job_id", p.JobID, "groups", len(groups))
-		go func() {
-			if err := r.RunForget(ctx, p.JobID, groups); err != nil {
-				slog.Warn("agent: forget job failed", "job_id", p.JobID, "err", err)
-				return
-			}
-			slog.Info("agent: forget job complete", "job_id", p.JobID)
-		}()
+		spawn("forget", func(jobCtx context.Context) error {
+			return r.RunForget(jobCtx, p.JobID, groups)
+		})
 	case api.JobPrune:
 		// Prune may require admin creds (delete authority on rest-server).
 		runCreds := creds
@@ -381,36 +633,66 @@ func (d *dispatcher) runJob(ctx context.Context, p api.CommandRunPayload, tx wsc
 			runCreds = ac
 		}
 		prr := runner.New(runner.Config{
-			ResticBin:    d.resticBin,
-			RepoURL:      runCreds.URL,
-			RepoUsername: runCreds.Username,
-			RepoPassword: runCreds.Password,
+			ResticBin:                  d.resticBin,
+			ResticVersion:              d.resticVer,
+			RepoURL:                    runCreds.URL,
+			RepoUsername:               runCreds.Username,
+			RepoPassword:               runCreds.Password,
+			SupportsRestoreNoOwnership: d.resticSupportsNoOwnership,
+			LimitUploadKBps:            upKBps,
+			LimitDownloadKBps:          downKBps,
 		}, tx, time.Second)
 		slog.Info("agent: accepting prune job", "job_id", p.JobID, "admin_creds", p.RequiresAdminCreds)
-		go func() {
-			if err := prr.RunPrune(ctx, p.JobID); err != nil {
-				slog.Warn("agent: prune job failed", "job_id", p.JobID, "err", err)
-			}
-		}()
+		spawn("prune", func(jobCtx context.Context) error {
+			return prr.RunPrune(jobCtx, p.JobID)
+		})
 	case api.JobCheck:
 		subset := 0
 		if len(p.Args) > 0 {
 			subset, _ = strconv.Atoi(p.Args[0])
 		}
 		slog.Info("agent: accepting check job", "job_id", p.JobID, "subset_pct", subset)
-		go func() {
-			if err := r.RunCheck(ctx, p.JobID, subset); err != nil {
-				slog.Warn("agent: check job failed", "job_id", p.JobID, "err", err)
-			}
-		}()
+		spawn("check", func(jobCtx context.Context) error {
+			return r.RunCheck(jobCtx, p.JobID, subset)
+		})
 	case api.JobUnlock:
 		slog.Info("agent: accepting unlock job", "job_id", p.JobID)
-		go func() {
-			if err := r.RunUnlock(ctx, p.JobID); err != nil {
-				slog.Warn("agent: unlock job failed", "job_id", p.JobID, "err", err)
-			}
-		}()
+		spawn("unlock", func(jobCtx context.Context) error {
+			return r.RunUnlock(jobCtx, p.JobID)
+		})
+	case api.JobRestore:
+		if p.Restore == nil {
+			failJob(p, tx, "restore: command.run carried no restore payload")
+			return fmt.Errorf("restore: command.run carried no restore payload")
+		}
+		rp := *p.Restore
+		if rp.SnapshotID == "" {
+			failJob(p, tx, "restore: snapshot_id is required")
+			return fmt.Errorf("restore: snapshot_id is required")
+		}
+		if !rp.InPlace && rp.TargetDir == "" {
+			failJob(p, tx, "restore: target_dir required for non-in-place restore")
+			return fmt.Errorf("restore: target_dir required for non-in-place restore")
+		}
+		slog.Info("agent: accepting restore job",
+			"job_id", p.JobID, "snapshot_id", rp.SnapshotID,
+			"paths", rp.Paths, "in_place", rp.InPlace, "target", rp.TargetDir)
+		spawn("restore", func(jobCtx context.Context) error {
+			return r.RunRestore(jobCtx, p.JobID, rp.SnapshotID, rp.Paths, rp.InPlace, rp.TargetDir)
+		})
+	case api.JobDiff:
+		if p.Diff == nil || p.Diff.SnapshotA == "" || p.Diff.SnapshotB == "" {
+			failJob(p, tx, "diff: command.run carried incomplete diff payload")
+			return fmt.Errorf("diff: command.run carried incomplete diff payload")
+		}
+		dp := *p.Diff
+		slog.Info("agent: accepting diff job",
+			"job_id", p.JobID, "a", dp.SnapshotA, "b", dp.SnapshotB)
+		spawn("diff", func(jobCtx context.Context) error {
+			return r.RunDiff(jobCtx, p.JobID, dp.SnapshotA, dp.SnapshotB)
+		})
 	default:
+		failJob(p, tx, fmt.Sprintf("kind %q not implemented on this agent", p.Kind))
 		return fmt.Errorf("kind %q not implemented yet (Phase 2 lands the rest)", p.Kind)
 	}
 	return nil
@@ -0,0 +1,65 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"time"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/updater"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/wsclient"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
+)
+
+// runUpdate handles a server-dispatched command.update. It logs progress
+// via log.stream so the live job page captures pre-restart state, then
+// calls the platform updater. On Linux the updater calls os.Exit; on
+// Windows it spawns a detached helper and returns, with the agent then
+// exiting.
+//
+// The terminal job state is set by the server, not the agent: success
+// is "agent re-hellos with matching version" rather than anything the
+// agent itself can assert. The only `job.finished` we send from here is
+// on the failure path, before any restart attempt.
+func (d *dispatcher) runUpdate(ctx context.Context, p api.CommandUpdatePayload, tx wsclient.Sender) {
+	logf := func(format string, args ...any) {
+		line := fmt.Sprintf(format, args...)
+		slog.Info("ws agent: update: " + line)
+		env, err := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
+			JobID:   p.JobID,
+			TS:      time.Now().UTC(),
+			Stream:  api.LogStdout,
+			Payload: line,
+		})
+		if err == nil {
+			_ = tx.Send(env)
+		}
+	}
+
+	startedEnv, err := api.Marshal(api.MsgJobStarted, "", api.JobStartedPayload{
+		JobID:     p.JobID,
+		Kind:      api.JobUpdate,
+		StartedAt: time.Now().UTC(),
+	})
+	if err == nil {
+		_ = tx.Send(startedEnv)
+	}
+
+	logf("fetching new binary from %s", d.serverURL)
+	if err := updater.Update(ctx, d.serverURL); err != nil {
+		logf("update failed: %v", err)
+		finishedEnv, mErr := api.Marshal(api.MsgJobFinished, "", api.JobFinishedPayload{
+			JobID:      p.JobID,
+			Status:     api.JobFailed,
+			FinishedAt: time.Now().UTC(),
+			Error:      err.Error(),
+		})
+		if mErr == nil {
+			_ = tx.Send(finishedEnv)
+		}
+		return
+	}
+	// Unreachable on Linux (Update calls os.Exit). On Windows control
+	// returns here while the detached helper does the swap-and-restart;
+	// the agent then exits cleanly so SCM hands off.
+}
@@ -9,21 +9,26 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
+	"strings"
 	"syscall"
 	"time"

+	"gitea.dcglab.co.uk/steve/restic-manager/internal/alert"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/fleetupdate"
 	rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
 )

-var version = "dev"
-
 func main() {
 	if err := run(); err != nil {
 		slog.Error("server fatal", "err", err)
@@ -37,7 +42,7 @@ func run() error {
 	flag.Parse()

 	if *showVersion {
-		fmt.Println("restic-manager-server", version)
+		fmt.Printf("restic-manager-server %s (commit %s, built %s)\n", version.Version, version.Commit, version.Date)
 		return nil
 	}

@@ -81,20 +86,41 @@ func run() error {

 	hub := ws.NewHub()
 	jobHub := ws.NewJobHub()
+	metricsRegistry := metrics.NewRegistry()
+
+	notifHub := notification.NewHub(st, aead, cfg.BaseURL)
+	alertEngine := alert.NewEngine(st, notifHub)
+	updateWatcher := ws.NewUpdateWatcher(st, alertEngine, jobHub)

 	renderer, err := ui.New()
 	if err != nil {
 		return fmt.Errorf("ui: %w", err)
 	}

+	var oidcClient *oidc.Client
+	if cfg.OIDC != nil {
+		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+		defer cancel()
+		oidcClient, err = oidc.New(ctx, cfg.OIDC, cfg.BaseURL)
+		if err != nil {
+			return fmt.Errorf("oidc: %w", err)
+		}
+		slog.Info("oidc enabled", "issuer", cfg.OIDC.Issuer, "display", cfg.OIDC.DisplayName)
+	}
+
 	deps := rmhttp.Deps{
-		Cfg:     cfg,
-		Store:   st,
-		AEAD:    aead,
-		Hub:     hub,
-		JobHub:  jobHub,
-		UI:      renderer,
-		Version: version,
+		Cfg:             cfg,
+		Store:           st,
+		AEAD:            aead,
+		Hub:             hub,
+		JobHub:          jobHub,
+		AlertEngine:     alertEngine,
+		NotificationHub: notifHub,
+		UpdateWatcher:   updateWatcher,
+		UI:              renderer,
+		Version:         version.Version,
+		OIDC:            oidcClient,
+		Metrics:         metricsRegistry,
 	}

 	// First-run bootstrap: if the users table is empty, mint a one-time
@@ -115,20 +141,38 @@ func run() error {
 		// text exactly once; we hash it into BootstrapToken on the
 		// server-side handler.
 		fmt.Fprintln(os.Stderr, "================================================================")
-		fmt.Fprintln(os.Stderr, "  FIRST RUN — bootstrap token (use within 1 hour, then it's gone):")
+		fmt.Fprintln(os.Stderr, "  FIRST RUN — no admin user exists yet.")
+		if cfg.BaseURL != "" {
+			fmt.Fprintln(os.Stderr, "  Open this URL in a browser to create the first administrator:")
+			fmt.Fprintln(os.Stderr, "    "+strings.TrimRight(cfg.BaseURL, "/")+"/bootstrap")
+		} else {
+			fmt.Fprintln(os.Stderr, "  Open the server URL in a browser; you'll be sent to /bootstrap.")
+			fmt.Fprintln(os.Stderr, "  (Set RM_BASE_URL to have a clickable link printed here.)")
+		}
+		fmt.Fprintln(os.Stderr, "")
+		fmt.Fprintln(os.Stderr, "  Headless? POST {token, username, password} to /api/bootstrap")
+		fmt.Fprintln(os.Stderr, "  with this one-shot bootstrap token (valid until first user exists):")
 		fmt.Fprintln(os.Stderr, "    "+token)
-		fmt.Fprintln(os.Stderr, "  POST it to /api/bootstrap with {token, username, password}.")
 		fmt.Fprintln(os.Stderr, "================================================================")
 	}

 	srv := rmhttp.New(deps)

+	// Fleet-update worker — built after the HTTP server because the
+	// dispatcher delegates back into srv.DispatchHostUpdate.
+	fleetWorker := fleetupdate.NewWorker(st, hub,
+		&serverDispatcher{srv: srv}, alertEngine)
+	srv.SetFleetWorker(fleetWorker)
+
 	ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
 	defer stop()

+	go alertEngine.Run(ctx)
+	go updateWatcher.Run(ctx)
+
 	errCh := make(chan error, 1)
 	go func() {
-		slog.Info("server listening", "addr", cfg.Listen, "version", version)
+		slog.Info("server listening", "addr", cfg.Listen, "version", version.Version)
 		errCh <- srv.Start()
 	}()

@@ -156,6 +200,10 @@ func run() error {
 	// shouldn't, but the queue exists either way).
 	pendingDrainTick := time.NewTicker(30 * time.Second)
 	defer pendingDrainTick.Stop()
+	// Pending-hosts expiry sweeper: drops announce rows past their 1h
+	// ceiling so the dashboard panel doesn't accumulate stale entries.
+	pendingExpiryTick := time.NewTicker(60 * time.Second)
+	defer pendingExpiryTick.Stop()
 	mt := maintenance.New(st)
 	go func() {
 		for {
@@ -171,11 +219,19 @@ func run() error {
 				}
 			case <-offlineTick.C:
 				cutoff := time.Now().Add(-90 * time.Second)
-				if n, err := st.MarkHostsOfflineStale(ctx, cutoff); err == nil && n > 0 {
-					slog.Info("marked hosts offline (stale heartbeat)", "n", n)
+				if ids, err := st.MarkHostsOfflineStaleReturnIDs(ctx, cutoff); err == nil && len(ids) > 0 {
+					slog.Info("marked hosts offline (stale heartbeat)", "n", len(ids))
+					for _, id := range ids {
+						alertEngine.NotifyHostOffline(id)
+					}
 				}
 			case <-pendingDrainTick.C:
 				srv.DrainAllDue(ctx)
+				srv.RunCatchupsDue(ctx)
+			case <-pendingExpiryTick.C:
+				if n, err := st.DeleteExpiredPendingHosts(ctx, time.Now().UTC()); err == nil && n > 0 {
+					slog.Info("expired pending hosts swept", "n", n)
+				}
 			case <-maintenanceTick.C:
 				decisions, err := mt.Decide(ctx, time.Now().UTC())
 				if err != nil {
@@ -206,3 +262,12 @@ func run() error {
 	}
 	return nil
 }
+
+// serverDispatcher adapts the http.Server's DispatchHostUpdate method
+// to the fleetupdate.Dispatcher interface. Lives in main so the
+// http and fleetupdate packages don't need to know about each other.
+type serverDispatcher struct{ srv *rmhttp.Server }
+
+func (d *serverDispatcher) DispatchUpdate(ctx context.Context, hostID, actorUserID string) (string, string, error) {
+	return d.srv.DispatchHostUpdate(ctx, hostID, actorUserID)
+}
@@ -1,14 +1,17 @@
 # syntax=docker/dockerfile:1.7

 # ---- Build stage --------------------------------------------------------
-FROM golang:1.25-alpine AS build
+# Cross-compiles:
+#   * the server binary for the image's TARGETARCH (linux/amd64 or arm64),
+#   * three agent binaries (linux/amd64, linux/arm64, windows/amd64) that
+#     the running server hands out via /agent/binary.
+# Pure-Go SQLite (modernc.org/sqlite) means CGO stays off; static binaries
+# run on distroless/static.
+FROM --platform=$BUILDPLATFORM golang:1.25-alpine AS build

 WORKDIR /src

-# Pure-Go SQLite (modernc.org/sqlite) means we can keep CGO off and build a
-# fully static binary that runs on distroless/static.
 ENV CGO_ENABLED=0 \
-    GOOS=linux \
    GOFLAGS="-trimpath"

 # Cache module downloads in a separate layer.
@@ -18,9 +21,45 @@ RUN go mod download
 COPY . .

 ARG VERSION=dev
-RUN go build -ldflags="-s -w -X main.version=${VERSION}" \
-    -o /out/restic-manager-server \
-    ./cmd/server
+ARG COMMIT=none
+ARG DATE=unknown
+ARG TARGETOS
+ARG TARGETARCH
+
+ENV VERSION_PKG="gitea.dcglab.co.uk/steve/restic-manager/internal/version"
+ENV LDFLAGS="-s -w \
+    -X ${VERSION_PKG}.Version=${VERSION} \
+    -X ${VERSION_PKG}.Commit=${COMMIT} \
+    -X ${VERSION_PKG}.Date=${DATE}"
+
+# Server: built for the image's runtime arch.
+RUN GOOS=${TARGETOS} GOARCH=${TARGETARCH} \
+    go build -ldflags="${LDFLAGS}" \
+        -o /out/restic-manager-server \
+        ./cmd/server
+
+# Empty /data skeleton so the runtime image carries an existing,
+# nonroot-owned mount point. Docker copies that ownership onto a
+# named volume the first time it's created, which avoids the
+# "permission denied" trap on /data/secret.key when the operator
+# uses a default `volumes: { rm-data: {} }` declaration.
+RUN mkdir -p /out/data
+
+# Agents: identical across image arches — an arm64 server image still
+# ships an amd64 agent binary for amd64 endpoints to download.
+RUN mkdir -p /out/agent-binaries && \
+    GOOS=linux GOARCH=amd64 \
+        go build -ldflags="${LDFLAGS}" \
+            -o /out/agent-binaries/restic-manager-agent-linux-amd64 \
+            ./cmd/agent && \
+    GOOS=linux GOARCH=arm64 \
+        go build -ldflags="${LDFLAGS}" \
+            -o /out/agent-binaries/restic-manager-agent-linux-arm64 \
+            ./cmd/agent && \
+    GOOS=windows GOARCH=amd64 \
+        go build -ldflags="${LDFLAGS}" \
+            -o /out/agent-binaries/restic-manager-agent-windows-amd64.exe \
+            ./cmd/agent

 # ---- Runtime stage ------------------------------------------------------
 FROM gcr.io/distroless/static-debian12:nonroot
@@ -31,7 +70,22 @@ LABEL org.opencontainers.image.licenses="PolyForm-Noncommercial-1.0.0"
 USER nonroot:nonroot
 WORKDIR /

+# Server binary on PATH.
 COPY --from=build /out/restic-manager-server /usr/local/bin/restic-manager-server

+# Image-baked bundled assets (P5-03). Read-only; the /agent/binary and
+# /install/* handlers fall back here when <DataDir>/... is empty, so a
+# fresh container Just Works without first-run staging. Operators can
+# still drop a custom build under <DataDir>/agent-binaries/<name> to
+# override per-host.
+COPY --from=build --chmod=0755 /out/agent-binaries/ /opt/restic-manager/dist/agent-binaries/
+COPY --chmod=0755 deploy/install/install.sh /opt/restic-manager/dist/install/install.sh
+COPY --chmod=0644 deploy/install/install.ps1 /opt/restic-manager/dist/install/install.ps1
+COPY --chmod=0644 deploy/install/restic-manager-agent.service /opt/restic-manager/dist/install/restic-manager-agent.service
+
+# Pre-created data dir owned by nonroot so a fresh named volume
+# inherits the right ownership.
+COPY --from=build --chown=nonroot:nonroot /out/data /data
+
 EXPOSE 8443
 ENTRYPOINT ["/usr/local/bin/restic-manager-server"]
@@ -1,21 +1,52 @@
 # Reference deployment for the restic-manager control plane.
-# Mirrors spec.md §10.1. Adjust image tag and RM_BASE_URL for your env.
+# Mirrors spec.md §10.1 and the P5-07 reference deployment.
 #
-# The server speaks plain HTTP. Front it with a TLS-terminating
-# reverse proxy (Caddy/Traefik/nginx). RM_TRUSTED_PROXY must contain
-# the proxy's IP/CIDR so X-Forwarded-* headers are honoured.
+# Scope: this compose stands up the server only. TLS termination and
+# the public hostname belong to a reverse proxy that lives outside
+# this stack (Caddy, Traefik, nginx, HAProxy, your existing edge —
+# whatever you already operate). See `docs/reverse-proxy.md` for the
+# headers + CIDRs that proxy needs to forward.
+#
+# Architecture:
+#   * The server speaks plain HTTP on :8080.
+#   * The agent binaries + install scripts ship inside the image under
+#     /opt/restic-manager/dist/, so /agent/binary and /install/*
+#     serve out of the box without first-run staging.
+#   * The named volume holds *only* operator state (sqlite,
+#     secrets.enc, audit log, the AEAD key). Image upgrades replace
+#     the agents/scripts; the volume is untouched.
+#   * Pre-1.0 releases never publish :latest — pin to an exact
+#     vX.Y.Z tag and bump deliberately.
+#
+# Before first start:
+#   1. Pick a version: export RM_VERSION=vX.Y.Z (or substitute below).
+#   2. Set RM_BASE_URL to the public HTTPS URL the external proxy
+#      serves on.
+#   3. Set RM_TRUSTED_PROXY to the IP/CIDR the proxy connects from
+#      (the X-Forwarded-* headers are honoured only when the immediate
+#      peer matches one of these).
+
 services:
  restic-manager:
-    image: ghcr.io/dcglab/restic-manager:latest
+    image: gitea.dcglab.co.uk/steve/restic-manager:${RM_VERSION:?set RM_VERSION to a vX.Y.Z tag}
    restart: unless-stopped
-    # Bind to localhost only — the proxy is what the public reaches.
+    # Bind to localhost only — your reverse proxy reaches the server
+    # over loopback (or, if it runs in a separate compose / on
+    # another host, swap this for an internal docker network or a
+    # private LAN bind).
    ports:
      - "127.0.0.1:8080:8080"
    volumes:
-      - ./data:/data
+      - rm-data:/data
    environment:
      - RM_DATA_DIR=/data
      - RM_LISTEN=:8080
-      - RM_BASE_URL=https://restic.lab.example
+      - RM_BASE_URL=${RM_BASE_URL:?set RM_BASE_URL to the public https URL}
      - RM_SECRET_KEY_FILE=/data/secret.key
-      - RM_TRUSTED_PROXY=172.16.0.0/12
+      - RM_TRUSTED_PROXY=${RM_TRUSTED_PROXY:?set RM_TRUSTED_PROXY to the proxy CIDR}
+      # Cookies are Secure by default; keep that. Override only for
+      # local-HTTP smoke tests.
+      # - RM_COOKIE_SECURE=true
+
+volumes:
+  rm-data:
@@ -0,0 +1,325 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": { "type": "grafana", "uid": "-- Grafana --" },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "restic-manager fleet overview. Imports against any Prometheus data source.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "id": 1,
+      "title": "Fleet status",
+      "type": "stat",
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "red", "value": null },
+              { "color": "green", "value": 1 }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "rm_hosts_online",
+          "legendFormat": "online",
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "rm_hosts_total",
+          "legendFormat": "total",
+          "refId": "B"
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "title": "Open alerts",
+      "type": "stat",
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 1 },
+              { "color": "red", "value": 5 }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "orientation": "horizontal",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "sum by (severity) (rm_active_alerts)",
+          "legendFormat": "{{severity}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "title": "Backups failing (last reported run)",
+      "type": "stat",
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "gridPos": { "h": 6, "w": 6, "x": 12, "y": 0 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "thresholds" },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 1 }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
+        "textMode": "auto"
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "count(rm_host_last_backup_success == 0)",
+          "legendFormat": "failing",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "title": "Hosts",
+      "type": "table",
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 6 },
+      "fieldConfig": {
+        "defaults": {
+          "custom": { "align": "auto", "displayMode": "auto" }
+        },
+        "overrides": [
+          {
+            "matcher": { "id": "byName", "options": "Value #B" },
+            "properties": [
+              { "id": "displayName", "value": "Last backup (s ago)" },
+              { "id": "unit", "value": "s" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #C" },
+            "properties": [
+              { "id": "displayName", "value": "Repo size" },
+              { "id": "unit", "value": "bytes" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #D" },
+            "properties": [
+              { "id": "displayName", "value": "Snapshots" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #A" },
+            "properties": [
+              { "id": "displayName", "value": "Online" }
+            ]
+          },
+          {
+            "matcher": { "id": "byName", "options": "Value #E" },
+            "properties": [
+              { "id": "displayName", "value": "Open alerts" }
+            ]
+          }
+        ]
+      },
+      "options": { "showHeader": true },
+      "transformations": [
+        {
+          "id": "merge",
+          "options": {}
+        }
+      ],
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "rm_host_agent_online",
+          "format": "table",
+          "instant": true,
+          "refId": "A"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "time() - rm_host_last_backup_timestamp_seconds",
+          "format": "table",
+          "instant": true,
+          "refId": "B"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "rm_host_repo_size_bytes",
+          "format": "table",
+          "instant": true,
+          "refId": "C"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "rm_host_snapshot_count",
+          "format": "table",
+          "instant": true,
+          "refId": "D"
+        },
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "rm_host_open_alerts",
+          "format": "table",
+          "instant": true,
+          "refId": "E"
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "title": "Repo size over time",
+      "type": "timeseries",
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": {
+            "axisLabel": "",
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "lineWidth": 1,
+            "pointSize": 5,
+            "showPoints": "never"
+          },
+          "unit": "bytes"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": { "calcs": ["last"], "displayMode": "list", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi", "sort": "desc" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "rm_host_repo_size_bytes",
+          "legendFormat": "{{host}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "title": "Job duration p95 (last 1h, by kind)",
+      "type": "timeseries",
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+      "fieldConfig": {
+        "defaults": {
+          "color": { "mode": "palette-classic" },
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 5,
+            "lineWidth": 1,
+            "pointSize": 4,
+            "showPoints": "never"
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": { "calcs": ["last"], "displayMode": "list", "placement": "bottom", "showLegend": true },
+        "tooltip": { "mode": "multi", "sort": "desc" }
+      },
+      "targets": [
+        {
+          "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+          "expr": "histogram_quantile(0.95, sum by (kind, le) (rate(rm_job_duration_seconds_bucket[1h])))",
+          "legendFormat": "{{kind}}",
+          "refId": "A"
+        }
+      ]
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": ["restic-manager", "backups"],
+  "templating": {
+    "list": [
+      {
+        "current": {},
+        "hide": 0,
+        "includeAll": false,
+        "label": "Prometheus",
+        "multi": false,
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      }
+    ]
+  },
+  "time": { "from": "now-6h", "to": "now" },
+  "timepicker": {},
+  "timezone": "",
+  "title": "restic-manager — fleet",
+  "uid": "rm-fleet-overview",
+  "version": 1,
+  "weekStart": ""
+}
@@ -0,0 +1,133 @@
+# install.ps1 — Windows installer for the restic-manager agent (P2-17).
+#
+# Usage (Run as administrator):
+#   $env:RM_SERVER = "https://restic.lab.example"
+#   $env:RM_TOKEN  = "<one-time-token>"  # omit for announce-and-approve
+#   iwr "$env:RM_SERVER/install/install.ps1" -UseBasicParsing | iex
+#
+# What it does:
+#   1. checks for admin elevation
+#   2. downloads the matching agent binary from the server
+#   3. lays down C:\Program Files\restic-manager\ and
+#      C:\ProgramData\restic-manager\ (config + state)
+#   4. registers the agent as a Windows service via the agent's own
+#      `install` subcommand (which uses the SCM API)
+#   5. enrolls (token flow if RM_TOKEN set, otherwise announce flow)
+#      by spawning the agent with the right CLI flags and waits
+#      until config is written
+#   6. surfaces (but does NOT disable) any existing scheduled tasks
+#      whose name contains "restic" so the operator can decide
+#
+# Idempotent — safe to re-run.
+
+[CmdletBinding()]
+param(
+  [string]$Server = $env:RM_SERVER,
+  [string]$Token  = $env:RM_TOKEN,
+  [string]$InstallDir = 'C:\Program Files\restic-manager',
+  [string]$DataDir    = 'C:\ProgramData\restic-manager'
+)
+
+$ErrorActionPreference = 'Stop'
+
+function Test-Admin {
+  $id  = [System.Security.Principal.WindowsIdentity]::GetCurrent()
+  $pri = New-Object System.Security.Principal.WindowsPrincipal($id)
+  return $pri.IsInRole([System.Security.Principal.WindowsBuiltInRole]::Administrator)
+}
+
+function Detect-Arch {
+  switch ($env:PROCESSOR_ARCHITECTURE) {
+    'AMD64' { return 'amd64' }
+    'ARM64' { return 'arm64' }
+    default { throw "unsupported PROCESSOR_ARCHITECTURE: $($env:PROCESSOR_ARCHITECTURE)" }
+  }
+}
+
+function Detect-ResticTasks {
+  Write-Host ''
+  Write-Host '— Existing restic-named scheduled tasks (review manually) —'
+  try {
+    $tasks = Get-ScheduledTask -ErrorAction SilentlyContinue |
+      Where-Object { $_.TaskName -match 'restic' -or $_.TaskPath -match 'restic' }
+    if ($tasks) {
+      foreach ($t in $tasks) {
+        Write-Host "  * $($t.TaskPath)$($t.TaskName)  state=$($t.State)"
+        Write-Host "    Disable with:  Disable-ScheduledTask -TaskName '$($t.TaskName)' -TaskPath '$($t.TaskPath)'"
+      }
+    } else {
+      Write-Host '  (none found)'
+    }
+  } catch {
+    Write-Host '  (Get-ScheduledTask failed; review the Task Scheduler UI manually)'
+  }
+  Write-Host ''
+}
+
+# --- preflight -------------------------------------------------------
+
+if (-not (Test-Admin)) {
+  throw 'install.ps1: must be run from an elevated PowerShell (Run as administrator).'
+}
+if (-not $Server) {
+  throw 'install.ps1: -Server (or $env:RM_SERVER) is required, e.g. https://restic.lab.example'
+}
+
+$arch = Detect-Arch
+Write-Host "install.ps1: server=$Server arch=$arch"
+
+# --- directories -----------------------------------------------------
+
+New-Item -ItemType Directory -Force -Path $InstallDir | Out-Null
+New-Item -ItemType Directory -Force -Path $DataDir    | Out-Null
+
+# --- download agent --------------------------------------------------
+
+$agentExe = Join-Path $InstallDir 'restic-manager-agent.exe'
+$tmpExe   = "$agentExe.tmp"
+$dlURL    = "$Server/agent/binary?os=windows&arch=$arch"
+Write-Host "install.ps1: downloading $dlURL"
+Invoke-WebRequest -UseBasicParsing -Uri $dlURL -OutFile $tmpExe
+# Atomic-ish replace: stop service if running so the .exe isn't busy.
+try { Stop-Service -Name 'restic-manager-agent' -ErrorAction SilentlyContinue } catch {}
+Move-Item -Force -Path $tmpExe -Destination $agentExe
+
+# --- enroll / announce -----------------------------------------------
+
+$cfgPath = Join-Path $DataDir 'agent.yaml'
+$args = @('-config', $cfgPath, '-enroll-server', $Server)
+if ($Token) {
+  $args += @('-enroll-token', $Token)
+  Write-Host 'install.ps1: enrolling with one-time token'
+} else {
+  Write-Host 'install.ps1: no RM_TOKEN — running announce-and-approve flow.'
+  Write-Host '  The fingerprint will print below. Compare it with the dashboard before clicking Accept.'
+}
+& $agentExe @args
+if ($LASTEXITCODE -ne 0) {
+  throw "install.ps1: agent enrolment failed (exit $LASTEXITCODE)"
+}
+
+# --- install + start service ----------------------------------------
+
+# The 'install' subcommand registers the service via the SCM. If
+# already registered, it errors loudly — re-run with -Force only if
+# you've manually verified.
+try {
+  & $agentExe install
+} catch {
+  Write-Host "install.ps1: service may already be registered ($_); continuing."
+}
+try {
+  Start-Service -Name 'restic-manager-agent'
+} catch {
+  Write-Host "install.ps1: Start-Service failed ($_); check Event Viewer."
+}
+
+Detect-ResticTasks
+
+Write-Host ''
+Write-Host 'install.ps1: done.'
+Write-Host "  config : $cfgPath"
+Write-Host "  binary : $agentExe"
+Write-Host "  service: restic-manager-agent  (Get-Service to inspect)"
@@ -49,6 +49,11 @@ detect_arch() {
 ensure_dirs() {
  install -d -m 0700 -o root -g root "$RM_CONFIG_DIR"
  install -d -m 0700 -o root -g root "$RM_STATE_DIR"
+  # Default new-directory restore target: $HOME/rm-restore. With the
+  # current unit (ProtectSystem=full, no ReadWritePaths pin) the agent
+  # can mkdir anywhere on real filesystems, so this is just a courtesy
+  # pre-create so the wizard's default lands in a tidy spot.
+  install -d -m 0700 -o root -g root /root/rm-restore
 }

 detect_existing_schedulers() {
@@ -33,12 +33,31 @@ CapabilityBoundingSet=CAP_DAC_READ_SEARCH CAP_DAC_OVERRIDE CAP_FOWNER CAP_CHOWN
 AmbientCapabilities=CAP_DAC_READ_SEARCH CAP_DAC_OVERRIDE CAP_FOWNER CAP_CHOWN

 # Hardening — blocks privilege escalation even from root, and
-# confines writes / network / kernel access to what restic actually
-# needs. Filesystem reads stay open: that's the whole job.
+# confines kernel / namespace / privilege surface. Filesystem reads
+# stay open (that's the whole job) and restore writes are
+# unrestricted: a backup tool whose entire purpose is "put files
+# back where they belong" can't have ProtectHome=read-only or
+# ProtectSystem=strict without breaking on the first cross-user
+# restore. ProtectSystem=full keeps /usr, /boot, /efi read-only so a
+# compromised agent can't swap out /usr/bin/restic or drop a kernel
+# module, while leaving /home, /root, /var, /opt, /srv, /tmp etc.
+# writable for arbitrary restore targets. The agent is treated as a
+# high-trust component (it runs operator hooks as root and holds
+# repo credentials); the residual hardening is about kernel + privesc
+# protection, not write confinement.
 NoNewPrivileges=true
-ProtectSystem=strict
-ReadWritePaths=/etc/restic-manager /var/lib/restic-manager
-ProtectHome=read-only
+ProtectSystem=full
+# ProtectSystem=full mounts /usr, /boot, /efi *and* /etc read-only.
+# The agent rewrites /etc/restic-manager/agent.yaml on enrolment and
+# whenever a new SecretsKey is minted, so we need a targeted
+# write-exemption for that dir. No exemption for the rest of /etc:
+# the agent has no business editing /etc/passwd, /etc/sudoers, etc.
+#
+# /usr/local/bin is writable so the self-update flow (P6-01) can
+# atomic-rename a fresh binary over the running one. Permitting the
+# whole directory (rather than just the binary path) is required
+# because os.Rename takes a write lock on the parent dir.
+ReadWritePaths=/etc/restic-manager /usr/local/bin
 ProtectHostname=true
 ProtectKernelTunables=true
 ProtectKernelModules=true
@@ -0,0 +1,249 @@
+# Onboarding a new host — agent instructions
+
+How an automation agent (with a username + password for the
+restic-manager server) brings a new host fully online.
+
+The flow is two roles:
+
+- **Controller side**: the agent calls JSON APIs on the
+  restic-manager server. Needs network reach to the server, plus
+  username/password.
+- **Target side**: the host being onboarded runs the install
+  script, which calls back to the server with the one-time token.
+
+If the agent is *both* sides (e.g. it can SSH into the target),
+it does steps 1–2 against the server and steps 3–4 against the
+target. If the agent only controls the server, it stops at
+step 2 and hands the install snippet to whoever owns the target.
+
+---
+
+## Conventions
+
+- Base URL: `$RM_SERVER` (e.g. `https://restic.lab.example`).
+- Session cookie jar: persist `rm_session` between calls.
+- All request/response bodies are JSON unless noted.
+- On any non-2xx, response body is
+  `{"code": "...", "message": "..."}`.
+
+---
+
+## 1. Login
+
+```
+POST $RM_SERVER/api/auth/login
+Content-Type: application/json
+
+{"username": "...", "password": "..."}
+```
+
+→ 200 with `{"user_id": "...", "role": "..."}` and a `Set-Cookie:
+rm_session=...` (HttpOnly, 24h TTL). Persist the cookie; reuse
+it on every subsequent call.
+
+Required role for the next step: **operator** or **admin**.
+A viewer-only login can read but cannot mint tokens.
+
+Session expires at 24h. On 401 from a later call, re-login.
+
+---
+
+## 2. Mint an enrolment token
+
+```
+POST $RM_SERVER/api/enrollment-tokens
+Cookie: rm_session=...
+Content-Type: application/json
+
+{
+  "hostname":      "newhost.example",
+  "tags":          ["prod", "london"],          // optional
+  "repo_url":      "rest:https://rest.example/newhost",
+  "repo_username": "...",                        // optional, for rest-server / S3
+  "repo_password": "...",                        // optional
+  "initial_paths": ["/etc", "/home", "/var/lib"] // optional; default source group
+}
+```
+
+→ 200 with:
+
+```json
+{ "token": "<RAW_ONE_TIME_TOKEN>", "expires_at": "2026-05-09T..." }
+```
+
+**Capture `token` immediately — the server only stores its hash
+and will never return the raw value again.** TTL is 1 hour.
+
+The repo creds you provided are encrypted under the token hash
+and pre-attached to the host. The agent will fetch and store
+them at enrol-time; you will not need to push them again.
+
+If you lose the token before the install runs, mint a new one
+(the existing one becomes irrelevant; you can leave it to expire
+or revoke it via the UI).
+
+---
+
+## 3. Install on the target host
+
+The install script is hosted by the server itself. Running on the
+target:
+
+### Linux
+
+```
+curl -fsSL $RM_SERVER/install/install.sh | \
+  sudo RM_SERVER=$RM_SERVER RM_TOKEN=<RAW_ONE_TIME_TOKEN> bash
+```
+
+What it does, end-to-end:
+
+1. detects arch (amd64 / arm64)
+2. downloads `$RM_SERVER/agent/binary?os=linux&arch=<arch>` to
+   `/usr/local/bin/restic-manager-agent`
+3. creates `/etc/restic-manager/` and `/var/lib/restic-manager/`
+   (root:root, 0700)
+4. calls `POST /api/agents/enroll` with the token; server returns
+   the persistent agent bearer + `host_id`, written to
+   `/etc/restic-manager/agent.env`
+5. installs the systemd unit, `daemon-reload`, `enable --now`
+6. surfaces any pre-existing restic cron/timer entries so the
+   operator can decide whether to disable them (script does
+   *not* touch them automatically)
+
+The script is idempotent. Re-running on an already-enrolled host
+is a no-op unless `RM_FORCE_REENROLL=1`.
+
+The agent runs as **root** by design — fleet backup needs to
+read every file on the system. See
+`deploy/install/restic-manager-agent.service` for rationale.
+
+### Windows
+
+```
+iwr $RM_SERVER/install/install.ps1 -UseBasicParsing | iex
+# (or download + run; needs an elevated PowerShell)
+# Required env: $env:RM_SERVER, $env:RM_TOKEN
+```
+
+Same flow, lays down a Windows service instead of a systemd unit.
+
+### Manual / non-script enrolment
+
+If the install script can't be used, the wire-level enrol call is:
+
+```
+POST $RM_SERVER/api/agents/enroll
+Content-Type: application/json
+
+{
+  "token":          "<RAW_ONE_TIME_TOKEN>",
+  "hostname":       "newhost.example",
+  "os":             "linux",                  // linux | windows
+  "arch":           "amd64",                  // amd64 | arm64
+  "agent_version":  "...",
+  "restic_version": "..."
+}
+```
+
+→ 200 with
+`{"host_id": "...", "agent_token": "...", "cert_pin_sha256": "..."}`.
+
+The agent_token goes into `/etc/restic-manager/agent.env` as
+`RM_AGENT_TOKEN=...`; subsequent agent → server traffic uses
+`Authorization: Bearer $RM_AGENT_TOKEN`.
+
+---
+
+## 4. Verify the host is healthy
+
+Poll until both conditions are true. Cap at ~5 minutes.
+
+```
+GET $RM_SERVER/api/hosts
+Cookie: rm_session=...
+```
+
+→ array of host objects. Find the one with the matching hostname
+and check:
+
+- `"status": "online"` — agent connected to the WS heartbeat
+- `"repo_status": "ready"` — `restic init` (or existing-config
+  detection) completed successfully
+
+If `repo_status` settles on `"init_failed"`, the repo creds are
+wrong or the repo URL is unreachable from the target. Inspect
+the matching job log:
+
+```
+GET $RM_SERVER/api/hosts/<host_id>/jobs   (most recent init job)
+GET $RM_SERVER/api/jobs/<job_id>          (full output)
+```
+
+Fix the creds with a creds-update call (see Settings → Repo on
+the UI for the exact route — currently form-only) or revoke the
+host and start over.
+
+---
+
+## 5. (Optional) configure schedules
+
+A new host gets one default source group covering `initial_paths`
+(or `/etc`,`/home` if you didn't pass any) and **no schedule**.
+Backups won't run until either:
+
+- a schedule is attached (cron expression, retention, etc.), or
+- you trigger an on-demand run via the source-group "Run now"
+  endpoint.
+
+These are not yet exposed cleanly as JSON-only routes; if the
+agent needs them, look at `internal/server/http/schedules*.go`
+and `internal/server/http/source_groups*.go` — most are JSON-
+capable, some are form-only with HTML 303 responses.
+
+---
+
+## Failure modes — quick reference
+
+| Symptom | Likely cause | Fix |
+|---|---|---|
+| `401` on `/api/enrollment-tokens` | session expired or viewer role | re-login as operator+ |
+| install.sh fails at "enrol": HTTP 410 | token expired (>1h) or already used | mint a fresh token |
+| Host shows `status=offline` after install | systemd unit didn't start; firewall blocks WS | `systemctl status restic-manager-agent`, check `$RM_SERVER` reachability |
+| `repo_status=init_failed` | bad repo creds or URL | inspect init job log; fix creds; retry probe via `/hosts/{id}/repo/probe` |
+| Token list grows with stale rows | normal — they expire at 1h | optional cleanup via `/hosts/enrollment-tokens/{hash}/revoke` |
+
+---
+
+## Minimum reproducible script
+
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+: "${RM_SERVER:?}" "${RM_USER:?}" "${RM_PASS:?}" "${RM_HOSTNAME:?}" \
+  "${RM_REPO_URL:?}" "${RM_REPO_USER:?}" "${RM_REPO_PASS:?}"
+
+JAR=$(mktemp)
+trap 'rm -f "$JAR"' EXIT
+
+# 1. login
+curl -fsS -c "$JAR" -H 'Content-Type: application/json' \
+  -d "{\"username\":\"$RM_USER\",\"password\":\"$RM_PASS\"}" \
+  "$RM_SERVER/api/auth/login" >/dev/null
+
+# 2. mint token
+TOKEN=$(curl -fsS -b "$JAR" -H 'Content-Type: application/json' \
+  -d "$(jq -nc \
+        --arg h "$RM_HOSTNAME" --arg u "$RM_REPO_USER" \
+        --arg p "$RM_REPO_PASS" --arg r "$RM_REPO_URL" \
+        '{hostname:$h, repo_url:$r, repo_username:$u, repo_password:$p}')" \
+  "$RM_SERVER/api/enrollment-tokens" | jq -r .token)
+
+# 3. emit the install snippet for the target machine
+cat <<EOF
+Run on $RM_HOSTNAME (as root):
+
+  curl -fsSL $RM_SERVER/install/install.sh | \\
+    sudo RM_SERVER=$RM_SERVER RM_TOKEN=$TOKEN bash
+EOF
+```
@@ -0,0 +1,19 @@
+[book]
+title = "restic-manager"
+description = "Self-hosted control plane for restic backups across a fleet of Linux and Windows endpoints."
+authors = ["Steve Cliff"]
+language = "en-GB"
+multilingual = false
+src = "src"
+
+[output.html]
+default-theme = "ayu"
+preferred-dark-theme = "ayu"
+git-repository-url = "https://gitea.dcglab.co.uk/steve/restic-manager"
+git-repository-icon = "fa-code-fork"
+edit-url-template = "https://gitea.dcglab.co.uk/steve/restic-manager/_edit/main/docs/book/{path}"
+no-section-label = false
+
+[output.html.fold]
+enable = true
+level = 2
@@ -0,0 +1,40 @@
+# Summary
+
+[Introduction](./intro.md)
+
+# Getting started
+
+- [Installing the server](./getting-started/install.md)
+- [Enrolling your first host](./getting-started/enrolling-hosts.md)
+- [Running behind a reverse proxy](./getting-started/reverse-proxy.md)
+
+# Concepts
+
+- [Architecture](./concepts/architecture.md)
+- [Credentials and how they flow](./concepts/credentials.md)
+- [Schedules and source groups](./concepts/schedules-and-source-groups.md)
+- [Repo maintenance](./concepts/repo-maintenance.md)
+
+# Operations
+
+- [Backups and restores](./operations/backups-and-restores.md)
+- [Alerts and notifications](./operations/alerts.md)
+- [Observability with Prometheus](./operations/observability.md)
+- [Updating agents](./operations/updates.md)
+
+# Security
+
+- [Threat model](./security/threat-model.md)
+- [Hardening checklist](./security/hardening.md)
+- [Reporting vulnerabilities](./security/disclosure.md)
+
+# Reference
+
+- [Environment variables](./reference/env-vars.md)
+- [HTTP endpoints](./reference/http-endpoints.md)
+
+---
+
+[Contributing](./contributing.md)
+[Roadmap](./roadmap.md)
+[License](./license.md)
@@ -0,0 +1,121 @@
+# Architecture
+
+## Components
+
+```
+┌────────────────────────────────────────────────────────────┐
+│  Server (control plane, single process)                    │
+│   * chi-based HTTP API + HTMX server-rendered UI           │
+│   * WebSocket hub for agent fan-out + browser fan-out      │
+│   * SQLite store (modernc.org/sqlite, pure Go)             │
+│   * AEAD encryption helpers                                │
+│   * Alert engine + notification hub                        │
+└────────────┬───────────────────────────────────┬───────────┘
+             │ outbound WS only                   │ HTTP(S)
+             │                                    │
+┌────────────▼─────────────┐         ┌────────────▼─────────────┐
+│  Agent (per host)        │         │  Browser (operator)      │
+│   * coder/websocket      │         │   * htmx + a tiny bit    │
+│   * cron for schedules   │         │     of vanilla JS for    │
+│   * restic wrapper       │         │     live job updates     │
+│   * sysinfo collector    │         └──────────────────────────┘
+└────────────┬─────────────┘
+             │ subprocess: restic ...
+             │
+┌────────────▼─────────────────────────────────────────────────┐
+│  restic repository (rest-server, S3, B2, SFTP, local …)      │
+│  Backup data flows directly here. Server never touches it.   │
+└──────────────────────────────────────────────────────────────┘
+```
+
+## Why outbound-only WebSockets?
+
+The agent dials the server on `/ws/agent` with a bearer token. The
+server doesn't initiate connections to the agent. Three reasons:
+
+1. **Firewall friendliness.** Nothing on the endpoint needs an
+   inbound port; this works behind the typical "branch office NAT"
+   without router config.
+2. **Single auth point.** The bearer token is the only credential
+   that crosses the boundary; the agent never accepts an
+   incoming socket.
+3. **Reconnect semantics are simpler.** When the connection drops
+   (NAT timeout, server restart, transient network glitch) the
+   agent backs off and re-dials; the server marks the host
+   offline after 90s and lets the alert engine raise a stale-host
+   alert.
+
+## Why SQLite?
+
+SQLite covers the project's HA non-goal: there isn't one. A small
+control plane managing twelve endpoints does not need replication
+or a separate database tier. SQLite gives us:
+
+- A single file to back up (plus the secret key).
+- Hand-rolled migrations under `internal/store/migrations/` —
+  no migration framework lock-in.
+- `WAL` mode plus per-connection foreign-key enforcement.
+
+The migrations file the entire schema; there's no ORM or
+query-builder layer between Go code and SQL.
+
+## Why the agent runs `restic` itself, not via the server
+
+The control plane never holds backup bytes in flight. That's
+deliberate:
+
+- A compromised control plane cannot exfiltrate snapshot
+  contents in-band — at worst it can dispatch new backup or
+  forget jobs (audit-logged) but the data path is between the
+  agent and the repository.
+- The same agent process can target whichever transport restic
+  natively supports (rest-server, S3, B2, SFTP, local), no
+  separate mux on the server side.
+
+## Job lifecycle
+
+```
+            ┌──────────────────────┐
+operator →  │ POST /hosts/{id}/    │
+            │       run-backup     │
+            └──────────┬───────────┘
+                       │   1. INSERT INTO jobs (status='queued')
+                       │   2. dispatch command.run over WS
+                       ▼
+            ┌──────────────────────┐
+            │ Agent dispatches     │
+            │ restic subprocess    │
+            └──────────┬───────────┘
+                       │
+                       │   3. job.started   ───▶ store.MarkJobStarted
+                       │   4. job.progress  ───▶ JobHub broadcast (live UI)
+                       │   5. log.stream    ───▶ append to job_logs
+                       │   6. job.finished  ───▶ store.MarkJobFinished
+                       │                          + alert engine eval
+                       │                          + (P6) metrics histogram
+                       ▼
+                  terminal: succeeded | failed | cancelled
+```
+
+Operators see live updates because the browser subscribes to
+`/api/jobs/{id}/stream`, and the WS handler broadcasts each
+agent-emitted envelope to all live subscribers in addition to
+persisting it.
+
+## What scheduling looks like
+
+- The agent runs a local `robfig/cron/v3` instance.
+- The server pushes the desired schedule set to the agent on
+  hello + after every CRUD change.
+- When the agent's cron fires, it sends `schedule.fire` to the
+  server. The server creates a job row, sends `command.run` back,
+  and the agent dispatches a normal backup.
+- If the WS drops between fire and run, the server queues the
+  schedule firing into `pending_runs` and drains on agent
+  reconnect — no missed scheduled backups due to network blips.
+
+For everything that isn't a backup (forget, prune, check), the
+server runs a 60-second maintenance ticker against
+`host_repo_maintenance` rows and dispatches the relevant command
+when a cadence is due. The agent's local cron only handles
+backups.
@@ -0,0 +1,98 @@
+# Credentials and how they flow
+
+restic-manager handles three credential surfaces:
+
+1. **Operator credentials** — the username + password (or OIDC
+   identity) that logs into the UI.
+2. **Agent bearer tokens** — issued at enrolment, used by the
+   agent to authenticate its WebSocket to the server.
+3. **Repo credentials** — the rest-server / S3 / B2 / SFTP
+   credentials the agent passes to `restic` itself.
+
+Each has a different threat model and storage strategy.
+
+## Operator credentials
+
+- Local users are stored in `users` with a bcrypt password hash.
+- Sessions are random tokens minted at login, stored hashed in
+  the `sessions` table, expired after 24h. Cookie is HttpOnly,
+  SameSite=Lax, and Secure (when `RM_COOKIE_SECURE=true`,
+  default).
+- OIDC users carry `auth_source='oidc'` and an `oidc_subject`
+  pinning their IdP identity. Local password login is rejected
+  for OIDC users.
+- Disabling a user soft-deletes them via `disabled_at` —
+  pre-existing sessions are invalidated on the next request.
+
+## Agent bearer tokens
+
+- Minted at enrolment, hashed at rest with `auth.HashToken`.
+- The plaintext token only exists in memory at enrolment time
+  and on the agent's filesystem (`/etc/restic-manager/agent.yaml`,
+  mode `0600`, owned by the service user).
+- Compromise of the server DB leaks the hashes, which is enough
+  to *log in as that agent* until you revoke. Compromise of the
+  agent host leaks the plaintext (via the config file) — same
+  end result.
+- Rotation: re-enrol the host. Today there's no in-place rotate;
+  the operator deletes the host (which cascades, including
+  revoking the bearer hash) and re-runs the install command.
+
+## Repo credentials
+
+This is the credential that ultimately matters for backup
+integrity. restic-manager keeps two slots per host:
+
+- **The everyday credential** (`host_credentials.kind = ''`).
+  Append-only-friendly: this is the one your backup schedule
+  uses. It can write but not delete or forget.
+- **The admin credential** (`host_credentials.kind = 'admin'`).
+  Has full delete rights. Only pushed to the agent transiently
+  while a `prune` or `forget` job is dispatching, and discarded
+  by the agent after the job ends.
+
+### Encryption flow
+
+1. Operator types the credential into the UI or the install form.
+2. Server AEAD-encrypts the cred (`crypto.AEAD.Encrypt`) using the
+   key in `RM_SECRET_KEY_FILE`. The plaintext is dropped from
+   memory.
+3. Encrypted blob is stored in `host_credentials.cred_blob`.
+4. When the agent connects, the server decrypts the blob and
+   sends the **plaintext** down the WebSocket inside a
+   `config.update` envelope.
+5. The agent stores the plaintext in its in-memory secrets store
+   for the lifetime of the process; it's reloaded fresh on every
+   server-side push.
+6. When a job runs, the agent merges the credential into the
+   restic environment (`restic.Env.RepoURL` stays bare; the
+   `user:pass@…` form is built only inside `envSlice()` at the
+   moment of `exec.Command`).
+
+The merged form is **never logged**. The slog package's structured
+output gets `restic.RedactURL()` for any URL it has cause to
+mention.
+
+### Why push plaintext over the wire?
+
+The transport itself is the trust boundary: the WebSocket runs
+inside the same TLS-terminated reverse-proxy connection your
+browser uses, and the agent has already authenticated with its
+bearer token. Re-encrypting the payload on top of that would just
+move the key-management problem somewhere else.
+
+If your reverse proxy isn't TLS-terminated, the deployment is
+already broken — see [Hardening](../security/hardening.md).
+
+## Setup tokens (admin-driven)
+
+When an admin creates a new user, the server mints a one-time
+setup link valid for 1 hour. The hash is stored; the raw token
+is shown to the admin once. The user opens the link, sets a
+password, and is dropped into a session. Expired tokens are
+swept on the alert engine's 60s tick.
+
+Same pattern for enrolment tokens: the raw token only exists in
+memory at mint time, and the install snippet is the operator's
+only chance to capture it. If you lose it, regenerate via the
+**Add host** page (NS-02).
@@ -0,0 +1,85 @@
+# Repo maintenance
+
+Backups go in; without maintenance, repos grow forever and
+eventually fall over. restic-manager runs three maintenance
+operations on a per-host cadence:
+
+| Command  | What it does                                                | Default cadence |
+|----------|-------------------------------------------------------------|-----------------|
+| `forget` | Marks snapshots eligible for removal per the retention policy attached to each source group. Cheap; runs append-only. | Daily after the last backup of the day |
+| `prune`  | Reclaims space from the repo. Requires the **admin** credential (write+delete). | Weekly, off-peak |
+| `check`  | Verifies repo integrity. Sub-options surface lock state. | Weekly, with `--read-data-subset N%` to sample pack files |
+
+A new field on each host row, `host_repo_maintenance`, holds the
+cron expressions and last-fire anchors. The maintenance ticker on
+the server runs every 60s, finds hosts whose next-fire is due,
+and dispatches the right command. The agent's local cron is
+**only** for backups.
+
+## Why server-side and not agent-side?
+
+The agent's cron knows about backups because backups are
+per-source-group. Maintenance is per-repo, not per-source-group,
+so doing it server-side keeps the per-host wiring simple:
+
+- One ticker, not N agent crons to keep in sync.
+- Cancelling a maintenance dispatch is just "don't dispatch the
+  next one" — no agent-side state to clean up.
+- Skipping offline hosts is trivial (no queue; only scheduled
+  *backups* queue into `pending_runs`).
+
+## Forget and the multi-group payload
+
+A single `forget` job can target several source groups at once.
+The wire envelope (`ForgetGroups`) carries one entry per group,
+each with its retention policy. The agent runs N
+`restic forget --tag <name> --keep-...` invocations in sequence,
+streams their output, and reports a single terminal status.
+
+## Prune and the admin credential
+
+Prune mutates the repo. The everyday append-only credential
+**cannot** prune — that's the whole point of append-only.
+restic-manager keeps a second slot per host (`kind = 'admin'`)
+for the credential that can.
+
+When a prune is dispatched (cadence-driven or operator-driven):
+
+1. Server pushes the admin credential to the agent in a fresh
+   `config.update`.
+2. Agent runs `restic prune` with the merged credential.
+3. Job finishes; agent discards the admin credential from its
+   in-memory secrets store.
+
+The server never logs the merged URL (see
+[Credentials](./credentials.md)).
+
+## Check and lock state
+
+`restic check` warns about stale locks when it finds them. The
+agent ships every check's output back as a `repo.stats` envelope
+and a stream of log lines; if a stale lock is detected, the
+**Repo** page surfaces a banner with an **Unlock** button. The
+operator-only `unlock` command runs `restic unlock` and clears
+the banner.
+
+`unlock` has no cadence — it's a manual action, never automatic.
+Auto-unlocking would mask the cause (probably a previously
+crashed long-running operation) and risk corrupting an
+operation the operator has merely lost track of.
+
+## Repo stats
+
+After every backup, check, prune, and unlock, the agent runs
+`restic stats --json --mode raw-data` and ships the result as a
+`repo.stats` envelope. The server stores this in
+`host_repo_stats` (latest only) and `host_repo_stats_history`
+(one row per host per day, last-write-wins per column — a
+prune-only patch never nulls a backup-time size).
+
+The host detail page surfaces:
+
+- Total size + raw size in the vitals strip.
+- Last-check timestamp + colour-coded status.
+- Last-prune timestamp.
+- 30/90-day repo size trend chart.
@@ -0,0 +1,105 @@
+# Schedules and source groups
+
+Two related but separable ideas:
+
+- A **source group** is a named bundle of "what to back up":
+  include paths, exclude patterns, retention policy, retry
+  configuration, optional pre/post hooks. The group's name is
+  used as the restic snapshot tag, so retention can target it
+  with `restic forget --tag <name>`.
+- A **schedule** is a cron expression that, when it fires,
+  triggers a backup of one or more source groups on a host.
+
+Decoupling them means you can have one schedule covering several
+groups (e.g. `0 1 * * *` running both `system` and `data`), and
+each group has its own retention without duplicating policy
+across schedules.
+
+## Source group anatomy
+
+```yaml
+name: data
+includes:
+  - /var/lib/postgresql
+  - /home
+excludes:
+  - /home/*/.cache
+  - /home/*/Downloads
+retention:
+  keep_last: 7
+  keep_daily: 14
+  keep_weekly: 4
+  keep_monthly: 6
+retry_max: 3
+retry_backoff_seconds: 600
+pre_hook: |
+  pg_dump -U postgres -F c -f /var/lib/postgresql/dumps/all.dump
+post_hook: |
+  rm -f /var/lib/postgresql/dumps/all.dump
+```
+
+### Conflict detection
+
+If your retention policy says `keep_hourly: 24` but no schedule
+points at this group sub-daily, the UI surfaces a
+**conflict-dimension banner** ("`hourly` won't be honoured —
+no schedule fires more often than once a day"). The flag is
+stored on the source group (`conflict_dimension`) and refreshed
+whenever a schedule or group changes.
+
+### Hooks
+
+`pre_hook` and `post_hook` run on the agent host inside
+`/bin/sh -c` (`cmd.exe /C` on Windows). Output is streamed back
+to the live job log as `hook(<phase>): …` lines.
+
+- A non-zero `pre_hook` exit aborts the backup.
+- `post_hook` always runs, with `RM_JOB_STATUS=succeeded|failed`
+  in the environment. Use this for cleanup that must happen
+  whether the backup worked or not.
+- Hooks only run for `kind=backup` jobs. They do not run for
+  `forget`, `prune`, `check`, etc.
+- AEAD-encrypted at rest at the HTTP layer; the agent receives
+  plaintext over the WS channel.
+
+A "host default" pair of hooks lives on the host itself; a
+source group's own hooks override them when set.
+
+## Schedule anatomy
+
+```yaml
+cron: "0 2 * * *"
+enabled: true
+source_group_ids:
+  - <gid for "data">
+  - <gid for "system">
+```
+
+Slim by design: a schedule says **when** and **which groups**.
+Everything else (paths, retention, hooks) lives on the groups.
+
+The agent's local cron fires the schedule. If the WebSocket is
+down at fire time, the server queues the firing into
+`pending_runs` and drains it on the next agent reconnect — a
+short network blip won't lose the backup.
+
+### Last / next run
+
+The schedules tab shows "next" (computed by parsing the cron
+expression with `robfig/cron/v3`) and "last" (the latest
+`actor_kind=schedule` job in the `jobs` table) for every
+schedule. The dashboard host row also surfaces `next 12h ago/from
+now` when a single covering schedule is the run-now candidate.
+
+## Bandwidth limits
+
+Two places set restic's `--limit-upload` / `--limit-download`:
+
+1. **Host-wide caps** on the host row (`bandwidth_up_kbps`,
+   `bandwidth_down_kbps`). Pushed to the agent on hello and
+   after `PUT /api/hosts/{id}/bandwidth`. Apply to every restic
+   invocation on the host.
+2. **Per-job overrides** on the per-source-group Run-now form.
+   Win over host caps for the lifetime of that one job.
+
+If neither is set, restic runs unthrottled.
@@ -0,0 +1,17 @@
+# Contributing
+
+Full contributor guide:
+[`CONTRIBUTING.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/CONTRIBUTING.md)
+in the repository root.
+
+The short version:
+
+- Open an issue first for non-trivial changes; the design is
+  still moving and unsolicited large PRs may conflict with
+  in-flight work.
+- `make lint test` must pass.
+- One logical change per commit, no `Co-Authored-By` trailers.
+- UK English in identifiers and comments; comments explain the
+  **why** not the **what**.
+
+Code of conduct: [`CODE_OF_CONDUCT.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/CODE_OF_CONDUCT.md).
@@ -0,0 +1,113 @@
+# Enrolling your first host
+
+The control plane only knows about hosts you've explicitly
+enrolled. Two paths exist:
+
+1. **Token-based enrolment** — admin generates a token, pastes it
+   into an install command on the host. The host appears immediately,
+   already mapped to the desired repo.
+2. **Announce-and-approve** — the agent runs without a token,
+   "announces" itself to the server, and a human in the UI accepts
+   the announcement.
+
+Token-based is the default and what most operators want; the
+announce flow exists for the case where you can't easily paste a
+secret onto the host (auto-imaged endpoints, scripted bring-ups
+from a config repo).
+
+## Token-based enrolment
+
+### From the UI
+
+1. Click **+ Add host** on the dashboard.
+2. Fill in the hostname, the restic repo URL, and the repo
+   credentials. The credentials are AEAD-encrypted at the server
+   immediately; what you paste is what the agent receives.
+3. Optionally pick the initial source paths — these become the
+   first source group on the host.
+4. Submit. The server mints a one-time token and shows you a copy-
+   pasteable install snippet.
+
+### On the host (Linux)
+
+```sh
+curl -fsSL https://restic.example.com/install/install.sh | \
+    sudo RM_SERVER=https://restic.example.com \
+         RM_ENROL_TOKEN=<token> \
+         bash
+```
+
+The script:
+
+1. Detects architecture (`amd64` or `arm64`).
+2. Downloads the agent binary from `/agent/binary?os=…&arch=…`.
+3. Drops the systemd unit at
+   `/etc/systemd/system/restic-manager-agent.service`.
+4. Runs the agent in `-enrol` mode, which posts the token and
+   stores the persistent bearer it gets back.
+5. Enables and starts the unit.
+
+Within seconds the host should appear on the dashboard as
+**online**.
+
+### On the host (Windows)
+
+```pwsh
+$env:RM_SERVER  = "https://restic.example.com"
+$env:RM_ENROL_TOKEN = "<token>"
+iwr -useb $env:RM_SERVER/install/install.ps1 | iex
+```
+
+Equivalent shape: registers a Windows service via the SCM
+(see P2-16 for details), runs `-enrol`, starts the service.
+
+## Recovering a lost token
+
+Tokens are single-use and short-lived (1h). If you closed the tab
+before pasting the install command, head to the **Add host** page —
+outstanding tokens are listed there with a **Regenerate** button.
+Regenerating revokes the old token's hash and mints a fresh raw
+token while preserving the original repo credentials and initial
+paths. (NS-02 in `tasks.md` if you want the design rationale.)
+
+## Announce-and-approve
+
+If the host can reach the server but you don't want to paste a
+secret on it, run the agent in `-announce` mode:
+
+```sh
+restic-manager-agent -announce \
+                     -server https://restic.example.com \
+                     -hostname myhost
+```
+
+The host appears in the **Pending hosts** panel on the dashboard
+with its hostname, OS, arch, and the source IP that announced it.
+Click **Accept**, fill in the repo URL + credentials, and the
+server pushes the bearer over the still-open WebSocket. No
+back-and-forth round trip.
+
+If you don't accept within an hour the announcement is swept.
+
+## What happens on the agent
+
+After enrolment, the agent:
+
+1. Connects via WebSocket to `/ws/agent` with its bearer token.
+2. Sends a `hello` envelope with its OS, arch, agent version,
+   restic version, and protocol version.
+3. Receives a `config.update` carrying its encrypted repo
+   credentials and any source-group paths.
+4. Sits idle, sending a heartbeat every 30s. Operator-driven
+   "Run now" actions arrive as `command.run` envelopes; scheduled
+   jobs are driven by the agent's local cron.
+
+## Auto-init of the repository
+
+The first time a backup runs, the agent invokes `restic init`
+against the repo you configured at enrolment. If the repo already
+exists (`config file already exists`) the agent treats it as a
+success and proceeds. The host's repo status (`unknown` →
+`ready` / `init_failed`) is surfaced under the vitals strip on
+the host detail page; if init fails, save fresh credentials in
+the **Repo** tab to retry.
@@ -0,0 +1,92 @@
+# Installing the server
+
+The reference deployment is a single Docker container fronted by
+your existing reverse proxy. The image bundles the server binary,
+the cross-compiled agent binaries, and the install scripts.
+
+## Prerequisites
+
+- A Linux host with Docker and Docker Compose.
+- A reverse proxy in front (Caddy, nginx, Traefik) terminating
+  TLS on a public hostname. The server itself is HTTP-only by
+  design — see [Reverse proxy](./reverse-proxy.md) for why.
+- A persistent volume for the server's data directory.
+
+## Quick start
+
+The reference compose file lives at
+[`deploy/docker-compose.yml`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/deploy/docker-compose.yml):
+
+```yaml
+services:
+  restic-manager:
+    image: gitea.dcglab.co.uk/steve/restic-manager:${RM_VERSION:-latest}
+    restart: unless-stopped
+    environment:
+      RM_LISTEN: ":8080"
+      RM_DATA_DIR: "/data"
+      RM_BASE_URL: "https://restic.example.com"
+      # Trust your reverse proxy's CIDR so X-Forwarded-* are honoured.
+      RM_TRUSTED_PROXY: "10.0.0.0/8"
+    volumes:
+      - rm-data:/data
+    ports:
+      # Bind localhost only — your reverse proxy is the public face.
+      - "127.0.0.1:8080:8080"
+
+volumes:
+  rm-data:
+```
+
+Bring it up:
+
+```sh
+docker compose up -d
+docker compose logs -f restic-manager
+```
+
+The first run prints a one-time **bootstrap token** to the log. Use
+it within an hour or it expires; if you miss the window the
+container print it again on next start as long as no admin user
+exists.
+
+## First-run admin setup
+
+Open `https://restic.example.com/bootstrap` (or whatever your
+public URL is). Paste the bootstrap token, pick a username and a
+password (≥ 12 characters), and submit. You'll land in the
+dashboard logged in as the new admin.
+
+If you'd rather curl it, the equivalent is:
+
+```sh
+curl -X POST https://restic.example.com/api/bootstrap \
+     -H 'Content-Type: application/json' \
+     -d '{"token":"<token-from-log>","username":"admin","password":"<≥12 chars>"}'
+```
+
+## Backing up the secret key
+
+Inside the data volume, `secret.key` holds the AEAD key used to
+encrypt every credential at rest. **Back it up separately from
+the database.** Without it, encrypted credentials in the database
+are unrecoverable; you'd have to re-enrol every host.
+
+A simple working approach: copy `secret.key` to your password
+manager or to a separately-backed-up secrets vault the day you
+install. It doesn't change.
+
+## Updating the server
+
+```sh
+# Pin a new version in your compose file (.env or docker-compose.yml),
+# then:
+docker compose pull
+docker compose up -d
+```
+
+Migrations run automatically on startup; the server will refuse to
+start if a migration fails (better to bail than to half-migrate).
+
+For the agent self-update story, see
+[Updating agents](../operations/updates.md).
@@ -0,0 +1,95 @@
+# Running behind a reverse proxy
+
+The restic-manager server is HTTP-only by design. TLS termination,
+public hostname, ACME, HSTS, and edge-level rate limiting all
+belong to a reverse proxy you already operate outside this project.
+
+## What the proxy must forward
+
+The server reads four headers when (and only when) the immediate
+peer matches `RM_TRUSTED_PROXY`:
+
+| Header                 | Value                                              | Why |
+|------------------------|----------------------------------------------------|-----|
+| `X-Forwarded-For`      | The original client IP                             | Rate-limit keys, audit log entries, OIDC redirect-URI checks. |
+| `X-Forwarded-Proto`    | `https`                                            | Used for absolute URLs (e.g. OIDC redirect URIs). |
+| `Host`                 | The public hostname clients use                    | Cookies are scoped to this; `RM_BASE_URL` must match. |
+| `Connection` / `Upgrade` | Pass through unchanged                           | `/ws/agent` and `/api/jobs/{id}/stream` are WebSockets; without `Upgrade: websocket` they fail. |
+
+Set `RM_TRUSTED_PROXY` to the CIDR (or comma-separated list of
+CIDRs) the proxy connects from. Anything outside that range has
+its `X-Forwarded-*` headers ignored, so a stray request that
+bypasses the proxy can't spoof the client IP.
+
+## Caddy
+
+```caddyfile
+restic.example.com {
+    encode zstd gzip
+    reverse_proxy 127.0.0.1:8080 {
+        header_up X-Real-IP {remote_host}
+    }
+}
+```
+
+Caddy adds `X-Forwarded-For` / `X-Forwarded-Proto` automatically
+and passes WebSocket headers through by default, so this is the
+whole config.
+
+## nginx
+
+```nginx
+server {
+    listen 443 ssl http2;
+    server_name restic.example.com;
+
+    ssl_certificate     /etc/letsencrypt/live/restic.example.com/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/restic.example.com/privkey.pem;
+
+    location / {
+        proxy_pass         http://127.0.0.1:8080;
+        proxy_http_version 1.1;
+        proxy_set_header   Host              $host;
+        proxy_set_header   X-Forwarded-For   $proxy_add_x_forwarded_for;
+        proxy_set_header   X-Forwarded-Proto https;
+
+        # WebSocket upgrade
+        proxy_set_header   Upgrade           $http_upgrade;
+        proxy_set_header   Connection        "upgrade";
+
+        # Long-lived agent WS — disable read timeout for this surface.
+        proxy_read_timeout 86400s;
+    }
+}
+```
+
+## Traefik
+
+```yaml
+http:
+  routers:
+    restic-manager:
+      rule: "Host(`restic.example.com`)"
+      entryPoints: [websecure]
+      tls:
+        certResolver: letsencrypt
+      service: restic-manager
+
+  services:
+    restic-manager:
+      loadBalancer:
+        servers:
+          - url: "http://restic-manager:8080"
+        passHostHeader: true
+```
+
+Traefik forwards WebSocket upgrades and the standard
+`X-Forwarded-*` set out of the box.
+
+## Verification
+
+After bringing the proxy up, the audit log should show your real
+client IP for an interactive login (not the proxy's local
+address). If you see `127.0.0.1` or the proxy's container IP, your
+`RM_TRUSTED_PROXY` is wrong or `X-Forwarded-For` isn't being
+forwarded.
@@ -0,0 +1,86 @@
+# restic-manager
+
+restic-manager is a self-hosted, browser-based, single-pane-of-glass
+for managing [restic](https://restic.net) backups across a fleet of
+Linux and Windows endpoints. It's designed for **small fleets** —
+the original target was twelve endpoints — and **one operator**.
+
+## What it does
+
+- Centralised view of every endpoint's last backup, repo size,
+  snapshot count, and recent jobs.
+- Trigger any restic operation remotely (`backup`, `forget`, `prune`,
+  `check`, `unlock`, `snapshots`, `stats`, `diff`, `restore`).
+- Per-host backup schedules with source groups (named bundles of
+  paths + retention policy).
+- Live job log streamed to the browser; downloadable as text or NDJSON.
+- Restore wizard with snapshot tree browse + path selection.
+- Repo-level health surfacing (size, raw size, last-check, lock
+  state) plus a 30/90-day size trend.
+- Alerting over webhook, ntfy, or SMTP.
+- Cross-platform agent (Linux + Windows).
+- Append-only-credential-friendly with a separate admin credential
+  for forget/prune.
+
+## What it isn't
+
+- **Not a SaaS.** Single-instance, single-tenant, by design.
+- **Not a replacement for restic** — it's a control plane. The agent
+  shells out to a real `restic` binary.
+- **Not highly available.** SQLite, single process; if you need
+  HA backups, you're shopping in the wrong aisle.
+- **Not a multi-protocol backup tool.** restic only.
+
+## How it fits together
+
+```
+┌──────────────────────────────────────────────┐
+│  Server (control plane, Docker)              │
+│   - REST + WebSocket API                     │
+│   - SQLite store                             │
+│   - Embedded HTMX UI                         │
+└──────────┬─────────────────────────┬─────────┘
+           │ outbound WS              │ HTTP(S)
+           │                          │
+┌──────────▼──────────┐    ┌──────────▼─────────┐
+│  Agent (per host)   │    │  Browser (operator) │
+│   - restic wrapper  │    └─────────────────────┘
+│   - cron for sched. │
+└──────────┬──────────┘
+           │ restic
+┌──────────▼──────────────────────────────────┐
+│  rest-server / S3 / SFTP / local repo       │
+│  (the actual backup data — server never     │
+│   touches it)                               │
+└─────────────────────────────────────────────┘
+```
+
+The control plane is a Go binary that runs in Docker. Each endpoint
+runs a small Go agent that holds an outbound WebSocket to the
+control plane. Backup data flows directly between the agent and the
+restic repository — the control plane never sees a snapshot byte.
+
+## Where to start
+
+- [Installing the server](./getting-started/install.md) walks
+  through the Docker-based reference deployment.
+- [Enrolling your first host](./getting-started/enrolling-hosts.md)
+  covers the install scripts and the announce-and-approve flow.
+- [Architecture](./concepts/architecture.md) is the right read if
+  you want to know why something is the way it is before running
+  the install.
+
+## Project status
+
+Pre-1.0 but feature-complete for the original use case. Phases
+0–4 are landed (MVP, scheduling, restore, RBAC + OIDC); Phase 5
+(this docs site, contributor onboarding, end-to-end CI) is in
+flight. See [`tasks.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/tasks.md)
+for the live roadmap and [`spec.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md)
+for the canonical design doc.
+
+## License
+
+[PolyForm Noncommercial 1.0.0](https://polyformproject.org/licenses/noncommercial/1.0.0/).
+Personal and community deployments welcome; commercial use
+requires a separate license.
@@ -0,0 +1,39 @@
+# License
+
+restic-manager is licensed under
+[**PolyForm Noncommercial 1.0.0**](https://polyformproject.org/licenses/noncommercial/1.0.0/).
+The full text lives at
+[`LICENSE`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/LICENSE)
+in the repository root.
+
+## What this means
+
+- **Personal, hobbyist, educational, charitable, and similar
+  noncommercial use** is fully permitted, including modification
+  and redistribution.
+- **Commercial use is not permitted** without a separate
+  license. The maintainer is not currently offering one — if
+  you need commercial rights, open an issue to start the
+  conversation.
+- The license is permissive about everything except commercial
+  use: you can fork, modify, deploy in your home/lab, and
+  contribute back.
+
+## Why this license
+
+The PolyForm Noncommercial license was chosen because:
+
+- It's a real, legal, plainly-worded license (not a custom
+  half-written variant).
+- It permits the realistic uses for a hobby project (the
+  maintainer's homelab, a friend's fleet, a charity's IT
+  closet) without inviting commercial vendors to repackage
+  the work.
+- It's compatible with the project staying small and
+  maintainable — the maintainer doesn't want to be on the hook
+  for SLA-grade commercial support.
+
+## Contributions
+
+By contributing, you agree your contributions are licensed
+under the same PolyForm Noncommercial 1.0.0 license.
@@ -0,0 +1,73 @@
+# Alerts and notifications
+
+restic-manager raises alerts on conditions that need human
+attention. The alert engine evaluates rules on a 60s tick and
+on every job-finished / host-online event.
+
+## Built-in alert kinds
+
+| Kind                | Trigger | Severity |
+|---------------------|---------|----------|
+| `backup_failed`     | A backup job ends in `failed` or `cancelled` | warning |
+| `forget_failed`     | A forget job ends in `failed` | warning |
+| `prune_failed`      | A prune job ends in `failed` | critical |
+| `check_failed`      | A check job ends in `failed` | critical |
+| `agent_offline`     | A host has been offline more than 90s past its heartbeat cadence | warning |
+| `stale_schedule`    | A schedule's "last run" is more than 1.5 × its interval ago | warning |
+| `update_failed`     | An agent self-update returned a fail or didn't reconnect within 90s | warning |
+| `fleet_update_halted`| The rolling fleet-update worker stopped on a failure | critical |
+
+Each alert has a `dedup_key` so re-firing the same condition
+just bumps `last_seen_at` — the operator gets one row per
+condition, not a thousand.
+
+## Lifecycle
+
+```
+raised  ──acknowledge──▶  acknowledged  ──resolve──▶  resolved
+   │                          │
+   └────────auto-resolve──────┘
+   (e.g. agent_offline auto-resolves on agent_online)
+```
+
+- **Acknowledge** says "I've seen this, stop notifying about it".
+- **Resolve** says "the underlying condition is gone".
+- Some alerts auto-resolve when the condition clears
+  (`agent_offline` is the canonical example).
+
+## Notification channels
+
+Configure under **Settings → Notifications**. Each channel can
+subscribe to all alerts or filter by severity.
+
+### Webhook
+
+Posts a JSON envelope to a URL of your choice. Useful for
+piping into Slack via an Incoming Webhook URL or into your own
+alerting tooling.
+
+### ntfy
+
+Pushes a plain-text alert to an [ntfy.sh](https://ntfy.sh/)
+topic. Configure the topic URL; optional bearer token if you
+self-host with auth.
+
+### SMTP
+
+Plain SMTP (with optional TLS). Configure host, port,
+username, password, and the recipient list.
+
+## Test fire
+
+Each channel exposes a **Test fire** button that dispatches a
+single synthetic alert through the channel without touching the
+alert engine. Use this when you've added a channel and want to
+verify connectivity before the next real failure happens.
+
+## What gets logged
+
+Every alert raise / acknowledge / resolve writes an audit log
+entry. The audit log UI at **Settings → Audit log** filters by
+user, action, target, and time range — useful for the
+post-incident "who clicked acknowledge on the prune-failure
+alert" question.
@@ -0,0 +1,73 @@
+# Backups and restores
+
+## Running a backup
+
+Three ways to trigger one:
+
+1. **Scheduled** — the agent's local cron fires at the time set
+   on the schedule.
+2. **Run-now** — operator clicks **Run now** on the host detail
+   right rail. Posts to `/hosts/{id}/run-backup` (defaults to all
+   source groups) or to a per-group form for finer control.
+3. **API** — `POST /api/hosts/{id}/jobs` with the appropriate
+   payload. Same audit + dispatch path.
+
+In every case the server creates a `jobs` row, broadcasts a
+`command.run` to the host, and lands the operator on the live
+job log page (HTMX `HX-Redirect`).
+
+## Cancelling a job
+
+Any running job — backup, forget, prune, restore, anything —
+exposes a **Cancel** button on its detail page. The server
+broadcasts `command.cancel`, and the agent kills the running
+restic subprocess via context cancel: SIGTERM first, SIGKILL
+after a 5s grace (`cmd.Cancel` + `cmd.WaitDelay`). On Windows the
+SIGTERM step is replaced with `os.Kill` because Windows can't
+deliver SIGTERM. Result: a cancelled job lands as `cancelled`
+within a couple of hundred milliseconds.
+
+## Restore wizard
+
+Restoring a file or path goes through a four-step wizard at
+`/hosts/{id}/restore`:
+
+1. **Pick a snapshot.** Search by id or by date; the page is
+   pre-populated when you launched the wizard from a snapshot row.
+2. **Browse the snapshot tree.** Lazy-loaded children via the
+   `MsgTreeList` synchronous WS RPC; results are cached
+   per-wizard-session for 30 minutes. Pick the absolute paths
+   you want.
+3. **Choose a target.** Either **In place** (overwrites the
+   live filesystem; requires you to type the hostname to
+   confirm) or **New directory** (default
+   `$HOME/rm-restore/<job-id>/`; agent expands `$HOME` /
+   `${HOME}` / `~/` and creates the directory chain).
+4. **Review and submit.** Server mints a job, dispatches
+   `command.run` with a `RestorePayload`, and `HX-Redirect`s to
+   the live job log.
+
+`--no-ownership` is gated on restic ≥ 0.17 (the flag was added
+in that release). Hosts running 0.16 don't get the flag and
+restore as the running user instead.
+
+## Snapshot diff
+
+Two snapshot ids in the **Diff** form on the host detail page →
+a `JobDiff` job that runs `restic diff <a> <b>`. Output streams
+to the standard live job log. Useful when investigating a
+suspiciously-sized backup.
+
+## Job log artefacts
+
+Every job's log is persisted in `job_logs` (one row per line),
+not just streamed in-memory. That gives you:
+
+- A live view at `/jobs/{id}` while the job runs.
+- Two download formats from the same page header dropdown:
+  - **txt** — one line per row, `HH:MM:SS.mmm  TAG  payload`.
+  - **ndjson** — one self-contained JSON object per line
+    (`{seq, ts, stream, payload}`), perfect for `jq`.
+
+Downloads work whether the job is running or finished —
+the source is the DB, not the live socket.
@@ -0,0 +1,61 @@
+# Observability with Prometheus
+
+restic-manager can expose a Prometheus scrape endpoint at
+`GET /metrics`. The endpoint is **opt-in** — without an explicit
+auth gate it isn't even mounted, so a forgotten config can't
+accidentally publish fleet state.
+
+The full reference lives at
+[`docs/prometheus.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/docs/prometheus.md);
+the short version follows.
+
+## Enable the endpoint
+
+Set at least one of:
+
+- `RM_METRICS_TOKEN` — `Authorization: Bearer <token>` required.
+- `RM_METRICS_TRUSTED_CIDR` — restricts source IPs (comma-CIDR).
+
+Both ANDed when both set. Constant-time token compare; CIDR
+honours `X-Forwarded-For` only when the immediate hop matches
+`RM_TRUSTED_PROXY`.
+
+## Metrics emitted
+
+- **Server gauges**: `rm_hosts_total`, `rm_hosts_online`,
+  `rm_active_alerts{severity}`, `rm_build_info{...}`.
+- **Per-host gauges**: `rm_host_agent_online`,
+  `rm_host_last_backup_timestamp_seconds`,
+  `rm_host_last_backup_success`, `rm_host_repo_size_bytes`,
+  `rm_host_snapshot_count`, `rm_host_open_alerts`,
+  `rm_host_repo_status`.
+- **Histogram**:
+  `rm_job_duration_seconds{kind,status,le=…}` (buckets
+  `1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf`).
+
+In-memory histogram only. Prometheus persists the scrapes; if
+you need durable history at hourly resolution that's
+Prometheus's job.
+
+## Sample Grafana dashboard
+
+[`deploy/grafana/restic-manager-dashboard.json`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/deploy/grafana/restic-manager-dashboard.json)
+imports through Grafana's **+ → Import → Upload JSON file**.
+Six panels:
+
+1. Fleet status (online / total).
+2. Open alerts by severity.
+3. Backups failing on most-recent run.
+4. Hosts table — last backup, repo size, snapshots, open alerts.
+5. Repo size over time, one line per host.
+6. Job-duration p95 over a 1h window per kind.
+
+## Alerting
+
+restic-manager already has a built-in alert engine
+([Alerts](./alerts.md)). The dashboard intentionally doesn't
+duplicate it as Prometheus alert rules. If you want
+Prometheus-side alerts on top, write your own based on the
+metrics above — `rm_host_last_backup_success == 0`,
+`time() - rm_host_last_backup_timestamp_seconds > <max age>`,
+or whatever suits your environment.
@@ -0,0 +1,50 @@
+# Updating agents
+
+Server updates are a `docker compose pull && up -d` away.
+Agents update via the control plane.
+
+## Single-host update
+
+Each host's detail page shows an **Update agent** button when
+the agent's reported version is older than the server's. The
+button:
+
+1. Dispatches a `command.update` to that host.
+2. The agent fetches the appropriate binary from
+   `$RM_SERVER/agent/binary?os=…&arch=…` to
+   `<binary-path>.new`.
+3. Copies the running binary to `<binary-path>.old` (one
+   revision back, in case rollback is needed).
+4. Atomic-renames `.new` over the running binary.
+5. Exits cleanly. systemd's `Restart=always` (or Windows SCM)
+   brings the process back on the new binary.
+
+A 90-second timer on the server side waits for a hello at the
+target version and marks the update succeeded — or, if the
+agent doesn't reconnect at the expected version in time, marks
+the update **failed** and raises an `update_failed` alert.
+
+## Fleet update
+
+The admin-only **Settings → Fleet update** page drives a rolling
+update across every host in the fleet:
+
+- One host at a time.
+- Wait for hello-with-target-version (max 95s).
+- On any host failing, **halt** the rollout, raise a
+  `fleet_update_halted` alert, leave the rest of the fleet on
+  the old version. No surprise mass-failures.
+
+You can cancel an in-progress fleet update; the worker stops
+after the current host finishes.
+
+## TLS and corruption
+
+Updates rely on the reverse proxy's TLS to detect corruption in
+transit. There's no separate sha256 verification step — we
+chose the simpler model on the basis that the same TLS already
+gates every other byte the server hands to the agent.
+
+If you'd like a separate signature step before applying updates,
+that's a future-phase enhancement (see `tasks.md` Phase 6
+candidates).
@@ -0,0 +1,58 @@
+# Environment variables
+
+The server reads its configuration from environment variables
+(canonical) with an optional YAML overlay. Env wins over YAML so
+operators can tweak a single setting without rewriting the file.
+
+## Server
+
+| Variable                  | Default                          | Meaning |
+|---------------------------|----------------------------------|---------|
+| `RM_LISTEN`               | `:8080`                          | TCP listener for the HTTP server. |
+| `RM_DATA_DIR`             | `/data`                          | Persistent state directory (SQLite, secret key, agent assets). |
+| `RM_BASE_URL`             | (none)                           | Public URL clients use; required for OIDC redirects + cookie scope. |
+| `RM_SECRET_KEY_FILE`      | `${RM_DATA_DIR}/secret.key`      | Path to the AEAD key file. Auto-generated on first run. |
+| `RM_COOKIE_SECURE`        | `true`                           | Set `false` only for local HTTP testing. Controls `Secure` on session cookies. |
+| `RM_TRUSTED_PROXY`        | (none)                           | Comma-separated CIDRs trusted for `X-Forwarded-*`. |
+| `RM_BUNDLED_ASSETS_DIR`   | `/opt/restic-manager/dist`       | Read-only path with bundled agent binaries + install scripts (the docker image bakes them here). |
+| `RM_METRICS_TOKEN`        | (off)                            | When set, `GET /metrics` requires `Authorization: Bearer <token>`. |
+| `RM_METRICS_TRUSTED_CIDR` | (off)                            | When set, `GET /metrics` restricts source IPs (comma-CIDR). |
+
+OIDC variables (all optional; empty issuer disables OIDC):
+
+| Variable                       | Meaning |
+|--------------------------------|---------|
+| `RM_OIDC_ISSUER`               | OIDC discovery URL (e.g. `https://auth.example.com`). |
+| `RM_OIDC_CLIENT_ID`            | Client ID registered with the IdP. |
+| `RM_OIDC_CLIENT_SECRET`        | Client secret (or use `RM_OIDC_CLIENT_SECRET_FILE`). |
+| `RM_OIDC_CLIENT_SECRET_FILE`   | Path to a file holding the client secret. |
+| `RM_OIDC_DISPLAY_NAME`         | Button label on the login page (e.g. "Authelia"). |
+| `RM_OIDC_ROLE_CLAIM`           | Token claim that carries roles (default `groups`). |
+| `RM_OIDC_ROLE_MAPPING`         | `idp-group=role` entries, comma-separated (e.g. `rm-admin=admin,rm-ops=operator`). |
+| `RM_OIDC_REDIRECT_URL`         | Override for the redirect URL; defaults to `${RM_BASE_URL}/auth/oidc/callback`. |
+
+## Agent
+
+| Variable             | Default | Meaning |
+|----------------------|---------|---------|
+| `RM_AGENT_CONFIG`    | `/etc/restic-manager/agent.yaml` (Linux) | Config file path. |
+
+The agent's other settings live in the YAML file (server URL,
+bearer token, optional cert pin). The install script writes that
+file for you at enrolment.
+
+## Build-time
+
+The Makefile threads `-ldflags` from `git describe` into the
+`internal/version` package so `--version` and the dashboard
+footer show the right values:
+
+```
+-X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=$(VERSION)
+-X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Commit=$(COMMIT)
+```
+
+If you build with `go build` directly (no Makefile), `Version`
+falls back to `dev` and the agent-update comparison falls back
+to "always equal". Source-build deployments can still run; they
+just don't participate in the self-update flow.
@@ -0,0 +1,82 @@
+# HTTP endpoints
+
+A non-exhaustive map of the surfaces the control plane exposes.
+All `/api/*` routes return JSON; all other paths render HTML
+(server-rendered with HTMX in the loop).
+
+The canonical wiring lives at
+[`internal/server/http/server.go`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/internal/server/http/server.go);
+when in doubt, read the routes block there.
+
+## Public (no auth)
+
+| Method | Path                       | Purpose |
+|--------|----------------------------|---------|
+| GET    | `/healthz`                 | Liveness probe. Returns 204. |
+| POST   | `/api/auth/login`          | Local-user login. JSON body: `{username, password}`. |
+| POST   | `/api/auth/logout`         | Invalidate the session cookie. |
+| POST   | `/api/bootstrap`           | First-run admin creation. Accepts the token printed at first start. |
+| POST   | `/api/agents/enroll`       | Token-based agent enrolment. |
+| POST   | `/api/agents/announce`     | Announce-and-approve agent enrolment. |
+| GET    | `/agent/binary?os=&arch=`  | Serves the agent binary for the install scripts. |
+| GET    | `/install/*`               | Serves the Linux + Windows install scripts and the systemd unit. |
+| GET    | `/api/version`             | Build version + commit JSON. |
+| GET    | `/metrics`                 | Prometheus exposition (only when opted-in via `RM_METRICS_TOKEN` / `RM_METRICS_TRUSTED_CIDR`). |
+| GET    | `/login`, `/setup`, `/bootstrap` | UI pages. |
+
+## Authenticated (any role)
+
+| Method | Path                                     | Purpose |
+|--------|------------------------------------------|---------|
+| GET    | `/`                                      | Dashboard. |
+| GET    | `/hosts/{id}`                            | Host detail. |
+| GET    | `/hosts/{id}/repo`                       | Repo tab. |
+| GET    | `/hosts/{id}/jobs`                       | Jobs tab. |
+| GET    | `/hosts/{id}/sources`                    | Source groups list. |
+| GET    | `/hosts/{id}/schedules`                  | Schedules list. |
+| GET    | `/jobs/{id}`                             | Live job log. |
+| GET    | `/api/hosts`, `/api/fleet/summary`       | JSON list + summary. |
+| GET    | `/api/jobs/{id}/stream`                  | WebSocket subscription to a job's live log. |
+| GET    | `/api/jobs/{id}/log.{txt,ndjson}`        | Persisted log download. |
+
+## Operator role and above
+
+| Method | Path                                  | Purpose |
+|--------|---------------------------------------|---------|
+| POST   | `/hosts/{id}/run-backup`              | Run-now (HTMX form-post). |
+| POST   | `/hosts/{id}/sources/{gid}/run-now`   | Per-source-group run-now. |
+| POST   | `/hosts/{id}/repo/{prune,check,unlock,reinit,probe}` | Maintenance actions. |
+| POST   | `/api/hosts/{id}/snapshots/diff`      | Snapshot-diff job. |
+| POST   | `/hosts/{id}/restore`                 | Restore wizard submit. |
+| POST   | `/api/jobs/{id}/cancel`               | Cancel a running job. |
+| POST   | `/hosts/{id}/tags`                    | Update host tags. |
+| POST   | `/hosts/{id}/sources` and friends     | Source-group CRUD. |
+| POST   | `/hosts/{id}/schedules` and friends   | Schedule CRUD. |
+| POST   | `/hosts/{id}/repo/credentials`, `/admin-credentials` | Credential update. |
+
+## Admin role only
+
+| Method | Path                                  | Purpose |
+|--------|---------------------------------------|---------|
+| POST   | `/hosts/new`                          | Mint enrolment token (Add host). |
+| POST   | `/hosts/{id}/delete`                  | Delete + cascade. |
+| POST   | `/hosts/{id}/update`                  | Dispatch a single agent update. |
+| GET/POST | `/settings/users/...`                | User management. |
+| POST   | `/settings/notifications/...`         | Notification channel CRUD + test fire. |
+| POST   | `/settings/fleet-update/...`          | Fleet-update worker. |
+
+## WebSocket
+
+| Path                           | Who connects | Auth |
+|--------------------------------|--------------|------|
+| `/ws/agent`                    | Agent        | Bearer token issued at enrolment. |
+| `/ws/agent/pending`            | Agent (announce flow) | Pending-id query param. |
+| `/api/jobs/{id}/stream`        | Browser      | Session cookie. |
+
+## RBAC enforcement
+
+Routes are grouped into chi route-groups by required role
+(`viewer < operator < admin`); the `requireRole` middleware in
+`internal/server/http/middleware.go` is the bouncer. Sessions
+re-validate `disabled_at` on every request, so a disabled user's
+cookie stops working immediately.
@@ -0,0 +1,32 @@
+# Roadmap
+
+The live roadmap is in
+[`tasks.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/tasks.md).
+Phases ship in order; items inside a phase ship as the
+opportunity arises.
+
+## Status snapshot
+
+| Phase | Theme                                            | Status |
+|-------|--------------------------------------------------|--------|
+| 0     | Project bootstrap                                | ✅ done |
+| 1     | MVP: enrolment, visibility, on-demand backup     | ✅ done |
+| 2     | Scheduling, retention, repo operations           | ✅ done |
+| 3     | Restore, alerts, audit                           | ✅ done |
+| 4     | RBAC, OIDC, host tags                            | ✅ done |
+| 5     | OSS readiness                                    | 🚧 in flight (this docs site is part of it) |
+| 6     | Update delivery + observability polish           | ✅ done |
+
+## What's not on the roadmap
+
+The non-goals list in [`spec.md` §2](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md):
+
+- Replacing restic itself or providing custom repo formats
+- Managing non-restic backup tools
+- Multi-tenancy / SaaS deployment
+- High availability of the control plane (SQLite, single-instance)
+- Mobile-native apps (responsive web only)
+
+If something there is critical to your use case, restic-manager
+isn't the right tool. That's not a closed door — it's a
+deliberate scope decision so the project stays maintainable.
@@ -0,0 +1,35 @@
+# Reporting vulnerabilities
+
+The full disclosure policy lives in
+[`SECURITY.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/SECURITY.md)
+at the repo root. The short version:
+
+- **Don't open a public issue.**
+- Send a Gitea private message to `steve` on
+  <https://gitea.dcglab.co.uk>, or email the address on the
+  maintainer's profile, with a subject like
+  `[SECURITY] restic-manager: <one-line summary>`.
+- Expect an acknowledgement within 3 working days; escalate
+  through the other channel if you don't get one.
+- Default disclosure window is **30 days from confirmed report
+  to public disclosure**, faster if a PoC is already
+  circulating, slower only by mutual agreement.
+
+## What to include
+
+A description of the issue and the impact, the affected
+component (server / agent / install script / docs), the version,
+and reproduction steps. A working PoC is welcome but not
+required — a credible threat model is enough.
+
+## In scope vs. out of scope
+
+See the full policy. Quick highlights:
+
+- **In scope:** server, agent, install scripts, docker image,
+  docker-compose reference, crypto choices, docs that lead to
+  insecure configs.
+- **Out of scope:** restic itself (report upstream), unpatched
+  third-party deps (report upstream first), pre-authenticated
+  admin abuse (admins are designed to have full power), DoS on
+  deployments without the recommended reverse proxy.
@@ -0,0 +1,72 @@
+# Hardening checklist
+
+A baseline for new deployments. Most of these are defaults; the
+list is here to make audit easy.
+
+## Server
+
+- [ ] Reverse proxy in front, TLS terminating at the proxy
+      (Caddy/nginx/Traefik).
+- [ ] `RM_TRUSTED_PROXY` set to the proxy's CIDR.
+- [ ] `RM_BASE_URL` matches the public hostname and the cookie
+      scope you want.
+- [ ] `RM_COOKIE_SECURE=true` (the default; only set `false`
+      for local HTTP testing).
+- [ ] HTTP listener bound to **localhost** in the compose file,
+      not `0.0.0.0`. The reverse proxy is the only thing that
+      should reach it.
+- [ ] `secret.key` backed up separately from the database.
+- [ ] Bootstrap token consumed and the printed log line scrubbed
+      from any log archive.
+
+## Authentication
+
+- [ ] Admin user has a password ≥ 12 characters (the floor).
+- [ ] OIDC enabled if you have an IdP — local password auth
+      stays as a break-glass.
+- [ ] Disabled (not deleted) any users who change roles or leave
+      so their session is invalidated immediately.
+- [ ] The last-admin guard isn't tripped — there's always at
+      least one enabled admin user.
+
+## Repo credentials
+
+- [ ] Append-only credential set as the everyday cred for every
+      host.
+- [ ] Admin credential set only where prune cadence is enabled.
+- [ ] No credentials reused across hosts. Each host should have
+      its own credential pair so a single host compromise has a
+      single blast radius.
+- [ ] If using rest-server, `--append-only` flag is on for the
+      everyday user; the prune user is a separate identity.
+
+## Agent
+
+- [ ] Agent runs as `root` (Linux) or `LocalSystem` (Windows)
+      **only when** the source paths require it. Otherwise pin
+      a service user that has read access to what's backed up
+      and nothing else.
+- [ ] systemd unit's sandboxing flags are intact
+      (`NoNewPrivileges`, `Protect*`, `MemoryDenyWriteExecute`).
+- [ ] Agent's config file `/etc/restic-manager/agent.yaml` is
+      mode `0600` and owned by the service user. The bearer
+      token lives in there.
+
+## Operations
+
+- [ ] Alerts wired to a real channel (webhook into Slack,
+      ntfy topic, SMTP) — not just sitting in the UI.
+- [ ] Test-fire each notification channel after configuring.
+- [ ] Audit-log retention is long enough to cover the operator's
+      incident-response window.
+- [ ] Prometheus endpoint, if enabled, gated by token AND CIDR
+      where practical (default is opt-in / off).
+
+## Recovery
+
+- [ ] A documented procedure for rotating a leaked agent bearer
+      (delete + re-enrol the host).
+- [ ] A test-restore done at least once, end-to-end, before
+      relying on the system in anger.
+- [ ] `secret.key` and the SQLite database covered by separate
+      backup paths so neither alone reconstitutes the other.
@@ -0,0 +1,110 @@
+# Threat model
+
+This page documents what restic-manager defends against, what it
+doesn't, and the trust assumptions a deployment is making. The
+canonical version lives in [`spec.md`](https://gitea.dcglab.co.uk/steve/restic-manager/src/branch/main/spec.md)
+§11; the summary here is shaped for operators rather than
+implementers.
+
+## Trust boundaries
+
+```
+┌──────────────────────────────────────────┐
+│  TRUSTED zone                            │
+│  ┌─────────────┐    ┌──────────────┐     │
+│  │  Operator's │    │   Reverse    │     │
+│  │   browser   │◄──►│    proxy     │     │  TLS terminates here
+│  └─────────────┘    └──────┬───────┘     │
+└────────────────────────────┼─────────────┘
+                             │ HTTP, plaintext
+                             │ (loopback or trusted LAN)
+┌────────────────────────────▼─────────────┐
+│  Server (control plane)                  │
+└────────────┬─────────────────────────────┘
+             │ outbound WebSocket (TLS to clients via proxy)
+             │ — bearer-authenticated
+┌────────────▼──────────────┐
+│  Agent (per host)         │  ◄── attacker model: assume one
+└────────────┬──────────────┘       endpoint can be compromised
+             │ subprocess
+             ▼
+   restic ──▶ repository (rest-server / S3 / SFTP / …)
+```
+
+## What we defend against
+
+### Network attacker between operator and server
+
+- HTTPS via the reverse proxy is the only operator-facing surface
+  on a sane deployment.
+- `RM_COOKIE_SECURE=true` (default) means the session cookie
+  refuses to ride a non-HTTPS connection.
+- `RM_TRUSTED_PROXY` gates whether `X-Forwarded-*` is honoured;
+  a bypassing request can't spoof the client IP.
+
+### Compromised agent host
+
+- The agent's bearer token can dispatch commands **only on its
+  own host**. It can't read other hosts' state, dispatch jobs
+  on other hosts, or escalate within the control plane.
+- If you suspect a host compromise:
+  1. Disable the agent's host row from **Hosts → Delete**
+     (cascades the bearer hash).
+  2. Rotate the repo credential at the rest-server / object
+     store side.
+  3. Audit-log lists every action that bearer ever drove.
+
+### DB compromise without the secret key
+
+- Repo credentials are AEAD-encrypted at rest. A DB dump alone
+  doesn't expose them.
+- Agent bearer **hashes** are leaked; that's enough to
+  authenticate as any agent until you revoke. A rotation
+  procedure is just "delete + re-enrol" today.
+- Operator passwords are bcrypt-hashed; OIDC users have no
+  password to leak.
+- Session tokens are hashed; an attacker can't replay a
+  session from a DB dump.
+
+### DB compromise WITH the secret key
+
+The attacker can decrypt every credential. Treat
+`secret.key` with the same care as a password manager database.
+Back it up to a separate vault, not to the same Docker volume
+as the database.
+
+### Forget/prune as a DoS vector
+
+- The everyday backup credential cannot prune (append-only).
+- The admin credential is only pushed to the agent at the
+  moment of dispatch and discarded after the job ends.
+- Compromise of a single agent host does **not** grant prune
+  rights — at worst the attacker gets fresh write access until
+  the credential is rotated.
+
+### Operator-side typo or bad copy-paste
+
+- Repo credentials are stored encrypted; mis-typed creds fail
+  fast on the next `restic` invocation rather than silently
+  corrupting state.
+- NS-03 added auto-init: the first dispatched job after creds
+  change runs `restic init`, surfaces the error eagerly under
+  the host's vitals strip if the creds are bad, and resets the
+  host's `repo_status` so the operator can retry without
+  hunting through job logs.
+
+## What we don't defend against
+
+- **Insider threat at the maintainer level.** A malicious
+  maintainer can publish a backdoored container; SBOM /
+  signing infrastructure (Phase 6 candidate) would help here
+  but isn't shipped today.
+- **Supply chain.** We pin module versions (`go.sum`) and
+  pin the Tailwind binary's release tag, but a compromise in
+  one of those upstreams would land here.
+- **Side-channel via restic itself.** A bug in restic that
+  enables snapshot-content disclosure is restic's problem; the
+  control plane doesn't see snapshot bytes either way.
+- **DoS via resource exhaustion** without the recommended
+  reverse-proxy / rate-limit in front. Don't expose the
+  server's HTTP port to the public internet directly.
@@ -0,0 +1,120 @@
+# End-to-end test harness
+
+The e2e harness stands up the full production-shaped stack
+(server + agent + rest-server) in Docker Compose and drives it
+through Playwright. CI runs it on every PR; operators can run it
+locally too.
+
+## Files
+
+```
+e2e/
+├── compose.e2e.yml         compose stack: server + rest-server + agent
+├── Dockerfile.agent        Linux container for the agent (alpine + restic)
+├── agent-entrypoint.sh     decides between announce / token-enrol / run
+└── playwright/
+    ├── package.json
+    ├── playwright.config.ts
+    └── tests/
+        ├── lib/server.ts   bootstrap, login, accept, poll helpers
+        └── smoke.spec.ts   happy-path: enrol → backup → succeeded
+```
+
+## Local run
+
+Prerequisites: Docker + Docker Compose, and `npx` for Playwright.
+
+```sh
+# 1. Build + bring up the stack (server, rest-server, source data).
+docker compose -f e2e/compose.e2e.yml up --build -d server rest-server source-fixture
+
+# 2. Wait for the server, then scrape the bootstrap token from the log.
+until curl -fsS http://127.0.0.1:8080/api/version >/dev/null; do sleep 1; done
+RM_BOOTSTRAP_TOKEN=$(docker compose -f e2e/compose.e2e.yml logs server \
+    | grep -Eo '[a-zA-Z0-9_-]{40,}' | head -1)
+export RM_BOOTSTRAP_TOKEN
+
+# 3. Start the agent (it announces against the running server).
+docker compose -f e2e/compose.e2e.yml up -d agent
+
+# 4. Install + run Playwright.
+cd e2e/playwright
+npm install
+npx playwright install --with-deps chromium
+npx playwright test
+```
+
+When the test passes you'll see:
+
+```
+Running 2 tests using 1 worker
+  ✓  smoke: enrol-via-announce → backup › happy path completes in under a minute (47s)
+  ✓  smoke: scrape /metrics › metrics endpoint exposes the host gauge (180ms)
+
+  2 passed (47.5s)
+```
+
+Tear-down:
+
+```sh
+docker compose -f e2e/compose.e2e.yml down -v
+```
+
+`-v` removes the named volumes too — important between runs because
+the rest-server volume holds an initialised repo and the
+agent-config volume holds a stale bearer.
+
+## What the test exercises
+
+1. **Bootstrap.** Posts the admin-creation request to
+   `/api/bootstrap` with the token scraped from the server log.
+2. **Login (UI).** Drives the login form via Playwright; verifies
+   the dashboard loads with a session cookie set.
+3. **Pending host appears.** Polls the dashboard for the inline
+   accept form generated by the announcing agent; reads the
+   pending-id out of its action URL.
+4. **Accept.** POSTs `/api/pending-hosts/{id}/accept` with the
+   rest-server URL + repo password. The server mints a Host row
+   + bearer + AEAD-encrypted creds and pushes the bearer down
+   the still-open pending WebSocket.
+5. **Online + auto-init.** Polls `/api/hosts` until the new host
+   is `status=online`. Auto-init runs as part of this — the
+   first dispatched job after creds save is `restic init`.
+6. **Run backup.** Submits the host detail page's `Run now`
+   form; expects `HX-Redirect` to the live job page.
+7. **Verify.** Polls `/api/hosts` until the host's
+   `last_backup_status` flips to `succeeded`.
+8. **Metrics.** Scrapes `/metrics` and asserts the
+   server-gauge + build-info lines are present (the compose
+   stack opens the endpoint via `RM_METRICS_TRUSTED_CIDR=0.0.0.0/0`).
+
+## CI workflow
+
+[`.gitea/workflows/e2e.yml`](../.gitea/workflows/e2e.yml) runs the
+suite on every PR into `main`. On failure it dumps the last 200
+lines of each container log as a workflow annotation and uploads
+the Playwright HTML report as an artefact.
+
+## When tests fail
+
+- **Pending host never appears.** Agent container probably
+  couldn't reach the server. Check `docker compose logs agent`
+  for connection errors and `docker compose logs server` for
+  any 4xx on `/api/agents/announce`.
+- **Backup hangs in `running`.** The agent shells out to
+  `restic`; check the live job log at
+  `http://127.0.0.1:8080/jobs/<id>` (still up after a
+  failed test as long as you didn't `down -v`).
+- **`RM_BOOTSTRAP_TOKEN not set`.** The server log scrape
+  matched the wrong line or the token regex is too tight. The
+  server prints the token on a line starting with `    ` (four
+  spaces) inside a banner; widen the regex if your server log
+  format changes.
+
+## Adding new tests
+
+The harness is intentionally flat — one `*.spec.ts` per
+scenario. Reuse the helpers in `lib/server.ts` and avoid
+duplicating bootstrap / login boilerplate. Heavy fixtures
+(custom users, OIDC IdP) belong in their own compose override
+file rather than complicating `compose.e2e.yml`.
@@ -0,0 +1,139 @@
+# Prometheus + Grafana
+
+restic-manager exposes a Prometheus scrape endpoint at `GET /metrics`.
+The endpoint is **opt-in** — it is not mounted at all unless you set
+at least one of the auth gates below. Once enabled, it serves the
+standard `text/plain` exposition format that every Prometheus
+release since 2.x parses without configuration.
+
+A sample Grafana dashboard lives at
+`deploy/grafana/restic-manager-dashboard.json`.
+
+## Enable the endpoint
+
+Two switches, both off by default. If both are set, both must pass
+(token AND source-IP); if only one is set, that gate alone
+authorises a scrape.
+
+| Env var                    | YAML key               | Effect |
+|----------------------------|------------------------|--------|
+| `RM_METRICS_TOKEN`         | `metrics_token`        | Requires `Authorization: Bearer <token>`. Compared in constant time. |
+| `RM_METRICS_TRUSTED_CIDR`  | `metrics_trusted_cidrs` (list) | Restricts the source IP to one of the listed CIDRs. Comma-separated in env, list in YAML. Honours `X-Forwarded-For` only when the immediate hop matches `RM_TRUSTED_PROXY`. |
+
+When neither is set, `GET /metrics` returns 404 — the route is not
+registered with the chi router so a forgotten config can't
+accidentally publish fleet state.
+
+### Example: Docker
+
+```yaml
+services:
+  restic-manager:
+    image: gitea.dcglab.co.uk/steve/restic-manager:latest
+    environment:
+      RM_METRICS_TOKEN_FILE: /run/secrets/rm_metrics_token
+      RM_METRICS_TRUSTED_CIDR: "10.0.0.0/8"
+    secrets:
+      - rm_metrics_token
+```
+
+(`RM_METRICS_TOKEN_FILE` is not currently supported — set
+`RM_METRICS_TOKEN` directly. The `_FILE` convention is on the
+roadmap.)
+
+## Prometheus scrape config
+
+Drop into your `prometheus.yml`:
+
+```yaml
+scrape_configs:
+  - job_name: restic-manager
+    metrics_path: /metrics
+    scheme: https            # via your reverse proxy
+    static_configs:
+      - targets: ['restic.example.com']
+    authorization:
+      type: Bearer
+      credentials_file: /etc/prometheus/secrets/rm_metrics_token
+```
+
+If you don't run a TLS-terminating proxy in front, drop `scheme:
+https` (the server is HTTP-only — see `docs/reverse-proxy.md`).
+
+## Metric reference
+
+All names are `rm_`-prefixed. Per-host metrics carry a `host_id`
+label (the stable ULID, immune to renames) and a `host` label
+(the human-readable name).
+
+### Server gauges
+
+| Name                  | Labels                             | Description |
+|-----------------------|------------------------------------|-------------|
+| `rm_hosts_total`      | —                                  | Total number of enrolled hosts (excludes pending announces). |
+| `rm_hosts_online`     | —                                  | Number of hosts with `status='online'`. |
+| `rm_active_alerts`    | `severity` ∈ {info, warning, critical} | Open alerts by severity. |
+| `rm_build_info`       | `version, commit, go_version`      | Always 1; pure label-bag for joining. |
+
+### Per-host gauges
+
+| Name                                       | Description |
+|--------------------------------------------|-------------|
+| `rm_host_agent_online`                     | 1 if the agent is currently online, 0 otherwise. |
+| `rm_host_last_backup_timestamp_seconds`    | Unix timestamp of the host's most recent backup. **Omitted** for hosts with no backup yet. |
+| `rm_host_last_backup_success`              | 1 if the most recent backup succeeded, 0 otherwise. **Omitted** for hosts with no backup yet. |
+| `rm_host_repo_size_bytes`                  | Latest reported repo size from `restic stats --mode raw-data`. **Omitted** when unknown. |
+| `rm_host_snapshot_count`                   | Number of restic snapshots known on the host's repo. |
+| `rm_host_open_alerts`                      | Number of currently open alerts attached to this host. |
+| `rm_host_repo_status`                      | Always 1; the `status` label carries `unknown` / `ready` / `init_failed`. |
+
+### Job duration histogram
+
+```
+rm_job_duration_seconds_bucket{kind, status, le}
+rm_job_duration_seconds_sum{kind, status}
+rm_job_duration_seconds_count{kind, status}
+```
+
+`kind` ∈ {backup, forget, prune, check, unlock, restore, diff, init, update}.
+`status` ∈ {succeeded, failed, cancelled}.
+
+Buckets (seconds):
+
+```
+1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf
+1s   5s  30s  1m  5m   30m   1h    6h    24h
+```
+
+The histogram is in-memory only — values reset on process restart.
+Operators who want durable history should let Prometheus persist
+the scrapes; restic-manager itself is a control plane, not a
+metrics database.
+
+## Grafana dashboard
+
+Import `deploy/grafana/restic-manager-dashboard.json`:
+
+1. In Grafana, **+ → Import → Upload JSON file**.
+2. Pick the Prometheus data source you scrape with.
+3. The dashboard's six panels populate from the metrics above:
+   * **Fleet status** — online/total stat panel.
+   * **Open alerts** — by severity.
+   * **Hosts** — per-host table (last backup, repo size, snapshots, alerts).
+   * **Repo size over time** — one line per host.
+   * **Backups failing** — count of hosts whose last backup didn't succeed.
+   * **Job duration p95** — `histogram_quantile(0.95, …)` over a 1h window per kind.
+
+Alerting is intentionally not configured in the dashboard — the
+control plane already has alerts (P3-05) with native channels for
+webhook, ntfy, and SMTP. Re-implementing them in Prometheus would
+just duplicate state. If you do want Prom-side alerts, copy the
+recording rules into your usual location.
+
+## Cardinality
+
+Per scrape: O(hosts) gauge rows + O(kinds × statuses × buckets)
+histogram rows. A 100-host fleet emits roughly 700 host rows + 270
+histogram rows — well below any practical limit. There are no
+`job_id` labels (cardinality bomb avoidance) and no per-source-group
+labels.
@@ -0,0 +1,113 @@
+# Running behind a reverse proxy
+
+The restic-manager server is HTTP-only by design (see `spec.md` §11):
+TLS termination, public hostname, ACME, HSTS, and edge-level rate
+limiting all belong to a reverse proxy that you already operate
+outside this project. The reference compose in `deploy/docker-compose.yml`
+stands up *only* the server; this page covers what your proxy needs
+to do to make the rest of it work.
+
+## What the proxy must forward
+
+The server reads four headers when (and only when) the immediate peer
+matches `RM_TRUSTED_PROXY`:
+
+| Header              | Value                                                    | Why |
+|---------------------|----------------------------------------------------------|-----|
+| `X-Forwarded-For`   | The original client IP (single value, or comma chain)    | Rate-limit keys, audit log entries, and OIDC redirect-URI checks all use the real client IP. |
+| `X-Forwarded-Proto` | `https`                                                  | The server emits absolute URLs (e.g. OIDC redirect URIs) using this. |
+| `Host`              | The public hostname clients use                          | Cookies are scoped to this; `RM_BASE_URL` must match. |
+| `Connection`/`Upgrade` | Pass through unchanged                                | The agent connects on `/ws/agent` and the live-log viewer connects on `/api/jobs/{id}/stream` — both are WebSockets and need `Upgrade: websocket` to survive the hop. |
+
+Set `RM_TRUSTED_PROXY` to the CIDR (or comma-separated list of CIDRs)
+the proxy connects from. Anything outside that range has its
+`X-Forwarded-*` headers ignored, so a stray request that bypasses the
+proxy can't spoof the client IP.
+
+## Example: Caddy
+
+```caddyfile
+restic.example.com {
+    # Caddy's default reverse_proxy preserves Host, sets
+    # X-Forwarded-For/Proto, and passes Connection: upgrade through,
+    # so a single directive covers HTTP + WebSocket.
+    reverse_proxy 127.0.0.1:8080
+
+    encode zstd gzip
+}
+```
+
+`RM_TRUSTED_PROXY=127.0.0.1/32` if Caddy and the server share the
+host; the docker-bridge CIDR (commonly `172.16.0.0/12`) if Caddy
+runs in another container on the default bridge network.
+
+## Example: nginx
+
+```nginx
+server {
+    listen 443 ssl http2;
+    server_name restic.example.com;
+
+    ssl_certificate     /etc/ssl/restic.example.com.fullchain.pem;
+    ssl_certificate_key /etc/ssl/restic.example.com.key.pem;
+
+    location / {
+        proxy_pass         http://127.0.0.1:8080;
+        proxy_http_version 1.1;
+
+        # WebSocket support — agent + live-log endpoints need this.
+        proxy_set_header   Upgrade           $http_upgrade;
+        proxy_set_header   Connection        $connection_upgrade;
+
+        # Trusted-proxy headers.
+        proxy_set_header   Host              $host;
+        proxy_set_header   X-Forwarded-For   $proxy_add_x_forwarded_for;
+        proxy_set_header   X-Forwarded-Proto https;
+
+        # Live job logs are long-running streams. Bump read timeouts
+        # so nginx doesn't drop them mid-backup.
+        proxy_read_timeout 1h;
+        proxy_send_timeout 1h;
+    }
+}
+
+# Standard websocket upgrade map (define once at the http {} level).
+map $http_upgrade $connection_upgrade {
+    default upgrade;
+    ''      close;
+}
+```
+
+`RM_TRUSTED_PROXY` for the same-host case: `127.0.0.1/32`.
+
+## Example: Traefik (label-based)
+
+```yaml
+labels:
+  - "traefik.enable=true"
+  - "traefik.http.routers.restic-manager.rule=Host(`restic.example.com`)"
+  - "traefik.http.routers.restic-manager.entrypoints=websecure"
+  - "traefik.http.routers.restic-manager.tls.certresolver=letsencrypt"
+  - "traefik.http.services.restic-manager.loadbalancer.server.port=8080"
+```
+
+Traefik handles `X-Forwarded-*` and `Connection: upgrade` by default.
+`RM_TRUSTED_PROXY` should be the docker network the Traefik container
+shares with the server (commonly `172.16.0.0/12` for the default
+bridge, or whatever your overlay network's CIDR is).
+
+## Sanity-checking the wiring
+
+After bringing the stack up:
+
+1. `curl -fsS https://restic.example.com/healthz` — should return 200.
+2. The login page should report HTTPS in the address bar; cookies
+   set after login should carry the `Secure` flag.
+3. Check the server log for the `config resolved` line:
+   `trusted_proxies` must include the IP/CIDR your proxy actually
+   connects from.
+4. Enrol a test agent — the WebSocket handshake hitting `/ws/agent`
+   confirms `Upgrade` is being forwarded correctly.
+
+If any of those fail, the proxy is the first place to look — the
+server itself is intentionally minimal.
@@ -0,0 +1,223 @@
+# Always-On vs Intermittent host mode
+
+**Date:** 2026-06-15
+**Branch:** `feat-laptop-host-mode`
+**Status:** Design — awaiting review
+
+## Problem
+
+The server currently assumes every host should be present 24×7. When an
+agent stops heartbeating for 90s it is flipped to `offline`, and after 15
+minutes that raises a `warning` alert. This is correct for a server, but
+wrong for a host that legitimately comes and goes — a workstation or
+laptop that sleeps overnight, travels, or is shut down on weekends. Such
+a host generates noise alerts every time it is closed, and — more
+importantly — there is **no mechanism to catch up a backup it missed
+while it was away.**
+
+Two distinct facts make the catch-up gap real:
+
+- **Backup cron runs on the agent, locally.** The agent fires
+  `MsgScheduleFire`; the server only dispatches in response. If the host
+  is asleep, the agent process is suspended, so the cron tick never
+  fires and no `MsgScheduleFire` is ever sent.
+- Therefore the existing `pending_runs` retry queue **does not** cover
+  this case. `pending_runs` only gets a row when a schedule *fired* but
+  the agent was momentarily disconnected at dispatch time. A window
+  missed entirely during sleep never enqueues anything.
+
+## Goal
+
+Let an operator mark a host as **not** always-on. Such a host:
+
+1. Does **not** raise offline/agent-down alerts when it is not visible.
+2. Renders a distinct, calm "asleep" state in the UI instead of the
+   alarming red "offline".
+3. When it reconnects, after a short settle delay, the server checks
+   whether it missed a scheduled backup and — if so — triggers a
+   catch-up backup automatically.
+4. Still raises a *staleness* alert if it has genuinely gone too long
+   without any backup (a host left in a drawer). This is the only
+   alert covering an asleep host: while the agent is offline no job
+   runs, so there is no failure to detect — staleness is the safety
+   net for "no backups are happening at all."
+5. Leaves normal job-failure alerting untouched: a backup that
+   actually runs (scheduled or catch-up) and fails alerts as it does
+   today. Failures can only occur while the agent is online and
+   executing restic.
+
+Default behaviour is unchanged for the entire existing fleet.
+
+## Decisions (from brainstorming)
+
+- **Setting shape:** a single boolean `Always On` checkbox per host,
+  **default ON**. Checked = today's 24×7 server semantics. Unchecked =
+  intermittent host. Opt-in only; zero behaviour change for current and
+  future hosts unless explicitly toggled.
+- **Overdue trigger:** evaluated on **reconnect + behind schedule**
+  (not a continuous always-evaluating sweep).
+- **Alert policy for intermittent hosts:** suppress offline alerts;
+  keep a long-threshold **staleness** alert; keep job-failure alerts.
+- **Staleness threshold:** **7 days**, a global constant for v1. May
+  become per-host configurable later — out of scope now.
+- **Catch-up granularity:** **per enabled schedule.** A host with a
+  daily and a weekly schedule catches up only whichever is actually
+  behind.
+- **UI vocabulary:** not-visible intermittent host shows a grey
+  `asleep` state; detail line reads
+  `asleep · last seen <relTime> · will catch up on return`.
+- **Chip:** chip and checkbox highlight the **same** truth (24×7). Show
+  a chip for **Always-On** hosts; **no** chip for intermittent.
+
+## Architecture
+
+The change is deliberately a thin policy + presentation layer over the
+existing online/offline state machine. We do **not** add a new `status`
+enum value or alter heartbeat / `last_seen_at` tracking. "Asleep" is a
+reinterpretation of `status='offline' AND NOT always_on`.
+
+### 1. Data model
+
+- **Migration `0024_hosts_always_on.sql`:**
+  ```sql
+  ALTER TABLE hosts ADD COLUMN always_on INTEGER NOT NULL DEFAULT 1;
+  ```
+  Column-level ALTER per the repo's migration rules. Default `1` means
+  every existing row is Always-On — no behaviour change on upgrade.
+- `store/types.go`: add `AlwaysOn bool` to the `Host` struct; thread it
+  through every host SELECT scan and the host insert/update paths.
+- New store helper `SetHostAlwaysOn(ctx, hostID, bool) error`.
+
+### 2. Online/offline mechanics — UNCHANGED
+
+The 30s offline sweeper (`cmd/server/main.go:220`) still flips an unseen
+host to `status='offline'` and still calls
+`alertEngine.NotifyHostOffline(id)`. `TouchHost` / `MarkHostHello`
+behaviour is untouched. The intermittent distinction is applied
+*downstream* of this state, in the alert engine and the templates.
+
+### 3. Alert behaviour
+
+All changes key off `host.AlwaysOn`, which the engine already has access
+to via the host row it loads.
+
+- **Suppress offline alert** (`alert/engine.go` `handleHostOffline()`
+  and the 60s `tick()`): when `!host.AlwaysOn`, do not raise
+  `agent_offline`.
+- **Resolve-on-toggle:** when a host is switched server→intermittent and
+  has an open `agent_offline` alert, auto-resolve it. (Handled in the
+  mode-change handler, fanning through the normal resolve path so
+  channels/audit fire as usual.)
+- **Staleness alert** — wire up the currently-dead `KindStaleSchedule`
+  constant, **for intermittent hosts only.** On the 60s tick, for each
+  host where `!AlwaysOn` AND the host has ≥1 enabled schedule AND
+  `LastBackupAt != nil` AND `now - LastBackupAt > 7*24h`: raise a
+  `warning` `stale_schedule` alert (dedup key `""`, one per host).
+  Auto-resolves when `LastBackupAt` advances past the threshold (i.e.
+  any successful backup, including the catch-up). Always-On hosts'
+  `stale_schedule` remains a no-op (unchanged, out of scope).
+  - If `LastBackupAt == nil` (intermittent host enrolled but never
+    backed up): no staleness alert in v1 — there is no baseline to
+    measure against, and onboarding probe state (`repo_status`) already
+    covers "never successfully set up."
+- **Job-failure alerts:** untouched. A catch-up backup that runs and
+  fails alerts exactly like any other backup.
+
+### 4. Catch-up on reconnect
+
+A new small component — the **catch-up scheduler** — lives server-side
+alongside the existing ticks.
+
+- **Arm:** on agent hello (`server/ws/handler.go` hello path /
+  `onAgentHello`), if the host is `!AlwaysOn`, record
+  `catchupDueAt[hostID] = now + 60s` in an in-memory map. Re-arming on a
+  subsequent hello just overwrites the timestamp (debounce — rapid
+  flapping does not stack catch-ups). In-memory is acceptable: catch-up
+  is best-effort and a server restart simply re-arms on the next hello.
+- **Fire:** reuse the existing 30s server tick. For each due entry
+  (`catchupDueAt <= now`):
+  1. Re-verify the agent is still connected (`Hub.Connected(hostID)`).
+     If it bounced back offline within the settle window, drop the entry
+     (it will re-arm on the next hello).
+  2. Skip if a backup is already running or queued for the host
+     (`current_job_id` set, or a relevant `pending_runs` row exists) —
+     avoid double-firing alongside a normal dispatch or pending drain.
+  3. For each **enabled** schedule on the host, compute overdue:
+     ```
+     overdue := sched.Next(host.LastBackupAt) <= now
+     ```
+     using `robfig/cron/v3` (already a dependency) to parse
+     `Schedule.CronExpr`. `Next(lastBackup)` is the first fire strictly
+     after the last successful backup; if that moment has already
+     passed, the window was missed → overdue. (If `LastBackupAt` is nil,
+     treat as overdue so a never-backed-up intermittent host with a
+     schedule gets its first run on connect.)
+  4. For each overdue schedule, dispatch its source-groups via the
+     existing `dispatchBackupForGroupCore()`.
+  5. Clear the entry.
+
+Net latency is ~60–90s after wake (60s settle + up to one 30s tick).
+This path is independent of and complementary to the `pending_runs`
+drain, which continues to handle the fired-but-not-sent case.
+
+### 5. UI
+
+- **CSS:** new grey `dot-asleep` token in `web/styles/input.css`,
+  visually distinct from red `dot-offline`.
+- **`partials/host_row.html` and `partials/host_chrome.html`:** when
+  `!AlwaysOn && status=='offline'`, render the grey dot + label
+  `asleep`; the detail/last-seen line reads
+  `asleep · last seen <relTime> · will catch up on return`. All other
+  states unchanged.
+- **24×7 chip:** on the host detail header, render a small
+  `Always On` / `24×7` chip **only when `AlwaysOn` is true**. No chip
+  for intermittent hosts. (Chip and checkbox highlight the same fact.)
+- **Toggle:** an `Always On` checkbox (default checked) on the host edit
+  surface. Operator-band `POST` (mirrors existing host-edit handlers),
+  audited as `host.mode_updated`. On save, if switching to intermittent,
+  trigger the resolve-on-toggle path for any open `agent_offline` alert.
+
+## Error handling & edge cases
+
+- **Toggle server→intermittent while offline+alerting:** open
+  `agent_offline` alert auto-resolved on save.
+- **Toggle intermittent→server while asleep:** host resumes normal
+  offline/alert semantics; it will alert per the 15-minute floor once
+  the sweeper/tick next evaluates it.
+- **No enabled schedules:** no catch-up and no staleness alert — there
+  is no backup expectation to measure against.
+- **Catch-up vs in-flight work:** guarded by the running/queued check in
+  step 4.2 so catch-up never races a normal dispatch or pending drain.
+- **Agent flaps during settle window:** entry dropped if not connected
+  at fire time; re-armed on the next hello.
+
+## Testing
+
+- **Alert engine (unit):**
+  - offline alert suppressed when `!AlwaysOn`.
+  - staleness alert raised when intermittent + schedule + last backup >
+    7d; not raised for Always-On hosts; not raised when last backup is
+    recent; not raised when no enabled schedule.
+  - staleness alert auto-resolves after a backup advances `LastBackupAt`.
+  - server→intermittent toggle resolves an open `agent_offline` alert.
+- **Overdue computation (unit, table-driven):** `(cronExpr,
+  lastBackupAt, now) → overdue?` including nil-last-backup and
+  daily/weekly cases.
+- **Catch-up scheduler (unit):** fires only when still connected; skips
+  when a backup is running/queued; dispatches only overdue schedules.
+- **UI (render test):** asleep state + 24×7 chip render under the right
+  conditions; offline state for Always-On hosts unchanged.
+- `go vet ./...` and full `go test ./...` green before merge.
+
+## Out of scope
+
+- Per-host staleness thresholds (global 7d constant for v1).
+- Continuous (non-reconnect) overdue evaluation.
+- Agent-side catch-up cron — the server is the reliable arbiter.
+- Wiring `stale_schedule` for Always-On hosts (separate concern).
+
+## Task tracking
+
+Add an entry to `tasks.md` under "Next steps from testing" (or a new
+small section) once the plan is approved, per the repo's tasks.md
+source-of-truth rule.
@@ -0,0 +1,126 @@
+# Threat model
+
+A short, structured walkthrough of the assets restic-manager
+protects, the actors that interact with it, the attack surfaces
+exposed, and the mitigations in place. This document is written for
+operators considering a deployment and for contributors evaluating
+security-sensitive changes. It is **not** a formal certification —
+restic-manager has not been third-party audited.
+
+Last reviewed: **2026-05-09** (against v1.0.0).
+
+---
+
+## 1. Assets
+
+In rough order of sensitivity:
+
+| Asset | Why it matters |
+|---|---|
+| **Restic repository passwords** | Decrypt every backup in the repo. Server holds them encrypted at rest; agents need plaintext at backup-time. |
+| **Repository URLs with embedded credentials** (e.g. `rest:https://user:pass@host/repo`) | Same as above — read access to the repo is leak-equivalent to the password. |
+| **Agent bearer tokens** | Long-lived credentials authenticating each agent → server WS. Compromise lets an attacker impersonate that host (push fake snapshots, ack fake schedule versions, exfiltrate repo creds the server pushes back). |
+| **Server session cookies** | Browser-side session for human operators. Compromise = full UI access at the user's role for the cookie's TTL (24h). |
+| **Database secret key** | Wraps every encrypted-at-rest field (repo creds, agent enrolment payloads). Loss of the file means decryptable backups; rotation requires re-pushing creds to every agent. |
+| **Bootstrap / setup tokens** | One-shot, time-limited; mint admin or invited-user accounts. |
+| **Audit log** | Tamper-evident record of admin actions; read-only via UI. |
+| **Backup data on the wire** | Restic itself encrypts on the agent before sending — see "out of scope". |
+
+---
+
+## 2. Actors
+
+| Actor | Trust |
+|---|---|
+| **Anonymous internet** | Untrusted. Should not reach the server unless proxied behind auth (see deployment guide). |
+| **Authenticated viewer** | Read-only on hosts/jobs/alerts/audit. |
+| **Authenticated operator** | Add/remove hosts, edit schedules, run backups/restores, mint enrolment tokens, ack alerts. |
+| **Authenticated admin** | All of the above plus user management, role changes, fleet update controls, secret-key visibility (no — see below). |
+| **Agent** | Trusted to backup-and-report on its own host only. Cannot read other hosts' creds. Bearer-authenticated. |
+| **Restic backend (rest-server / S3 / B2 / etc.)** | Out of scope for this document — assumed to authenticate the credentials presented and not collude. |
+
+---
+
+## 3. Attack surfaces and mitigations
+
+### 3.1 First-run bootstrap
+
+- **Surface**: `/bootstrap` UI + `/api/bootstrap` JSON endpoint.
+- **Risk**: race between server start and admin creation — an attacker who reaches the server first can claim admin.
+- **Mitigations**:
+  - Bootstrap token printed to stderr exactly once; held in memory, not persisted.
+  - The UI form on `/bootstrap` uses the in-memory token automatically (no token field for the operator to type or expose).
+  - Both surfaces self-disable the moment any user row exists (`CountUsers > 0`).
+  - Token is also blanked from process memory after success (defence in depth).
+- **Residual risk**: if an operator brings up the server on the public internet before reaching the bootstrap page, an attacker reaching `/bootstrap` first wins. **Recommendation**: bring the server up behind an existing trusted network or with the listener bound to `127.0.0.1` until first-run is complete.
+
+### 3.2 Local user accounts
+
+- **Surface**: `/login`, `/api/auth/login`.
+- **Mitigations**: Argon2id password hashing with per-deployment params; constant-time password compare; session-cookie minting via `crypto/rand`; session rows hash-only (raw token only in cookie).
+- **Rate limiting**: Currently not in place at the application layer — the project assumes a reverse proxy enforces login throttling. **Recommendation**: front the server with `caddy`/`nginx` rate-limit rules in production.
+- **Password policy**: 12-character minimum on bootstrap and user-setup paths; no maximum, no rotation, no history. Sufficient for self-hosted ops; tighten in policy if a deployment requires it.
+
+### 3.3 OIDC SSO
+
+- **Surface**: `/auth/oidc/*` — generic OIDC client, JIT user provisioning.
+- **Mitigations**: state + nonce per flow; role mapping is server-configured (claims trusted only to identify the user, not pick role); user-disabled gate runs after IdP success.
+- **Residual risk**: misconfigured role-mapping rules can promote any IdP user to admin. **Recommendation**: review `cfg.OIDC.RoleMappings` carefully.
+
+### 3.4 Agent enrolment
+
+- **Surface**: `/api/agents/enroll` (token-authenticated), `/api/agents/announce` (anonymous, then operator-approves).
+- **Mitigations**:
+  - Token path: one-shot, hashed at rest, 1h TTL; agent receives a fresh long-lived bearer in the response.
+  - Announce path: agent supplies an Ed25519 public key; operator sees a fingerprint to confirm out-of-band before accepting.
+  - Bearer tokens are SHA-256 hashed in the DB.
+- **Residual risk**: an attacker on the network between operator and target host who intercepts the install snippet can enrol *as* the target. The install script must be served over TLS in production (the docker-only deployment defaults to TLS-by-default; bare-metal deployers must configure their own).
+
+### 3.5 Agent → server WebSocket
+
+- **Surface**: persistent WS authenticated by agent bearer.
+- **Mitigations**: bearer is presented per-connection; server pins the agent fingerprint for the announce flow; messages are envelope-typed and rejected if shape-invalid.
+- **No payload-level signing** today — TLS is the integrity boundary. A man-in-the-middle with a valid cert chain could swap messages. **Recommendation**: pin the server cert via `RM_SERVER_CERT_PIN_SHA256` if running over a network you don't fully control.
+
+### 3.6 Repo credential lifecycle
+
+- Stored encrypted at rest under the AEAD secret key.
+- Pushed to the agent over the WS on hello, on creds change, and on demand.
+- Agent persists them encrypted (per-host secret key derived from a value known only to the agent).
+- Logged surfaces use `restic.RedactURL()` to strip `user:pass@` from URLs before they reach `slog`.
+- Plaintext form is constructed only at `exec.Command` time inside the agent, never stored on a struct field that could be slogged.
+
+### 3.7 Restore
+
+- Operators can restore to any path the agent (running as root) can write.
+- Cross-host restore (host A's snapshot → host C) is **deferred** — see F-01. The current single-host restore does not require granting any cross-host privileges.
+
+### 3.8 Audit log
+
+- Append-only writes from the application; SQLite enforces no schema-level immutability.
+- A compromise of the SQLite file (via OS-level access) can edit the audit log. **Recommendation**: ship audit entries to an append-only sink (syslog / Loki / Splunk) if tamper-evidence beyond the OS boundary is required.
+
+### 3.9 Self-update channel (P6)
+
+- Agents fetch new binaries via the WS transport from the server.
+- Binaries are signature-checked by the agent against a key embedded in the existing agent (see `internal/fleetupdate/`).
+- **Residual risk**: a server compromise lets the attacker push code to every agent (running as root). The signing-key compromise window is the same as the server compromise window because both live on the server. Splitting the signing key onto a separate signer is future work (not v1).
+
+---
+
+## 4. Out of scope
+
+- **Restic itself** — its repository format, encryption, and backend protocol are upstream-trusted.
+- **The host OS** — root compromise of a host obviously compromises that host's backups.
+- **The backup destination** — restic-manager assumes the rest-server / object-store / SFTP target enforces its own auth.
+- **Side-channel attacks** on the server process (RAM dump, process tracing).
+- **Physical access** to the server's disk.
+
+---
+
+## 5. Reporting
+
+Found something we missed? See `SECURITY.md` for the disclosure
+process. Coordinated disclosure preferred; the project is
+maintained by a small team and we'll respond as quickly as we
+reasonably can.
@@ -0,0 +1,42 @@
+# Build a Linux container that runs the restic-manager agent against a
+# sibling rest-server in the e2e compose stack. Used only by tests
+# (e2e/compose.e2e.yml + .gitea/workflows/e2e.yml).
+#
+# Two stages:
+#   1. golang:alpine to build the agent binary.
+#   2. alpine:3.20 with the `restic` package + the built binary.
+#
+# Pinning by digest is intentional for CI reproducibility.
+
+FROM golang:1.25-alpine AS build
+WORKDIR /src
+
+ENV CGO_ENABLED=0 \
+    GOFLAGS="-trimpath"
+
+COPY go.mod go.sum* ./
+RUN go mod download
+
+COPY . .
+ARG VERSION=e2e
+RUN go build -ldflags="-s -w -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=${VERSION}" \
+        -o /out/restic-manager-agent ./cmd/agent
+
+FROM alpine:3.20
+RUN apk add --no-cache restic ca-certificates curl
+COPY --from=build /out/restic-manager-agent /usr/local/bin/restic-manager-agent
+
+# Agents normally run as root because backup paths often need it. The
+# e2e fixture only backs up paths under /data which we own, so this
+# container would tolerate a non-root user — but staying root keeps
+# parity with the production install.
+USER root
+
+# The agent needs a writable directory for its config + secrets store.
+RUN mkdir -p /etc/restic-manager /var/lib/restic-manager
+ENV RM_AGENT_CONFIG=/etc/restic-manager/agent.yaml
+
+# The compose entrypoint sets the announce URL via env.
+COPY e2e/agent-entrypoint.sh /usr/local/bin/entrypoint.sh
+RUN chmod +x /usr/local/bin/entrypoint.sh
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
@@ -0,0 +1,21 @@
+# Playwright runner for the e2e suite. Built and run by
+# e2e/compose.e2e.yml so the test process sits on the same docker
+# network as the server, agent, and rest-server. The previous setup
+# ran Playwright on the workflow runner host and reached the server
+# via 127.0.0.1:8080; that fails on Gitea's act-style runners
+# because the workflow steps execute inside a runner container,
+# not on the host where compose publishes its ports.
+
+FROM mcr.microsoft.com/playwright:v1.59.1-jammy
+
+WORKDIR /work
+
+# Install npm deps in a separate layer keyed off package.json so
+# changes to specs don't bust the dep cache.
+COPY e2e/playwright/package.json /work/package.json
+RUN npm install --no-audit --no-fund
+
+COPY e2e/playwright/ /work/
+
+ENV CI=1
+ENTRYPOINT ["npx", "playwright", "test"]
@@ -0,0 +1,27 @@
+#!/bin/sh
+# Entrypoint for the e2e agent container.
+#
+# Three states:
+#   1. Already enrolled (agent.yaml has a bearer): run the agent.
+#   2. Token supplied via $RM_ENROL_TOKEN: enrol then run.
+#   3. Otherwise: announce against $RM_SERVER and wait for an admin to
+#      accept us. The announce flow blocks until accepted, then drops
+#      straight into the normal run loop, so this is the test-friendly
+#      path.
+set -eu
+
+CFG="${RM_AGENT_CONFIG:-/etc/restic-manager/agent.yaml}"
+SERVER="${RM_SERVER:?set RM_SERVER}"
+
+if [ -f "$CFG" ] && grep -q '^agent_token:' "$CFG"; then
+    exec restic-manager-agent -config "$CFG"
+fi
+
+if [ -n "${RM_ENROL_TOKEN:-}" ]; then
+    exec restic-manager-agent -config "$CFG" \
+        -enroll-server "$SERVER" \
+        -enroll-token "$RM_ENROL_TOKEN"
+fi
+
+# Announce-and-approve: blocks until an admin accepts, then runs.
+exec restic-manager-agent -config "$CFG" -enroll-server "$SERVER"
@@ -0,0 +1,113 @@
+# End-to-end test stack — used by .gitea/workflows/e2e.yml and by
+# operators who want to run the Playwright suite locally.
+#
+# Three services:
+#   * server      — restic-manager built from the working tree
+#   * agent       — restic-manager agent built from the working tree
+#                   (announces; Playwright accepts it during the test)
+#   * rest-server — the actual restic backend, sibling of the agent
+#
+# Run from the repo root:
+#   docker compose -f e2e/compose.e2e.yml up --build --abort-on-container-exit
+
+services:
+  rest-server:
+    image: restic/rest-server:0.13.0
+    environment:
+      DATA_DIR: /data
+      OPTIONS: "--no-auth"
+    volumes:
+      - rest-data:/data
+    networks: [rmnet]
+
+  server:
+    build:
+      context: ..
+      dockerfile: deploy/Dockerfile.server
+      args:
+        VERSION: e2e
+    environment:
+      RM_LISTEN: ":8080"
+      RM_DATA_DIR: "/data"
+      RM_BASE_URL: "http://server:8080"
+      RM_COOKIE_SECURE: "false"
+      # Bind the metrics endpoint loose for the test, so one of the
+      # Playwright assertions can exercise it.
+      RM_METRICS_TRUSTED_CIDR: "0.0.0.0/0"
+    volumes:
+      - server-data:/data
+    ports:
+      - "127.0.0.1:8080:8080"
+    healthcheck:
+      test: ["CMD", "/usr/local/bin/restic-manager-server", "--version"]
+      interval: 2s
+      timeout: 2s
+      retries: 30
+    networks: [rmnet]
+
+  agent:
+    build:
+      context: ..
+      dockerfile: e2e/Dockerfile.agent
+      args:
+        VERSION: e2e
+    environment:
+      RM_SERVER: "http://server:8080"
+    depends_on:
+      - server
+    volumes:
+      # Source paths the agent backs up. Compose pre-populates this
+      # with a few files so the snapshot list isn't empty.
+      - source-data:/source
+      - agent-config:/etc/restic-manager
+      - agent-state:/var/lib/restic-manager
+    networks: [rmnet]
+
+  # Playwright test runner. Profile-gated so `compose up` doesn't
+  # start it; CI invokes it via `compose run` and `docker cp`s the
+  # report+traces out (see .gitea/workflows/e2e.yml). Lives on
+  # rmnet so it can reach the server via its compose-network DNS
+  # name rather than depending on host port-publish (which doesn't
+  # work on Gitea's container-based runners).
+  #
+  # Reports are NOT bind-mounted: when the runner job itself runs
+  # inside a container, `./playwright/...` resolves to a path that
+  # only exists inside the runner container, so the host docker
+  # daemon would silently mount an empty dir. Instead the report
+  # stays inside the playwright container and the workflow extracts
+  # it via `docker cp` before tearing down.
+  playwright:
+    profiles: [test]
+    build:
+      context: ..
+      dockerfile: e2e/Dockerfile.playwright
+    environment:
+      RM_BASE_URL: "http://server:8080"
+      RM_BOOTSTRAP_TOKEN: "${RM_BOOTSTRAP_TOKEN:-}"
+    depends_on:
+      - server
+      - agent
+    networks: [rmnet]
+
+  # One-shot init container that drops a couple of files into the
+  # source volume so backups have something to snapshot.
+  source-fixture:
+    image: alpine:3.20
+    command: >
+      sh -c 'mkdir -p /source && echo "hello world" > /source/hello.txt &&
+             echo "another file" > /source/two.txt && sleep 0.2'
+    volumes:
+      - source-data:/source
+    networks: [rmnet]
+    restart: "no"
+
+volumes:
+  server-data:
+  rest-data:
+  source-data:
+  agent-config:
+  agent-state:
+
+networks:
+  rmnet:
+    driver: bridge
@@ -0,0 +1,14 @@
+{
+  "name": "restic-manager-e2e",
+  "version": "0.0.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "test": "playwright test",
+    "test:headed": "playwright test --headed",
+    "test:debug": "PWDEBUG=1 playwright test"
+  },
+  "devDependencies": {
+    "@playwright/test": "1.59.1"
+  }
+}
@@ -0,0 +1,35 @@
+import { defineConfig, devices } from '@playwright/test';
+
+// Single-target Chromium config: the e2e suite is narrow (smoke
+// the production-shaped flow against the docker-compose stack).
+// Cross-browser matrix doesn't add signal — what we're verifying is
+// the server's HTML and the agent's WebSocket handshake, neither of
+// which depends on browser engine.
+
+const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080';
+
+export default defineConfig({
+    testDir: './tests',
+    // 4 minutes — the smoke test waits for: enrolment + bootstrap
+    // (~5s), auto-init landing (~10s), backup completion (~120s
+    // budget). 60s is far too tight in CI; 4m gives headroom even
+    // on a contended runner without masking real regressions.
+    timeout: 240_000,
+    expect: { timeout: 10_000 },
+    fullyParallel: false,
+    retries: process.env.CI ? 1 : 0,
+    workers: 1,
+    reporter: [['list'], ['html', { open: 'never' }]],
+    use: {
+        baseURL,
+        trace: 'retain-on-failure',
+        screenshot: 'only-on-failure',
+        video: 'retain-on-failure',
+    },
+    projects: [
+        {
+            name: 'chromium',
+            use: { ...devices['Desktop Chrome'] },
+        },
+    ],
+});
@@ -0,0 +1,152 @@
+// Helpers used by every test. The shape favours the JSON API for
+// reads + accept/dispatch (deterministic, easy to assert) and the
+// browser for human-facing surfaces (login form, dashboard render).
+
+import { APIRequestContext, expect, Page } from '@playwright/test';
+
+export const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080';
+
+export interface HostJSON {
+    id: string;
+    name: string;
+    status: string;
+    repo_status?: string;
+    last_backup_status?: string;
+}
+
+export async function readBootstrapToken(): Promise<string> {
+    const tok = process.env.RM_BOOTSTRAP_TOKEN;
+    if (!tok) {
+        throw new Error('RM_BOOTSTRAP_TOKEN not set — the harness scrapes it from server logs');
+    }
+    return tok;
+}
+
+export async function bootstrapAdmin(
+    request: APIRequestContext,
+    {
+        username = 'admin',
+        password = 'e2e-test-password-1234',
+    }: { username?: string; password?: string } = {},
+): Promise<{ username: string; password: string }> {
+    const token = await readBootstrapToken();
+    const res = await request.post(`${baseURL}/api/bootstrap`, {
+        data: { token, username, password },
+    });
+    if (!res.ok() && res.status() !== 409 /* already bootstrapped */) {
+        throw new Error(`bootstrap: ${res.status()} ${await res.text()}`);
+    }
+    return { username, password };
+}
+
+export async function loginViaUI(page: Page, username: string, password: string): Promise<void> {
+    await page.goto(`${baseURL}/login`);
+    await page.locator('#login-username').fill(username);
+    await page.locator('#login-password').fill(password);
+    await Promise.all([
+        page.waitForURL(new RegExp(`^${baseURL}/?$`)),
+        page.locator('form[action="/login"] button[type="submit"]').click(),
+    ]);
+}
+
+/**
+ * Polls the dashboard until a pending host card is visible, then
+ * extracts its pending-id from the inline accept form's action URL.
+ */
+export async function waitForPendingHostID(page: Page): Promise<string> {
+    const formLocator = page.locator('form[action^="/api/pending-hosts/"][action$="/accept"]').first();
+    await expect(formLocator).toBeVisible({ timeout: 60_000 });
+    const action = await formLocator.getAttribute('action');
+    if (!action) throw new Error('pending host form has no action attribute');
+    const m = action.match(/\/api\/pending-hosts\/([^/]+)\/accept/);
+    if (!m) throw new Error(`unexpected action URL: ${action}`);
+    return m[1];
+}
+
+export async function acceptPending(
+    request: APIRequestContext,
+    cookie: string,
+    pendingID: string,
+    repo: { url: string; username?: string; password: string },
+): Promise<void> {
+    const res = await request.post(`${baseURL}/api/pending-hosts/${pendingID}/accept`, {
+        headers: { cookie, 'content-type': 'application/json' },
+        data: {
+            repo_url: repo.url,
+            repo_username: repo.username ?? '',
+            repo_password: repo.password,
+        },
+    });
+    if (!res.ok()) {
+        throw new Error(`accept: ${res.status()} ${await res.text()}`);
+    }
+}
+
+export async function listHosts(request: APIRequestContext, cookie: string): Promise<HostJSON[]> {
+    const res = await request.get(`${baseURL}/api/hosts`, { headers: { cookie } });
+    if (!res.ok()) throw new Error(`list hosts: ${res.status()} ${await res.text()}`);
+    const body = (await res.json()) as { items?: HostJSON[]; hosts?: HostJSON[] };
+    return body.items ?? body.hosts ?? [];
+}
+
+export async function waitForHostStatus(
+    request: APIRequestContext,
+    cookie: string,
+    matcher: (h: HostJSON) => boolean,
+    timeoutMs = 60_000,
+): Promise<HostJSON> {
+    const deadline = Date.now() + timeoutMs;
+    let last: HostJSON | undefined;
+    while (Date.now() < deadline) {
+        const hosts = await listHosts(request, cookie);
+        const hit = hosts.find(matcher);
+        if (hit) return hit;
+        last = hosts[0];
+        await new Promise((r) => setTimeout(r, 1_000));
+    }
+    throw new Error(`waitForHostStatus: timeout. Last seen: ${JSON.stringify(last)}`);
+}
+
+export async function createSourceGroup(
+    request: APIRequestContext,
+    cookie: string,
+    hostID: string,
+    body: { name: string; includes: string[]; excludes?: string[] },
+): Promise<string> {
+    const res = await request.post(`${baseURL}/api/hosts/${hostID}/source-groups`, {
+        headers: { cookie, 'content-type': 'application/json' },
+        data: {
+            name: body.name,
+            includes: body.includes,
+            excludes: body.excludes ?? [],
+            retention_policy: {},
+            retry_max: 0,
+            retry_backoff_seconds: 0,
+        },
+    });
+    if (!res.ok()) throw new Error(`createSourceGroup: ${res.status()} ${await res.text()}`);
+    const created = (await res.json()) as { id?: string; group?: { id?: string } };
+    const id = created.id ?? created.group?.id;
+    if (!id) throw new Error(`createSourceGroup: no id in response: ${JSON.stringify(created)}`);
+    return id;
+}
+
+export async function runSourceGroup(
+    request: APIRequestContext,
+    cookie: string,
+    hostID: string,
+    groupID: string,
+): Promise<void> {
+    const res = await request.post(
+        `${baseURL}/api/hosts/${hostID}/source-groups/${groupID}/run`,
+        { headers: { cookie } },
+    );
+    if (!res.ok()) throw new Error(`runSourceGroup: ${res.status()} ${await res.text()}`);
+}
+
+export async function getSessionCookie(page: Page): Promise<string> {
+    const cookies = await page.context().cookies();
+    const c = cookies.find((c) => c.name === 'rm_session');
+    if (!c) throw new Error('rm_session cookie not set after login');
+    return `${c.name}=${c.value}`;
+}
@@ -0,0 +1,90 @@
+// End-to-end smoke: bootstrap → accept pending host → run backup → see succeeded.
+//
+// The compose stack stands up a server, a sibling rest-server, and an
+// agent in announce-and-approve mode. This test drives the operator
+// path through the UI (login + dashboard) and the API
+// (accept + run-now + poll for terminal) — UI for the human surfaces,
+// API for the deterministic ones.
+
+import { test, expect } from '@playwright/test';
+import {
+    baseURL,
+    bootstrapAdmin,
+    loginViaUI,
+    waitForPendingHostID,
+    acceptPending,
+    waitForHostStatus,
+    createSourceGroup,
+    runSourceGroup,
+    getSessionCookie,
+} from './lib/server';
+
+test.describe('smoke: enrol-via-announce → backup', () => {
+    test('happy path: enrol → accept → backup → succeeded', async ({ page, request }) => {
+        const { username, password } = await bootstrapAdmin(request);
+        await loginViaUI(page, username, password);
+
+        // Dashboard renders.
+        await expect(page.locator('main')).toContainText(/host|fleet|pending/i, { timeout: 10_000 });
+
+        // Pending host appears (the agent container has been
+        // announcing since startup).
+        const pendingID = await waitForPendingHostID(page);
+        const cookie = await getSessionCookie(page);
+
+        // Accept with the rest-server creds. compose's rest-server runs
+        // --no-auth, so any credentials work; restic still demands a
+        // password to encrypt the repo.
+        await acceptPending(request, cookie, pendingID, {
+            url: 'rest:http://rest-server:8000/',
+            password: 'e2e-repo-password',
+        });
+
+        // Wait for the host to come online AND for auto-init to
+        // finish. Coming online happens as soon as the agent's
+        // bearer-authed WS attaches (~1s after accept); repo_status
+        // flips to 'ready' once the auto-init job completes (a
+        // couple of seconds later). Loading the host page before
+        // that leaves the Run-backup button disabled because the
+        // server-rendered HTML reflects the still-in-progress init,
+        // and the page has no live-refresh on that field.
+        const readyHost = await waitForHostStatus(
+            request, cookie,
+            (h) => h.status === 'online' && h.repo_status === 'ready',
+            90_000,
+        );
+        expect(readyHost.id).toBeTruthy();
+
+        // Per-host Run-now is gone; backups are dispatched per
+        // source-group now. Create one that maps to the agent's
+        // /source mount, then kick it via the JSON API.
+        const groupID = await createSourceGroup(request, cookie, readyHost.id, {
+            name: 'default',
+            includes: ['/source'],
+        });
+        await runSourceGroup(request, cookie, readyHost.id, groupID);
+
+        // Wait for the host's last_backup_status to flip to 'succeeded'.
+        // The host record is the source of truth: it's what the
+        // dashboard projects from job-completion events on the WS
+        // channel.
+        const finishedHost = await waitForHostStatus(
+            request, cookie,
+            (h) => h.id === readyHost.id && h.last_backup_status === 'succeeded',
+            120_000,
+        );
+        expect(finishedHost.last_backup_status).toBe('succeeded');
+    });
+});
+
+test.describe('smoke: scrape /metrics', () => {
+    test('metrics endpoint exposes the host gauge', async ({ request }) => {
+        // Compose sets RM_METRICS_TRUSTED_CIDR=0.0.0.0/0 so the
+        // endpoint is open to the test runner.
+        const res = await request.get(`${baseURL}/metrics`);
+        expect(res.status()).toBe(200);
+        const body = await res.text();
+        expect(body).toContain('rm_hosts_total');
+        expect(body).toContain('rm_build_info{');
+    });
+});
@@ -3,22 +3,26 @@ module gitea.dcglab.co.uk/steve/restic-manager
 go 1.25.0

 require (
+	github.com/coder/websocket v1.8.14
+	github.com/coreos/go-oidc/v3 v3.18.0
 	github.com/go-chi/chi/v5 v5.2.5
+	github.com/golang-jwt/jwt/v5 v5.3.1
 	github.com/oklog/ulid/v2 v2.1.1
+	github.com/robfig/cron/v3 v3.0.1
 	golang.org/x/crypto v0.50.0
+	golang.org/x/oauth2 v0.36.0
+	golang.org/x/sys v0.43.0
 	gopkg.in/yaml.v3 v3.0.1
 	modernc.org/sqlite v1.50.0
 )

 require (
-	github.com/coder/websocket v1.8.14 // indirect
 	github.com/dustin/go-humanize v1.0.1 // indirect
+	github.com/go-jose/go-jose/v4 v4.1.4 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/ncruces/go-strftime v1.0.0 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
-	github.com/robfig/cron/v3 v3.0.1 // indirect
-	golang.org/x/sys v0.43.0 // indirect
 	modernc.org/libc v1.72.0 // indirect
 	modernc.org/mathutil v1.7.1 // indirect
 	modernc.org/memory v1.11.0 // indirect
@@ -1,9 +1,15 @@
 github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g=
 github.com/coder/websocket v1.8.14/go.mod h1:NX3SzP+inril6yawo5CQXx8+fk145lPDC6pumgx0mVg=
+github.com/coreos/go-oidc/v3 v3.18.0 h1:V9orjXynvu5wiC9SemFTWnG4F45v403aIcjWo0d41+A=
+github.com/coreos/go-oidc/v3 v3.18.0/go.mod h1:DYCf24+ncYi+XkIH97GY1+dqoRlbaSI26KVTCI9SrY4=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
 github.com/go-chi/chi/v5 v5.2.5 h1:Eg4myHZBjyvJmAFjFvWgrqDTXFyOzjj7YIm3L3mu6Ug=
 github.com/go-chi/chi/v5 v5.2.5/go.mod h1:X7Gx4mteadT3eDOMTsXzmI4/rwUpOwBHLpAfupzFJP0=
+github.com/go-jose/go-jose/v4 v4.1.4 h1:moDMcTHmvE6Groj34emNPLs/qtYXRVcd6S7NHbHz3kA=
+github.com/go-jose/go-jose/v4 v4.1.4/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08=
+github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY=
+github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
 github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
 github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
@@ -25,6 +31,8 @@ golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI=
 golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q=
 golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
 golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
+golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs=
+golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q=
 golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
 golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -62,6 +62,13 @@ type Config struct {
 	LegacyRepoURL      string `yaml:"repo_url,omitempty"`
 	LegacyRepoPassword string `yaml:"repo_password,omitempty"`

+	// AnnounceKey is the base64-encoded Ed25519 private key used by
+	// announce-and-approve enrolment (P2-18). Generated on first
+	// announce, persisted so the agent can re-attach to the same
+	// pending row across restarts. 64 bytes when decoded.
+	// Empty for token-flow enrolments.
+	AnnounceKey string `yaml:"announce_key,omitempty"`
+
 	// path is the file we loaded from. Used by Save.
 	path string `yaml:"-"`
 }
@@ -0,0 +1,81 @@
+package runner
+
+import (
+	"context"
+	"strings"
+	"testing"
+	"time"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
+)
+
+// (fakeSender is defined in runner_test.go; it's already lock-protected
+// because the runner's stdout + stderr pump goroutines call Send
+// concurrently. The original local 'safeSender' here was a workaround
+// from before fakeSender itself grew the mutex.)
+
+// TestRunBackupCanceledMidRunReportsCanceled spawns a backup against
+// a fake restic that sleeps for 30 seconds, cancels the context after
+// a short delay, and confirms the resulting job.finished envelope
+// reports status=canceled (not failed).
+func TestRunBackupCanceledMidRunReportsCanceled(t *testing.T) {
+	t.Parallel()
+
+	// Fake restic: replace the shell with a long sleep via `exec` so the
+	// process tree is one process — SIGTERM goes directly to sleep and
+	// it exits. Without `exec`, the shell stays in the foreground while
+	// sleep is its child; SIGTERM-to-shell may or may not propagate to
+	// sleep depending on the shell, leading to the WaitDelay-then-
+	// SIGKILL fallback path firing — slower and noisier.
+	bin := setupScript(t, `exec sleep 30`)
+
+	tx := &fakeSender{}
+	r := New(Config{ResticBin: bin}, tx, 0)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan error, 1)
+	go func() {
+		done <- r.RunBackup(ctx, "job-cancel", []string{"/tmp/x"}, nil, nil, BackupHooks{})
+	}()
+
+	// Wait long enough for the subprocess to actually start before
+	// canceling. Without this, exec.CommandContext can race the
+	// kill against Start and produce a different error path.
+	time.Sleep(150 * time.Millisecond)
+	cancel()
+
+	select {
+	case <-done:
+	case <-time.After(15 * time.Second):
+		t.Fatal("RunBackup did not return within 15s of cancel")
+	}
+
+	// Locate the job.finished envelope and check its status.
+	envs := tx.snapshot()
+	var finEnv api.Envelope
+	var found bool
+	for _, e := range envs {
+		if e.Type == api.MsgJobFinished {
+			finEnv = e
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Fatal("no job.finished envelope was sent")
+	}
+	var fin api.JobFinishedPayload
+	if err := finEnv.UnmarshalPayload(&fin); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if fin.Status != api.JobCancelled {
+		t.Fatalf("status: got %q, want %q", fin.Status, api.JobCancelled)
+	}
+	if fin.ExitCode != 130 {
+		t.Errorf("exit_code: got %d, want 130 (POSIX cancel convention)", fin.ExitCode)
+	}
+	// The error message should be empty for canceled jobs (see runner.sendFinished).
+	if !strings.HasPrefix(fin.Error, "") || fin.Error != "" {
+		t.Errorf("error: got %q, want empty for canceled jobs", fin.Error)
+	}
+}
@@ -0,0 +1,106 @@
+// hooks.go — pre/post backup hooks for the agent runner (P2R-11).
+//
+// Hooks fire only for backup jobs (the runner's other kinds —
+// init/forget/prune/check/unlock — call shell scripts that touch
+// repo internals; running operator hooks for those would be
+// surprising). Hook bodies arrive plaintext on the wire (server
+// decrypted before the WS push). The agent never persists them
+// to disk; they live in memory for the lifetime of one job.
+//
+// Failure semantics:
+//   - pre_hook non-zero exit aborts the backup: the runner returns
+//     the error, the job is recorded as failed, and the actual
+//     restic invocation never runs.
+//   - post_hook non-zero exit is logged with a warning prefix in
+//     the job log but does NOT change the job status — the operator
+//     wants the backup result preserved even if the cleanup step
+//     misbehaved.
+//
+// Streaming: each line of the hook's stdout/stderr is shipped as a
+// log.stream envelope with payload prefixed `hook: ` so the live
+// log viewer can visually separate it from restic's own output.
+package runner
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"io"
+	"os/exec"
+	"runtime"
+	"sync/atomic"
+	"time"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
+)
+
+// runHook executes script via the host shell. status is the value
+// passed as RM_JOB_STATUS in the env (empty for pre-hooks; the
+// final job status — "succeeded" or "failed" — for post-hooks).
+// Returns an error iff the hook exited non-zero. ctx cancellation
+// kills the subprocess.
+func (r *Runner) runHook(ctx context.Context, jobID, phase, script, status string, seq *atomic.Int64) error {
+	if script == "" {
+		return nil
+	}
+	shell, flag := defaultShell()
+	cmd := exec.CommandContext(ctx, shell, flag, script)
+	cmd.Env = []string{
+		"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+	}
+	if status != "" {
+		cmd.Env = append(cmd.Env, "RM_JOB_STATUS="+status)
+	}
+	cmd.Env = append(cmd.Env, "RM_JOB_ID="+jobID, "RM_HOOK_PHASE="+phase)
+
+	stdout, err := cmd.StdoutPipe()
+	if err != nil {
+		return fmt.Errorf("hook %s: stdout pipe: %w", phase, err)
+	}
+	stderr, err := cmd.StderrPipe()
+	if err != nil {
+		return fmt.Errorf("hook %s: stderr pipe: %w", phase, err)
+	}
+	if err := cmd.Start(); err != nil {
+		return fmt.Errorf("hook %s: start: %w", phase, err)
+	}
+	done := make(chan struct{}, 2)
+	go func() { r.pumpHookLines(stdout, "stdout", phase, jobID, seq); done <- struct{}{} }()
+	go func() { r.pumpHookLines(stderr, "stderr", phase, jobID, seq); done <- struct{}{} }()
+	<-done
+	<-done
+	if werr := cmd.Wait(); werr != nil {
+		return fmt.Errorf("hook %s exited non-zero: %w", phase, werr)
+	}
+	return nil
+}
+
+// pumpHookLines streams lines as log.stream envelopes prefixed with
+// "hook(<phase>): " so the live log can visually separate them.
+func (r *Runner) pumpHookLines(rd io.Reader, stream, phase, jobID string, seq *atomic.Int64) {
+	scanner := bufio.NewScanner(rd)
+	scanner.Buffer(make([]byte, 0, 64*1024), 256*1024)
+	for scanner.Scan() {
+		line := "hook(" + phase + "): " + scanner.Text()
+		env, _ := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
+			JobID:   jobID,
+			Seq:     seq.Add(1),
+			TS:      time.Now().UTC(),
+			Stream:  api.LogStream(stream),
+			Payload: line,
+		})
+		_ = r.tx.Send(env)
+	}
+}
+
+// defaultShell returns the (binary, single-arg-flag) pair to use for
+// `<shell> <flag> "<script>"`. /bin/sh -c on Unix; cmd.exe /C on
+// Windows. The hook author writes whichever shell they prefer
+// inside the script body itself (PowerShell, bash, etc) — this is
+// just the bootstrap interpreter.
+func defaultShell() (string, string) {
+	if runtime.GOOS == "windows" {
+		return "cmd.exe", "/C"
+	}
+	return "/bin/sh", "-c"
+}
@@ -0,0 +1,90 @@
+// hooks_test.go — pre/post backup hook semantics (P2R-11).
+package runner
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
+)
+
+// TestPreHookFailureAbortsBackup: pre_hook exits 1 → restic never
+// runs, job is recorded failed with the hook's error.
+func TestPreHookFailureAbortsBackup(t *testing.T) {
+	t.Parallel()
+	// Restic script that records every invocation. If restic was
+	// called we'll see "restic-was-here" in the captured log.
+	bin := setupScript(t, `echo "restic-was-here"`)
+	tx := &fakeSender{}
+	r := New(Config{ResticBin: bin}, tx, 0)
+
+	err := r.RunBackup(context.Background(), "job-pre",
+		[]string{"/etc"}, nil, []string{"tag"},
+		BackupHooks{Pre: "exit 1"})
+	if err == nil {
+		t.Fatal("expected RunBackup to return an error from failed pre_hook")
+	}
+	if !strings.Contains(err.Error(), "pre_hook failed") {
+		t.Fatalf("error message: %q (want 'pre_hook failed')", err)
+	}
+	// job.finished arrived with status=failed.
+	finEnv := firstEnvOfType(t, tx.envs, api.MsgJobFinished)
+	var fin api.JobFinishedPayload
+	_ = finEnv.UnmarshalPayload(&fin)
+	if fin.Status != api.JobFailed {
+		t.Fatalf("status: %q, want failed", fin.Status)
+	}
+	// restic must NOT have run.
+	for _, env := range tx.envs {
+		if env.Type != api.MsgLogStream {
+			continue
+		}
+		var l api.LogStreamLine
+		_ = env.UnmarshalPayload(&l)
+		if strings.Contains(l.Payload, "restic-was-here") {
+			t.Fatal("restic was invoked despite pre_hook failure")
+		}
+	}
+}
+
+// TestPostHookRunsAfterBackup: post_hook fires after a successful
+// backup and receives RM_JOB_STATUS=succeeded in the env.
+func TestPostHookRunsAfterBackup(t *testing.T) {
+	t.Parallel()
+	bin := setupScript(t, `
+case "$1" in
+  backup)    echo '{"message_type":"summary","snapshot_id":"abc"}' ;;
+  snapshots) echo '[]' ;;
+  stats)     echo '{"total_size":0,"total_uncompressed_size":0,"snapshots_count":0,"total_file_count":0,"total_blob_count":0}' ;;
+  *)         exit 0 ;;
+esac
+`)
+	tx := &fakeSender{}
+	r := New(Config{ResticBin: bin}, tx, 0)
+
+	post := `echo "post-status=$RM_JOB_STATUS phase=$RM_HOOK_PHASE"`
+	if err := r.RunBackup(context.Background(), "job-post",
+		[]string{"/etc"}, nil, nil, BackupHooks{Post: post}); err != nil {
+		t.Fatalf("RunBackup: %v", err)
+	}
+
+	// Walk log.stream envelopes; one of them should be the post-hook
+	// line with the expected status.
+	var found bool
+	for _, env := range tx.envs {
+		if env.Type != api.MsgLogStream {
+			continue
+		}
+		var l api.LogStreamLine
+		_ = env.UnmarshalPayload(&l)
+		if strings.Contains(l.Payload, "post-status=succeeded") &&
+			strings.Contains(l.Payload, "phase=post") {
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Fatal("post_hook output not found in log.stream envelopes")
+	}
+}
@@ -0,0 +1,266 @@
+package runner
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
+)
+
+// TestRunRestoreShipsExpectedEnvelopes: a fake restic emits a couple
+// of restore status lines and a summary; the runner translates them
+// into job.progress envelopes and finishes the job successfully.
+func TestRunRestoreShipsExpectedEnvelopes(t *testing.T) {
+	t.Parallel()
+
+	bin := setupScript(t, `
+case "$1" in
+  restore)
+    echo '{"message_type":"status","seconds_elapsed":1,"percent_done":0.5,"total_files":10,"files_restored":5,"total_bytes":1000,"bytes_restored":500}'
+    echo '{"message_type":"status","seconds_elapsed":2,"percent_done":1.0,"total_files":10,"files_restored":10,"total_bytes":1000,"bytes_restored":1000}'
+    echo '{"message_type":"summary","seconds_elapsed":2,"total_files":10,"files_restored":10,"total_bytes":1000,"bytes_restored":1000}'
+    ;;
+  *)
+    echo "unknown: $*" ;;
+esac
+`)
+
+	tx := &fakeSender{}
+	r := New(Config{ResticBin: bin}, tx, 0)
+
+	if err := r.RunRestore(context.Background(), "job-r1", "f3a7b2c1",
+		[]string{"/etc/nginx/sites-available/alfa.conf"},
+		false, "/tmp/restore-out"); err != nil {
+		t.Fatalf("RunRestore: %v", err)
+	}
+
+	// Confirm landmarks: started → progress → finished.
+	order := envelopeOrder(tx.envs)
+	wants := []api.MessageType{api.MsgJobStarted, api.MsgJobProgress, api.MsgJobFinished}
+	positions := map[api.MessageType]int{}
+	for i, mt := range order {
+		if _, seen := positions[mt]; !seen {
+			positions[mt] = i
+		}
+	}
+	for i := 0; i < len(wants)-1; i++ {
+		a, b := wants[i], wants[i+1]
+		pa, aOK := positions[a]
+		pb, bOK := positions[b]
+		if !aOK {
+			t.Fatalf("envelope %q not found in %v", a, order)
+		}
+		if !bOK {
+			t.Fatalf("envelope %q not found in %v", b, order)
+		}
+		if pa >= pb {
+			t.Fatalf("expected %q before %q (positions %d, %d)", a, b, pa, pb)
+		}
+	}
+
+	// Started carries the right kind.
+	startEnv := firstEnvOfType(t, tx.envs, api.MsgJobStarted)
+	var startP api.JobStartedPayload
+	if err := startEnv.UnmarshalPayload(&startP); err != nil {
+		t.Fatalf("unmarshal started: %v", err)
+	}
+	if startP.Kind != api.JobRestore {
+		t.Fatalf("kind: got %q want %q", startP.Kind, api.JobRestore)
+	}
+
+	// Finished is succeeded.
+	finEnv := firstEnvOfType(t, tx.envs, api.MsgJobFinished)
+	var finP api.JobFinishedPayload
+	if err := finEnv.UnmarshalPayload(&finP); err != nil {
+		t.Fatalf("unmarshal finished: %v", err)
+	}
+	if finP.Status != api.JobSucceeded {
+		t.Fatalf("status: got %q want %q", finP.Status, api.JobSucceeded)
+	}
+	// Progress envelope reflects the last status line: 100% with 10 files.
+	progEnv := firstEnvOfType(t, tx.envs, api.MsgJobProgress)
+	var progP api.JobProgressPayload
+	if err := progEnv.UnmarshalPayload(&progP); err != nil {
+		t.Fatalf("unmarshal progress: %v", err)
+	}
+	// First progress will be from line 1 (50%) since we send first status
+	// immediately. Verify we at least see a sensible value.
+	if progP.PercentDone <= 0 {
+		t.Fatalf("expected non-zero progress, got %v", progP.PercentDone)
+	}
+	if progP.FilesDone <= 0 || progP.TotalFiles <= 0 {
+		t.Fatalf("expected file counters set, got %+v", progP)
+	}
+}
+
+// TestRunRestoreInPlaceArgvHasNoNoOwnership: indirectly verifies that
+// in-place mode doesn't pass --no-ownership. We can't see the actual
+// argv without a custom test harness, so we use a fake restic that
+// echoes its args and check the captured log.stream.
+func TestRunRestoreInPlaceArgvHasNoNoOwnership(t *testing.T) {
+	t.Parallel()
+
+	bin := setupScript(t, `
+case "$1" in
+  restore)
+    # Print all args on stderr so they're forwarded as log.stream.
+    echo "argv: $*" 1>&2
+    echo '{"message_type":"summary","seconds_elapsed":0,"total_files":0,"files_restored":0,"total_bytes":0,"bytes_restored":0}'
+    ;;
+esac
+`)
+
+	tx := &fakeSender{}
+	r := New(Config{ResticBin: bin}, tx, 0)
+	if err := r.RunRestore(context.Background(), "job-r2", "abc",
+		nil, true, ""); err != nil {
+		t.Fatalf("RunRestore: %v", err)
+	}
+
+	// Reconstruct the argv from the captured stderr log line.
+	var argv string
+	for _, e := range tx.envs {
+		if e.Type == api.MsgLogStream {
+			var p api.LogStreamLine
+			_ = e.UnmarshalPayload(&p)
+			if p.Stream == api.LogStderr && strings.HasPrefix(p.Payload, "argv:") {
+				argv = p.Payload
+				break
+			}
+		}
+	}
+	if argv == "" {
+		t.Fatal("never captured argv echo from fake restic")
+	}
+	if strings.Contains(argv, "--no-ownership") {
+		t.Errorf("in-place restore should NOT pass --no-ownership; got argv=%q", argv)
+	}
+	if !strings.Contains(argv, "--target /") {
+		t.Errorf("in-place restore should pass --target /; got argv=%q", argv)
+	}
+}
+
+// TestRunRestoreNewDirArgvShape: non-in-place restore passes --target
+// to the operator-chosen new directory and includes the path filters.
+// We deliberately do NOT pass --no-ownership (added in restic 0.17;
+// older versions error out — the comment in restore.go explains why).
+func TestRunRestoreNewDirArgvShape(t *testing.T) {
+	t.Parallel()
+
+	bin := setupScript(t, `
+case "$1" in
+  restore)
+    echo "argv: $*" 1>&2
+    echo '{"message_type":"summary","seconds_elapsed":0,"total_files":0,"files_restored":0,"total_bytes":0,"bytes_restored":0}'
+    ;;
+esac
+`)
+	tx := &fakeSender{}
+	r := New(Config{ResticBin: bin}, tx, 0)
+	if err := r.RunRestore(context.Background(), "job-r3", "abc",
+		[]string{"/etc/foo"}, false, "/tmp/restore-out"); err != nil {
+		t.Fatalf("RunRestore: %v", err)
+	}
+
+	var argv string
+	for _, e := range tx.envs {
+		if e.Type == api.MsgLogStream {
+			var p api.LogStreamLine
+			_ = e.UnmarshalPayload(&p)
+			if p.Stream == api.LogStderr && strings.HasPrefix(p.Payload, "argv:") {
+				argv = p.Payload
+				break
+			}
+		}
+	}
+	if argv == "" {
+		t.Fatal("no argv echo")
+	}
+	if strings.Contains(argv, "--no-ownership") {
+		t.Errorf("restic 0.16 doesn't accept --no-ownership; got argv=%q", argv)
+	}
+	if !strings.Contains(argv, "--target /tmp/restore-out") {
+		t.Errorf("expected --target /tmp/restore-out; got argv=%q", argv)
+	}
+	if !strings.Contains(argv, "--include /etc/foo") {
+		t.Errorf("expected --include /etc/foo; got argv=%q", argv)
+	}
+}
+
+// TestRunRestoreNewDirAutoCreatesTarget: a new-directory restore
+// should mkdir the requested target chain before invoking restic, so
+// operators don't have to pre-create the per-job subdir.
+func TestRunRestoreNewDirAutoCreatesTarget(t *testing.T) {
+	t.Parallel()
+	bin := setupScript(t, `
+case "$1" in
+  restore)
+    echo '{"message_type":"summary","seconds_elapsed":0,"total_files":0,"files_restored":0,"total_bytes":0,"bytes_restored":0}'
+    ;;
+esac
+`)
+	tx := &fakeSender{}
+	r := New(Config{ResticBin: bin}, tx, 0)
+
+	// Multi-level path the operator hasn't created yet.
+	target := filepath.Join(t.TempDir(), "deep", "deeper", "deepest")
+	if err := r.RunRestore(context.Background(), "job-rmkdir", "abc",
+		[]string{"/etc/foo"}, false, target); err != nil {
+		t.Fatalf("RunRestore: %v", err)
+	}
+
+	if st, err := os.Stat(target); err != nil {
+		t.Fatalf("expected target dir to exist: %v", err)
+	} else if !st.IsDir() {
+		t.Fatalf("expected directory, got %v", st.Mode())
+	}
+}
+
+// TestRunDiffShipsLogLines: diff output is forwarded as log.stream.
+func TestRunDiffShipsLogLines(t *testing.T) {
+	t.Parallel()
+	bin := setupScript(t, `
+case "$1" in
+  diff)
+    echo '{"message_type":"change","path":"/etc/nginx/nginx.conf","modifier":"M"}'
+    echo '{"message_type":"statistics","added":{"files":0,"dirs":0}}'
+    ;;
+esac
+`)
+	tx := &fakeSender{}
+	r := New(Config{ResticBin: bin}, tx, 0)
+	if err := r.RunDiff(context.Background(), "job-d1", "snap-a", "snap-b"); err != nil {
+		t.Fatalf("RunDiff: %v", err)
+	}
+
+	startEnv := firstEnvOfType(t, tx.envs, api.MsgJobStarted)
+	var startP api.JobStartedPayload
+	_ = startEnv.UnmarshalPayload(&startP)
+	if startP.Kind != api.JobDiff {
+		t.Fatalf("kind: got %q want %q", startP.Kind, api.JobDiff)
+	}
+	finEnv := firstEnvOfType(t, tx.envs, api.MsgJobFinished)
+	var finP api.JobFinishedPayload
+	_ = finEnv.UnmarshalPayload(&finP)
+	if finP.Status != api.JobSucceeded {
+		t.Fatalf("status: %q", finP.Status)
+	}
+	// At least one log line should carry the change payload.
+	var sawChange bool
+	for _, e := range tx.envs {
+		if e.Type != api.MsgLogStream {
+			continue
+		}
+		var p api.LogStreamLine
+		_ = e.UnmarshalPayload(&p)
+		if strings.Contains(p.Payload, `"message_type":"change"`) {
+			sawChange = true
+		}
+	}
+	if !sawChange {
+		t.Fatal("never saw a change log line in diff output")
+	}
+}
@@ -26,10 +26,22 @@ type Sender interface {
 // from the agent's config file (server-pushed config.update payloads
 // override these in memory).
 type Config struct {
-	ResticBin    string
-	RepoURL      string
-	RepoUsername string
-	RepoPassword string
+	ResticBin     string
+	ResticVersion string // e.g. "0.17.1" — empty if unknown
+	RepoURL       string
+	RepoUsername  string
+	RepoPassword  string
+
+	// SupportsRestoreNoOwnership comes from a startup probe of
+	// `restic restore --help`; gates the new-dir-restore flag without
+	// relying on version sniffing.
+	SupportsRestoreNoOwnership bool
+
+	// Bandwidth caps in KB/s applied to every restic invocation.
+	// <=0 means "no cap". Per-job override: callers that build a
+	// runner per-dispatch can pass the override value here directly.
+	LimitUploadKBps   int
+	LimitDownloadKBps int
 }

 // Runner owns the restic invocations.
@@ -54,10 +66,14 @@ func New(cfg Config, tx Sender, progressMinPeriod time.Duration) *Runner {
 // resticEnv builds the shared restic.Env from r.cfg.
 func (r *Runner) resticEnv() restic.Env {
 	return restic.Env{
-		Bin:          r.cfg.ResticBin,
-		RepoURL:      r.cfg.RepoURL,
-		RepoUsername: r.cfg.RepoUsername,
-		RepoPassword: r.cfg.RepoPassword,
+		Bin:                        r.cfg.ResticBin,
+		Version:                    r.cfg.ResticVersion,
+		RepoURL:                    r.cfg.RepoURL,
+		RepoUsername:               r.cfg.RepoUsername,
+		RepoPassword:               r.cfg.RepoPassword,
+		SupportsRestoreNoOwnership: r.cfg.SupportsRestoreNoOwnership,
+		LimitUploadKBps:            r.cfg.LimitUploadKBps,
+		LimitDownloadKBps:          r.cfg.LimitDownloadKBps,
 	}
 }

@@ -87,8 +103,10 @@ func (r *Runner) streamHandler(jobID string, seq *atomic.Int64) restic.LineHandl
 }

 // sendFinished ships a job.finished envelope. err==nil → succeeded;
-// otherwise failed. statsBlob is forwarded as JobFinishedPayload.Stats.
-func (r *Runner) sendFinished(jobID string, finishedAt time.Time, err error, statsBlob json.RawMessage) {
+// otherwise failed (or canceled if ctx was canceled — operator
+// hit the Cancel button or the agent is shutting down).
+// statsBlob is forwarded as JobFinishedPayload.Stats.
+func (r *Runner) sendFinished(ctx context.Context, jobID string, finishedAt time.Time, err error, statsBlob json.RawMessage) {
 	status := api.JobSucceeded
 	exit := 0
 	errMsg := ""
@@ -96,6 +114,16 @@ func (r *Runner) sendFinished(jobID string, finishedAt time.Time, err error, sta
 		status = api.JobFailed
 		exit = -1
 		errMsg = err.Error()
+		// If the context was canceled, the failure is operator-driven
+		// (or shutdown). Surface as JobCancelled so the UI shows a
+		// neutral "canceled" state rather than a red "failed" one.
+		// exec.CommandContext returns the process's exit error on
+		// ctx-cancel, which we'd otherwise rebadge as failed.
+		if ctxErr := ctx.Err(); ctxErr != nil {
+			status = api.JobCancelled
+			exit = 130  // POSIX convention for SIGINT/SIGTERM-killed
+			errMsg = "" // no need to surface the underlying restic error
+		}
 	}
 	finEnv, _ := api.Marshal(api.MsgJobFinished, jobID, api.JobFinishedPayload{
 		JobID:      jobID,
@@ -108,16 +136,35 @@ func (r *Runner) sendFinished(jobID string, finishedAt time.Time, err error, sta
 	_ = r.tx.Send(finEnv)
 }

+// BackupHooks bundles the optional pre/post shell snippets that fire
+// around a backup. Empty fields skip that phase. Resolved server-side
+// (group → host default) before dispatch; the agent just executes
+// whatever arrives in the payload.
+type BackupHooks struct {
+	Pre  string
+	Post string
+}
+
 // RunBackup executes a backup job and reports back via the sender.
 // Returns nil on a clean (or "incomplete-but-snapshot-created") finish.
-func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, tags []string) error {
+func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, tags []string, hooks BackupHooks) error {
 	startedAt := time.Now().UTC()
 	r.sendStarted(jobID, api.JobBackup, startedAt)

-	env := r.resticEnv()
-
 	var seq atomic.Int64
-	lastProgress := time.Now()
+
+	// pre_hook: non-zero exit aborts the backup. The job is recorded
+	// as failed with the hook's error and restic never runs.
+	if hooks.Pre != "" {
+		if err := r.runHook(ctx, jobID, "pre", hooks.Pre, "", &seq); err != nil {
+			finishedAt := time.Now().UTC()
+			r.sendFinished(ctx, jobID, finishedAt, err, nil)
+			return fmt.Errorf("pre_hook failed: %w", err)
+		}
+	}
+
+	env := r.resticEnv()
+	lastProgress := time.Time{} // zero time → first status event always emits

 	handle := func(stream string, line string, ev any) {
 		// Throttled progress events come from restic's `status` JSON.
@@ -165,7 +212,21 @@ func (r *Runner) RunBackup(ctx context.Context, jobID string, paths, excludes, t
 	if summary != nil {
 		statsBlob, _ = json.Marshal(summary)
 	}
-	r.sendFinished(jobID, finishedAt, err, statsBlob)
+
+	// post_hook: always runs regardless of backup outcome. Receives
+	// RM_JOB_STATUS=succeeded|failed in env. Non-zero exit is logged
+	// but does not change the recorded job status.
+	if hooks.Post != "" {
+		status := "succeeded"
+		if err != nil {
+			status = "failed"
+		}
+		if perr := r.runHook(ctx, jobID, "post", hooks.Post, status, &seq); perr != nil {
+			slog.Warn("runner: post_hook exited non-zero", "job_id", jobID, "err", perr)
+		}
+	}
+
+	r.sendFinished(ctx, jobID, finishedAt, err, statsBlob)

 	// On a successful backup, refresh the server's snapshot projection.
 	// We do this *after* job.finished so the UI sees the job land first;
@@ -199,7 +260,7 @@ func (r *Runner) RunInit(ctx context.Context, jobID string) error {
 	var seq atomic.Int64
 	err := env.RunInit(ctx, r.streamHandler(jobID, &seq))
 	finishedAt := time.Now().UTC()
-	r.sendFinished(jobID, finishedAt, err, nil)
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)
 	if err != nil {
 		return fmt.Errorf("runner init: %w", err)
 	}
@@ -221,7 +282,7 @@ func (r *Runner) RunForget(ctx context.Context, jobID string, groups []restic.Fo
 	var seq atomic.Int64
 	err := env.RunForget(ctx, groups, r.streamHandler(jobID, &seq))
 	finishedAt := time.Now().UTC()
-	r.sendFinished(jobID, finishedAt, err, nil)
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)

 	// Refresh the server's snapshot projection — forget rewrites the
 	// index so the host's snapshot list almost certainly shrunk.
@@ -259,7 +320,7 @@ func (r *Runner) RunPrune(ctx context.Context, jobID string) error {
 		}
 	}

-	r.sendFinished(jobID, finishedAt, err, nil)
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)

 	if err != nil {
 		return fmt.Errorf("runner prune: %w", err)
@@ -298,7 +359,7 @@ func (r *Runner) RunCheck(ctx context.Context, jobID string, subsetPct int) erro
 		slog.Warn("runner: stats.report after check failed", "job_id", jobID, "err", rerr)
 	}

-	r.sendFinished(jobID, finishedAt, err, nil)
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)

 	if err != nil {
 		return fmt.Errorf("runner check: %w", err)
@@ -306,6 +367,102 @@ func (r *Runner) RunCheck(ctx context.Context, jobID string, subsetPct int) erro
 	return nil
 }

+// RunRestore executes a restic restore job and reports back via the
+// sender. paths is the operator-selected file/dir list to restore.
+// inPlace=true preserves uid/gid/mode and writes at "/"; inPlace=false
+// writes at targetDir with --no-ownership.
+//
+// Status events from restic are throttled into job.progress in the
+// same shape as backup; raw status lines are dropped from log.stream
+// (they would drown the log on a fast restore — the progress widget
+// already covers them).
+func (r *Runner) RunRestore(ctx context.Context, jobID, snapshotID string, paths []string, inPlace bool, targetDir string) error {
+	startedAt := time.Now().UTC()
+	r.sendStarted(jobID, api.JobRestore, startedAt)
+
+	env := r.resticEnv()
+	var seq atomic.Int64
+	lastProgress := time.Time{} // zero time → first status event always emits
+
+	handle := func(stream string, line string, ev any) {
+		status, isStatus := ev.(restic.RestoreStatus)
+		if !isStatus {
+			now := time.Now().UTC()
+			logEnv, _ := api.Marshal(api.MsgLogStream, "", api.LogStreamLine{
+				JobID:   jobID,
+				Seq:     seq.Add(1),
+				TS:      now,
+				Stream:  api.LogStream(stream),
+				Payload: line,
+			})
+			_ = r.tx.Send(logEnv)
+		}
+		if isStatus {
+			if time.Since(lastProgress) < r.progressMinPeriod {
+				return
+			}
+			lastProgress = time.Now()
+			progEnv, _ := api.Marshal(api.MsgJobProgress, jobID, api.JobProgressPayload{
+				JobID:         jobID,
+				PercentDone:   status.PercentDone,
+				FilesDone:     status.FilesRestored,
+				TotalFiles:    status.TotalFiles,
+				BytesDone:     status.BytesRestored,
+				TotalBytes:    status.TotalBytes,
+				ETASeconds:    estimateETA(status.BytesRestored, status.TotalBytes, status.SecondsElapsed),
+				ThroughputBps: throughput(status.BytesRestored, status.SecondsElapsed),
+			})
+			_ = r.tx.Send(progEnv)
+		}
+	}
+
+	summary, err := env.RunRestore(ctx, snapshotID, paths, inPlace, targetDir, handle)
+	finishedAt := time.Now().UTC()
+
+	var statsBlob json.RawMessage
+	if summary != nil {
+		statsBlob, _ = json.Marshal(summary)
+	}
+	r.sendFinished(ctx, jobID, finishedAt, err, statsBlob)
+	if err != nil {
+		return fmt.Errorf("runner restore: %w", err)
+	}
+	return nil
+}
+
+// estimateETA computes an ETA in seconds based on current bytes
+// progress + elapsed seconds. Restic restore's --json doesn't emit an
+// ETA field of its own (unlike backup), so we approximate by linear
+// extrapolation. Returns 0 when we don't have enough data.
+func estimateETA(bytesDone, totalBytes, secondsElapsed int64) int64 {
+	if bytesDone <= 0 || totalBytes <= 0 || secondsElapsed <= 0 || bytesDone >= totalBytes {
+		return 0
+	}
+	rate := float64(bytesDone) / float64(secondsElapsed)
+	if rate <= 0 {
+		return 0
+	}
+	return int64(float64(totalBytes-bytesDone) / rate)
+}
+
+// RunDiff executes `restic diff --json <a> <b>` and forwards output
+// as log.stream lines. No snapshot-list refresh, no stats update —
+// diff is purely informational.
+func (r *Runner) RunDiff(ctx context.Context, jobID, snapshotA, snapshotB string) error {
+	startedAt := time.Now().UTC()
+	r.sendStarted(jobID, api.JobDiff, startedAt)
+
+	env := r.resticEnv()
+	var seq atomic.Int64
+	err := env.RunDiff(ctx, snapshotA, snapshotB, r.streamHandler(jobID, &seq))
+	finishedAt := time.Now().UTC()
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)
+	if err != nil {
+		return fmt.Errorf("runner diff: %w", err)
+	}
+	return nil
+}
+
 // RunUnlock executes a `restic unlock` job. On success it ships a
 // repo.stats envelope with LockPresent=false so the UI banner clears.
 func (r *Runner) RunUnlock(ctx context.Context, jobID string) error {
@@ -325,7 +482,7 @@ func (r *Runner) RunUnlock(ctx context.Context, jobID string) error {
 		}
 	}

-	r.sendFinished(jobID, finishedAt, err, nil)
+	r.sendFinished(ctx, jobID, finishedAt, err, nil)

 	if err != nil {
 		return fmt.Errorf("runner unlock: %w", err)
@@ -2,32 +2,67 @@ package runner

 import (
 	"context"
+	"errors"
 	"os"
+	"os/exec"
 	"path/filepath"
+	"sync"
+	"syscall"
 	"testing"
+	"time"

 	"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
 	"gitea.dcglab.co.uk/steve/restic-manager/internal/restic"
 )

-// fakeSender collects sent envelopes for assertions.
-type fakeSender struct{ envs []api.Envelope }
+// fakeSender collects sent envelopes for assertions. Lock-protected
+// because the runner's pumpStdout / pumpStderr goroutines call Send
+// concurrently — without the mutex, -race in CI flags every test
+// that exercises a Run* method with both pumps active.
+type fakeSender struct {
+	mu   sync.Mutex
+	envs []api.Envelope
+}

 func (s *fakeSender) Send(e api.Envelope) error {
+	s.mu.Lock()
 	s.envs = append(s.envs, e)
+	s.mu.Unlock()
 	return nil
 }

+// snapshot returns a copy of the captured envelopes safe to read
+// without holding the lock. Tests use this when iterating envs while
+// other goroutines may still be writing — though in practice all
+// runner Run* methods join their pumps before returning, so callers
+// can also read .envs directly post-return.
+func (s *fakeSender) snapshot() []api.Envelope {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := make([]api.Envelope, len(s.envs))
+	copy(out, s.envs)
+	return out
+}
+
 // setupScript writes a shell script (without shebang) to a temp dir,
 // names it "restic", makes it executable, and returns the path.
 //
-// Writes to "<path>.tmp" then renames into place. The rename is what
-// makes this race-free: under -race + many t.Parallel tests, a
-// fork-from-another-goroutine can inherit the writable fd from
+// Writes to "<path>.tmp" then renames into place. The rename is the
+// usual guard against ETXTBSY: under -race + many t.Parallel tests,
+// a fork-from-another-goroutine can inherit the writable fd from
 // os.WriteFile before close completes, and exec'ing the file then
-// returns ETXTBSY ("text file busy"). Once the rename lands, the
-// final path is a fresh dirent pointing at an inode that has no
-// writable fd open anywhere — exec is safe.
+// returns ETXTBSY ("text file busy"). The renamed dirent points at
+// an inode that has no writable fd open anywhere — exec is safe on
+// a vanilla filesystem.
+//
+// On overlayfs (every job that runs inside a `container:` block on
+// our Gitea runner), the rename can briefly leak ETXTBSY anyway —
+// the upper layer's "writable inode" bookkeeping lags the userspace
+// close. To make the helper deterministic across environments, we
+// probe-exec the file with a benign argument until exec succeeds,
+// then return. Each script body has a `case "$1" in ... esac` shape
+// where unknown args fall through to a clean exit, so the probe is
+// a no-op from the test's point of view.
 func setupScript(t *testing.T, body string) string {
 	t.Helper()
 	dir := t.TempDir()
@@ -39,7 +74,21 @@ func setupScript(t *testing.T, body string) string {
 	if err := os.Rename(tmp, final); err != nil {
 		t.Fatalf("setupScript: rename: %v", err)
 	}
-	return final
+
+	deadline := time.Now().Add(3 * time.Second)
+	for {
+		err := exec.Command(final, "__rm_probe__").Run()
+		if err == nil {
+			return final
+		}
+		if !errors.Is(err, syscall.ETXTBSY) {
+			t.Fatalf("setupScript: probe exec: %v", err)
+		}
+		if time.Now().After(deadline) {
+			t.Fatalf("setupScript: %s still ETXTBSY after 3s", final)
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
 }

 // firstEnvOfType returns the first envelope with the given type, or
@@ -320,7 +369,7 @@ esac
 // still produces job.started and job.finished envelopes.
 func TestRunInitShipsStartedAndFinished(t *testing.T) {
 	t.Parallel()
-	bin := setupScript(t, `echo "initialized repository"`)
+	bin := setupScript(t, `echo "initialised repository"`)
 	tx := &fakeSender{}
 	r := New(Config{ResticBin: bin}, tx, 0)
 	if err := r.RunInit(context.Background(), "job-init"); err != nil {
@@ -110,7 +110,7 @@ func (s *Scheduler) Apply(payload api.ScheduleSetPayload, tx Sender) {
 		"received", len(payload.Schedules), "active", added)

 	// Ack outside the lock — Send() shouldn't take long, but holding
-	// s.mu across an external call would needlessly serialize other
+	// s.mu across an external call would needlessly serialise other
 	// callers (e.g. a future Status() inspection from the UI).
 	ackEnv, err := api.Marshal(api.MsgScheduleAck, "", api.ScheduleAckPayload{
 		Version:   payload.Version,
@@ -21,7 +21,7 @@ import (

 // additionalData binds ciphertexts to the agent-secrets context, so a
 // blob lifted from one role's file can't be replayed into another's
-// row in some unrelated table that uses the same key. (Defense in
+// row in some unrelated table that uses the same key. (Defence in
 // depth — the key is per-host today, but cheap to be careful.)
 const additionalData = "rm-agent-repo-creds-v1"

@@ -0,0 +1,103 @@
+//go:build windows
+
+// install_windows.go — thin wrappers around the Service Control
+// Manager via golang.org/x/sys/windows/svc/mgr. Used by the agent's
+// `install` / `uninstall` / `start` / `stop` subcommands.
+//
+// UNTESTED in CI. Mirrors the canonical example shape; if you need
+// to extend this, prefer copying from x/sys/windows/svc/example
+// over inventing new patterns.
+package service
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"golang.org/x/sys/windows/svc/mgr"
+)
+
+// Install registers the service with the SCM, pointing it at the
+// currently-running binary. The service starts on every boot and
+// runs as LocalSystem (default).
+func Install() error {
+	exe, err := os.Executable()
+	if err != nil {
+		return fmt.Errorf("install: locate executable: %w", err)
+	}
+	exe, err = filepath.Abs(exe)
+	if err != nil {
+		return fmt.Errorf("install: absolutise path: %w", err)
+	}
+	m, err := mgr.Connect()
+	if err != nil {
+		return fmt.Errorf("install: connect SCM: %w", err)
+	}
+	defer m.Disconnect()
+	if existing, err := m.OpenService(ServiceName); err == nil {
+		_ = existing.Close()
+		return fmt.Errorf("service %q already installed; uninstall first", ServiceName)
+	}
+	s, err := m.CreateService(ServiceName, exe, mgr.Config{
+		StartType:   mgr.StartAutomatic,
+		DisplayName: "Restic-manager agent",
+		Description: "Backs up this host on the schedule the central restic-manager dictates.",
+	}, "run")
+	if err != nil {
+		return fmt.Errorf("install: create service: %w", err)
+	}
+	defer s.Close()
+	return nil
+}
+
+// Uninstall removes the service from the SCM. Caller is expected to
+// stop the service first; this returns the SCM's error if it's
+// still running.
+func Uninstall() error {
+	m, err := mgr.Connect()
+	if err != nil {
+		return fmt.Errorf("uninstall: connect SCM: %w", err)
+	}
+	defer m.Disconnect()
+	s, err := m.OpenService(ServiceName)
+	if err != nil {
+		return fmt.Errorf("uninstall: open service: %w", err)
+	}
+	defer s.Close()
+	if err := s.Delete(); err != nil {
+		return fmt.Errorf("uninstall: delete service: %w", err)
+	}
+	return nil
+}
+
+// Start asks the SCM to start the installed service. No-op if it's
+// already running (the SCM returns an error which we surface).
+func Start() error {
+	m, err := mgr.Connect()
+	if err != nil {
+		return err
+	}
+	defer m.Disconnect()
+	s, err := m.OpenService(ServiceName)
+	if err != nil {
+		return err
+	}
+	defer s.Close()
+	return s.Start()
+}
+
+// Stop sends a stop control to the service.
+func Stop() error {
+	m, err := mgr.Connect()
+	if err != nil {
+		return err
+	}
+	defer m.Disconnect()
+	s, err := m.OpenService(ServiceName)
+	if err != nil {
+		return err
+	}
+	defer s.Close()
+	_, err = s.Control(0x00000001) // SERVICE_CONTROL_STOP
+	return err
+}
@@ -0,0 +1,44 @@
+//go:build !windows
+
+// service_other.go — non-Windows fallback for the service package.
+// Linux uses systemd to wrap the agent; the binary itself just runs
+// in the foreground. Run() therefore just executes the agent loop
+// and returns. install/uninstall sub-commands return a clear error
+// directing the operator at the install.sh + systemd unit shipped
+// in deploy/install/.
+package service
+
+import (
+	"context"
+	"errors"
+)
+
+// AgentRun is the function-pointer shape main passes in. Same shape
+// as the Windows variant so the call site is portable.
+type AgentRun func(ctx context.Context) error
+
+// Run executes the agent loop in the foreground; on Unix the
+// systemd unit (or whatever runs us) supplies the lifecycle.
+func Run(agentRun AgentRun) error {
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	return agentRun(ctx)
+}
+
+// Install registers the agent as a service. Windows-only; on Unix
+// the systemd unit covers this — returns an error pointing there.
+func Install() error { return errUnsupported("install") }
+
+// Uninstall is the inverse of Install. Windows-only.
+func Uninstall() error { return errUnsupported("uninstall") }
+
+// Start asks the OS service manager to start the installed service.
+// Windows-only.
+func Start() error { return errUnsupported("start") }
+
+// Stop sends a stop signal to the installed service. Windows-only.
+func Stop() error { return errUnsupported("stop") }
+
+func errUnsupported(verb string) error {
+	return errors.New("service " + verb + " is Windows-only; use the systemd unit on Linux")
+}
@@ -0,0 +1,93 @@
+//go:build windows
+
+// service_windows.go — Service Control Manager integration for the
+// agent on Windows (P2-16). Implements the svc.Handler interface so
+// `restic-manager-agent run` works under both interactive and SCM
+// contexts. install/uninstall live in install_windows.go.
+//
+// UNTESTED on Windows in this repo's CI (the runners are Linux).
+// The shape mirrors the canonical example in
+// golang.org/x/sys/windows/svc/example. Treat any deviation from
+// that example as suspicious.
+package service
+
+import (
+	"context"
+	"errors"
+	"log/slog"
+
+	"golang.org/x/sys/windows/svc"
+)
+
+// ServiceName is the SCM identifier for the agent service.
+const ServiceName = "restic-manager-agent"
+
+// AgentRun is the function the service handler calls to start the
+// agent's main loop. Pass cmd/agent's run-loop entry point at the
+// call site so this package stays free of cross-cmd imports.
+type AgentRun func(ctx context.Context) error
+
+// Run delegates to the SCM dispatcher when running under Windows
+// service control, otherwise runs the agent loop in the foreground
+// (for `restic-manager-agent run` from a console, e.g. while
+// debugging on a developer's box).
+func Run(agentRun AgentRun) error {
+	isService, err := svc.IsWindowsService()
+	if err != nil {
+		return err
+	}
+	if !isService {
+		ctx, cancel := context.WithCancel(context.Background())
+		defer cancel()
+		return agentRun(ctx)
+	}
+	return svc.Run(ServiceName, &handler{run: agentRun})
+}
+
+// handler implements svc.Handler. Execute is called once when the
+// service is started. We spawn the agent loop in a goroutine and
+// listen for SCM Stop / Shutdown notifications, cancelling the
+// context to wind down cleanly.
+type handler struct {
+	run AgentRun
+}
+
+func (h *handler) Execute(_ []string, req <-chan svc.ChangeRequest, status chan<- svc.Status) (bool, uint32) {
+	const accepted = svc.AcceptStop | svc.AcceptShutdown
+	status <- svc.Status{State: svc.StartPending}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	doneCh := make(chan error, 1)
+	go func() {
+		doneCh <- h.run(ctx)
+	}()
+	status <- svc.Status{State: svc.Running, Accepts: accepted}
+
+	for {
+		select {
+		case c := <-req:
+			switch c.Cmd {
+			case svc.Interrogate:
+				status <- c.CurrentStatus
+			case svc.Stop, svc.Shutdown:
+				slog.Info("svc: stop requested")
+				cancel()
+				status <- svc.Status{State: svc.StopPending}
+				if err := <-doneCh; err != nil && !errors.Is(err, context.Canceled) {
+					slog.Warn("svc: agent loop exited with error", "err", err)
+					return false, 1
+				}
+				return false, 0
+			}
+		case err := <-doneCh:
+			// Agent loop exited on its own — uncommon (only via signal
+			// or fatal error). Surface as an SCM stop.
+			if err != nil && !errors.Is(err, context.Canceled) {
+				slog.Warn("svc: agent loop exited unexpectedly", "err", err)
+				return false, 1
+			}
+			return false, 0
+		}
+	}
+}
@@ -76,5 +76,5 @@ func detectResticVersion(ctx context.Context, override string) (string, error) {
 	if len(parts) >= 2 && parts[0] == "restic" {
 		return parts[1], nil
 	}
-	return "", fmt.Errorf("sysinfo: unrecognized restic version output: %q", first)
+	return "", fmt.Errorf("sysinfo: unrecognised restic version output: %q", first)
 }
@@ -0,0 +1,100 @@
+// Package updater carries the agent's self-update logic.
+//
+// The flow is operator-driven: the server dispatches a command.update
+// WS envelope, the agent fetches a fresh binary from the server's
+// /agent/binary endpoint, atomic-renames it over the running binary
+// (Linux) or hands off to a detached helper script (Windows), and
+// exits cleanly so the service manager restarts under the new
+// binary. See docs/superpowers/specs/2026-05-06-p6-01-02-...
+//
+// Platform-specific code is build-tagged into updater_unix.go /
+// updater_windows.go. This file holds the shared HTTP fetch + path
+// helpers + the test seam.
+package updater
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path/filepath"
+	"runtime"
+	"time"
+)
+
+// fetch downloads the new binary into <binaryPath>.new, fsyncs, chmods.
+// Returns the path of the staged file (always binaryPath + ".new").
+func fetch(ctx context.Context, serverURL, binaryPath string) (string, error) {
+	url := fmt.Sprintf("%s/agent/binary?os=%s&arch=%s", serverURL, runtime.GOOS, runtime.GOARCH)
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return "", err
+	}
+	c := &http.Client{Timeout: 5 * time.Minute}
+	res, err := c.Do(req)
+	if err != nil {
+		return "", err
+	}
+	defer func() { _ = res.Body.Close() }()
+	if res.StatusCode != http.StatusOK {
+		return "", fmt.Errorf("agent binary fetch: %s", res.Status)
+	}
+
+	stagePath := binaryPath + ".new"
+	f, err := os.OpenFile(stagePath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
+	if err != nil {
+		return "", err
+	}
+	if _, copyErr := io.Copy(f, res.Body); copyErr != nil {
+		_ = f.Close()
+		_ = os.Remove(stagePath)
+		return "", copyErr
+	}
+	if syncErr := f.Sync(); syncErr != nil {
+		_ = f.Close()
+		_ = os.Remove(stagePath)
+		return "", syncErr
+	}
+	if closeErr := f.Close(); closeErr != nil {
+		_ = os.Remove(stagePath)
+		return "", closeErr
+	}
+	if err := os.Chmod(stagePath, 0o755); err != nil {
+		_ = os.Remove(stagePath)
+		return "", err
+	}
+	return stagePath, nil
+}
+
+// resolveOwnBinary returns the absolute path of the running binary.
+// Refuses /proc/self/exe — that's what os.Executable returns on some
+// systems but the path can't be renamed across.
+func resolveOwnBinary() (string, error) {
+	p, err := os.Executable()
+	if err != nil {
+		return "", err
+	}
+	abs, err := filepath.Abs(p)
+	if err != nil {
+		return "", err
+	}
+	if abs == "/proc/self/exe" {
+		return "", fmt.Errorf("cannot resolve own binary path (/proc/self/exe)")
+	}
+	return abs, nil
+}
+
+// UpdateForTest is the platform-neutral test seam. In production the
+// platform-specific Update fetches, swaps, then exits the process.
+// UpdateForTest stops short of the exit so unit tests can assert on
+// file state.
+func UpdateForTest(serverURL, binaryPath string) error {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+	defer cancel()
+	stage, err := fetch(ctx, serverURL, binaryPath)
+	if err != nil {
+		return err
+	}
+	return swap(stage, binaryPath)
+}
@@ -0,0 +1,87 @@
+//go:build !windows
+
+package updater
+
+import (
+	"bytes"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"runtime"
+	"testing"
+)
+
+// TestUpdate_LinuxAtomicSwap stages a fake "running binary" file, runs
+// UpdateForTest against a fake /agent/binary server, and asserts that
+// the binary was swapped, .old preserves the previous bytes, and .new
+// was renamed away.
+func TestUpdate_LinuxAtomicSwap(t *testing.T) {
+	tmp := t.TempDir()
+	binPath := filepath.Join(tmp, "agent")
+	if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+	newBytes := []byte("NEW BINARY CONTENTS")
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/agent/binary" {
+			http.NotFound(w, r)
+			return
+		}
+		gotOS, gotArch := r.URL.Query().Get("os"), r.URL.Query().Get("arch")
+		if gotOS != runtime.GOOS || gotArch != runtime.GOARCH {
+			t.Errorf("query mismatch: got os=%s arch=%s want %s/%s",
+				gotOS, gotArch, runtime.GOOS, runtime.GOARCH)
+		}
+		_, _ = io.Copy(w, bytes.NewReader(newBytes))
+	}))
+	defer srv.Close()
+
+	if err := UpdateForTest(srv.URL, binPath); err != nil {
+		t.Fatalf("update: %v", err)
+	}
+
+	got, err := os.ReadFile(binPath)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if string(got) != string(newBytes) {
+		t.Fatalf("binary contents: got %q want %q", got, newBytes)
+	}
+	old, err := os.ReadFile(binPath + ".old")
+	if err != nil {
+		t.Fatalf("agent.old missing: %v", err)
+	}
+	if string(old) != "OLD" {
+		t.Fatalf("agent.old contents: got %q want %q", old, "OLD")
+	}
+	if _, err := os.Stat(binPath + ".new"); !os.IsNotExist(err) {
+		t.Fatalf("agent.new should be absent after swap, got err=%v", err)
+	}
+}
+
+// TestUpdate_FetchHTTPError surfaces the server's status when the
+// binary is not published for this os/arch.
+func TestUpdate_FetchHTTPError(t *testing.T) {
+	tmp := t.TempDir()
+	binPath := filepath.Join(tmp, "agent")
+	if err := os.WriteFile(binPath, []byte("OLD"), 0o755); err != nil {
+		t.Fatal(err)
+	}
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		http.Error(w, `{"error":"binary_not_published"}`, http.StatusNotFound)
+	}))
+	defer srv.Close()
+
+	err := UpdateForTest(srv.URL, binPath)
+	if err == nil {
+		t.Fatal("expected error, got nil")
+	}
+	got, _ := os.ReadFile(binPath)
+	if string(got) != "OLD" {
+		t.Fatalf("binary should not have changed, got %q", got)
+	}
+}
@@ -0,0 +1,73 @@
+//go:build !windows
+
+package updater
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"time"
+)
+
+// Update fetches the new binary, swaps it in, then exits so systemd
+// restarts the process under the new binary. The caller should close
+// the WS connection cleanly (so the server transitions the host to
+// disconnected immediately rather than waiting for the heartbeat
+// sweep) before invoking.
+//
+// Service-user assumption: the agent runs as root under the
+// systemd-shipped unit, which can write the binary path directly.
+// If the agent ever moves to a non-root service user, this breaks —
+// would need a setuid helper or an out-of-process update service.
+func Update(ctx context.Context, serverURL string) error {
+	binPath, err := resolveOwnBinary()
+	if err != nil {
+		return err
+	}
+	stage, err := fetch(ctx, serverURL, binPath)
+	if err != nil {
+		return err
+	}
+	if err := swap(stage, binPath); err != nil {
+		return err
+	}
+	slog.Info("agent self-update: binary swapped, exiting for systemd restart",
+		"binary", binPath)
+	// Give logger / WS close-frame a moment to flush, then exit.
+	time.Sleep(200 * time.Millisecond)
+	os.Exit(0)
+	return nil // unreachable
+}
+
+// swap copies the running binary to <bin>.old (M1 — keep one revision
+// back for hand-rolled rollback), then atomic-renames the staged
+// binary into place. Linux supports rename-while-open so this works
+// even though the running process holds the source open.
+func swap(stagePath, binPath string) error {
+	src, err := os.Open(binPath)
+	if err != nil {
+		return fmt.Errorf("open running binary: %w", err)
+	}
+	defer func() { _ = src.Close() }()
+	dst, err := os.OpenFile(binPath+".old", os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
+	if err != nil {
+		return fmt.Errorf("open .old: %w", err)
+	}
+	if _, err := io.Copy(dst, src); err != nil {
+		_ = dst.Close()
+		return fmt.Errorf("copy to .old: %w", err)
+	}
+	if err := dst.Sync(); err != nil {
+		_ = dst.Close()
+		return err
+	}
+	if err := dst.Close(); err != nil {
+		return err
+	}
+	if err := os.Rename(stagePath, binPath); err != nil {
+		return fmt.Errorf("rename .new over running binary: %w", err)
+	}
+	return nil
+}
@@ -0,0 +1,73 @@
+//go:build windows
+
+package updater
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"syscall"
+	"time"
+)
+
+// helperScript is rendered with fmt.Sprintf, args order:
+//
+//	%[1]s — running binary path (source for the .old copy)
+//	%[2]s — .old path
+//	%[3]s — staged .new path
+//	%[4]s — running binary path (rename target)
+const helperScript = `@echo off
+timeout /t 3 /nobreak >nul
+copy /Y "%[1]s" "%[2]s"
+sc stop restic-manager-agent
+:wait
+sc query restic-manager-agent | find "STOPPED" >nul
+if errorlevel 1 (timeout /t 1 /nobreak >nul & goto wait)
+move /Y "%[3]s" "%[4]s"
+sc start restic-manager-agent
+del "%%~f0"
+`
+
+// Update on Windows can't overwrite the running .exe in-process
+// (exclusive file lock), so we stage the new binary, write a small
+// detached helper script that waits, stops the service, swaps the
+// binary, and starts the service, then exit cleanly. SCM treats
+// clean exits after sc stop as intentional and does not auto-restart;
+// the helper's final sc start handles that.
+func Update(ctx context.Context, serverURL string) error {
+	binPath, err := resolveOwnBinary()
+	if err != nil {
+		return err
+	}
+	stage, err := fetch(ctx, serverURL, binPath)
+	if err != nil {
+		return err
+	}
+	helperPath := filepath.Join(filepath.Dir(binPath), "agent-update.cmd")
+	body := fmt.Sprintf(helperScript, binPath, binPath+".old", stage, binPath)
+	if err := os.WriteFile(helperPath, []byte(body), 0o755); err != nil {
+		return err
+	}
+	cmd := exec.Command("cmd.exe", "/c", helperPath)
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		HideWindow:    true,
+		CreationFlags: 0x00000008 | 0x08000000, // DETACHED_PROCESS | CREATE_NO_WINDOW
+	}
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+	slog.Info("agent self-update: helper spawned, exiting cleanly",
+		"binary", binPath, "helper", helperPath)
+	time.Sleep(200 * time.Millisecond)
+	os.Exit(0)
+	return nil // unreachable
+}
+
+// swap is unused on Windows — the helper script does the swap.
+// Defined to satisfy the build (UpdateForTest references it).
+func swap(_, _ string) error {
+	return fmt.Errorf("updater.swap not implemented on Windows; use the helper script via Update")
+}
@@ -40,7 +40,7 @@ type Config struct {
 // Sender is what handlers use to push agent → server messages
 // (job.progress, job.finished, log.stream, command.result, …).
 // Returned by the WS client to the dispatch handler. Write operations
-// serialize behind a single mutex on the conn; concurrent calls are
+// serialise behind a single mutex on the conn; concurrent calls are
 // safe.
 type Sender interface {
 	Send(env api.Envelope) error
@@ -0,0 +1,284 @@
+// Package alert evaluates the hardcoded rule set and persists raises
+// / acknowledges / resolves. Three event sources feed it:
+//   - JobFinishedEvent — pushed when a job lands a terminal state
+//     (the existing MarkJobFinished site)
+//   - HostOfflineEvent / HostOnlineEvent — pushed by the offline
+//     sweeper and by the ws hello handler
+//   - 60s ticker (internal) — drives stale-schedule + auto-resolve
+//
+// All output goes through store.RaiseOrTouch / Acknowledge / Resolve
+// and the notification.Hub. The engine is one goroutine started at
+// boot; non-blocking sends from hot paths.
+package alert
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"sync"
+	"time"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
+)
+
+// staleBackupThreshold is how long an intermittent host may go without
+// a successful backup before we raise a stale_schedule alert. Global
+// constant for v1 (may become per-host later). Only intermittent hosts
+// are evaluated — always-on hosts' stale_schedule stays a no-op.
+const staleBackupThreshold = 7 * 24 * time.Hour
+
+// JobFinishedEvent carries everything the engine needs to evaluate
+// the failed-X rules. Pushed via Engine.NotifyJobFinished from the
+// MarkJobFinished site.
+type JobFinishedEvent struct {
+	HostID        string
+	JobID         string
+	Kind          string // backup | forget | prune | check | unlock | restore | diff
+	Status        string // succeeded | failed | cancelled
+	SourceGroupID string // dedup key for backup/forget/prune/check; empty otherwise
+	When          time.Time
+}
+
+// Engine evaluates hardcoded alert rules and dispatches via notification.Hub.
+type Engine struct {
+	store *store.Store
+	hub   *notification.Hub
+
+	jobs     chan JobFinishedEvent
+	hostDown chan string // host_id
+	hostUp   chan string
+
+	// agentOfflineFloor is the duration a host must be offline before
+	// we raise. Configurable for tests; default 15m.
+	agentOfflineFloor time.Duration
+	tickPeriod        time.Duration
+
+	closeOnce sync.Once
+	done      chan struct{}
+}
+
+// NewEngine builds the engine. agentOfflineFloor + tickPeriod default
+// to 15min and 60s respectively when zero.
+func NewEngine(st *store.Store, hub *notification.Hub) *Engine {
+	return &Engine{
+		store:             st,
+		hub:               hub,
+		jobs:              make(chan JobFinishedEvent, 32),
+		hostDown:          make(chan string, 32),
+		hostUp:            make(chan string, 32),
+		agentOfflineFloor: 15 * time.Minute,
+		tickPeriod:        60 * time.Second,
+		done:              make(chan struct{}),
+	}
+}
+
+// Run drives the event loop. Returns when ctx is done. Blocks; call in
+// its own goroutine.
+func (e *Engine) Run(ctx context.Context) {
+	t := time.NewTicker(e.tickPeriod)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			e.closeOnce.Do(func() { close(e.done) })
+			return
+		case ev := <-e.jobs:
+			e.handleJobFinished(ctx, ev)
+		case hostID := <-e.hostDown:
+			e.handleHostOffline(ctx, hostID)
+		case hostID := <-e.hostUp:
+			e.handleHostOnline(ctx, hostID)
+		case now := <-t.C:
+			e.tick(ctx, now)
+		}
+	}
+}
+
+// NotifyJobFinished is the hot-path hook called from MarkJobFinished's
+// caller (ws.handler.dispatchAgentMessage). Non-blocking: drops on a
+// full channel with a slog warning.
+func (e *Engine) NotifyJobFinished(ev JobFinishedEvent) {
+	select {
+	case e.jobs <- ev:
+	default:
+		slog.Warn("alert: jobs channel full; dropping event", "kind", ev.Kind, "host_id", ev.HostID)
+	}
+}
+
+// NotifyHostOffline notifies the engine that a host is offline.
+func (e *Engine) NotifyHostOffline(hostID string) {
+	select {
+	case e.hostDown <- hostID:
+	default:
+		slog.Warn("alert: hostDown channel full; dropping", "host_id", hostID)
+	}
+}
+
+// NotifyHostOnline notifies the engine that a host is online.
+func (e *Engine) NotifyHostOnline(hostID string) {
+	select {
+	case e.hostUp <- hostID:
+	default:
+		slog.Warn("alert: hostUp channel full; dropping", "host_id", hostID)
+	}
+}
+
+func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
+	// Determine which kind/severity pair this job maps to. Jobs not
+	// listed here (init, unlock, restore, diff) produce no alerts in v1.
+	var kind, severity string
+	switch ev.Kind {
+	case "backup":
+		kind, severity = KindBackupFailed, "warning"
+	case "forget":
+		kind, severity = KindForgetFailed, "warning"
+	case "prune":
+		kind, severity = KindPruneFailed, "warning"
+	case "check":
+		kind, severity = KindCheckFailed, "critical"
+	default:
+		return
+	}
+	// dedupKey scopes the alert to a specific subject. For backups it's
+	// the source-group id (each group = its own restic run = its own
+	// failure surface). forget/prune/check are repo-scoped — leave the
+	// key empty so we get one alert per host per kind, matching the
+	// "is this repo healthy?" mental model.
+	dedupKey := ""
+	if ev.Kind == "backup" {
+		dedupKey = ev.SourceGroupID
+	}
+	switch ev.Status {
+	case "failed":
+		e.raiseAndNotify(ctx, ev.HostID, kind, dedupKey, severity,
+			fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
+	case "succeeded":
+		e.resolveAndNotify(ctx, ev.HostID, kind, dedupKey, ev.When)
+		if ev.Kind == "backup" {
+			// A fresh backup clears staleness for intermittent hosts.
+			e.resolveAndNotify(ctx, ev.HostID, KindStaleSchedule, "", ev.When)
+		}
+	}
+}
+
+func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
+	host, err := e.store.GetHost(ctx, hostID)
+	if err != nil {
+		return
+	}
+	// Intermittent hosts (laptops) legitimately disappear — never raise
+	// agent_offline for them. The stale_schedule sweep in tick() is the
+	// only staleness signal for these hosts.
+	if !host.AlwaysOn {
+		return
+	}
+	// Apply the 15-min floor — raise only when last_seen_at is older
+	// than agentOfflineFloor. A nil last_seen_at (host enrolled but
+	// never connected) is treated as "now" so we don't raise
+	// immediately on enrolment.
+	if host.LastSeenAt == nil {
+		return
+	}
+	if time.Since(*host.LastSeenAt) < e.agentOfflineFloor {
+		return
+	}
+	e.raiseAndNotify(ctx, hostID, KindAgentOffline, "", "warning",
+		fmt.Sprintf("Agent offline for %s (threshold %s)",
+			roundDur(time.Since(*host.LastSeenAt)), e.agentOfflineFloor),
+		time.Now().UTC())
+}
+
+func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
+	e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", time.Now().UTC())
+}
+
+// tick is the 60-second sweep. Responsibilities:
+//  1. Re-evaluate agent_offline for every offline host that may have
+//     crossed the floor between explicit events.
+//  2. Stale-schedule detection for intermittent hosts — raises
+//     stale_schedule when LastBackupAt is older than 7 days and the
+//     host has an enabled schedule. Always-on hosts are excluded.
+func (e *Engine) tick(ctx context.Context, now time.Time) {
+	// User-management cleanup piggy-backed here for now. Setup tokens
+	// have a 1h expiry; the alert engine tick is the cheapest existing
+	// 60s loop. If more housekeeping queries appear, extract a
+	// dedicated maintenance loop.
+	if _, err := e.store.CleanupExpiredSetupTokens(ctx, now); err != nil {
+		slog.Warn("alert: cleanup expired setup tokens", "err", err)
+	}
+	if _, err := e.store.CleanupExpiredOIDCState(ctx, now.Add(-5*time.Minute)); err != nil {
+		slog.Warn("alert: cleanup expired oidc state", "err", err)
+	}
+
+	hosts, err := e.store.ListHosts(ctx)
+	if err != nil {
+		slog.Warn("alert: tick list hosts", "err", err)
+		return
+	}
+	for _, h := range hosts {
+		// Intermittent hosts: suppress agent_offline entirely; instead
+		// raise stale_schedule when they have gone too long with no
+		// successful backup AND they have at least one enabled schedule
+		// to be measured against. A nil LastBackupAt (never backed up)
+		// has no baseline — onboarding/repo_status covers that case.
+		if !h.AlwaysOn {
+			if h.LastBackupAt == nil {
+				continue
+			}
+			if now.Sub(*h.LastBackupAt) < staleBackupThreshold {
+				continue
+			}
+			hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID)
+			if err != nil {
+				slog.Warn("alert: tick list schedules", "host_id", h.ID, "err", err)
+				continue
+			}
+			if !hasEnabled {
+				continue
+			}
+			e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning",
+				fmt.Sprintf("No backup in %s (threshold %s)",
+					roundDur(now.Sub(*h.LastBackupAt)), staleBackupThreshold), now)
+			// Resolution is handled in handleJobFinished on a successful
+			// backup (and ResolveOnModeChange on toggle) — the tick only
+			// raises, it does not auto-resolve.
+			continue
+		}
+		// Always-on hosts: existing agent_offline re-evaluation.
+		if h.Status != "offline" || h.LastSeenAt == nil {
+			continue
+		}
+		if now.Sub(*h.LastSeenAt) >= e.agentOfflineFloor {
+			e.raiseAndNotify(ctx, h.ID, KindAgentOffline, "", "warning",
+				fmt.Sprintf("Agent offline for %s (threshold %s)",
+					roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
+		}
+	}
+}
+
+// roundDur returns a human-readable duration string, rounding to the
+// nearest minute. Durations under a minute are reported as "less than
+// a minute".
+func roundDur(d time.Duration) string {
+	if d < time.Minute {
+		return "less than a minute"
+	}
+	return d.Round(time.Minute).String()
+}
+
+// hostHasEnabledSchedule reports whether the host has at least one
+// enabled backup schedule — the precondition for a stale_schedule
+// alert (no schedule = no backup expectation to measure against).
+func (e *Engine) hostHasEnabledSchedule(ctx context.Context, hostID string) (bool, error) {
+	schedules, err := e.store.ListSchedulesByHost(ctx, hostID)
+	if err != nil {
+		return false, err
+	}
+	for _, sc := range schedules {
+		if sc.Enabled {
+			return true, nil
+		}
+	}
+	return false, nil
+}
@@ -0,0 +1,255 @@
+package alert
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/oklog/ulid/v2"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
+)
+
+// TestIntermittentHostSuppressesOfflineAlert checks that handleHostOffline
+// does NOT raise agent_offline for a host with AlwaysOn=false.
+func TestIntermittentHostSuppressesOfflineAlert(t *testing.T) {
+	t.Parallel()
+	eng, st, hostID := setupEngine(t)
+	ctx := context.Background()
+
+	// Make the host intermittent.
+	if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
+		t.Fatalf("SetHostAlwaysOn: %v", err)
+	}
+
+	// Give it a stale last_seen_at well past the floor.
+	if _, err := st.DB().Exec(
+		`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
+		time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
+		"offline",
+		hostID,
+	); err != nil {
+		t.Fatalf("update last_seen_at: %v", err)
+	}
+
+	eng.handleHostOffline(ctx, hostID)
+
+	open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	if len(open) != 0 {
+		t.Fatalf("expected 0 open alerts for intermittent host; got %d: %+v", len(open), open)
+	}
+}
+
+// TestAlwaysOnHostStillRaisesOfflineAlert checks that always-on hosts still
+// get an agent_offline alert when offline past the floor.
+func TestAlwaysOnHostStillRaisesOfflineAlert(t *testing.T) {
+	t.Parallel()
+	eng, st, hostID := setupEngine(t)
+	ctx := context.Background()
+
+	// always_on=true is the default, but be explicit.
+	if err := st.SetHostAlwaysOn(ctx, hostID, true); err != nil {
+		t.Fatalf("SetHostAlwaysOn: %v", err)
+	}
+
+	// Give it a stale last_seen_at well past the 15m floor.
+	if _, err := st.DB().Exec(
+		`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
+		time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
+		"offline",
+		hostID,
+	); err != nil {
+		t.Fatalf("update last_seen_at: %v", err)
+	}
+
+	eng.handleHostOffline(ctx, hostID)
+
+	open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	if len(open) != 1 || open[0].Kind != KindAgentOffline {
+		t.Fatalf("expected 1 agent_offline alert; got %d: %+v", len(open), open)
+	}
+}
+
+// TestStalenessAlertForIntermittentHost checks that tick raises stale_schedule
+// for an intermittent host whose last backup is older than 7 days AND has an
+// enabled schedule. Also verifies that a succeeded backup clears the alert.
+func TestStalenessAlertForIntermittentHost(t *testing.T) {
+	t.Parallel()
+	eng, st, hostID := setupEngine(t)
+	ctx := context.Background()
+
+	// Make intermittent.
+	if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
+		t.Fatalf("SetHostAlwaysOn: %v", err)
+	}
+
+	// Create a source group to attach the schedule to.
+	sgID := ulid.Make().String()
+	if err := st.CreateSourceGroup(ctx, &store.SourceGroup{
+		ID:       sgID,
+		HostID:   hostID,
+		Name:     "default",
+		Includes: []string{"/home"},
+	}); err != nil {
+		t.Fatalf("CreateSourceGroup: %v", err)
+	}
+
+	// Create an enabled schedule pointing at the source group.
+	schedID := ulid.Make().String()
+	if err := st.CreateSchedule(ctx, &store.Schedule{
+		ID:             schedID,
+		HostID:         hostID,
+		CronExpr:       "0 2 * * *",
+		Enabled:        true,
+		SourceGroupIDs: []string{sgID},
+	}); err != nil {
+		t.Fatalf("CreateSchedule: %v", err)
+	}
+
+	// Set last_backup_at to 8 days ago.
+	eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour)
+	if err := st.SetHostLastBackup(ctx, hostID, "succeeded", eightDaysAgo); err != nil {
+		t.Fatalf("SetHostLastBackup: %v", err)
+	}
+
+	eng.tick(ctx, time.Now().UTC())
+
+	open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	var staleCount int
+	for _, a := range open {
+		if a.Kind == KindStaleSchedule {
+			staleCount++
+		}
+	}
+	if staleCount != 1 {
+		t.Fatalf("expected 1 stale_schedule alert after tick; got %d (all open: %+v)", staleCount, open)
+	}
+
+	// A succeeded backup should clear the stale_schedule alert.
+	eng.handleJobFinished(ctx, JobFinishedEvent{
+		HostID:        hostID,
+		JobID:         ulid.Make().String(),
+		Kind:          "backup",
+		Status:        "succeeded",
+		SourceGroupID: sgID,
+		When:          time.Now().UTC(),
+	})
+
+	open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	for _, a := range open {
+		if a.Kind == KindStaleSchedule {
+			t.Fatalf("expected stale_schedule to be resolved after backup succeeded; still open: %+v", a)
+		}
+	}
+}
+
+// TestNoStalenessWithoutEnabledSchedule checks that no stale_schedule is
+// raised for an intermittent host with a stale backup but no enabled schedule.
+func TestNoStalenessWithoutEnabledSchedule(t *testing.T) {
+	t.Parallel()
+	eng, st, hostID := setupEngine(t)
+	ctx := context.Background()
+
+	// Make intermittent.
+	if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
+		t.Fatalf("SetHostAlwaysOn: %v", err)
+	}
+
+	// Set last_backup_at to 8 days ago — stale — but no schedule.
+	eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour)
+	if err := st.SetHostLastBackup(ctx, hostID, "succeeded", eightDaysAgo); err != nil {
+		t.Fatalf("SetHostLastBackup: %v", err)
+	}
+
+	eng.tick(ctx, time.Now().UTC())
+
+	open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	for _, a := range open {
+		if a.Kind == KindStaleSchedule {
+			t.Fatalf("expected no stale_schedule without an enabled schedule; got: %+v", a)
+		}
+	}
+}
+
+// TestResolveOnModeChangeClearsOfflineAlert checks that ResolveOnModeChange
+// clears an open agent_offline alert when a host's mode is toggled.
+func TestResolveOnModeChangeClearsOfflineAlert(t *testing.T) {
+	t.Parallel()
+	eng, st, hostID := setupEngine(t)
+	ctx := context.Background()
+
+	// Make always-on and set it offline with a stale last_seen_at.
+	if err := st.SetHostAlwaysOn(ctx, hostID, true); err != nil {
+		t.Fatalf("SetHostAlwaysOn: %v", err)
+	}
+	if _, err := st.DB().Exec(
+		`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
+		time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
+		"offline",
+		hostID,
+	); err != nil {
+		t.Fatalf("update last_seen_at: %v", err)
+	}
+
+	// Raise the offline alert.
+	eng.handleHostOffline(ctx, hostID)
+
+	open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	if len(open) != 1 || open[0].Kind != KindAgentOffline {
+		t.Fatalf("expected 1 agent_offline alert before mode change; got %d: %+v", len(open), open)
+	}
+
+	// Toggle mode — should clear the alert.
+	eng.ResolveOnModeChange(ctx, hostID, time.Now().UTC())
+
+	open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	for _, a := range open {
+		if a.Kind == KindAgentOffline {
+			t.Fatalf("expected agent_offline to be resolved after mode change; still open: %+v", a)
+		}
+	}
+}
+
+// TestNoStalenessWhenNeverBackedUp checks that no stale_schedule alert is
+// raised for an intermittent host that has never backed up (nil LastBackupAt).
+func TestNoStalenessWhenNeverBackedUp(t *testing.T) {
+	t.Parallel()
+	eng, st, hostID := setupEngine(t)
+	ctx := context.Background()
+
+	// Make intermittent.
+	if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
+		t.Fatalf("SetHostAlwaysOn: %v", err)
+	}
+
+	// Create a source group and an enabled schedule — but do NOT set LastBackupAt.
+	sgID := ulid.Make().String()
+	if err := st.CreateSourceGroup(ctx, &store.SourceGroup{
+		ID:       sgID,
+		HostID:   hostID,
+		Name:     "default",
+		Includes: []string{"/home"},
+	}); err != nil {
+		t.Fatalf("CreateSourceGroup: %v", err)
+	}
+
+	schedID := ulid.Make().String()
+	if err := st.CreateSchedule(ctx, &store.Schedule{
+		ID:             schedID,
+		HostID:         hostID,
+		CronExpr:       "0 2 * * *",
+		Enabled:        true,
+		SourceGroupIDs: []string{sgID},
+	}); err != nil {
+		t.Fatalf("CreateSchedule: %v", err)
+	}
+
+	eng.tick(ctx, time.Now().UTC())
+
+	open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	for _, a := range open {
+		if a.Kind == KindStaleSchedule {
+			t.Fatalf("expected no stale_schedule when never backed up; got: %+v", a)
+		}
+	}
+}
@@ -0,0 +1,174 @@
+package alert
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"time"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
+)
+
+// Alert kind constants — keep in lockstep with the engine logic and
+// the UI tag-colour table.
+const (
+	// KindBackupFailed is raised when a backup job finishes with
+	// status "failed" and resolved on next backup success.
+	KindBackupFailed = "backup_failed"
+
+	// KindForgetFailed mirrors KindBackupFailed for forget jobs.
+	KindForgetFailed = "forget_failed"
+
+	// KindPruneFailed mirrors KindBackupFailed for prune jobs.
+	KindPruneFailed = "prune_failed"
+
+	// KindCheckFailed is raised at "critical" severity (repository
+	// integrity is at risk) when a check job fails.
+	KindCheckFailed = "check_failed"
+
+	// KindStaleSchedule is raised for intermittent (non-always-on) hosts
+	// when their last successful backup is older than staleBackupThreshold
+	// (7 days) and they have at least one enabled schedule. Resolved on
+	// backup success or when the host is switched to always-on mode.
+	KindStaleSchedule = "stale_schedule"
+
+	// KindAgentOffline is raised when a host's last_seen_at is older
+	// than the 15-minute floor and resolved when the host reconnects.
+	KindAgentOffline = "agent_offline"
+)
+
+// raiseAndNotify is the standard raise pattern: store.RaiseOrTouch
+// deduplicates, and notification.Hub.Dispatch fires only on the first
+// raise (didRaise=true). Subsequent occurrences of the same open alert
+// are "touched" (last_seen_at bumped) without a second notification.
+func (e *Engine) raiseAndNotify(ctx context.Context, hostID, kind, dedupKey, severity, message string, when time.Time) {
+	id, didRaise, err := e.store.RaiseOrTouch(ctx, hostID, kind, dedupKey, severity, message, when)
+	if err != nil {
+		slog.Warn("alert: raise", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
+		return
+	}
+	if !didRaise {
+		return
+	}
+	host, err := e.store.GetHost(ctx, hostID)
+	hostName := hostID
+	if err == nil {
+		hostName = host.Name
+	}
+	go e.hub.Dispatch(ctx, notification.Payload{
+		Event:    notification.EventRaised,
+		AlertID:  id,
+		Severity: severity,
+		Kind:     kind,
+		HostID:   hostID,
+		HostName: hostName,
+		Message:  message,
+		RaisedAt: when,
+	})
+}
+
+// Acknowledge updates the alert row and fans out alert.acknowledged to
+// every enabled channel. Best-effort: store errors are logged but the
+// dispatch still fires only when the store update succeeds.
+func (e *Engine) Acknowledge(ctx context.Context, alertID, userID string, when time.Time) error {
+	if err := e.store.Acknowledge(ctx, alertID, userID, when); err != nil {
+		return err
+	}
+	a, lerr := e.store.GetAlert(ctx, alertID)
+	if lerr != nil || a == nil {
+		// Acknowledge already succeeded; dispatch is best-effort.
+		return nil //nolint:nilerr
+	}
+	p := alertPayload(ctx, e.store, notification.EventAcknowledged, a)
+	go e.hub.Dispatch(context.WithoutCancel(ctx), p)
+	return nil
+}
+
+// Resolve marks the alert resolved and fans out alert.resolved.
+func (e *Engine) Resolve(ctx context.Context, alertID string, when time.Time) error {
+	a, _ := e.store.GetAlert(ctx, alertID)
+	if err := e.store.Resolve(ctx, alertID, when); err != nil {
+		return err
+	}
+	if a == nil {
+		return nil
+	}
+	p := alertPayload(ctx, e.store, notification.EventResolved, a)
+	go e.hub.Dispatch(context.WithoutCancel(ctx), p)
+	return nil
+}
+
+// alertPayload builds a Payload from a stored Alert, looking up the host
+// name when HostID is set.
+func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a *store.Alert) notification.Payload {
+	hostID, hostName := "", ""
+	if a.HostID != nil {
+		hostID = *a.HostID
+		hostName = hostID
+		if h, err := st.GetHost(ctx, hostID); err == nil && h != nil {
+			hostName = h.Name
+		}
+	}
+	return notification.Payload{
+		Event:    ev,
+		AlertID:  a.ID,
+		Severity: a.Severity,
+		Kind:     a.Kind,
+		HostID:   hostID,
+		HostName: hostName,
+		Message:  a.Message,
+		RaisedAt: a.CreatedAt,
+	}
+}
+
+// ResolveOnModeChange clears any open agent_offline and stale_schedule
+// alerts for a host whose always-on flag was just toggled. The next
+// 60s tick re-raises whichever still applies under the new mode, so
+// this is a self-correcting "wipe and let the sweep settle" call.
+// Safe to invoke from the HTTP layer (it only touches the store + hub).
+func (e *Engine) ResolveOnModeChange(ctx context.Context, hostID string, when time.Time) {
+	e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", when)
+	e.resolveAndNotify(ctx, hostID, KindStaleSchedule, "", when)
+}
+
+// resolveAndNotify clears the open (or acknowledged) alert matching
+// (host_id, kind, dedup_key) via store.AutoResolve, then fires
+// alert.resolved for the row(s) actually closed. Best-effort —
+// errors are logged but do not propagate.
+func (e *Engine) resolveAndNotify(ctx context.Context, hostID, kind, dedupKey string, when time.Time) {
+	open, err := e.store.ListAlerts(ctx, store.AlertFilter{
+		Status: "open", HostID: hostID,
+	})
+	if err != nil {
+		return
+	}
+	openAcked, _ := e.store.ListAlerts(ctx, store.AlertFilter{
+		Status: "acknowledged", HostID: hostID,
+	})
+	all := append(open, openAcked...)
+	if err := e.store.AutoResolve(ctx, hostID, kind, dedupKey, when); err != nil {
+		slog.Warn("alert: auto-resolve", "kind", kind, "host_id", hostID, "dedup_key", dedupKey, "err", err)
+		return
+	}
+	host, _ := e.store.GetHost(ctx, hostID)
+	hostName := hostID
+	if host != nil {
+		hostName = host.Name
+	}
+	for _, a := range all {
+		if a.Kind != kind || a.DedupKey != dedupKey {
+			continue
+		}
+		go e.hub.Dispatch(ctx, notification.Payload{
+			Event:    notification.EventResolved,
+			AlertID:  a.ID,
+			Severity: a.Severity,
+			Kind:     a.Kind,
+			HostID:   hostID,
+			HostName: hostName,
+			Message:  fmt.Sprintf("Auto-resolved (%s)", kind),
+			RaisedAt: when,
+		})
+	}
+}
@@ -0,0 +1,125 @@
+package alert
+
+import (
+	"context"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/oklog/ulid/v2"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
+)
+
+func setupEngine(t *testing.T) (*Engine, *store.Store, string) {
+	t.Helper()
+	dir := t.TempDir()
+	st, _ := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
+	t.Cleanup(func() { _ = st.Close() })
+	keyPath := filepath.Join(dir, "secret.key")
+	_ = crypto.GenerateKeyFile(keyPath)
+	key, _ := crypto.LoadKeyFromFile(keyPath)
+	aead, _ := crypto.NewAEAD(key)
+	hub := notification.NewHub(st, aead, "https://rm.example")
+	eng := NewEngine(st, hub)
+	hostID := ulid.Make().String()
+	if err := st.CreateHost(context.Background(), store.Host{
+		ID: hostID, Name: "alfa-01", OS: "linux", Arch: "amd64",
+		EnrolledAt: time.Now().UTC(),
+	}, "deadbeef", ""); err != nil {
+		t.Fatalf("create host: %v", err)
+	}
+	return eng, st, hostID
+}
+
+func TestEngineBackupFailedRaisesThenResolves(t *testing.T) {
+	t.Parallel()
+	eng, st, hostID := setupEngine(t)
+	ctx := context.Background()
+
+	eng.handleJobFinished(ctx, JobFinishedEvent{
+		HostID: hostID, JobID: "j1", Kind: "backup", Status: "failed",
+		When: time.Now().UTC(),
+	})
+	open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	if len(open) != 1 || open[0].Kind != KindBackupFailed {
+		t.Fatalf("expected one backup_failed open; got %+v", open)
+	}
+
+	// Second failed job should TOUCH (not raise a fresh row).
+	eng.handleJobFinished(ctx, JobFinishedEvent{
+		HostID: hostID, JobID: "j2", Kind: "backup", Status: "failed",
+		When: time.Now().UTC().Add(time.Minute),
+	})
+	open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	if len(open) != 1 {
+		t.Fatalf("expected dedup to stay at 1 open; got %d", len(open))
+	}
+
+	// Success auto-resolves.
+	eng.handleJobFinished(ctx, JobFinishedEvent{
+		HostID: hostID, JobID: "j3", Kind: "backup", Status: "succeeded",
+		When: time.Now().UTC().Add(2 * time.Minute),
+	})
+	open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
+	if len(open) != 0 {
+		t.Fatalf("expected zero open after success; got %d", len(open))
+	}
+}
+
+func TestEngineCheckFailedSeverityCritical(t *testing.T) {
+	t.Parallel()
+	eng, st, hostID := setupEngine(t)
+	eng.handleJobFinished(context.Background(), JobFinishedEvent{
+		HostID: hostID, Kind: "check", Status: "failed", When: time.Now().UTC(),
+	})
+	open, _ := st.ListAlerts(context.Background(),
+		store.AlertFilter{Status: "open", HostID: hostID})
+	if len(open) != 1 || open[0].Severity != "critical" {
+		t.Fatalf("got %+v", open)
+	}
+}
+
+func TestEngineAgentOfflineRespects15MinFloor(t *testing.T) {
+	t.Parallel()
+	eng, st, hostID := setupEngine(t)
+	// Host's last_seen_at defaulted to NULL via CreateHost (enrolled but never
+	// seen). Force a stale value for the test by direct DB update.
+	if _, err := st.DB().Exec(
+		`UPDATE hosts SET last_seen_at = ? WHERE id = ?`,
+		time.Now().UTC().Add(-20*time.Minute).Format(time.RFC3339Nano), hostID,
+	); err != nil {
+		t.Fatalf("update last_seen_at: %v", err)
+	}
+	eng.handleHostOffline(context.Background(), hostID)
+	open, _ := st.ListAlerts(context.Background(),
+		store.AlertFilter{Status: "open", HostID: hostID})
+	if len(open) != 1 {
+		t.Fatalf("expected agent_offline raised; got %d", len(open))
+	}
+
+	// Bring back online — should auto-resolve.
+	eng.handleHostOnline(context.Background(), hostID)
+	open, _ = st.ListAlerts(context.Background(),
+		store.AlertFilter{Status: "open", HostID: hostID})
+	if len(open) != 0 {
+		t.Fatalf("expected agent_offline resolved; got %d", len(open))
+	}
+}
+
+func TestEngineAgentOfflineUnderFloorNoRaise(t *testing.T) {
+	t.Parallel()
+	eng, st, hostID := setupEngine(t)
+	// last_seen_at is NULL from CreateHost (never touched). A nil
+	// last_seen_at means the host was enrolled but never connected —
+	// treat that as "now" for the floor check so we don't raise
+	// immediately. handleHostOffline must skip the raise.
+	eng.handleHostOffline(context.Background(), hostID)
+	open, _ := st.ListAlerts(context.Background(),
+		store.AlertFilter{Status: "open", HostID: hostID})
+	if len(open) != 0 {
+		t.Fatalf("expected no raise within 15-min floor; got %d", len(open))
+	}
+}
@@ -0,0 +1,63 @@
+package alert
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"time"
+
+	"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
+)
+
+// Alert-kind constants for P6 self-update flows.
+const (
+	// KindUpdateFailed is raised when an agent fails to come back with
+	// the expected version after a command.update dispatch (timeout or
+	// version-mismatch). Resolved by a subsequent matching hello.
+	KindUpdateFailed = "update_failed"
+
+	// KindFleetUpdateHalted is raised when the fleet-update worker
+	// stops mid-run because a host failed to update or went offline.
+	// Host-less alert (system-scoped). Manually resolved by an admin.
+	KindFleetUpdateHalted = "fleet_update_halted"
+)
+
+// RaiseUpdateFailed records a per-host update failure. dedupKey is the
+// hostID so a re-dispatch on the same host touches the existing alert
+// rather than spawning a duplicate.
+func (e *Engine) RaiseUpdateFailed(ctx context.Context, hostID, jobID, reason string, when time.Time) {
+	msg := fmt.Sprintf("Agent update failed (job %s): %s", jobID, reason)
+	e.raiseAndNotify(ctx, hostID, KindUpdateFailed, hostID, "warning", msg, when)
+}
+
+// ResolveUpdateFailed clears any open update_failed alert for hostID.
+// Called from the WS hello path when the agent reconnects with the
+// target version.
+func (e *Engine) ResolveUpdateFailed(ctx context.Context, hostID string, when time.Time) {
+	e.resolveAndNotify(ctx, hostID, KindUpdateFailed, hostID, when)
+}
+
+// RaiseFleetUpdateHalted is host-less — the fleet update is a
+// system-level concept. We persist it via the dedicated host-less
+// alert path so the alerts table's host_id column carries NULL.
+func (e *Engine) RaiseFleetUpdateHalted(ctx context.Context, fleetUpdateID, reason string, when time.Time) {
+	msg := fmt.Sprintf("Fleet update %s halted: %s", fleetUpdateID, reason)
+	id, didRaise, err := e.store.RaiseOrTouchSystem(ctx, KindFleetUpdateHalted, fleetUpdateID, "warning", msg, when)
+	if err != nil {
+		slog.Warn("alert: raise fleet_update_halted", "fu_id", fleetUpdateID, "err", err)
+		return
+	}
+	if !didRaise {
+		return
+	}
+	go e.hub.Dispatch(ctx, notification.Payload{
+		Event:    notification.EventRaised,
+		AlertID:  id,
+		Severity: "warning",
+		Kind:     KindFleetUpdateHalted,
+		HostID:   "",
+		HostName: "",
+		Message:  msg,
+		RaisedAt: when,
+	})
+}
@@ -52,14 +52,18 @@ type JobKind string

 // Allowed JobKind values. backup is operator/cron driven; init runs
 // once per host on first connect; forget/prune/check fire from the
-// server-side maintenance ticker; unlock is operator-only.
+// server-side maintenance ticker; unlock and restore are operator-
+// only; diff is operator-only and read-only.
 const (
-	JobBackup JobKind = "backup"
-	JobInit   JobKind = "init"
-	JobForget JobKind = "forget"
-	JobPrune  JobKind = "prune"
-	JobCheck  JobKind = "check"
-	JobUnlock JobKind = "unlock"
+	JobBackup  JobKind = "backup"
+	JobInit    JobKind = "init"
+	JobForget  JobKind = "forget"
+	JobPrune   JobKind = "prune"
+	JobCheck   JobKind = "check"
+	JobUnlock  JobKind = "unlock"
+	JobRestore JobKind = "restore"
+	JobDiff    JobKind = "diff"
+	JobUpdate  JobKind = "update"
 )

 // JobStatus is the lifecycle state of a job.
@@ -130,6 +134,48 @@ type CommandRunPayload struct {
 	Tag                string        `json:"tag,omitempty"`
 	ForgetGroups       []ForgetGroup `json:"forget_groups,omitempty"`
 	RequiresAdminCreds bool          `json:"requires_admin_creds,omitempty"`
+
+	// Per-job bandwidth caps in KB/s. When nil, the agent uses the
+	// host-wide caps it received via config.update. When non-nil,
+	// the override wins for this job only — even a non-nil zero
+	// pointer means "no cap for this job" (caller's explicit choice).
+	BandwidthUpKBps   *int `json:"bandwidth_up_kbps,omitempty"`
+	BandwidthDownKBps *int `json:"bandwidth_down_kbps,omitempty"`
+
+	// Hooks run only for kind=backup. Server resolves source-group
+	// hook → host default → empty before dispatching, so the agent
+	// just executes whatever is here.
+	PreHook  string `json:"pre_hook,omitempty"`
+	PostHook string `json:"post_hook,omitempty"`
+
+	// Restore is populated only for kind=restore. See RestorePayload
+	// for the shape; nil for every other kind.
+	Restore *RestorePayload `json:"restore,omitempty"`
+
+	// Diff is populated only for kind=diff. See DiffPayload for
+	// shape; nil for every other kind.
+	Diff *DiffPayload `json:"diff,omitempty"`
+}
+
+// RestorePayload carries restore-specific arguments on a JobRestore
+// command.run. Paths are absolute paths inside the snapshot (same
+// shape restic accepts as positional args). When InPlace is true the
+// agent restores at root (`--target /`) and preserves uid/gid/mode;
+// otherwise it restores into TargetDir with --no-ownership so the
+// operator can inspect the files as the agent user.
+type RestorePayload struct {
+	SnapshotID string   `json:"snapshot_id"`
+	Paths      []string `json:"paths"`
+	InPlace    bool     `json:"in_place"`
+	TargetDir  string   `json:"target_dir,omitempty"` // ignored when in_place=true
+}
+
+// DiffPayload carries snapshot-diff arguments on a JobDiff command.run.
+// SnapshotA / SnapshotB may be either short or long IDs; restic
+// accepts both.
+type DiffPayload struct {
+	SnapshotA string `json:"snapshot_a"`
+	SnapshotB string `json:"snapshot_b"`
 }

 // CommandCancelPayload is the server → agent cancel signal.
@@ -306,13 +352,56 @@ type ConfigUpdatePayload struct {
 	RepoCredential string `json:"repo_credential,omitempty"` // sensitive (for rest server basic auth)
 	HookShell      string `json:"hook_shell,omitempty"`
 	Slot           string `json:"slot,omitempty"`
+
+	// Bandwidth caps in KB/s. Pointer semantics so the server can
+	// disambiguate "no change in this push" (nil → omitted on the
+	// wire) from "explicitly clear the cap" (zero or negative value).
+	// Applied to every restic invocation as --limit-upload /
+	// --limit-download. Per-job overrides ride on CommandRunPayload.
+	BandwidthUpKBps   *int `json:"bandwidth_up_kbps,omitempty"`
+	BandwidthDownKBps *int `json:"bandwidth_down_kbps,omitempty"`
 }

-// AgentUpdateAvailablePayload — informational only; the agent does
-// NOT self-update. See spec.md §4.2 for the package-manager-based
-// update model.
-type AgentUpdateAvailablePayload struct {
-	LatestVersion string `json:"latest_version"`
-	PackageURL    string `json:"package_url"` // apt repo / choco source
-	Changelog     string `json:"changelog,omitempty"`
+// CommandUpdatePayload carries no operational data — the agent
+// already knows its own os/arch and fetches from its configured
+// server URL via /agent/binary. JobID is the server-issued id of
+// the update job; the agent echoes it on log.stream lines so the
+// live job log captures pre-restart progress, then either exits
+// (Linux) or hands off to a detached helper script (Windows).
+type CommandUpdatePayload struct {
+	JobID string `json:"job_id"`
+}
+
+// TreeListRequestPayload is the body of a tree.list RPC. Used by the
+// restore wizard to lazy-load directory contents from a snapshot.
+//
+// The exchange is synchronous: the server marshals MsgTreeList with a
+// fresh Envelope.ID, sends to the agent, blocks on a channel keyed by
+// that ID. The agent runs `restic ls --json <SnapshotID> <Path>`,
+// emits direct children, and replies with MsgTreeListResult carrying
+// the same ID. The server-side handler matches on ID and forwards to
+// the waiting channel. See internal/server/ws/rpc.go for the helper.
+type TreeListRequestPayload struct {
+	SnapshotID string `json:"snapshot_id"`
+	Path       string `json:"path"` // absolute path inside the snapshot, "/" for root
+}
+
+// TreeListEntry is one direct child returned by a tree.list call.
+// Type is "dir" | "file" | "symlink"; size is best-effort (zero on
+// directories and symlinks).
+type TreeListEntry struct {
+	Name string `json:"name"`
+	Type string `json:"type"`
+	Size int64  `json:"size,omitempty"`
+}
+
+// TreeListResultPayload is the reply to a tree.list. Error is set
+// when the agent couldn't fulfil the request (missing snapshot,
+// path doesn't exist, restic invocation failed); Entries is empty in
+// that case. A successful empty directory has Error="" + nil Entries.
+type TreeListResultPayload struct {
+	SnapshotID string          `json:"snapshot_id"`
+	Path       string          `json:"path"`
+	Entries    []TreeListEntry `json:"entries,omitempty"`
+	Error      string          `json:"error,omitempty"`
 }
@@ -12,27 +12,29 @@ type MessageType string

 // Agent → server message types.
 const (
-	MsgHello         MessageType = "hello"
-	MsgHeartbeat     MessageType = "heartbeat"
-	MsgJobStarted    MessageType = "job.started"
-	MsgJobProgress   MessageType = "job.progress"
-	MsgJobFinished   MessageType = "job.finished"
-	MsgSnapshotsRpt  MessageType = "snapshots.report"
-	MsgRepoStats     MessageType = "repo.stats"
-	MsgLogStream     MessageType = "log.stream"
-	MsgScheduleAck   MessageType = "schedule.ack"
-	MsgScheduleFire  MessageType = "schedule.fire"  // agent: a local cron entry fired, please dispatch a job
-	MsgCommandResult MessageType = "command.result" // ack for command.run
-	MsgError         MessageType = "error"
+	MsgHello          MessageType = "hello"
+	MsgHeartbeat      MessageType = "heartbeat"
+	MsgJobStarted     MessageType = "job.started"
+	MsgJobProgress    MessageType = "job.progress"
+	MsgJobFinished    MessageType = "job.finished"
+	MsgSnapshotsRpt   MessageType = "snapshots.report"
+	MsgRepoStats      MessageType = "repo.stats"
+	MsgLogStream      MessageType = "log.stream"
+	MsgScheduleAck    MessageType = "schedule.ack"
+	MsgScheduleFire   MessageType = "schedule.fire"    // agent: a local cron entry fired, please dispatch a job
+	MsgCommandResult  MessageType = "command.result"   // ack for command.run
+	MsgTreeListResult MessageType = "tree.list.result" // reply to a server-driven tree.list
+	MsgError          MessageType = "error"
 )

 // Server → agent message types.
 const (
-	MsgCommandRun       MessageType = "command.run"
-	MsgCommandCancel    MessageType = "command.cancel"
-	MsgScheduleSet      MessageType = "schedule.set"
-	MsgConfigUpdate     MessageType = "config.update"
-	MsgAgentUpdateAvail MessageType = "agent.update.available"
+	MsgCommandRun    MessageType = "command.run"
+	MsgCommandCancel MessageType = "command.cancel"
+	MsgScheduleSet   MessageType = "schedule.set"
+	MsgConfigUpdate  MessageType = "config.update"
+	MsgCommandUpdate MessageType = "command.update"
+	MsgTreeList      MessageType = "tree.list" // sync RPC: list a snapshot's children
 )

 // Envelope is the framing for every WS message in either direction.
@@ -76,7 +78,7 @@ type ErrorCode string
 const (
 	ErrProtocolTooOld ErrorCode = "protocol_too_old"
 	ErrProtocolTooNew ErrorCode = "protocol_too_new"
-	ErrUnauthorized   ErrorCode = "unauthorized"
+	ErrUnauthorized   ErrorCode = "unauthorised"
 	ErrBadRequest     ErrorCode = "bad_request"
 	ErrInternal       ErrorCode = "internal"
 )
@@ -9,6 +9,7 @@ import (
 	"errors"
 	"fmt"
 	"strings"
+	"testing"

 	"golang.org/x/crypto/argon2"
 )
@@ -27,22 +28,38 @@ const (
 	defaultKeyLen     = 32
 )

+// Cheap params used only when the binary is a `go test` binary
+// (testing.Testing() == true). Argon2id at production params costs
+// 300–500 ms per hash and dominates wall time on CI runners under
+// `-race`. Tests don't need real KDF strength — VerifyPassword reads
+// params from the encoded hash, so verifying a cheap-params hash
+// works the same way.
+const (
+	testMemoryKiB  = 8
+	testIterations = 1
+	testParallel   = 1
+)
+
 // HashPassword returns an argon2id-encoded string of the form
 //
 //	$argon2id$v=19$m=...,t=...,p=...$<salt>$<hash>
 //
 // safe to store in a TEXT column. The salt is freshly random per call.
 func HashPassword(password string) (string, error) {
+	mem, iter, par := uint32(defaultMemoryKiB), uint32(defaultIterations), uint8(defaultParallel)
+	if testing.Testing() {
+		mem, iter, par = testMemoryKiB, testIterations, testParallel
+	}
 	salt := make([]byte, defaultSaltLen)
 	if _, err := rand.Read(salt); err != nil {
 		return "", fmt.Errorf("auth: read salt: %w", err)
 	}
 	hash := argon2.IDKey([]byte(password), salt,
-		defaultIterations, defaultMemoryKiB, defaultParallel, defaultKeyLen)
+		iter, mem, par, defaultKeyLen)

 	return fmt.Sprintf("$argon2id$v=%d$m=%d,t=%d,p=%d$%s$%s",
 		argon2.Version,
-		defaultMemoryKiB, defaultIterations, defaultParallel,
+		mem, iter, par,
 		base64.RawStdEncoding.EncodeToString(salt),
 		base64.RawStdEncoding.EncodeToString(hash),
 	), nil
@@ -56,7 +73,7 @@ func VerifyPassword(encoded, password string) error {
 	parts := strings.Split(encoded, "$")
 	// "$argon2id$v=...$m=...,t=...,p=...$<salt>$<hash>" → 6 parts (leading empty)
 	if len(parts) != 6 || parts[1] != "argon2id" {
-		return errors.New("auth: unrecognized hash format")
+		return errors.New("auth: unrecognised hash format")
 	}
 	var version int
 	if _, err := fmt.Sscanf(parts[2], "v=%d", &version); err != nil {
@@ -2,7 +2,7 @@
 // passwords, REST-server credentials, hook bodies, and any other
 // secret that lands in the SQLite store.
 //
-// The threat model is "defense in depth against a stolen DB file" —
+// The threat model is "defence in depth against a stolen DB file" —
 // not "an attacker with code execution can't read secrets at runtime."
 // We need the encryption key at runtime to do any actual work, so
 // anyone with a memory dump of the running server can extract it.
@@ -0,0 +1,20 @@
+package notification
+
+import (
+	"context"
+	"time"
+)
+
+// Channel is the per-kind transport. Implementations live in
+// webhook.go / ntfy.go / smtp.go. Send must respect ctx (5s for HTTP,
+// 10s for SMTP) and never panic.
+type Channel interface {
+	// Kind returns the kind string ("webhook", "ntfy", "smtp"). Used
+	// for log enrichment and dispatcher routing.
+	Kind() string
+
+	// Send delivers one payload. Returns (statusCode, latency, err).
+	// statusCode is HTTP for HTTP channels, the SMTP final-line code
+	// (e.g. 250) for SMTP, 0 if the call didn't reach a wire response.
+	Send(ctx context.Context, p Payload) (statusCode int, latency time.Duration, err error)
+}
--- a/Show More
+++ b/Show More