Compare commits
287 Commits
main
..
1af02f4495
| Author | SHA1 | Date | |
|---|---|---|---|
| 1af02f4495 | |||
| 06d63b29ff | |||
| 82931684eb | |||
| 39dcda4e9e | |||
| 1b9b23f205 | |||
| c4dc9e9119 | |||
| 7011510092 | |||
| 42eeabea9a | |||
| 7b390e9e5e | |||
| afd15c6990 | |||
| 2562b2c7b5 | |||
| 8be551349c | |||
| a48df77f40 | |||
| 70769f0841 | |||
| ea74965830 | |||
| 9c209a952e | |||
| 871490b9d4 | |||
| d317d2e561 | |||
| 00bfef0aee | |||
| 363bdff85b | |||
| 20425b3360 | |||
| 9c098e773b | |||
| 711d5e964c | |||
| 39657355be | |||
| 0bd075c2a3 | |||
| 83d97a27cc | |||
| ccaccd840a | |||
| 94441a5371 | |||
| 3fa7be51a5 | |||
| 6fd2a2ff77 | |||
| d413896302 | |||
| 74cf24c28b | |||
| 22bcf69e6c | |||
| fe1ed49977 | |||
| d24856866e | |||
| 731f01a63e | |||
| c80ca90efb | |||
| c32acc0332 | |||
| 505a2d7a79 | |||
| 3800b34a2b | |||
| b91fe56c83 | |||
| d6f6d19bff | |||
| 7cc17813a9 | |||
| 5ee58979fa | |||
| 4d90f72575 | |||
| 3173f85b97 | |||
| 962a5affea | |||
| 885439b048 | |||
| c62d7d3ac3 | |||
| 86598d6357 | |||
| c55a75355a | |||
| f56844b5c6 | |||
| 878c82a328 | |||
| e7d891c4fc | |||
| 5c844ad9b7 | |||
| 6006cad992 | |||
| 7f8bd13a07 | |||
| 805380f52d | |||
| c2581e56e8 | |||
| dc89997307 | |||
| cdbd8eeb88 | |||
| bc19ad8804 | |||
| 814e49cb93 | |||
| 4b48925edf | |||
| 36fd9050fe | |||
| 89d4458866 | |||
| 191f0f1c55 | |||
| 00b926b0a3 | |||
| dfff6d1ef9 | |||
| 0415a96e27 | |||
| d85e82110f | |||
| d2cc4a802e | |||
| c34a76393c | |||
| 6ccc6c8c5e | |||
| b0a5a76925 | |||
| 88f1959a6a | |||
| cae4147df6 | |||
| dbb8550936 | |||
| 90bcddb27e | |||
| cd3c13e2c6 | |||
| a74dc33c1c | |||
| a985d45daa | |||
| 57a13f0759 | |||
| 8d4c4426b0 | |||
| cbdd94ca12 | |||
| c1e974aad9 | |||
| 95aee73e2c | |||
| f87ba29836 | |||
| 2073898c10 | |||
| 37a25beb14 | |||
| f0828782c1 | |||
| 12391abef0 | |||
| 2c090171e5 | |||
| bd08d8ca14 | |||
| a7e53e0a64 | |||
| ca170fedc5 | |||
| c9f230ce1d | |||
| 282258e837 | |||
| 4eab42a9c3 | |||
| 03e5ec31f1 | |||
| 6fd16ace81 | |||
| ba425c9766 | |||
| 1d0d994bc4 | |||
| 489f831fc7 | |||
| 3f36bcd0b0 | |||
| cb3260b89c | |||
| 8813e93317 | |||
| 9860b412f7 | |||
| 1618094a26 | |||
| dd53c9e497 | |||
| 84814b1386 | |||
| a45c801884 | |||
| 7792aadb94 | |||
| 2eac324cec | |||
| 3cdaee63d4 | |||
| 7f2a9964db | |||
| feaeff217d | |||
| cffad4b4f3 | |||
| 84e121bb9c | |||
| c5b884a22b | |||
| 3d99306cea | |||
| 6466f8c759 | |||
| 9be3cead8e | |||
| ee410fcf95 | |||
| e0fbb8c980 | |||
| 371fe734f3 | |||
| d373d19647 | |||
| cd38b40516 | |||
| de6939b3f6 | |||
| 873821b871 | |||
| 8c42b00228 | |||
| cb4695e09a | |||
| f38930e2e6 | |||
| 16e71a0708 | |||
| a6ac9ee71d | |||
| a99864c649 | |||
| f0a323ef91 | |||
| c22fb24f5b | |||
| 6688b3f88a | |||
| 69fc89143d | |||
| b5a0aa4667 | |||
| f24dfa5214 | |||
| 640b64710e | |||
| e6d965d7a5 | |||
| 4b70939ab5 | |||
| 518c29ddb3 | |||
| 6165e34f6f | |||
| 64861a5fb8 | |||
| 28d5043eb0 | |||
| e4031d26fa | |||
| 02250670c1 | |||
| 8e06bc7924 | |||
| f0dfa689fe | |||
| a2398d0b66 | |||
| e22b41d452 | |||
| 1111124573 | |||
| 6e47efc146 | |||
| 265b4b6c5d | |||
| 6d295bc9f6 | |||
| 9fa2ef48f0 | |||
| 454a2415dc | |||
| 0bd7a896c4 | |||
| bdabcfb68e | |||
| c691dc8a56 | |||
| 8ceb76c733 | |||
| d29475560d | |||
| bbdf631a01 | |||
| a3a53e3b87 | |||
| 567561a6a3 | |||
| a8e6c9d6d7 | |||
| 1d3661470f | |||
| 13c35b68d4 | |||
| c20375eaf5 | |||
| cce3cd8384 | |||
| 93ab0ae84f | |||
| 6589f23313 | |||
| ddc07609cb | |||
| 21d967a2cf | |||
| 24973bdc72 | |||
| cd510d2032 | |||
| a07d7fc53e | |||
| bc02fcb498 | |||
| d8dd21b5e0 | |||
| b054e7b987 | |||
| 99ef2b7a71 | |||
| b8c9c50a93 | |||
| 18cc90d54e | |||
| a1db4ce4f7 | |||
| 99b88d08c9 | |||
| 1629dc7146 | |||
| 0c9ea75046 | |||
| 3e337dfb3c | |||
| e64cf25c0e | |||
| 2794d5a821 | |||
| c47cc682e0 | |||
| e7e11454a8 | |||
| 77a8590e3a | |||
| 46ec123f95 | |||
| b35f1736f7 | |||
| a8aff2c62b | |||
| 1ae567021a | |||
| 81a00202d0 | |||
| dafae84149 | |||
| d3c354cd97 | |||
| 1f600fa849 | |||
| 212fd3e400 | |||
| c9be9040d9 | |||
| 7fd29427a0 | |||
| 49fd3f4441 | |||
| f3eaf511be | |||
| 2caf7f1193 | |||
| 4ad0b5147a | |||
| f97f67eb67 | |||
| bc77081366 | |||
| 87655cf0e4 | |||
| de6d51eeb1 | |||
| 212ddfe226 | |||
| b640775a61 | |||
| 13f58537ad | |||
| a24eee4c68 | |||
| 0ae62261e3 | |||
| dd7b37a5c1 | |||
| 694d9d9bf3 | |||
| 2d40002355 | |||
| e871b05b38 | |||
| 18a9f6624e | |||
| 2a8dd1eba2 | |||
| fab99b4a38 | |||
| ffba7371c5 | |||
| 4035c44be3 | |||
| d62b173712 | |||
| 8b91d3037c | |||
| 64d2fcf7a3 | |||
| 67ca769686 | |||
| dede74fd3a | |||
| 0ed9c3d1ec | |||
| a535822ff3 | |||
| 21841e38c4 | |||
| e968abc042 | |||
| 713bc4a2bb | |||
| d000fe7ec1 | |||
| 337dcc0f0f | |||
| 813158b3d6 | |||
| 5667cdf13a | |||
| 666af41f46 | |||
| 7a7cac588c | |||
| fdecde0d5c | |||
| f62a90b4b3 | |||
| 1b947f5a2c | |||
| c565a7abd1 | |||
| 7e49b62e0e | |||
| e0037f0026 | |||
| 72d8081b0d | |||
| 8a05969953 | |||
| 148e61b33b | |||
| 160d788bae | |||
| 6450bf1b88 | |||
| 946b6db137 | |||
| 4b075840a1 | |||
| ee3ee241ea | |||
| 12b72e7dde | |||
| bd434bd1d0 | |||
| 26a2b85e13 | |||
| dad8c7fe99 | |||
| ee16bc7ce7 | |||
| 229f89fee2 | |||
| 136e1a1d8f | |||
| f9c2351ab6 | |||
| 81c7825937 | |||
| b6cfa99413 | |||
| 2418e585db | |||
| 5d1951ad94 | |||
| ec276dbc91 | |||
| 0ba56ed30d | |||
| e58917106d | |||
| 6c9558c703 | |||
| 3904a78f14 | |||
| 41a4043af3 | |||
| 77a305d064 | |||
| 95b49ecab9 | |||
| e8eccd20c2 | |||
| f34773b505 | |||
| 84fd31ccaa | |||
| c275f4ff4c | |||
| 595546afb9 | |||
| c9368de904 | |||
| 7612687a14 |
+37
-50
@@ -2,34 +2,28 @@
|
||||
#
|
||||
# Notes for anyone editing this file:
|
||||
#
|
||||
# Custom runner image
|
||||
# Every job runs inside `gitea.dcglab.co.uk/steve/ci-runner-go`
|
||||
# (recipe: https://gitea.dcglab.co.uk/steve/ci/src/branch/main/images/ci-runner-go).
|
||||
# That image already ships:
|
||||
# * Go on PATH at /usr/local/go/bin (so `actions/setup-go` is
|
||||
# redundant and intentionally NOT used here — the action would
|
||||
# otherwise re-download Go on every job)
|
||||
# * Node.js + npm (used by docs / e2e workflows)
|
||||
# * Docker CLI, Buildx, Compose v2 (used by docker-build steps)
|
||||
# When bumping the Go floor, push a new ci-runner-go image with
|
||||
# the matching Go version and bump the date pin in IMAGE below.
|
||||
#
|
||||
# Self-hosted runner expectations
|
||||
# Each runner host bind-mounts persistent volumes for
|
||||
# /root/go/pkg/mod (GOMODCACHE), /root/.cache/go-build (GOCACHE),
|
||||
# and /root/.cache/act (action clones) into every job container —
|
||||
# regardless of which image the container is built from. As a
|
||||
# The Gitea runners are provisioned out-of-band (the infra team owns
|
||||
# the script). Each runner host bind-mounts persistent volumes for
|
||||
# /root/go/pkg/mod (GOMODCACHE), /root/.cache/go-build (GOCACHE), and
|
||||
# /root/.cache/act (action clones) into every job container. As a
|
||||
# result:
|
||||
# * Common GitHub actions (actions/checkout, actions/upload-artifact,
|
||||
# golangci/golangci-lint-action) are pre-cloned into
|
||||
# /root/.cache/act on the runner, so the per-job
|
||||
# "git clone https://github.com/actions/..." step is a fetch,
|
||||
# not a full clone.
|
||||
# * `cache: true` on actions/setup-go is intentionally OMITTED — the
|
||||
# action would otherwise tar/untar GOMODCACHE+GOCACHE through the
|
||||
# Gitea cache backend on every job, undoing the host-volume cache
|
||||
# and adding ~10s of redundant zstd round-trip per job.
|
||||
# * Common GitHub actions (actions/checkout, actions/setup-go,
|
||||
# actions/upload-artifact, golangci/golangci-lint-action) are
|
||||
# pre-cloned into /root/.cache/act on the runner, so the per-job
|
||||
# "git clone https://github.com/actions/..." step is a fetch, not
|
||||
# a full clone.
|
||||
# * golangci-lint is pre-installed at /usr/local/bin/golangci-lint
|
||||
# on the runner host BUT that's outside the job's filesystem
|
||||
# view; the golangci-lint-action below pins a specific version
|
||||
# and re-downloads — that's fine (deterministic CI > marginal
|
||||
# speed).
|
||||
# on the runner (latest v2.x). The golangci-lint-action below
|
||||
# still pins a specific version and re-downloads — that's fine
|
||||
# (deterministic CI > marginal speed) but means the host-installed
|
||||
# binary is currently unused. Drop the `version:` arg below to
|
||||
# use the host-installed one if you want to trade determinism
|
||||
# for speed.
|
||||
#
|
||||
# Build matrix
|
||||
# Linux amd64 + arm64 + Windows amd64. CGO_ENABLED=0 throughout —
|
||||
@@ -38,10 +32,10 @@
|
||||
# binaries.
|
||||
#
|
||||
# Go version
|
||||
# Anchored by the ci-runner-go image (currently Go 1.25.7). Floor
|
||||
# is set by the heaviest dep (modernc.org/sqlite v1.50+ requires
|
||||
# Go 1.23+; we run 1.25 so golangci-lint's Go-version compatibility
|
||||
# check is happy — see the version pin in the lint job).
|
||||
# The GO_VERSION env var anchors all three jobs. Floor is set by the
|
||||
# heaviest dep (modernc.org/sqlite v1.50+ requires Go 1.23+ today;
|
||||
# we run 1.25 so golangci-lint's Go-version compatibility check is
|
||||
# happy — see the version pin in the lint job).
|
||||
#
|
||||
# upload-artifact
|
||||
# Pinned at v3 historically; v3 was deprecated upstream. v4 should
|
||||
@@ -54,12 +48,8 @@ on:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
# Force bash as the default shell. With `container:` set on every
|
||||
# job, Gitea Actions otherwise picks `sh -e` and our `set -euo
|
||||
# pipefail` fails on dash with "Illegal option -o pipefail".
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
env:
|
||||
GO_VERSION: "1.25"
|
||||
|
||||
jobs:
|
||||
test:
|
||||
@@ -70,11 +60,6 @@ jobs:
|
||||
# one runner. The third shard ("rest") covers everything else.
|
||||
name: Test (${{ matrix.name }})
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: docker.dcglab.co.uk/ci-runner-go:2026-05-15
|
||||
credentials:
|
||||
username: ${{ secrets.ZOT_USERNAME }}
|
||||
password: ${{ secrets.ZOT_PASSWORD }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -88,6 +73,10 @@ jobs:
|
||||
packages: ""
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: ${{ env.GO_VERSION }}
|
||||
# cache: true intentionally omitted — see header notes.
|
||||
- name: go vet
|
||||
run: go vet ./...
|
||||
- name: go test
|
||||
@@ -109,13 +98,12 @@ jobs:
|
||||
lint:
|
||||
name: Lint
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: docker.dcglab.co.uk/ci-runner-go:2026-05-15
|
||||
credentials:
|
||||
username: ${{ secrets.ZOT_USERNAME }}
|
||||
password: ${{ secrets.ZOT_PASSWORD }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: ${{ env.GO_VERSION }}
|
||||
# cache: true intentionally omitted — see header notes.
|
||||
- uses: golangci/golangci-lint-action@v7
|
||||
with:
|
||||
# Must be built against the same Go release as go.mod targets,
|
||||
@@ -129,11 +117,6 @@ jobs:
|
||||
build:
|
||||
name: Build (${{ matrix.goos }}/${{ matrix.goarch }})
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: docker.dcglab.co.uk/ci-runner-go:2026-05-15
|
||||
credentials:
|
||||
username: ${{ secrets.ZOT_USERNAME }}
|
||||
password: ${{ secrets.ZOT_PASSWORD }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -147,6 +130,10 @@ jobs:
|
||||
ext: ".exe"
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: ${{ env.GO_VERSION }}
|
||||
# cache: true intentionally omitted — see header notes.
|
||||
- name: build server + agent
|
||||
env:
|
||||
GOOS: ${{ matrix.goos }}
|
||||
|
||||
+10
-45
@@ -22,27 +22,16 @@ on:
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
|
||||
# Force bash as the default shell — see ci.yml header.
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
jobs:
|
||||
e2e:
|
||||
name: Playwright vs docker-compose
|
||||
runs-on: ubuntu-latest
|
||||
container: gitea.dcglab.co.uk/steve/ci-runner-go:2026-05-08
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Build the e2e stack
|
||||
# --profile test pulls in the playwright service which is
|
||||
# otherwise gated. --pull refreshes base images so a bump
|
||||
# to the Dockerfile's FROM tag (e.g. mcr.microsoft.com/
|
||||
# playwright:vX.Y.Z-jammy) isn't masked by a stale runner
|
||||
# cache that still has the old tag's layers.
|
||||
run: docker compose --profile test -f e2e/compose.e2e.yml build --pull
|
||||
run: docker compose -f e2e/compose.e2e.yml build
|
||||
|
||||
- name: Bring up the stack
|
||||
run: docker compose -f e2e/compose.e2e.yml up -d server rest-server source-fixture
|
||||
@@ -79,35 +68,15 @@ jobs:
|
||||
- name: Start the agent
|
||||
run: docker compose -f e2e/compose.e2e.yml up -d agent
|
||||
|
||||
- name: Run Playwright tests
|
||||
id: playwright
|
||||
env:
|
||||
RM_BOOTSTRAP_TOKEN: ${{ env.RM_BOOTSTRAP_TOKEN }}
|
||||
# --name pins a stable container ID so the next step can
|
||||
# docker cp out of it before tear-down. We deliberately
|
||||
# drop --rm so the container survives the test exit; the
|
||||
# tear-down step removes it.
|
||||
run: docker compose -f e2e/compose.e2e.yml run --name e2e-pw playwright
|
||||
|
||||
- name: Extract Playwright report
|
||||
if: always() && steps.playwright.outcome != 'skipped'
|
||||
- name: Prepare report mounts
|
||||
run: |
|
||||
mkdir -p e2e/playwright/playwright-report e2e/playwright/test-results
|
||||
docker cp e2e-pw:/work/playwright-report/. e2e/playwright/playwright-report/ || true
|
||||
docker cp e2e-pw:/work/test-results/. e2e/playwright/test-results/ || true
|
||||
chmod -R a+rwX e2e/playwright/playwright-report e2e/playwright/test-results
|
||||
|
||||
- name: Show Playwright failure context (on failure)
|
||||
if: failure()
|
||||
run: |
|
||||
set +e
|
||||
shopt -s nullglob globstar
|
||||
for f in e2e/playwright/test-results/**/error-context.md; do
|
||||
echo "::group::$f"
|
||||
cat "$f"
|
||||
echo "::endgroup::"
|
||||
done
|
||||
echo "Failure attachments (download via the playwright-report artifact):"
|
||||
find e2e/playwright/test-results \( -name '*.png' -o -name '*.webm' -o -name 'trace.zip' \) -printf ' %p\n' | sort
|
||||
- name: Run Playwright tests
|
||||
env:
|
||||
RM_BOOTSTRAP_TOKEN: ${{ env.RM_BOOTSTRAP_TOKEN }}
|
||||
run: docker compose -f e2e/compose.e2e.yml run --rm playwright
|
||||
|
||||
- name: Compose logs (on failure)
|
||||
if: failure()
|
||||
@@ -118,16 +87,12 @@ jobs:
|
||||
|
||||
- name: Upload Playwright report (on failure)
|
||||
if: failure()
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: playwright-report
|
||||
path: |
|
||||
e2e/playwright/playwright-report
|
||||
e2e/playwright/test-results
|
||||
path: e2e/playwright/playwright-report
|
||||
retention-days: 7
|
||||
|
||||
- name: Tear down
|
||||
if: always()
|
||||
run: |
|
||||
docker rm -f e2e-pw 2>/dev/null || true
|
||||
docker compose -f e2e/compose.e2e.yml down -v
|
||||
run: docker compose -f e2e/compose.e2e.yml down -v
|
||||
|
||||
@@ -12,12 +12,18 @@
|
||||
# plus install.sh / install.ps1 / the systemd unit baked in under
|
||||
# /opt/restic-manager/dist (the read-only fallback path the server
|
||||
# handlers use when <DataDir>/... is empty).
|
||||
# * Pushes to zot OCI registry (docker.dcglab.co.uk).
|
||||
# * Pushes to this Gitea instance's container registry under
|
||||
# <gitea-host>/<owner>/restic-manager.
|
||||
#
|
||||
# Tag fan-out
|
||||
# * tag push: :vX.Y.Z, :X.Y, :X
|
||||
# * tag push and X >= 1: also :latest
|
||||
# * workflow_dispatch: only :snapshot-<shortsha>; nothing else moves.
|
||||
#
|
||||
# Why no goreleaser
|
||||
# The architecture already routes agent distribution through the
|
||||
# server's /agent/binary endpoint. The image is the only deliverable;
|
||||
# binary archives would just be a second source of truth.
|
||||
|
||||
name: Release
|
||||
|
||||
@@ -28,35 +34,25 @@ on:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: docker.dcglab.co.uk
|
||||
IMAGE_NAME: restic-manager
|
||||
|
||||
# Force bash as the default shell — see ci.yml header.
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
REGISTRY: gitea.dcglab.co.uk
|
||||
IMAGE_NAME: ${{ gitea.repository }}
|
||||
|
||||
jobs:
|
||||
image:
|
||||
name: Build + push image
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: docker.dcglab.co.uk/ci-runner-go:2026-05-15
|
||||
credentials:
|
||||
username: ${{ secrets.ZOT_USERNAME }}
|
||||
password: ${{ secrets.ZOT_PASSWORD }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: docker/setup-qemu-action@v3
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to zot registry
|
||||
- name: Log in to Gitea registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ secrets.ZOT_USERNAME }}
|
||||
password: ${{ secrets.ZOT_PASSWORD }}
|
||||
username: ${{ gitea.actor }}
|
||||
password: ${{ secrets.DEV_TOKEN }}
|
||||
|
||||
- name: Compute tags + version
|
||||
id: meta
|
||||
|
||||
@@ -45,10 +45,3 @@ coverage.html
|
||||
# tooling already skips paths starting with _, but ignore explicitly
|
||||
# so an accidental `git add cmd/.` can't sneak them into a release.
|
||||
/cmd/_*/
|
||||
|
||||
# Local-only planning / scratch — never committed.
|
||||
/ask.md
|
||||
/docs/superpowers/
|
||||
|
||||
# Claude Code agent worktrees (transient, harness-created).
|
||||
/.claude/worktrees/
|
||||
|
||||
-127
@@ -1,127 +0,0 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project are documented here.
|
||||
The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
||||
and the project follows [Semantic Versioning](https://semver.org/).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [1.1.0] - 2026-06-15
|
||||
|
||||
### Added
|
||||
|
||||
- **Always-On vs intermittent host mode.** A host can now be marked as
|
||||
not always-on — for laptops/workstations that legitimately sleep,
|
||||
travel, or shut down outside hours. An intermittent host no longer
|
||||
raises "agent offline" alerts when it disappears; instead it shows a
|
||||
calm "asleep" state in the UI ("asleep · last seen … · will catch up
|
||||
on return") and is covered by a longer-horizon staleness alert (raised
|
||||
only when it has an enabled schedule and no successful backup in 7
|
||||
days). When such a host reconnects, the server waits a short settle
|
||||
window and then automatically dispatches any scheduled backup whose
|
||||
window elapsed while it was asleep. Toggle per host from the host
|
||||
detail page (operator-band, audited as `host.mode_updated`). New and
|
||||
existing hosts default to always-on, so current fleets are unaffected.
|
||||
|
||||
### Changed
|
||||
|
||||
- Host-detail header redesign: tags and presence are grouped into
|
||||
labelled, boxed pills with click-to-edit; presence shows a `24x7` /
|
||||
`Free` chip; the agent "out of date" indicator is simplified (the full
|
||||
version detail remains in the Agent-update panel and on hover).
|
||||
- Relative timestamps ("2h ago") now tick client-side, so a tab left
|
||||
open no longer shows a stale value as wall-clock time moves on.
|
||||
- Release and CI container images are now published to and pulled from
|
||||
the zot OCI registry (`docker.dcglab.co.uk`).
|
||||
|
||||
## [1.0.1] - 2026-05-09
|
||||
|
||||
### Fixed
|
||||
|
||||
- Build version is now single-sourced from `internal/version`, and the
|
||||
server Dockerfile's ldflags were corrected so docker-built binaries
|
||||
report their real version. Previously `internal/version.Version` stayed
|
||||
at its "dev" default in docker images, which made every host look
|
||||
permanently out-of-date to the update logic.
|
||||
|
||||
## [1.0.0] - 2026-05-09
|
||||
|
||||
First tagged release. Six development phases brought the project from
|
||||
empty repo to a self-hostable, multi-tenant restic backup orchestrator
|
||||
with a web UI, JSON API, and self-updating agent fleet.
|
||||
|
||||
### Phase 1 — MVP: enrolment, visibility, on-demand backup
|
||||
|
||||
- HTTP server, SQLite store with migrations, AEAD-encrypted
|
||||
credentials at rest, Argon2id password hashing, session cookies.
|
||||
- WebSocket transport between server and agents (heartbeat, hello,
|
||||
schedule fan-out, job log streaming).
|
||||
- Agent install path for Linux (systemd unit + `install.sh`); one-time
|
||||
enrolment tokens with embedded repo credentials.
|
||||
- Run-now backup execution end-to-end, snapshot listing.
|
||||
- Server-side encrypted repo creds pushed to the agent on hello.
|
||||
|
||||
### Phase 2 — Scheduling, retention, repo operations
|
||||
|
||||
- Source groups (paths + excludes + pre/post hooks + bandwidth caps)
|
||||
decoupled from schedules; a schedule fires a source group.
|
||||
- Cron-style schedules with retention policies, server-driven
|
||||
reconciliation push and ack.
|
||||
- `restic forget`, `prune`, `check`, `unlock` automation; periodic
|
||||
maintenance ticker with per-host stagger.
|
||||
- Pending-runs queue with backpressure (`max_concurrent_jobs` per
|
||||
host).
|
||||
- Repo stats panel on the host detail page (size, last-check, last-
|
||||
prune, stale-lock banner).
|
||||
- Auto-init of repos on first onboard with credential-failure surface
|
||||
on the host detail page.
|
||||
- Announce-and-approve enrolment path for hosts that don't have a
|
||||
pre-minted token (Ed25519 fingerprint, operator approves).
|
||||
- Windows agent: SCM service integration + `install.ps1` installer.
|
||||
- Cross-platform alt-enrolment (announce flow on Windows).
|
||||
|
||||
### Phase 3 — Restore, alerts, audit
|
||||
|
||||
- Restore wizard: pick a snapshot, pick paths, pick a target
|
||||
(in-place / new directory), live progress.
|
||||
- Snapshot diff against parent.
|
||||
- Alert engine: per-source-group dedup, severity tiers, ack / resolve.
|
||||
- Live-refresh alerts table with severity cues.
|
||||
- Audit log UI with filters, sort, CSV export, payload-detail modal.
|
||||
|
||||
### Phase 4 — RBAC, OIDC, host tags
|
||||
|
||||
- Role-based access control: viewer / operator / admin.
|
||||
- User management UI (invite, role change, disable, password reset).
|
||||
- Generic OIDC SSO with JIT user provisioning + role mapping.
|
||||
- Per-host tags with chip-row filter on the dashboard.
|
||||
|
||||
### Phase 5 — OSS readiness
|
||||
|
||||
- mdBook-rendered docs site at `docs/book/`.
|
||||
- Contributor onboarding (CONTRIBUTING.md, security policy, license).
|
||||
- Docker-only release pipeline + reference deployment compose file.
|
||||
- Playwright e2e harness covering the smoke runbook.
|
||||
|
||||
### Phase 6 — Update delivery + observability
|
||||
|
||||
- Agent self-update: server-side channel pin per host, signed binary
|
||||
fetch via the WS transport, atomic swap with rollback on failure.
|
||||
- Fleet-wide update orchestration with per-host stagger and an admin
|
||||
pause switch.
|
||||
- Prometheus `/metrics` endpoint + Grafana dashboard JSON.
|
||||
- Repo size trend per host (90-day rolling) on the host detail page.
|
||||
|
||||
### Cross-cutting
|
||||
|
||||
- Live dashboard with column sort, filters, free-text host search,
|
||||
background-tab-aware live refresh (5s cadence).
|
||||
- Pure-Go binary with embedded UI, no Node/CGO at runtime.
|
||||
- Reproducible `-trimpath -ldflags="-s -w"` builds for
|
||||
linux/amd64, linux/arm64, windows/amd64.
|
||||
- Sharded CI (server-http / store / rest), pre-commit hooks (gofumpt,
|
||||
go vet, golangci-lint).
|
||||
- Threat model published (`docs/threat-model.md`).
|
||||
|
||||
[Unreleased]: https://gitea.dcglab.co.uk/steve/restic-manager/compare/v1.0.0...HEAD
|
||||
[1.0.0]: https://gitea.dcglab.co.uk/steve/restic-manager/releases/tag/v1.0.0
|
||||
@@ -8,10 +8,8 @@ VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || ec
|
||||
COMMIT ?= $(shell git rev-parse HEAD 2>/dev/null || echo none)
|
||||
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
VERSION_PKG := gitea.dcglab.co.uk/steve/restic-manager/internal/version
|
||||
LDFLAGS := -s -w \
|
||||
-X $(VERSION_PKG).Version=$(VERSION) \
|
||||
-X $(VERSION_PKG).Commit=$(COMMIT) \
|
||||
-X $(VERSION_PKG).Date=$(DATE)
|
||||
LDFLAGS := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE) \
|
||||
-X $(VERSION_PKG).Version=$(VERSION) -X $(VERSION_PKG).Commit=$(COMMIT)
|
||||
GOFLAGS := -trimpath
|
||||
DOCKER_IMAGE ?= gitea.dcglab.co.uk/steve/restic-manager
|
||||
DOCKER_TAG ?= dev
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
# The ask!
|
||||
|
||||
I have numerous servers deployed out in a lab, mainly Linux but some Windows
|
||||
All have restic installed on them
|
||||
I need to build a browser based management service that allows me to have a central single-plane-of-glass to monitor and manage all teh endpoints
|
||||
All endpoints will be enabled for SSH (unless other methods are better?)
|
||||
|
||||
Plan out how we would go about this please?
|
||||
+11
-6
@@ -22,7 +22,12 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/agent/wsclient"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/restic"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||
)
|
||||
|
||||
var (
|
||||
version = "dev"
|
||||
commit = "none"
|
||||
date = "unknown"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -61,7 +66,7 @@ func run() error {
|
||||
flag.Parse()
|
||||
|
||||
if *showVersion {
|
||||
fmt.Printf("restic-manager-agent %s (commit %s, built %s)\n", version.Version, version.Commit, version.Date)
|
||||
fmt.Printf("restic-manager-agent %s (commit %s, built %s)\n", version, commit, date)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -77,14 +82,14 @@ func run() error {
|
||||
if *enrollServer == "" {
|
||||
return errors.New("enrollment: -enroll-server is required with -enroll-token")
|
||||
}
|
||||
return doEnroll(*enrollServer, *enrollToken, cfg, version.Version)
|
||||
return doEnroll(*enrollServer, *enrollToken, cfg, version)
|
||||
}
|
||||
|
||||
// Announce-and-approve: -enroll-server set, no token, agent not
|
||||
// yet enrolled. Run the announce flow inline; on success the cfg
|
||||
// has the bearer + host_id and we drop into the normal run loop.
|
||||
if !cfg.Enrolled() && *enrollServer != "" {
|
||||
if err := doAnnounce(*enrollServer, cfg, version.Version); err != nil {
|
||||
if err := doAnnounce(*enrollServer, cfg, version); err != nil {
|
||||
return fmt.Errorf("announce: %w", err)
|
||||
}
|
||||
}
|
||||
@@ -101,7 +106,7 @@ func run() error {
|
||||
return fmt.Errorf("sysinfo: %w", err)
|
||||
}
|
||||
slog.Info("agent starting",
|
||||
"version", version.Version,
|
||||
"version", version,
|
||||
"host_id", cfg.HostID,
|
||||
"server", cfg.ServerURL,
|
||||
"restic_version", snap.ResticVersion,
|
||||
@@ -131,7 +136,7 @@ func run() error {
|
||||
CertPinSHA256: cfg.CertPinSHA256,
|
||||
HelloPayload: api.HelloPayload{
|
||||
ProtocolVersion: snap.ProtocolVersion,
|
||||
AgentVersion: version.Version,
|
||||
AgentVersion: version,
|
||||
ResticVersion: snap.ResticVersion,
|
||||
Hostname: snap.Hostname,
|
||||
OS: snap.OS,
|
||||
|
||||
+11
-20
@@ -9,7 +9,6 @@ import (
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
@@ -21,12 +20,16 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/fleetupdate"
|
||||
rmhttp "gitea.dcglab.co.uk/steve/restic-manager/internal/server/http"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/maintenance"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||
)
|
||||
|
||||
var (
|
||||
version = "dev"
|
||||
commit = "none"
|
||||
date = "unknown"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -42,7 +45,7 @@ func run() error {
|
||||
flag.Parse()
|
||||
|
||||
if *showVersion {
|
||||
fmt.Printf("restic-manager-server %s (commit %s, built %s)\n", version.Version, version.Commit, version.Date)
|
||||
fmt.Printf("restic-manager-server %s (commit %s, built %s)\n", version, commit, date)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -86,7 +89,6 @@ func run() error {
|
||||
|
||||
hub := ws.NewHub()
|
||||
jobHub := ws.NewJobHub()
|
||||
metricsRegistry := metrics.NewRegistry()
|
||||
|
||||
notifHub := notification.NewHub(st, aead, cfg.BaseURL)
|
||||
alertEngine := alert.NewEngine(st, notifHub)
|
||||
@@ -118,9 +120,8 @@ func run() error {
|
||||
NotificationHub: notifHub,
|
||||
UpdateWatcher: updateWatcher,
|
||||
UI: renderer,
|
||||
Version: version.Version,
|
||||
Version: version,
|
||||
OIDC: oidcClient,
|
||||
Metrics: metricsRegistry,
|
||||
}
|
||||
|
||||
// First-run bootstrap: if the users table is empty, mint a one-time
|
||||
@@ -141,18 +142,9 @@ func run() error {
|
||||
// text exactly once; we hash it into BootstrapToken on the
|
||||
// server-side handler.
|
||||
fmt.Fprintln(os.Stderr, "================================================================")
|
||||
fmt.Fprintln(os.Stderr, " FIRST RUN — no admin user exists yet.")
|
||||
if cfg.BaseURL != "" {
|
||||
fmt.Fprintln(os.Stderr, " Open this URL in a browser to create the first administrator:")
|
||||
fmt.Fprintln(os.Stderr, " "+strings.TrimRight(cfg.BaseURL, "/")+"/bootstrap")
|
||||
} else {
|
||||
fmt.Fprintln(os.Stderr, " Open the server URL in a browser; you'll be sent to /bootstrap.")
|
||||
fmt.Fprintln(os.Stderr, " (Set RM_BASE_URL to have a clickable link printed here.)")
|
||||
}
|
||||
fmt.Fprintln(os.Stderr, "")
|
||||
fmt.Fprintln(os.Stderr, " Headless? POST {token, username, password} to /api/bootstrap")
|
||||
fmt.Fprintln(os.Stderr, " with this one-shot bootstrap token (valid until first user exists):")
|
||||
fmt.Fprintln(os.Stderr, " FIRST RUN — bootstrap token (use within 1 hour, then it's gone):")
|
||||
fmt.Fprintln(os.Stderr, " "+token)
|
||||
fmt.Fprintln(os.Stderr, " POST it to /api/bootstrap with {token, username, password}.")
|
||||
fmt.Fprintln(os.Stderr, "================================================================")
|
||||
}
|
||||
|
||||
@@ -172,7 +164,7 @@ func run() error {
|
||||
|
||||
errCh := make(chan error, 1)
|
||||
go func() {
|
||||
slog.Info("server listening", "addr", cfg.Listen, "version", version.Version)
|
||||
slog.Info("server listening", "addr", cfg.Listen, "version", version)
|
||||
errCh <- srv.Start()
|
||||
}()
|
||||
|
||||
@@ -227,7 +219,6 @@ func run() error {
|
||||
}
|
||||
case <-pendingDrainTick.C:
|
||||
srv.DrainAllDue(ctx)
|
||||
srv.RunCatchupsDue(ctx)
|
||||
case <-pendingExpiryTick.C:
|
||||
if n, err := st.DeleteExpiredPendingHosts(ctx, time.Now().UTC()); err == nil && n > 0 {
|
||||
slog.Info("expired pending hosts swept", "n", n)
|
||||
|
||||
@@ -26,11 +26,7 @@ ARG DATE=unknown
|
||||
ARG TARGETOS
|
||||
ARG TARGETARCH
|
||||
|
||||
ENV VERSION_PKG="gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||
ENV LDFLAGS="-s -w \
|
||||
-X ${VERSION_PKG}.Version=${VERSION} \
|
||||
-X ${VERSION_PKG}.Commit=${COMMIT} \
|
||||
-X ${VERSION_PKG}.Date=${DATE}"
|
||||
ENV LDFLAGS="-s -w -X main.version=${VERSION} -X main.commit=${COMMIT} -X main.date=${DATE}"
|
||||
|
||||
# Server: built for the image's runtime arch.
|
||||
RUN GOOS=${TARGETOS} GOARCH=${TARGETARCH} \
|
||||
|
||||
@@ -1,325 +0,0 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": { "type": "grafana", "uid": "-- Grafana --" },
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "restic-manager fleet overview. Imports against any Prometheus data source.",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Fleet status",
|
||||
"type": "stat",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "green", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_hosts_online",
|
||||
"legendFormat": "online",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_hosts_total",
|
||||
"legendFormat": "total",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Open alerts",
|
||||
"type": "stat",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 5 }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "sum by (severity) (rm_active_alerts)",
|
||||
"legendFormat": "{{severity}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Backups failing (last reported run)",
|
||||
"type": "stat",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 6, "w": 6, "x": 12, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "count(rm_host_last_backup_success == 0)",
|
||||
"legendFormat": "failing",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Hosts",
|
||||
"type": "table",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 6 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": { "align": "auto", "displayMode": "auto" }
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value #B" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Last backup (s ago)" },
|
||||
{ "id": "unit", "value": "s" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value #C" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Repo size" },
|
||||
{ "id": "unit", "value": "bytes" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value #D" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Snapshots" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value #A" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Online" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "Value #E" },
|
||||
"properties": [
|
||||
{ "id": "displayName", "value": "Open alerts" }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": { "showHeader": true },
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_host_agent_online",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "time() - rm_host_last_backup_timestamp_seconds",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_host_repo_size_bytes",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_host_snapshot_count",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "D"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_host_open_alerts",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "E"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Repo size over time",
|
||||
"type": "timeseries",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisLabel": "",
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"showPoints": "never"
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["last"], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "rm_host_repo_size_bytes",
|
||||
"legendFormat": "{{host}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Job duration p95 (last 1h, by kind)",
|
||||
"type": "timeseries",
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 5,
|
||||
"lineWidth": 1,
|
||||
"pointSize": 4,
|
||||
"showPoints": "never"
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": { "calcs": ["last"], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"expr": "histogram_quantile(0.95, sum by (kind, le) (rate(rm_job_duration_seconds_bucket[1h])))",
|
||||
"legendFormat": "{{kind}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": ["restic-manager", "backups"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Prometheus",
|
||||
"multi": false,
|
||||
"name": "DS_PROMETHEUS",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "restic-manager — fleet",
|
||||
"uid": "rm-fleet-overview",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
@@ -1,249 +0,0 @@
|
||||
# Onboarding a new host — agent instructions
|
||||
|
||||
How an automation agent (with a username + password for the
|
||||
restic-manager server) brings a new host fully online.
|
||||
|
||||
The flow is two roles:
|
||||
|
||||
- **Controller side**: the agent calls JSON APIs on the
|
||||
restic-manager server. Needs network reach to the server, plus
|
||||
username/password.
|
||||
- **Target side**: the host being onboarded runs the install
|
||||
script, which calls back to the server with the one-time token.
|
||||
|
||||
If the agent is *both* sides (e.g. it can SSH into the target),
|
||||
it does steps 1–2 against the server and steps 3–4 against the
|
||||
target. If the agent only controls the server, it stops at
|
||||
step 2 and hands the install snippet to whoever owns the target.
|
||||
|
||||
---
|
||||
|
||||
## Conventions
|
||||
|
||||
- Base URL: `$RM_SERVER` (e.g. `https://restic.lab.example`).
|
||||
- Session cookie jar: persist `rm_session` between calls.
|
||||
- All request/response bodies are JSON unless noted.
|
||||
- On any non-2xx, response body is
|
||||
`{"code": "...", "message": "..."}`.
|
||||
|
||||
---
|
||||
|
||||
## 1. Login
|
||||
|
||||
```
|
||||
POST $RM_SERVER/api/auth/login
|
||||
Content-Type: application/json
|
||||
|
||||
{"username": "...", "password": "..."}
|
||||
```
|
||||
|
||||
→ 200 with `{"user_id": "...", "role": "..."}` and a `Set-Cookie:
|
||||
rm_session=...` (HttpOnly, 24h TTL). Persist the cookie; reuse
|
||||
it on every subsequent call.
|
||||
|
||||
Required role for the next step: **operator** or **admin**.
|
||||
A viewer-only login can read but cannot mint tokens.
|
||||
|
||||
Session expires at 24h. On 401 from a later call, re-login.
|
||||
|
||||
---
|
||||
|
||||
## 2. Mint an enrolment token
|
||||
|
||||
```
|
||||
POST $RM_SERVER/api/enrollment-tokens
|
||||
Cookie: rm_session=...
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"hostname": "newhost.example",
|
||||
"tags": ["prod", "london"], // optional
|
||||
"repo_url": "rest:https://rest.example/newhost",
|
||||
"repo_username": "...", // optional, for rest-server / S3
|
||||
"repo_password": "...", // optional
|
||||
"initial_paths": ["/etc", "/home", "/var/lib"] // optional; default source group
|
||||
}
|
||||
```
|
||||
|
||||
→ 200 with:
|
||||
|
||||
```json
|
||||
{ "token": "<RAW_ONE_TIME_TOKEN>", "expires_at": "2026-05-09T..." }
|
||||
```
|
||||
|
||||
**Capture `token` immediately — the server only stores its hash
|
||||
and will never return the raw value again.** TTL is 1 hour.
|
||||
|
||||
The repo creds you provided are encrypted under the token hash
|
||||
and pre-attached to the host. The agent will fetch and store
|
||||
them at enrol-time; you will not need to push them again.
|
||||
|
||||
If you lose the token before the install runs, mint a new one
|
||||
(the existing one becomes irrelevant; you can leave it to expire
|
||||
or revoke it via the UI).
|
||||
|
||||
---
|
||||
|
||||
## 3. Install on the target host
|
||||
|
||||
The install script is hosted by the server itself. Running on the
|
||||
target:
|
||||
|
||||
### Linux
|
||||
|
||||
```
|
||||
curl -fsSL $RM_SERVER/install/install.sh | \
|
||||
sudo RM_SERVER=$RM_SERVER RM_TOKEN=<RAW_ONE_TIME_TOKEN> bash
|
||||
```
|
||||
|
||||
What it does, end-to-end:
|
||||
|
||||
1. detects arch (amd64 / arm64)
|
||||
2. downloads `$RM_SERVER/agent/binary?os=linux&arch=<arch>` to
|
||||
`/usr/local/bin/restic-manager-agent`
|
||||
3. creates `/etc/restic-manager/` and `/var/lib/restic-manager/`
|
||||
(root:root, 0700)
|
||||
4. calls `POST /api/agents/enroll` with the token; server returns
|
||||
the persistent agent bearer + `host_id`, written to
|
||||
`/etc/restic-manager/agent.env`
|
||||
5. installs the systemd unit, `daemon-reload`, `enable --now`
|
||||
6. surfaces any pre-existing restic cron/timer entries so the
|
||||
operator can decide whether to disable them (script does
|
||||
*not* touch them automatically)
|
||||
|
||||
The script is idempotent. Re-running on an already-enrolled host
|
||||
is a no-op unless `RM_FORCE_REENROLL=1`.
|
||||
|
||||
The agent runs as **root** by design — fleet backup needs to
|
||||
read every file on the system. See
|
||||
`deploy/install/restic-manager-agent.service` for rationale.
|
||||
|
||||
### Windows
|
||||
|
||||
```
|
||||
iwr $RM_SERVER/install/install.ps1 -UseBasicParsing | iex
|
||||
# (or download + run; needs an elevated PowerShell)
|
||||
# Required env: $env:RM_SERVER, $env:RM_TOKEN
|
||||
```
|
||||
|
||||
Same flow, lays down a Windows service instead of a systemd unit.
|
||||
|
||||
### Manual / non-script enrolment
|
||||
|
||||
If the install script can't be used, the wire-level enrol call is:
|
||||
|
||||
```
|
||||
POST $RM_SERVER/api/agents/enroll
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"token": "<RAW_ONE_TIME_TOKEN>",
|
||||
"hostname": "newhost.example",
|
||||
"os": "linux", // linux | windows
|
||||
"arch": "amd64", // amd64 | arm64
|
||||
"agent_version": "...",
|
||||
"restic_version": "..."
|
||||
}
|
||||
```
|
||||
|
||||
→ 200 with
|
||||
`{"host_id": "...", "agent_token": "...", "cert_pin_sha256": "..."}`.
|
||||
|
||||
The agent_token goes into `/etc/restic-manager/agent.env` as
|
||||
`RM_AGENT_TOKEN=...`; subsequent agent → server traffic uses
|
||||
`Authorization: Bearer $RM_AGENT_TOKEN`.
|
||||
|
||||
---
|
||||
|
||||
## 4. Verify the host is healthy
|
||||
|
||||
Poll until both conditions are true. Cap at ~5 minutes.
|
||||
|
||||
```
|
||||
GET $RM_SERVER/api/hosts
|
||||
Cookie: rm_session=...
|
||||
```
|
||||
|
||||
→ array of host objects. Find the one with the matching hostname
|
||||
and check:
|
||||
|
||||
- `"status": "online"` — agent connected to the WS heartbeat
|
||||
- `"repo_status": "ready"` — `restic init` (or existing-config
|
||||
detection) completed successfully
|
||||
|
||||
If `repo_status` settles on `"init_failed"`, the repo creds are
|
||||
wrong or the repo URL is unreachable from the target. Inspect
|
||||
the matching job log:
|
||||
|
||||
```
|
||||
GET $RM_SERVER/api/hosts/<host_id>/jobs (most recent init job)
|
||||
GET $RM_SERVER/api/jobs/<job_id> (full output)
|
||||
```
|
||||
|
||||
Fix the creds with a creds-update call (see Settings → Repo on
|
||||
the UI for the exact route — currently form-only) or revoke the
|
||||
host and start over.
|
||||
|
||||
---
|
||||
|
||||
## 5. (Optional) configure schedules
|
||||
|
||||
A new host gets one default source group covering `initial_paths`
|
||||
(or `/etc`,`/home` if you didn't pass any) and **no schedule**.
|
||||
Backups won't run until either:
|
||||
|
||||
- a schedule is attached (cron expression, retention, etc.), or
|
||||
- you trigger an on-demand run via the source-group "Run now"
|
||||
endpoint.
|
||||
|
||||
These are not yet exposed cleanly as JSON-only routes; if the
|
||||
agent needs them, look at `internal/server/http/schedules*.go`
|
||||
and `internal/server/http/source_groups*.go` — most are JSON-
|
||||
capable, some are form-only with HTML 303 responses.
|
||||
|
||||
---
|
||||
|
||||
## Failure modes — quick reference
|
||||
|
||||
| Symptom | Likely cause | Fix |
|
||||
|---|---|---|
|
||||
| `401` on `/api/enrollment-tokens` | session expired or viewer role | re-login as operator+ |
|
||||
| install.sh fails at "enrol": HTTP 410 | token expired (>1h) or already used | mint a fresh token |
|
||||
| Host shows `status=offline` after install | systemd unit didn't start; firewall blocks WS | `systemctl status restic-manager-agent`, check `$RM_SERVER` reachability |
|
||||
| `repo_status=init_failed` | bad repo creds or URL | inspect init job log; fix creds; retry probe via `/hosts/{id}/repo/probe` |
|
||||
| Token list grows with stale rows | normal — they expire at 1h | optional cleanup via `/hosts/enrollment-tokens/{hash}/revoke` |
|
||||
|
||||
---
|
||||
|
||||
## Minimum reproducible script
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
: "${RM_SERVER:?}" "${RM_USER:?}" "${RM_PASS:?}" "${RM_HOSTNAME:?}" \
|
||||
"${RM_REPO_URL:?}" "${RM_REPO_USER:?}" "${RM_REPO_PASS:?}"
|
||||
|
||||
JAR=$(mktemp)
|
||||
trap 'rm -f "$JAR"' EXIT
|
||||
|
||||
# 1. login
|
||||
curl -fsS -c "$JAR" -H 'Content-Type: application/json' \
|
||||
-d "{\"username\":\"$RM_USER\",\"password\":\"$RM_PASS\"}" \
|
||||
"$RM_SERVER/api/auth/login" >/dev/null
|
||||
|
||||
# 2. mint token
|
||||
TOKEN=$(curl -fsS -b "$JAR" -H 'Content-Type: application/json' \
|
||||
-d "$(jq -nc \
|
||||
--arg h "$RM_HOSTNAME" --arg u "$RM_REPO_USER" \
|
||||
--arg p "$RM_REPO_PASS" --arg r "$RM_REPO_URL" \
|
||||
'{hostname:$h, repo_url:$r, repo_username:$u, repo_password:$p}')" \
|
||||
"$RM_SERVER/api/enrollment-tokens" | jq -r .token)
|
||||
|
||||
# 3. emit the install snippet for the target machine
|
||||
cat <<EOF
|
||||
Run on $RM_HOSTNAME (as root):
|
||||
|
||||
curl -fsSL $RM_SERVER/install/install.sh | \\
|
||||
sudo RM_SERVER=$RM_SERVER RM_TOKEN=$TOKEN bash
|
||||
EOF
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,139 +0,0 @@
|
||||
# Prometheus + Grafana
|
||||
|
||||
restic-manager exposes a Prometheus scrape endpoint at `GET /metrics`.
|
||||
The endpoint is **opt-in** — it is not mounted at all unless you set
|
||||
at least one of the auth gates below. Once enabled, it serves the
|
||||
standard `text/plain` exposition format that every Prometheus
|
||||
release since 2.x parses without configuration.
|
||||
|
||||
A sample Grafana dashboard lives at
|
||||
`deploy/grafana/restic-manager-dashboard.json`.
|
||||
|
||||
## Enable the endpoint
|
||||
|
||||
Two switches, both off by default. If both are set, both must pass
|
||||
(token AND source-IP); if only one is set, that gate alone
|
||||
authorises a scrape.
|
||||
|
||||
| Env var | YAML key | Effect |
|
||||
|----------------------------|------------------------|--------|
|
||||
| `RM_METRICS_TOKEN` | `metrics_token` | Requires `Authorization: Bearer <token>`. Compared in constant time. |
|
||||
| `RM_METRICS_TRUSTED_CIDR` | `metrics_trusted_cidrs` (list) | Restricts the source IP to one of the listed CIDRs. Comma-separated in env, list in YAML. Honours `X-Forwarded-For` only when the immediate hop matches `RM_TRUSTED_PROXY`. |
|
||||
|
||||
When neither is set, `GET /metrics` returns 404 — the route is not
|
||||
registered with the chi router so a forgotten config can't
|
||||
accidentally publish fleet state.
|
||||
|
||||
### Example: Docker
|
||||
|
||||
```yaml
|
||||
services:
|
||||
restic-manager:
|
||||
image: gitea.dcglab.co.uk/steve/restic-manager:latest
|
||||
environment:
|
||||
RM_METRICS_TOKEN_FILE: /run/secrets/rm_metrics_token
|
||||
RM_METRICS_TRUSTED_CIDR: "10.0.0.0/8"
|
||||
secrets:
|
||||
- rm_metrics_token
|
||||
```
|
||||
|
||||
(`RM_METRICS_TOKEN_FILE` is not currently supported — set
|
||||
`RM_METRICS_TOKEN` directly. The `_FILE` convention is on the
|
||||
roadmap.)
|
||||
|
||||
## Prometheus scrape config
|
||||
|
||||
Drop into your `prometheus.yml`:
|
||||
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: restic-manager
|
||||
metrics_path: /metrics
|
||||
scheme: https # via your reverse proxy
|
||||
static_configs:
|
||||
- targets: ['restic.example.com']
|
||||
authorization:
|
||||
type: Bearer
|
||||
credentials_file: /etc/prometheus/secrets/rm_metrics_token
|
||||
```
|
||||
|
||||
If you don't run a TLS-terminating proxy in front, drop `scheme:
|
||||
https` (the server is HTTP-only — see `docs/reverse-proxy.md`).
|
||||
|
||||
## Metric reference
|
||||
|
||||
All names are `rm_`-prefixed. Per-host metrics carry a `host_id`
|
||||
label (the stable ULID, immune to renames) and a `host` label
|
||||
(the human-readable name).
|
||||
|
||||
### Server gauges
|
||||
|
||||
| Name | Labels | Description |
|
||||
|-----------------------|------------------------------------|-------------|
|
||||
| `rm_hosts_total` | — | Total number of enrolled hosts (excludes pending announces). |
|
||||
| `rm_hosts_online` | — | Number of hosts with `status='online'`. |
|
||||
| `rm_active_alerts` | `severity` ∈ {info, warning, critical} | Open alerts by severity. |
|
||||
| `rm_build_info` | `version, commit, go_version` | Always 1; pure label-bag for joining. |
|
||||
|
||||
### Per-host gauges
|
||||
|
||||
| Name | Description |
|
||||
|--------------------------------------------|-------------|
|
||||
| `rm_host_agent_online` | 1 if the agent is currently online, 0 otherwise. |
|
||||
| `rm_host_last_backup_timestamp_seconds` | Unix timestamp of the host's most recent backup. **Omitted** for hosts with no backup yet. |
|
||||
| `rm_host_last_backup_success` | 1 if the most recent backup succeeded, 0 otherwise. **Omitted** for hosts with no backup yet. |
|
||||
| `rm_host_repo_size_bytes` | Latest reported repo size from `restic stats --mode raw-data`. **Omitted** when unknown. |
|
||||
| `rm_host_snapshot_count` | Number of restic snapshots known on the host's repo. |
|
||||
| `rm_host_open_alerts` | Number of currently open alerts attached to this host. |
|
||||
| `rm_host_repo_status` | Always 1; the `status` label carries `unknown` / `ready` / `init_failed`. |
|
||||
|
||||
### Job duration histogram
|
||||
|
||||
```
|
||||
rm_job_duration_seconds_bucket{kind, status, le}
|
||||
rm_job_duration_seconds_sum{kind, status}
|
||||
rm_job_duration_seconds_count{kind, status}
|
||||
```
|
||||
|
||||
`kind` ∈ {backup, forget, prune, check, unlock, restore, diff, init, update}.
|
||||
`status` ∈ {succeeded, failed, cancelled}.
|
||||
|
||||
Buckets (seconds):
|
||||
|
||||
```
|
||||
1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf
|
||||
1s 5s 30s 1m 5m 30m 1h 6h 24h
|
||||
```
|
||||
|
||||
The histogram is in-memory only — values reset on process restart.
|
||||
Operators who want durable history should let Prometheus persist
|
||||
the scrapes; restic-manager itself is a control plane, not a
|
||||
metrics database.
|
||||
|
||||
## Grafana dashboard
|
||||
|
||||
Import `deploy/grafana/restic-manager-dashboard.json`:
|
||||
|
||||
1. In Grafana, **+ → Import → Upload JSON file**.
|
||||
2. Pick the Prometheus data source you scrape with.
|
||||
3. The dashboard's six panels populate from the metrics above:
|
||||
* **Fleet status** — online/total stat panel.
|
||||
* **Open alerts** — by severity.
|
||||
* **Hosts** — per-host table (last backup, repo size, snapshots, alerts).
|
||||
* **Repo size over time** — one line per host.
|
||||
* **Backups failing** — count of hosts whose last backup didn't succeed.
|
||||
* **Job duration p95** — `histogram_quantile(0.95, …)` over a 1h window per kind.
|
||||
|
||||
Alerting is intentionally not configured in the dashboard — the
|
||||
control plane already has alerts (P3-05) with native channels for
|
||||
webhook, ntfy, and SMTP. Re-implementing them in Prometheus would
|
||||
just duplicate state. If you do want Prom-side alerts, copy the
|
||||
recording rules into your usual location.
|
||||
|
||||
## Cardinality
|
||||
|
||||
Per scrape: O(hosts) gauge rows + O(kinds × statuses × buckets)
|
||||
histogram rows. A 100-host fleet emits roughly 700 host rows + 270
|
||||
histogram rows — well below any practical limit. There are no
|
||||
`job_id` labels (cardinality bomb avoidance) and no per-source-group
|
||||
labels.
|
||||
@@ -1,223 +0,0 @@
|
||||
# Always-On vs Intermittent host mode
|
||||
|
||||
**Date:** 2026-06-15
|
||||
**Branch:** `feat-laptop-host-mode`
|
||||
**Status:** Design — awaiting review
|
||||
|
||||
## Problem
|
||||
|
||||
The server currently assumes every host should be present 24×7. When an
|
||||
agent stops heartbeating for 90s it is flipped to `offline`, and after 15
|
||||
minutes that raises a `warning` alert. This is correct for a server, but
|
||||
wrong for a host that legitimately comes and goes — a workstation or
|
||||
laptop that sleeps overnight, travels, or is shut down on weekends. Such
|
||||
a host generates noise alerts every time it is closed, and — more
|
||||
importantly — there is **no mechanism to catch up a backup it missed
|
||||
while it was away.**
|
||||
|
||||
Two distinct facts make the catch-up gap real:
|
||||
|
||||
- **Backup cron runs on the agent, locally.** The agent fires
|
||||
`MsgScheduleFire`; the server only dispatches in response. If the host
|
||||
is asleep, the agent process is suspended, so the cron tick never
|
||||
fires and no `MsgScheduleFire` is ever sent.
|
||||
- Therefore the existing `pending_runs` retry queue **does not** cover
|
||||
this case. `pending_runs` only gets a row when a schedule *fired* but
|
||||
the agent was momentarily disconnected at dispatch time. A window
|
||||
missed entirely during sleep never enqueues anything.
|
||||
|
||||
## Goal
|
||||
|
||||
Let an operator mark a host as **not** always-on. Such a host:
|
||||
|
||||
1. Does **not** raise offline/agent-down alerts when it is not visible.
|
||||
2. Renders a distinct, calm "asleep" state in the UI instead of the
|
||||
alarming red "offline".
|
||||
3. When it reconnects, after a short settle delay, the server checks
|
||||
whether it missed a scheduled backup and — if so — triggers a
|
||||
catch-up backup automatically.
|
||||
4. Still raises a *staleness* alert if it has genuinely gone too long
|
||||
without any backup (a host left in a drawer). This is the only
|
||||
alert covering an asleep host: while the agent is offline no job
|
||||
runs, so there is no failure to detect — staleness is the safety
|
||||
net for "no backups are happening at all."
|
||||
5. Leaves normal job-failure alerting untouched: a backup that
|
||||
actually runs (scheduled or catch-up) and fails alerts as it does
|
||||
today. Failures can only occur while the agent is online and
|
||||
executing restic.
|
||||
|
||||
Default behaviour is unchanged for the entire existing fleet.
|
||||
|
||||
## Decisions (from brainstorming)
|
||||
|
||||
- **Setting shape:** a single boolean `Always On` checkbox per host,
|
||||
**default ON**. Checked = today's 24×7 server semantics. Unchecked =
|
||||
intermittent host. Opt-in only; zero behaviour change for current and
|
||||
future hosts unless explicitly toggled.
|
||||
- **Overdue trigger:** evaluated on **reconnect + behind schedule**
|
||||
(not a continuous always-evaluating sweep).
|
||||
- **Alert policy for intermittent hosts:** suppress offline alerts;
|
||||
keep a long-threshold **staleness** alert; keep job-failure alerts.
|
||||
- **Staleness threshold:** **7 days**, a global constant for v1. May
|
||||
become per-host configurable later — out of scope now.
|
||||
- **Catch-up granularity:** **per enabled schedule.** A host with a
|
||||
daily and a weekly schedule catches up only whichever is actually
|
||||
behind.
|
||||
- **UI vocabulary:** not-visible intermittent host shows a grey
|
||||
`asleep` state; detail line reads
|
||||
`asleep · last seen <relTime> · will catch up on return`.
|
||||
- **Chip:** chip and checkbox highlight the **same** truth (24×7). Show
|
||||
a chip for **Always-On** hosts; **no** chip for intermittent.
|
||||
|
||||
## Architecture
|
||||
|
||||
The change is deliberately a thin policy + presentation layer over the
|
||||
existing online/offline state machine. We do **not** add a new `status`
|
||||
enum value or alter heartbeat / `last_seen_at` tracking. "Asleep" is a
|
||||
reinterpretation of `status='offline' AND NOT always_on`.
|
||||
|
||||
### 1. Data model
|
||||
|
||||
- **Migration `0024_hosts_always_on.sql`:**
|
||||
```sql
|
||||
ALTER TABLE hosts ADD COLUMN always_on INTEGER NOT NULL DEFAULT 1;
|
||||
```
|
||||
Column-level ALTER per the repo's migration rules. Default `1` means
|
||||
every existing row is Always-On — no behaviour change on upgrade.
|
||||
- `store/types.go`: add `AlwaysOn bool` to the `Host` struct; thread it
|
||||
through every host SELECT scan and the host insert/update paths.
|
||||
- New store helper `SetHostAlwaysOn(ctx, hostID, bool) error`.
|
||||
|
||||
### 2. Online/offline mechanics — UNCHANGED
|
||||
|
||||
The 30s offline sweeper (`cmd/server/main.go:220`) still flips an unseen
|
||||
host to `status='offline'` and still calls
|
||||
`alertEngine.NotifyHostOffline(id)`. `TouchHost` / `MarkHostHello`
|
||||
behaviour is untouched. The intermittent distinction is applied
|
||||
*downstream* of this state, in the alert engine and the templates.
|
||||
|
||||
### 3. Alert behaviour
|
||||
|
||||
All changes key off `host.AlwaysOn`, which the engine already has access
|
||||
to via the host row it loads.
|
||||
|
||||
- **Suppress offline alert** (`alert/engine.go` `handleHostOffline()`
|
||||
and the 60s `tick()`): when `!host.AlwaysOn`, do not raise
|
||||
`agent_offline`.
|
||||
- **Resolve-on-toggle:** when a host is switched server→intermittent and
|
||||
has an open `agent_offline` alert, auto-resolve it. (Handled in the
|
||||
mode-change handler, fanning through the normal resolve path so
|
||||
channels/audit fire as usual.)
|
||||
- **Staleness alert** — wire up the currently-dead `KindStaleSchedule`
|
||||
constant, **for intermittent hosts only.** On the 60s tick, for each
|
||||
host where `!AlwaysOn` AND the host has ≥1 enabled schedule AND
|
||||
`LastBackupAt != nil` AND `now - LastBackupAt > 7*24h`: raise a
|
||||
`warning` `stale_schedule` alert (dedup key `""`, one per host).
|
||||
Auto-resolves when `LastBackupAt` advances past the threshold (i.e.
|
||||
any successful backup, including the catch-up). Always-On hosts'
|
||||
`stale_schedule` remains a no-op (unchanged, out of scope).
|
||||
- If `LastBackupAt == nil` (intermittent host enrolled but never
|
||||
backed up): no staleness alert in v1 — there is no baseline to
|
||||
measure against, and onboarding probe state (`repo_status`) already
|
||||
covers "never successfully set up."
|
||||
- **Job-failure alerts:** untouched. A catch-up backup that runs and
|
||||
fails alerts exactly like any other backup.
|
||||
|
||||
### 4. Catch-up on reconnect
|
||||
|
||||
A new small component — the **catch-up scheduler** — lives server-side
|
||||
alongside the existing ticks.
|
||||
|
||||
- **Arm:** on agent hello (`server/ws/handler.go` hello path /
|
||||
`onAgentHello`), if the host is `!AlwaysOn`, record
|
||||
`catchupDueAt[hostID] = now + 60s` in an in-memory map. Re-arming on a
|
||||
subsequent hello just overwrites the timestamp (debounce — rapid
|
||||
flapping does not stack catch-ups). In-memory is acceptable: catch-up
|
||||
is best-effort and a server restart simply re-arms on the next hello.
|
||||
- **Fire:** reuse the existing 30s server tick. For each due entry
|
||||
(`catchupDueAt <= now`):
|
||||
1. Re-verify the agent is still connected (`Hub.Connected(hostID)`).
|
||||
If it bounced back offline within the settle window, drop the entry
|
||||
(it will re-arm on the next hello).
|
||||
2. Skip if a backup is already running or queued for the host
|
||||
(`current_job_id` set, or a relevant `pending_runs` row exists) —
|
||||
avoid double-firing alongside a normal dispatch or pending drain.
|
||||
3. For each **enabled** schedule on the host, compute overdue:
|
||||
```
|
||||
overdue := sched.Next(host.LastBackupAt) <= now
|
||||
```
|
||||
using `robfig/cron/v3` (already a dependency) to parse
|
||||
`Schedule.CronExpr`. `Next(lastBackup)` is the first fire strictly
|
||||
after the last successful backup; if that moment has already
|
||||
passed, the window was missed → overdue. (If `LastBackupAt` is nil,
|
||||
treat as overdue so a never-backed-up intermittent host with a
|
||||
schedule gets its first run on connect.)
|
||||
4. For each overdue schedule, dispatch its source-groups via the
|
||||
existing `dispatchBackupForGroupCore()`.
|
||||
5. Clear the entry.
|
||||
|
||||
Net latency is ~60–90s after wake (60s settle + up to one 30s tick).
|
||||
This path is independent of and complementary to the `pending_runs`
|
||||
drain, which continues to handle the fired-but-not-sent case.
|
||||
|
||||
### 5. UI
|
||||
|
||||
- **CSS:** new grey `dot-asleep` token in `web/styles/input.css`,
|
||||
visually distinct from red `dot-offline`.
|
||||
- **`partials/host_row.html` and `partials/host_chrome.html`:** when
|
||||
`!AlwaysOn && status=='offline'`, render the grey dot + label
|
||||
`asleep`; the detail/last-seen line reads
|
||||
`asleep · last seen <relTime> · will catch up on return`. All other
|
||||
states unchanged.
|
||||
- **24×7 chip:** on the host detail header, render a small
|
||||
`Always On` / `24×7` chip **only when `AlwaysOn` is true**. No chip
|
||||
for intermittent hosts. (Chip and checkbox highlight the same fact.)
|
||||
- **Toggle:** an `Always On` checkbox (default checked) on the host edit
|
||||
surface. Operator-band `POST` (mirrors existing host-edit handlers),
|
||||
audited as `host.mode_updated`. On save, if switching to intermittent,
|
||||
trigger the resolve-on-toggle path for any open `agent_offline` alert.
|
||||
|
||||
## Error handling & edge cases
|
||||
|
||||
- **Toggle server→intermittent while offline+alerting:** open
|
||||
`agent_offline` alert auto-resolved on save.
|
||||
- **Toggle intermittent→server while asleep:** host resumes normal
|
||||
offline/alert semantics; it will alert per the 15-minute floor once
|
||||
the sweeper/tick next evaluates it.
|
||||
- **No enabled schedules:** no catch-up and no staleness alert — there
|
||||
is no backup expectation to measure against.
|
||||
- **Catch-up vs in-flight work:** guarded by the running/queued check in
|
||||
step 4.2 so catch-up never races a normal dispatch or pending drain.
|
||||
- **Agent flaps during settle window:** entry dropped if not connected
|
||||
at fire time; re-armed on the next hello.
|
||||
|
||||
## Testing
|
||||
|
||||
- **Alert engine (unit):**
|
||||
- offline alert suppressed when `!AlwaysOn`.
|
||||
- staleness alert raised when intermittent + schedule + last backup >
|
||||
7d; not raised for Always-On hosts; not raised when last backup is
|
||||
recent; not raised when no enabled schedule.
|
||||
- staleness alert auto-resolves after a backup advances `LastBackupAt`.
|
||||
- server→intermittent toggle resolves an open `agent_offline` alert.
|
||||
- **Overdue computation (unit, table-driven):** `(cronExpr,
|
||||
lastBackupAt, now) → overdue?` including nil-last-backup and
|
||||
daily/weekly cases.
|
||||
- **Catch-up scheduler (unit):** fires only when still connected; skips
|
||||
when a backup is running/queued; dispatches only overdue schedules.
|
||||
- **UI (render test):** asleep state + 24×7 chip render under the right
|
||||
conditions; offline state for Always-On hosts unchanged.
|
||||
- `go vet ./...` and full `go test ./...` green before merge.
|
||||
|
||||
## Out of scope
|
||||
|
||||
- Per-host staleness thresholds (global 7d constant for v1).
|
||||
- Continuous (non-reconnect) overdue evaluation.
|
||||
- Agent-side catch-up cron — the server is the reliable arbiter.
|
||||
- Wiring `stale_schedule` for Always-On hosts (separate concern).
|
||||
|
||||
## Task tracking
|
||||
|
||||
Add an entry to `tasks.md` under "Next steps from testing" (or a new
|
||||
small section) once the plan is approved, per the repo's tasks.md
|
||||
source-of-truth rule.
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,259 @@
|
||||
# P2 Completion Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||
|
||||
**Goal:** Close every remaining P2 task in `tasks.md`: P2R-09 (auto-init UX), P2R-10/11/12 (hooks), P2R-13 (bandwidth wiring + per-job override), P2R-14 (schedule next/last run), P2-16 (Windows svc), P2-17 (`install.ps1`), P2-18 (announce-and-approve).
|
||||
|
||||
**Architecture:** Server stays HTTP+WS; agent stays a single binary that auto-restages via `make build`. Hooks live on `source_groups` (and host-level defaults). Announce-and-approve adds a separate WS path (`/ws/agent/pending`) and a Pending hosts panel; token-flow stays default. Windows service support uses `golang.org/x/sys/windows/svc` behind a `//go:build windows` tag — Linux builds untouched. **Operator is away — make best guesses on small UX choices, but commit each item separately so the choices are reviewable.**
|
||||
|
||||
**Tech Stack:** Go 1.23+, chi router, modernc/sqlite, `coder/websocket`, `robfig/cron/v3`, HTMX + Tailwind, `golang.org/x/sys/windows/svc`, Ed25519 (stdlib).
|
||||
|
||||
---
|
||||
|
||||
## Pre-flight
|
||||
|
||||
- [ ] **Run baseline:** `go vet ./... && go build ./... && go test ./...` — must be green before starting. Restage agent + restart server (per CLAUDE.md restage block) so smoke env is warm.
|
||||
|
||||
## Order of execution
|
||||
|
||||
Smallest blast-radius first. UI polish → bandwidth → next/last → hooks → announce → Windows. Commit and restage at each task boundary. Run `go vet ./... && go test ./...` before every commit.
|
||||
|
||||
---
|
||||
|
||||
## Task 1 — P2R-13a: Wire bandwidth caps into restic invocations
|
||||
|
||||
**Files:**
|
||||
- Modify: `internal/restic/runner.go` (add `LimitUploadKBps`, `LimitDownloadKBps` to `Env` or to a per-call options struct already present; emit `--limit-upload N`/`--limit-download N` on `restic backup|forget|prune|check|restore`)
|
||||
- Modify: `internal/agent/runner/*.go` — pass host-wide caps into the runner. Caps come from `agent.config.Config` or are pushed via `config.update`. Decision: ship caps in the existing `config.update` envelope as new fields `bandwidth_up_kbps`, `bandwidth_down_kbps`. Server pushes on hello + on `PUT /api/hosts/{id}/bandwidth`.
|
||||
- Modify: `internal/api/messages.go` — extend `ConfigUpdatePayload` with the two int pointers.
|
||||
- Modify: `internal/server/ws/handler.go` (or wherever hello/config push lives) — include caps in the pushed config.
|
||||
- Modify: `internal/server/http/host_bandwidth.go` — after `SetHostBandwidth`, fan out a `config.update` to the connected agent (mirror the credentials-edit path).
|
||||
- Test: `internal/restic/runner_test.go` — assert flag injection.
|
||||
- Test: `internal/server/ws/*_test.go` — assert config.update carries caps on hello and on edit.
|
||||
|
||||
- [ ] **Step 1.1** Add `LimitUploadKBps *int`, `LimitDownloadKBps *int` to whatever per-host config the runner already consults. Existing pattern is `restic.Env{}`; extend it.
|
||||
- [ ] **Step 1.2** Failing test in `internal/restic/runner_test.go`: build a backup command with `LimitUploadKBps=1024`, assert the resulting argv contains `--limit-upload 1024`.
|
||||
- [ ] **Step 1.3** Implement: prepend the flags in argv builders for `backup`, `forget`, `prune`, `check`, `restore`. Skip when nil/<=0.
|
||||
- [ ] **Step 1.4** Wire `config.update` payload — server reads `Host.BandwidthUpKBps`/`DownKBps`, includes them in the existing `ConfigUpdatePayload` push on hello and on bandwidth edit (mirror cred-edit fan-out in `internal/server/http/host_credentials.go`).
|
||||
- [ ] **Step 1.5** Agent applies caps: store in the in-memory dispatcher state on `config.update`, attach to every restic call.
|
||||
- [ ] **Step 1.6** `go vet ./... && go test ./... && make build && <restage block>`. Commit:
|
||||
```
|
||||
agent+server: apply host bandwidth caps to restic invocations
|
||||
```
|
||||
|
||||
## Task 2 — P2R-13b: Per-job override on Run-now confirm dialog
|
||||
|
||||
**Decision:** A small numeric input on the per-source-group Run-now button (and dashboard Run-all). Operator is away — keep it minimal: two optional inputs (up/down KB/s) on the dispatch endpoint; UI shows a `<details>` "Limit bandwidth for this run" disclosure with two number inputs.
|
||||
|
||||
**Files:**
|
||||
- Modify: `internal/server/http/sources.go` (or wherever the per-group Run-now POST lives) — accept optional `bandwidth_up_kbps`/`bandwidth_down_kbps` form fields, pass through.
|
||||
- Modify: dispatch path (`internal/server/dispatch_*.go` or `ws/handler.go` job-dispatch core) — accept overrides, include in the `command.run` payload.
|
||||
- Modify: `internal/api/messages.go` — `CommandRunPayload` gains optional caps that take precedence over host-wide caps when present.
|
||||
- Modify: agent dispatcher — use payload override if present else falls back to config caps.
|
||||
- Modify: `web/templates/pages/host_sources.html` (and the schedules Run-now form) — `<details>` block.
|
||||
- Test: HTTP test for the new form fields; agent runner test for override precedence.
|
||||
|
||||
- [ ] **Step 2.1** Failing test: POST to per-group Run-now with `bandwidth_up_kbps=512` → assert dispatched payload carries 512.
|
||||
- [ ] **Step 2.2** Implement endpoint changes + payload extension.
|
||||
- [ ] **Step 2.3** Agent override precedence test (payload wins over config).
|
||||
- [ ] **Step 2.4** UI `<details>` blocks (one per Run-now form).
|
||||
- [ ] **Step 2.5** Playwright spot-check via `:8080` smoke env: open Sources tab, expand the Run-now disclosure, fire with limit=128, then open the live job log and confirm the agent's restic argv (read `/tmp/rm-smoke/server.log` for the dispatched command — it logs argv) shows `--limit-upload 128`.
|
||||
- [ ] **Step 2.6** Commit.
|
||||
|
||||
## Task 3 — P2R-14: Schedule "next run" / "last run"
|
||||
|
||||
**Files:**
|
||||
- Modify: `internal/store/schedules.go` — add `NextRunAt(time.Time)` derivation helper and `LatestScheduledJobAt(host_id, schedule_id) (time.Time, error)` (or a single batched fetch for all schedules of a host).
|
||||
- Modify: dashboard host row (`web/templates/partials/host_row.html`) — show "Next: …" and "Last: …" when there's a single covering schedule (already detected in slice 5).
|
||||
- Modify: `web/templates/pages/host_schedules.html` — add Next/Last columns to the schedules table.
|
||||
- Modify: relevant page handlers (`internal/server/http/ui_schedules.go`, dashboard handler) — populate the data.
|
||||
- Test: `schedules_test.go` for next-run derivation (parse cron, compute next from a fixed `now`).
|
||||
|
||||
- [ ] **Step 3.1** Add `NextRun(cronExpr string, from time.Time) (time.Time, error)` helper using `robfig/cron/v3`'s `Parse(...).Next(from)`. Test with three crons.
|
||||
- [ ] **Step 3.2** Add `LatestJobByActorKindForSchedule(host_id, schedule_id) (time.Time, status, error)` query against `jobs` (filter `actor_kind='schedule'` AND `schedule_id=?`, ORDER BY `started_at` DESC LIMIT 1).
|
||||
- [ ] **Step 3.3** Wire schedules-page handler to populate Next/Last per row; render relative time + ISO tooltip (mirror existing `formatRelTime` template helper if it exists; otherwise use a simple "5m ago" helper).
|
||||
- [ ] **Step 3.4** Wire dashboard row: when single covering schedule, surface "Next: 03:00" / "Last: 8h ago — succeeded".
|
||||
- [ ] **Step 3.5** Playwright spot-check: a host with a schedule shows Next/Last; pause it → Next becomes "—" / "(paused)".
|
||||
- [ ] **Step 3.6** Commit.
|
||||
|
||||
## Task 4 — P2R-09: Auto-init UX polish
|
||||
|
||||
**Files:**
|
||||
- Modify: `web/templates/pages/host_repo.html` — danger-zone re-init button + two-step confirm (type the host name).
|
||||
- Modify: `internal/server/http/ui_repo.go` (or new `repo_reinit.go`) — `POST /hosts/{id}/repo/reinit` admin-only, audit-logged. Server runs `restic init --force` (or wipes-then-inits — pick the safer of the two; restic doesn't truly wipe a repo, the operator must clear the bucket. **Best guess:** dispatch a normal `init` job with a flag that re-runs even if the repo claims to exist; if restic refuses, surface "the repo on the remote already has data — clear it manually before re-init" via the job log).
|
||||
- Modify: host detail page header / vitals strip — surface init result line. Use the existing latest-`init`-job query to render "repo ready · initialised <relative time> ago" or "init failed · job N · retry".
|
||||
- Test: HTTP test for re-init endpoint (auth, audit, host-name confirm); template test that the result line renders for both states.
|
||||
|
||||
- [ ] **Step 4.1** Add helper: `LatestJobByKind(host_id, "init")` — already exists from P2R-06 (`store.LatestJobByKind`). Reuse.
|
||||
- [ ] **Step 4.2** Render init line into vitals strip; show "init failed" amber when latest init failed.
|
||||
- [ ] **Step 4.3** Implement `POST /hosts/{id}/repo/reinit` handler — admin role check, requires a `confirm_hostname` form field that must equal `host.Name`, returns 400 otherwise. Dispatches a fresh `init` job.
|
||||
- [ ] **Step 4.4** Add danger-zone re-init form to `host_repo.html` (currently disabled per slice 4). Two-step confirm with the typed hostname.
|
||||
- [ ] **Step 4.5** Playwright: visit `/hosts/{id}/repo`, click re-init, type wrong hostname → blocked; type right hostname → dispatches init job → returns to live log.
|
||||
- [ ] **Step 4.6** Commit.
|
||||
|
||||
## Task 5 — P2R-10: Hook schema (migration 0010)
|
||||
|
||||
**Files:**
|
||||
- Create: `internal/store/migrations/0010_hooks.sql`
|
||||
- `ALTER TABLE source_groups ADD COLUMN pre_hook BLOB;` (AEAD ciphertext, NULLable)
|
||||
- `ALTER TABLE source_groups ADD COLUMN post_hook BLOB;`
|
||||
- `ALTER TABLE hosts ADD COLUMN pre_hook_default BLOB;`
|
||||
- `ALTER TABLE hosts ADD COLUMN post_hook_default BLOB;`
|
||||
- All four are AEAD ciphertext (existing `crypto.AEAD`); BLOB column type.
|
||||
- Modify: `internal/store/types.go` — add `PreHook *string` (decrypted), `PostHook *string` to `SourceGroup`; same to `Host`.
|
||||
- Modify: `internal/store/sources.go` + `internal/store/hosts.go` — getters/setters encrypt on write, decrypt on read. Pass `crypto.AEAD` through (pattern mirrors `host_credentials.go`).
|
||||
- Test: encrypt/decrypt round-trip; setting `nil` clears the column.
|
||||
|
||||
- [ ] **Step 5.1** Write migration SQL. Column-level ALTERs only (per CLAUDE.md).
|
||||
- [ ] **Step 5.2** Update store types + getters/setters with AEAD encrypt/decrypt. Mirror `internal/store/host_credentials.go` patterns exactly.
|
||||
- [ ] **Step 5.3** Round-trip test: set hook on a source group; reload; assert plaintext returned. Set nil; assert nil after reload.
|
||||
- [ ] **Step 5.4** `go vet && go test`. Commit.
|
||||
|
||||
## Task 6 — P2R-11: Agent execution of hooks
|
||||
|
||||
**Files:**
|
||||
- Modify: `internal/api/messages.go` — `ConfigUpdatePayload` (or the per-source-group bundle inside `ScheduleSetPayload`) carries `PreHook`, `PostHook` plaintext (server has decrypted by then; wire is authenticated WS, same trust boundary as repo creds).
|
||||
- Modify: agent dispatcher — for `kind=backup` only:
|
||||
- Run `pre_hook` (if present) via `os/exec` with the host shell (`/bin/sh -c` on Linux, `cmd.exe /C` on Windows). Capture stdout+stderr → JobLog with `hook:` prefix. Non-zero exit aborts the backup, marks the job failed with `pre_hook` error.
|
||||
- Run `post_hook` (if present) **always** after the backup, with `RM_JOB_STATUS=succeeded|failed` env var. Capture into JobLog, prefix `hook:`. Non-zero exit on post_hook does NOT change job status (warning logged).
|
||||
- Skip both for `kind` ∈ {forget, prune, check, unlock, init} per spec.md §14.3.
|
||||
- Test: dispatcher test with a `pre_hook` that exits 1 → backup not started; `post_hook` always runs and sees `RM_JOB_STATUS`.
|
||||
|
||||
- [ ] **Step 6.1** Plumb hooks through `ScheduleSetPayload` source-group bundle + per-group Run-now `command.run` payload (override host-default with group hook if both present). Server-side resolution: host default if group hook is empty.
|
||||
- [ ] **Step 6.2** Agent dispatcher: factor hook execution into `internal/agent/runner/hooks.go`. Use `exec.CommandContext`, set env, plumb output to existing JobLog stream with `Source: "hook"` (or prefix the log lines `hook: …`).
|
||||
- [ ] **Step 6.3** Failing test in `internal/agent/runner/runner_test.go` (create file if absent): `pre_hook=/bin/false` → job fails with `pre_hook failed (exit 1)` and the actual restic backup never runs (assert via mock-restic shim).
|
||||
- [ ] **Step 6.4** Test: `post_hook` runs even when backup fails; receives `RM_JOB_STATUS=failed`.
|
||||
- [ ] **Step 6.5** Test: hooks skipped on `forget`/`prune`/`check`/`unlock` jobs.
|
||||
- [ ] **Step 6.6** `go vet && go test && make build && <restage block>`. Commit.
|
||||
|
||||
## Task 7 — P2R-12: Hook editor UI
|
||||
|
||||
**Files:**
|
||||
- Modify: `web/templates/pages/source_group_edit.html` (new or extend existing source-group form) — `<textarea>` for pre_hook, `<textarea>` for post_hook, with the warning banner: "this hook runs as the agent service user (root on Linux; LocalSystem on Windows)".
|
||||
- Modify: source-group HTTP handler (`internal/server/http/sources.go`) — accept hook fields on POST/PUT, encrypt-and-persist via store.
|
||||
- Create: a new "Settings" tab section on host detail (currently inert per P1-25) — wait, just add a new sub-tab or extend Repo page. **Decision:** add `pre_hook_default` / `post_hook_default` to the Repo page under a new "Hooks" section since Settings is still inert.
|
||||
- Modify: source-group form admin-only check; post-only edit allowed by operators? **Decision:** admin-only edit per spec; render but disable for operators.
|
||||
- Modify: audit-log writer — emit `source_group.hook_updated` and `host.default_hook_updated` events (without the hook body).
|
||||
- Test: HTTP test for create + update; admin-only enforcement; audit row written without secret.
|
||||
|
||||
- [ ] **Step 7.1** Source-group form extension + handler wiring.
|
||||
- [ ] **Step 7.2** Repo page Hooks section (host defaults).
|
||||
- [ ] **Step 7.3** Audit entries.
|
||||
- [ ] **Step 7.4** Playwright: as admin, set a `pre_hook` of `echo hello`, fire Run-now, open live log, confirm `hook: hello` line appears.
|
||||
- [ ] **Step 7.5** Commit.
|
||||
|
||||
## Task 8 — P2-18a: Announce schema + endpoint
|
||||
|
||||
**Files:**
|
||||
- Create: `internal/store/migrations/0011_pending_hosts.sql`
|
||||
```sql
|
||||
CREATE TABLE pending_hosts (
|
||||
id TEXT PRIMARY KEY,
|
||||
hostname TEXT NOT NULL,
|
||||
os TEXT NOT NULL,
|
||||
arch TEXT NOT NULL,
|
||||
agent_version TEXT NOT NULL,
|
||||
restic_version TEXT NOT NULL,
|
||||
public_key BLOB NOT NULL, -- 32-byte Ed25519
|
||||
fingerprint TEXT NOT NULL, -- "SHA256:hex"
|
||||
announced_from_ip TEXT NOT NULL,
|
||||
first_seen_at TEXT NOT NULL,
|
||||
last_seen_at TEXT NOT NULL,
|
||||
expires_at TEXT NOT NULL
|
||||
);
|
||||
CREATE INDEX pending_hosts_expires ON pending_hosts(expires_at);
|
||||
CREATE INDEX pending_hosts_fingerprint ON pending_hosts(fingerprint);
|
||||
```
|
||||
- Create: `internal/store/pending_hosts.go` — `CreatePendingHost`, `GetPendingHostByFingerprint`, `ListPendingHosts`, `DeletePendingHost`, `TouchPendingHost`, `DeleteExpiredPendingHosts`.
|
||||
- Create: `internal/server/http/announce.go` — `POST /api/agents/announce` accepts `{hostname, os, arch, agent_version, restic_version, public_key (base64)}`. Validates protocol_version implicitly via `agent_version` check. Token-bucket rate limit per source IP (10/min). Global cap 100 pending rows. Returns `{fingerprint, pending_id, hostname_collision: bool}`.
|
||||
- Test: `announce_test.go` — happy path; rate limit; cap; collision flag.
|
||||
|
||||
- [ ] **Step 8.1** Migration + store layer + tests.
|
||||
- [ ] **Step 8.2** Endpoint + tests (use a fake clock + in-process token bucket).
|
||||
- [ ] **Step 8.3** Commit.
|
||||
|
||||
## Task 9 — P2-18b: Pending WS + accept/reject
|
||||
|
||||
**Files:**
|
||||
- Create: `internal/server/ws/pending.go` — `GET /ws/agent/pending` upgrade. Server issues a 32-byte nonce; agent signs it with its Ed25519 private key; server verifies against the `public_key` stored on the pending row keyed by the supplied `pending_id`. If valid, hold the connection open; on accept, push a single `enrolled` message containing `{bearer_token, repo_credentials_aead_blob}` and close cleanly. On reject, close with code 4001 + reason "rejected".
|
||||
- Create: `internal/server/http/pending.go` — admin-only `POST /api/pending-hosts/{id}/accept` (atomically: mint bearer, decrypt admin-supplied repo creds (passed in form), promote pending row → real `hosts` row, push `enrolled` to the open WS, audit-log) and `POST /api/pending-hosts/{id}/reject` (delete row + close socket).
|
||||
- Modify: server `main.go` route registration.
|
||||
- Test: integration test — fake agent opens pending WS, admin POST /accept, agent receives bearer.
|
||||
|
||||
- [ ] **Step 9.1** Pending WS handler with nonce-sign verify.
|
||||
- [ ] **Step 9.2** Accept/reject endpoints. Accept reuses the existing token-consume path internally (mints persistent bearer from `crypto.RandomToken`-style helper, inserts host row + `host_credentials`).
|
||||
- [ ] **Step 9.3** Tests.
|
||||
- [ ] **Step 9.4** Commit.
|
||||
|
||||
## Task 10 — P2-18c: Agent announce path
|
||||
|
||||
**Files:**
|
||||
- Modify: `cmd/agent/main.go` — when `RM_TOKEN` is unset, switch to announce mode instead of erroring out. `RM_SERVER` still required.
|
||||
- Create: `internal/agent/announce/announce.go` — generate-or-load Ed25519 keypair (persisted as a file alongside `secrets.enc`, mode 0600). POST `/api/agents/announce`. Open `/ws/agent/pending`. Wait. On `enrolled` message, persist bearer to `agent.yaml`, persist repo creds via existing secrets store, exit announce mode and reconnect via the normal WS path.
|
||||
- Modify: `deploy/install/install.sh` — when `RM_TOKEN` is missing, run agent in announce mode and `journalctl --follow` until the agent prints the fingerprint, print it to the operator's terminal in big copy-friendly format, then keep following until enrolled.
|
||||
- Test: end-to-end test in `internal/server/...` using a fake agent.
|
||||
|
||||
- [ ] **Step 10.1** Keypair generation + persistence.
|
||||
- [ ] **Step 10.2** Announce client + pending WS client; print `SHA256:…` fingerprint to stdout in a banner.
|
||||
- [ ] **Step 10.3** Install script branch.
|
||||
- [ ] **Step 10.4** Playwright: register a host via announce mode (run agent locally with no RM_TOKEN), log into UI, see Pending hosts panel with the fingerprint, click Accept, confirm host appears.
|
||||
- [ ] **Step 10.5** Commit.
|
||||
|
||||
## Task 11 — P2-18d: Pending hosts UI panel
|
||||
|
||||
**Files:**
|
||||
- Modify: `web/templates/pages/dashboard.html` — add Pending hosts panel above the host list when any pending rows exist.
|
||||
- Modify: dashboard handler — `Store.ListPendingHosts(now)` (auto-skips expired).
|
||||
- Add buttons → POST `/api/pending-hosts/{id}/accept` and `/reject` via HTMX.
|
||||
- Background sweeper for `DeleteExpiredPendingHosts` every 60s (mirror the existing offline-sweeper goroutine pattern).
|
||||
|
||||
- [ ] **Step 11.1** Sweeper goroutine.
|
||||
- [ ] **Step 11.2** Dashboard handler + template.
|
||||
- [ ] **Step 11.3** Accept form must include the same repo URL/user/pw fields as the token-mint form (admin still supplies repo creds at accept time).
|
||||
- [ ] **Step 11.4** Playwright sweep.
|
||||
- [ ] **Step 11.5** Commit.
|
||||
|
||||
## Task 12 — P2-16: Windows service integration
|
||||
|
||||
**Decision:** Cannot test on Windows from WSL. Goal is a clean compile under `GOOS=windows GOARCH=amd64` and code that follows the canonical `golang.org/x/sys/windows/svc/example` pattern. Untestable beyond compile + manual review; mark in commit message.
|
||||
|
||||
**Files:**
|
||||
- Create: `internal/agent/service/service_windows.go` (build tag `//go:build windows`) — implements `svc.Handler`. `Execute` starts the agent's main loop in a goroutine, listens for `svc.Stop`/`svc.Shutdown`, cancels ctx, waits.
|
||||
- Create: `internal/agent/service/service_other.go` (build tag `//go:build !windows`) — stub `RunService` that just runs the agent loop in the foreground.
|
||||
- Create: `internal/agent/service/install_windows.go` — `Install`, `Uninstall`, `Start`, `Stop` thin wrappers around `mgr` package.
|
||||
- Modify: `cmd/agent/main.go` — sub-commands: `install`, `uninstall`, `start`, `stop`, `run` (default). `run` delegates to `service.Run()` which on Windows checks `svc.IsWindowsService()` and dispatches accordingly.
|
||||
- Test: `internal/agent/service/service_windows_test.go` (build-tagged) for argv parsing only — actual SCM interaction can't be tested in CI.
|
||||
|
||||
- [ ] **Step 12.1** Implement the svc.Handler shell.
|
||||
- [ ] **Step 12.2** Install/uninstall wrappers (use `mgr.ConnectLocal()`, `m.CreateService(name, exepath, mgr.Config{...}, "run")`).
|
||||
- [ ] **Step 12.3** Cross-compile check: `GOOS=windows GOARCH=amd64 go build ./cmd/agent` must succeed.
|
||||
- [ ] **Step 12.4** Commit with note "untested on Windows; compile-verified only".
|
||||
|
||||
## Task 13 — P2-17: install.ps1
|
||||
|
||||
**Files:**
|
||||
- Create: `deploy/install/install.ps1` — PowerShell 5.1+ compatible. Checks admin elevation. Downloads agent binary from `$RM_SERVER/agent/binary?os=windows&arch=amd64`. Drops it at `C:\Program Files\restic-manager\restic-manager-agent.exe`. Runs `restic-manager-agent.exe install` (registers service). Starts it. Detects existing tasks named `*restic*` via `Get-ScheduledTask` and prints them — does not auto-disable. Writes `C:\ProgramData\restic-manager\agent.yaml` with `RM_SERVER` + `RM_TOKEN` (or no token if announce-mode).
|
||||
- Modify: `internal/server/http/install.go` (or wherever install scripts are served) to also serve `/install/install.ps1`.
|
||||
- Modify: CLAUDE.md restage block to also stage `install.ps1`.
|
||||
|
||||
- [ ] **Step 13.1** Write the script.
|
||||
- [ ] **Step 13.2** Wire serving + restage.
|
||||
- [ ] **Step 13.3** Smoke parse: `pwsh -NoProfile -Command "Get-Command -Syntax (Get-ChildItem deploy/install/install.ps1)"` if pwsh is on PATH, else `Set-StrictMode` parse via `pwsh -c "$null = [scriptblock]::Create((Get-Content deploy/install/install.ps1 -Raw))"`. Skip if no pwsh available — note in commit.
|
||||
- [ ] **Step 13.4** Commit.
|
||||
|
||||
## Task 14 — Final integration sweep
|
||||
|
||||
- [ ] **Step 14.1** `go vet ./... && go test ./... -race`. Full build. Restage. Restart server.
|
||||
- [ ] **Step 14.2** Playwright walkthrough on `:8080`: login → dashboard shows pending-hosts empty state → create source group → set a `pre_hook` → Run-now with bandwidth override → confirm hook fires + bandwidth applied → schedules tab shows next/last → repo page shows init-OK line → re-init flow gated by typed hostname.
|
||||
- [ ] **Step 14.3** Update `tasks.md`: tick P2R-09, P2R-10, P2R-11, P2R-12, P2R-13, P2R-14, P2-16, P2-17, P2-18 done. Update Phase 2 acceptance line items as satisfied.
|
||||
- [ ] **Step 14.4** Open PR `p2-completion → main` with a summary of every item closed.
|
||||
|
||||
---
|
||||
|
||||
## Decisions made on the operator's behalf (away)
|
||||
|
||||
1. **Bandwidth UI for per-job override:** small `<details>` disclosure under each Run-now button. Simpler than a modal; matches the rest of the app's progressive-disclosure style.
|
||||
2. **Re-init UX:** server dispatches a fresh `init` job; if restic refuses because the repo already exists, surfaces the error in the job log and instructs the operator to clear the remote bucket. We don't try to forcibly wipe — too dangerous, and the agent doesn't have credentials to wipe S3/B2/etc generically.
|
||||
3. **Hooks editor lives on the Repo page (host defaults) + on the source-group edit form (per-group override).** Skips inventing a new "Settings" tab since that surface is still inert.
|
||||
4. **Announce flow:** admin still supplies repo creds at accept time (same form as the token-mint flow). The pending row only carries identity-of-the-endpoint material, never repo creds.
|
||||
5. **Windows service:** compile-verified only; untested. Commit message will say so.
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,131 @@
|
||||
# P5-03 implementation plan — Docker-only release
|
||||
|
||||
Spec: `docs/superpowers/specs/2026-05-05-p5-03-docker-only-release.md`.
|
||||
|
||||
Branch: `p5-03-docker-release`. Do not auto-open a PR (see CLAUDE.md
|
||||
memory: CI runs are expensive on the self-hosted cluster).
|
||||
|
||||
---
|
||||
|
||||
## Slice 1 — Server config + handler fallback
|
||||
|
||||
**Goal:** server can serve agent binaries / install scripts from a
|
||||
read-only "bundled assets" path when `<DataDir>` doesn't have them.
|
||||
|
||||
1. `internal/server/config/config.go` (or wherever `Cfg` lives) gains
|
||||
a `BundledAssetsDir string` field, defaulting to
|
||||
`/opt/restic-manager/dist`. Wire from `RM_BUNDLED_ASSETS_DIR` env
|
||||
var, mirroring the existing env-var conventions.
|
||||
2. `internal/server/http/agent_assets.go`:
|
||||
- `handleAgentBinary`: try `<DataDir>/agent-binaries/<name>`
|
||||
first; on `os.Stat` ENOENT, try
|
||||
`<BundledAssetsDir>/agent-binaries/<name>`; on second ENOENT,
|
||||
existing 404.
|
||||
- `handleInstallAsset`: same dual-path, with `install/` subpath.
|
||||
3. Tests in `internal/server/http/agent_assets_test.go` (new file):
|
||||
- DataDir hit serves DataDir bytes.
|
||||
- DataDir miss + bundled hit serves bundled bytes.
|
||||
- DataDir hit shadows bundled.
|
||||
- Both miss → 404 + existing error envelope.
|
||||
- Path-traversal still rejected for `install/*` (regression check).
|
||||
|
||||
**Verify:** `go vet ./...` + `go test ./internal/server/http/...`.
|
||||
|
||||
---
|
||||
|
||||
## Slice 2 — Version ldflags on both binaries
|
||||
|
||||
1. `cmd/server/main.go`: keep `var version`, add
|
||||
`var commit = "none"` and `var date = "unknown"`. Surface via
|
||||
existing version-log line.
|
||||
2. `cmd/agent/main.go`: same three vars. Agent already reports
|
||||
`agent_version` in the WS hello — extend to include commit if
|
||||
it's already plumbed through `internal/api`; otherwise leave the
|
||||
commit out of the wire and just log it on startup.
|
||||
3. `Makefile`: extend the `make build` `-ldflags` to set all three
|
||||
from `git describe --tags --always` + `git rev-parse HEAD` +
|
||||
UTC timestamp. Source-build users get real values, not "dev".
|
||||
4. `deploy/Dockerfile.server`: add `ARG COMMIT=none` and
|
||||
`ARG DATE=unknown`; pass through `-ldflags`.
|
||||
|
||||
**Verify:** `make build && ./bin/restic-manager-server -version`
|
||||
(or whatever the existing flag is) prints non-`dev` values.
|
||||
|
||||
---
|
||||
|
||||
## Slice 3 — Dockerfile bakes agents + install assets
|
||||
|
||||
1. Build stage cross-compiles three agents:
|
||||
|
||||
```dockerfile
|
||||
RUN go build -trimpath -ldflags="-s -w \
|
||||
-X main.version=${VERSION} -X main.commit=${COMMIT} -X main.date=${DATE}" \
|
||||
-o /out/agent/restic-manager-agent-linux-amd64 ./cmd/agent
|
||||
ENV GOARCH=arm64
|
||||
RUN go build ... -o /out/agent/restic-manager-agent-linux-arm64 ./cmd/agent
|
||||
ENV GOOS=windows GOARCH=amd64
|
||||
RUN go build ... -o /out/agent/restic-manager-agent-windows-amd64.exe ./cmd/agent
|
||||
```
|
||||
|
||||
(Reset `GOOS`/`GOARCH` between layers via `ENV`. Server build
|
||||
stays at `GOOS=linux GOARCH=$TARGETARCH`.)
|
||||
|
||||
2. Final stage `COPY --from=build`:
|
||||
- `/out/restic-manager-server` → `/usr/local/bin/`
|
||||
- `/out/agent/*` → `/opt/restic-manager/dist/agent-binaries/`
|
||||
- `deploy/install/install.sh` →
|
||||
`/opt/restic-manager/dist/install/install.sh`
|
||||
- `deploy/install/install.ps1` →
|
||||
`/opt/restic-manager/dist/install/install.ps1`
|
||||
- `deploy/install/restic-manager-agent.service` →
|
||||
`/opt/restic-manager/dist/install/restic-manager-agent.service`
|
||||
|
||||
3. Set `--chmod=0755` on the agent binaries and `install.sh`,
|
||||
`--chmod=0644` on the unit file and `install.ps1`. Distroless
|
||||
final stage runs as `nonroot`; bundled assets are readable by
|
||||
anyone (mode `o+r`), so the user switch doesn't break reads.
|
||||
|
||||
**Verify:**
|
||||
```sh
|
||||
docker build -f deploy/Dockerfile.server -t rm:dev .
|
||||
docker run --rm -d -p 18080:8080 \
|
||||
-e RM_LISTEN=:8080 -e RM_DATA_DIR=/data \
|
||||
-e RM_BASE_URL=http://127.0.0.1:18080 \
|
||||
-v rm-test:/data rm:dev
|
||||
curl -fsSL "http://127.0.0.1:18080/agent/binary?os=linux&arch=amd64" | wc -c
|
||||
curl -fsSL "http://127.0.0.1:18080/install/install.sh" | head -1
|
||||
```
|
||||
|
||||
Both should succeed against a fresh volume (no operator staging).
|
||||
|
||||
---
|
||||
|
||||
## Slice 4 — Release workflow
|
||||
|
||||
`.gitea/workflows/release.yml` per the spec. Two jobs:
|
||||
|
||||
1. **`image`**: checkout → setup-qemu → setup-buildx → login → compute
|
||||
tags → buildx build+push.
|
||||
2. (Future) `release-notes`: stub left as a TODO comment for now.
|
||||
Operator can hand-write release notes via the Gitea UI on first
|
||||
cut.
|
||||
|
||||
The `compute tags` shell step is the only non-trivial bit; tested
|
||||
inline by running the script with mocked `GITHUB_REF_TYPE` /
|
||||
`GITHUB_REF_NAME` env vars before committing.
|
||||
|
||||
**Verify on first dispatch:** trigger `workflow_dispatch` from the
|
||||
Gitea UI, check the runner produces `:snapshot-<sha>` and pushes
|
||||
multi-arch.
|
||||
|
||||
---
|
||||
|
||||
## Slice 5 — Tasks.md + commit + push
|
||||
|
||||
1. `tasks.md`: tick P5-03; add a one-line note that goreleaser was
|
||||
dropped in favour of Docker-only after a 2026-05-05 design pass
|
||||
(link the spec).
|
||||
2. `git add -A && git commit -m "p5-03: docker-only release path"`
|
||||
(no Co-Authored-By trailer — CLAUDE.md rule).
|
||||
3. `git push -u origin p5-03-docker-release`.
|
||||
4. **Stop.** Do not open a PR. Wait for operator review.
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,473 @@
|
||||
# P3 — Alerts (design)
|
||||
|
||||
> Phase 3 sub-spec covering the alerts engine, notification channels, and UI
|
||||
> (P3-05 / P3-06 / P3-07).
|
||||
>
|
||||
> Wireframe: `_diag/p3-alerts-wireframe/wireframe.html`. Screenshots in the
|
||||
> same directory. Spec brainstorm ran 2026-05-04; user approved all ten
|
||||
> design decisions before this spec was written.
|
||||
|
||||
## Scope locked
|
||||
|
||||
Brainstorm decisions (in order asked):
|
||||
|
||||
1. **Rule model.** Hardcoded rule set, no operator-tunable thresholds in v1.
|
||||
The engine knows about each rule type internally; per-rule config can land
|
||||
later if/when an operator asks.
|
||||
2. **Rule set.** Six rules: `backup_failed`, `forget_failed`, `prune_failed`,
|
||||
`check_failed`, `stale_schedule`, `agent_offline`.
|
||||
3. **Engine cadence.** Hybrid. Event hooks at the existing
|
||||
`MarkJobFinished` and offline-sweeper sites for the immediate triggers;
|
||||
one 60-second ticker handles stale-schedule detection and auto-resolution.
|
||||
4. **Resolution.** Auto-resolve when the underlying condition clears + manual
|
||||
Resolve at any time. Acknowledge is a separate "I've seen it" intermediate
|
||||
state that does NOT close the alert.
|
||||
5. **v1 channels.** Webhook + native ntfy + SMTP. Apprise deferred (the
|
||||
channel plumbing accepts new kinds without reshaping). SMTP added as
|
||||
a first-class channel post-brainstorm because the use case — overnight
|
||||
alerts the operator wants to read in the morning rather than be pinged
|
||||
on at 03:00 — is poorly served by ntfy's push model and clumsy via
|
||||
webhook → email-gateway.
|
||||
6. **Channel scope.** Global only. No per-host or per-severity routing in v1.
|
||||
7. **Notification body.** Structured JSON for webhooks, formatted
|
||||
title+body+click-URL for ntfy, plus a per-channel "Send test notification"
|
||||
button with inline result feedback.
|
||||
8. **Deduplication.** Open-alert uniqueness on `(host_id, kind)` with a
|
||||
`last_seen_at` bump on every confirming tick. One notification per
|
||||
occurrence; the UI shows "still happening · Ns ago" while a rule keeps
|
||||
matching.
|
||||
9. **Alert UI.** Top-level `/alerts` page (the existing nav stub becomes
|
||||
real). Per-host vitals "Open alerts" cell links to `/alerts?host_id=...`.
|
||||
Channel CRUD lives at `/settings/notifications`.
|
||||
10. **Delivery semantics.** Best-effort fire-and-forget with a 5s timeout
|
||||
per notification. Failures are logged but not retried. The alert row in
|
||||
the DB is the source of truth.
|
||||
|
||||
## Architecture
|
||||
|
||||
The subsystem is three loosely-coupled units behind one `AlertEngine`
|
||||
goroutine:
|
||||
|
||||
```
|
||||
┌───────────────────────────┐
|
||||
event hooks ─────────────────►│ │
|
||||
│ AlertEngine │ ──► raise/resolve
|
||||
60s ticker ──────────────────►│ (rule evaluation) │ alert row
|
||||
│ │
|
||||
└────────────┬──────────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────┐
|
||||
│ notification.Hub │
|
||||
│ (fire-and-forget) │
|
||||
└──┬────────┬──────────┘
|
||||
│ │
|
||||
┌──────▼──┐ ┌──▼──────┐
|
||||
│ Webhook │ │ Ntfy │ …future channels
|
||||
└─────────┘ └─────────┘
|
||||
```
|
||||
|
||||
### Component boundaries
|
||||
|
||||
| Component | Purpose | Depends on |
|
||||
| ---------------------------------------- | ---------------------------------------------------------------------------------------- | -------------------------------------- |
|
||||
| `internal/alert.Engine` | Owns the rule evaluation. Exposes `OnJobFinished`, `OnHostOffline`, `OnHostOnline` event hooks; runs a 60s ticker for stale-schedule + auto-resolution sweeps. Persists raises/resolves through the store. | store, notification.Hub, slog |
|
||||
| `internal/alert.Rule` + per-rule files | Each of the six rules is a small struct with `Kind() string`, `Severity() string`, `MessageFor(ctx) string`. The engine iterates over a registered slice. | store models |
|
||||
| `internal/notification.Hub` | Receives "alert raised/resolved/test" events; fans out to enabled channels in parallel; logs results to a new `notification_log` table. | store, channel adapters |
|
||||
| `internal/notification.Channel` (iface) | Single method `Send(ctx, payload) error` with a 5s context for HTTP channels, 10s for SMTP. Three impls in v1: `webhookChannel`, `ntfyChannel`, `smtpChannel`. | http.Client; net/smtp + crypto/tls for SMTP |
|
||||
| `internal/store/alerts.go` | CRUD on `alerts` table: `RaiseOrTouch(host_id, kind, severity, message)`, `Acknowledge(id, user)`, `Resolve(id, by user)`, `AutoResolve(host_id, kind)`, `ListAlerts(filter)`, plus the `last_seen_at` bump. | sqlite |
|
||||
| `internal/store/notification_channels.go` | CRUD on `notification_channels` (new table) + `notification_log` (new table). | sqlite, crypto.AEAD (for secrets) |
|
||||
| `internal/server/http/ui_alerts.go` | `/alerts` page handler + filter parsing + ack/resolve form actions. | store |
|
||||
| `internal/server/http/ui_notifications.go` | `/settings/notifications` page + channel CRUD + "Send test" handler. | store, notification.Hub |
|
||||
|
||||
### Engine event shape
|
||||
|
||||
The engine runs as one goroutine per server process started in
|
||||
`cmd/server/main.go`. It exposes a small set of channels other code writes to:
|
||||
|
||||
```go
|
||||
type Engine struct {
|
||||
store *store.Store
|
||||
hub *notification.Hub
|
||||
|
||||
// Event channels (buffered, drop-on-full with a slog warning to keep
|
||||
// hot paths non-blocking). The engine drains them on its own
|
||||
// goroutine, evaluates the rule, and acts.
|
||||
jobFinished chan jobFinishedEvent // from store.MarkJobFinished hook
|
||||
hostOffline chan string // host_id; from offline sweeper
|
||||
hostOnline chan string // host_id; from ws handler hello
|
||||
|
||||
// 60s ticker drives stale-schedule + auto-resolution sweeps.
|
||||
tick *time.Ticker
|
||||
}
|
||||
```
|
||||
|
||||
The hot-path call sites (`store.MarkJobFinished`, `ws.handler` offline
|
||||
sweep, `ws.handler` hello) push to these channels via a tiny
|
||||
`Engine.Notify*` method that does a non-blocking send. The engine's own
|
||||
goroutine handles every match — keeps mutation off the hot path.
|
||||
|
||||
### Rule catalogue
|
||||
|
||||
| Kind | Severity | Trigger | Auto-resolve when |
|
||||
| ------------------- | -------- | ----------------------------------------------------------------------- | -------------------------------------------------- |
|
||||
| `backup_failed` | warning | `MarkJobFinished` with kind=backup, status=failed | next backup for the same host succeeds |
|
||||
| `forget_failed` | warning | `MarkJobFinished` with kind=forget, status=failed | next forget for the same host succeeds |
|
||||
| `prune_failed` | warning | `MarkJobFinished` with kind=prune, status=failed | next prune for the same host succeeds |
|
||||
| `check_failed` | critical | `MarkJobFinished` with kind=check, status=failed OR errors_found | next check for the same host succeeds without errors |
|
||||
| `stale_schedule` | warning | 60s ticker: a schedule's next-fire time is more than 5 minutes in the past with no matching job since | next job for that schedule succeeds OR schedule deleted |
|
||||
| `agent_offline` | warning | offline-sweeper marks the host offline AND the host has been offline > 15 min (engine checks `last_seen_at`) | hostOnline event for that host |
|
||||
|
||||
The 15-minute floor on `agent_offline` exists so a 30-second blip during
|
||||
agent restart doesn't generate a notification storm. The store's existing
|
||||
offline sweeper (`hosts.last_seen_at` with 90s threshold) already marks the
|
||||
host offline; the engine sees the event but waits for the threshold before
|
||||
raising.
|
||||
|
||||
### Dedup + last_seen_at
|
||||
|
||||
`store.RaiseOrTouch(host_id, kind, severity, message)`:
|
||||
|
||||
```sql
|
||||
SELECT id, last_seen_at FROM alerts
|
||||
WHERE host_id = ? AND kind = ? AND resolved_at IS NULL
|
||||
LIMIT 1;
|
||||
```
|
||||
|
||||
- Found: `UPDATE alerts SET last_seen_at = ?, message = ? WHERE id = ?`,
|
||||
return `(id, didRaise=false)`.
|
||||
- Not found: `INSERT INTO alerts (id, host_id, kind, severity, message,
|
||||
created_at, last_seen_at) VALUES (?, ?, ?, ?, ?, ?, ?)`, return
|
||||
`(id, didRaise=true)`.
|
||||
|
||||
The engine fires a notification through the Hub only when `didRaise=true`.
|
||||
Touch-only events keep the row's `last_seen_at` fresh so the UI can render
|
||||
"still happening · Ns ago" without spamming the operator's phone.
|
||||
|
||||
### Notification payload shapes
|
||||
|
||||
**Webhook** — a single JSON envelope per event:
|
||||
|
||||
```json
|
||||
{
|
||||
"event": "alert.raised",
|
||||
"alert_id": "01KQT...",
|
||||
"severity": "warning",
|
||||
"kind": "backup_failed",
|
||||
"host_id": "01KQ...",
|
||||
"host_name": "alfa-01",
|
||||
"message": "Backup 'system-config' failed: rest-server returned 401",
|
||||
"raised_at": "2026-05-04T15:42:01Z",
|
||||
"link": "https://restic-manager.example/alerts/01KQT..."
|
||||
}
|
||||
```
|
||||
|
||||
`event` is one of `alert.raised | alert.acknowledged | alert.resolved |
|
||||
alert.test`. The same envelope shape is reused across events — operators
|
||||
build one bridge, switch on `event` and `severity`.
|
||||
|
||||
**SMTP** — single-recipient plain-text email per channel. The channel
|
||||
config carries the SMTP server credentials and a `to` address; one
|
||||
channel = one recipient (or one distribution-list address). Operators
|
||||
who want multiple recipients add multiple channels — keeps the config
|
||||
flat and the failure modes per-recipient.
|
||||
|
||||
Subject pattern is hardcoded (no per-channel template in v1):
|
||||
|
||||
```
|
||||
Subject: [restic-manager] [<severity>] <host_name>: <kind>
|
||||
From: <configured-from-address>
|
||||
To: <configured-to-address>
|
||||
Date: <RFC 5322>
|
||||
Message-ID: <alert_id@<server-host>>
|
||||
|
||||
<message line — same string the webhook/ntfy gets>
|
||||
|
||||
—
|
||||
Raised at: 2026-05-04T15:42:01Z
|
||||
Severity: warning
|
||||
Host: alfa-01
|
||||
Kind: backup_failed
|
||||
|
||||
Open in restic-manager:
|
||||
https://restic-manager.example/alerts/01KQT...
|
||||
|
||||
(This message was sent by restic-manager. Acknowledge or resolve in the UI.)
|
||||
```
|
||||
|
||||
The body is plain text only in v1 — no HTML alternative — both because
|
||||
the data is already structured well enough as text and because HTML
|
||||
email opens a long tail of rendering / sanitisation concerns. The
|
||||
`Message-ID` includes the alert id so a thread-aware client can group
|
||||
related events (raised → acknowledged → resolved) together.
|
||||
|
||||
Encryption:
|
||||
- **STARTTLS** (default, port 587). Opportunistic upgrade. Most
|
||||
operator-facing relays.
|
||||
- **Implicit TLS** (port 465). Connect-then-TLS-handshake.
|
||||
- **None** (port 25). Plain. Hidden behind a "Yes I understand" warning
|
||||
on the form because the password goes over the wire.
|
||||
|
||||
Auth:
|
||||
- **PLAIN** (RFC 4616) over TLS. Default and almost always what's wanted.
|
||||
- **CRAM-MD5** (RFC 2195). Offered if the server advertises it, no UI
|
||||
toggle — automatic.
|
||||
- No OAuth2 / XOAUTH2 in v1; that's a real next step if Gmail-without-
|
||||
app-passwords becomes a recurring ask.
|
||||
|
||||
Per-message timeout is 10s (vs 5s for HTTP channels) — STARTTLS
|
||||
handshake + DATA over a slow link can legitimately take that long.
|
||||
|
||||
**Ntfy** — uses the standard publish format:
|
||||
|
||||
```
|
||||
POST /<topic> HTTP/1.1
|
||||
Host: <server>
|
||||
Authorization: Bearer <access-token> (if configured)
|
||||
Title: [warning] alfa-01 backup failed
|
||||
Priority: 4
|
||||
Tags: warning,backup_failed
|
||||
Click: https://restic-manager.example/alerts/01KQT...
|
||||
|
||||
Backup 'system-config' failed: rest-server returned 401
|
||||
```
|
||||
|
||||
Severity → priority mapping:
|
||||
|
||||
| Severity | Priority |
|
||||
| --------- | -------- |
|
||||
| info | 3 (default) |
|
||||
| warning | 4 (high) |
|
||||
| critical | 5 (urgent) |
|
||||
|
||||
Per-channel `default_priority` setting overrides for non-critical alerts;
|
||||
critical always goes urgent regardless.
|
||||
|
||||
### Test notification
|
||||
|
||||
`POST /api/notifications/{channel_id}/test` builds a synthetic event
|
||||
(severity=info, kind=test_notification, message="Test from
|
||||
restic-manager", link to the channel's edit page) and runs it through the
|
||||
real send path. Returns `{ok: bool, latency_ms: int, status_code?: int,
|
||||
error?: string}`. UI renders the green ✓ / red ✗ feedback inline.
|
||||
|
||||
## Routes added
|
||||
|
||||
| Method | Path | Purpose |
|
||||
| ------- | ----------------------------------------------------- | ------------------------------------------------------------- |
|
||||
| GET | `/alerts` | Fleet alerts list with filters (`?status=open&severity=warning&host_id=...&q=...`) |
|
||||
| POST | `/alerts/{id}/acknowledge` | Mark alert acknowledged (HTMX form) |
|
||||
| POST | `/alerts/{id}/resolve` | Manual resolve (HTMX form) |
|
||||
| GET | `/settings/notifications` | Channel list page |
|
||||
| GET | `/settings/notifications/new` | Channel kind picker + empty form |
|
||||
| POST | `/settings/notifications/new` | Validate + create + redirect |
|
||||
| GET | `/settings/notifications/{id}/edit` | Channel edit form |
|
||||
| POST | `/settings/notifications/{id}/edit` | Validate + update |
|
||||
| POST | `/settings/notifications/{id}/delete` | Delete channel (typed-confirm name in the form) |
|
||||
| POST | `/api/notifications/{id}/test` | Fire test notification, return JSON result |
|
||||
| GET | `/api/alerts` | JSON list (mirrors the UI filters) for future REST callers |
|
||||
|
||||
## Data model
|
||||
|
||||
### Migration 0013 — alerts.last_seen_at
|
||||
|
||||
```sql
|
||||
ALTER TABLE alerts ADD COLUMN last_seen_at TEXT;
|
||||
UPDATE alerts SET last_seen_at = created_at WHERE last_seen_at IS NULL;
|
||||
```
|
||||
|
||||
Existing alerts (currently zero in production — nothing writes them yet)
|
||||
get `last_seen_at = created_at`. Column is nullable for forwards-compat
|
||||
with rows from the alert-engine-pre-bump period.
|
||||
|
||||
### Migration 0014 — notification_channels + notification_log
|
||||
|
||||
```sql
|
||||
CREATE TABLE notification_channels (
|
||||
id TEXT PRIMARY KEY,
|
||||
kind TEXT NOT NULL CHECK (kind IN ('webhook', 'ntfy', 'smtp')),
|
||||
name TEXT NOT NULL,
|
||||
enabled INTEGER NOT NULL DEFAULT 1 CHECK (enabled IN (0, 1)),
|
||||
config BLOB NOT NULL, -- AEAD-encrypted JSON; per-kind shape
|
||||
default_priority TEXT, -- ntfy only; null for webhook + smtp
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL,
|
||||
last_fired_at TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX notification_channels_enabled ON notification_channels(enabled) WHERE enabled = 1;
|
||||
|
||||
CREATE TABLE notification_log (
|
||||
id TEXT PRIMARY KEY,
|
||||
channel_id TEXT NOT NULL REFERENCES notification_channels(id) ON DELETE CASCADE,
|
||||
alert_id TEXT REFERENCES alerts(id) ON DELETE SET NULL,
|
||||
event TEXT NOT NULL, -- alert.raised | alert.acknowledged | alert.resolved | alert.test
|
||||
ok INTEGER NOT NULL CHECK (ok IN (0, 1)),
|
||||
status_code INTEGER,
|
||||
latency_ms INTEGER,
|
||||
error TEXT,
|
||||
fired_at TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE INDEX notification_log_channel ON notification_log(channel_id, fired_at DESC);
|
||||
CREATE INDEX notification_log_alert ON notification_log(alert_id);
|
||||
```
|
||||
|
||||
`config` is an AEAD-encrypted JSON blob — bearer tokens for webhooks and
|
||||
access tokens for ntfy live there. Per-kind config shapes:
|
||||
|
||||
```go
|
||||
type webhookConfig struct {
|
||||
URL string `json:"url"`
|
||||
BearerToken string `json:"bearer_token,omitempty"`
|
||||
HeaderName string `json:"header_name,omitempty"`
|
||||
HeaderValue string `json:"header_value,omitempty"`
|
||||
}
|
||||
|
||||
type ntfyConfig struct {
|
||||
ServerURL string `json:"server_url"` // default https://ntfy.sh
|
||||
Topic string `json:"topic"`
|
||||
AccessToken string `json:"access_token,omitempty"`
|
||||
}
|
||||
|
||||
type smtpConfig struct {
|
||||
Host string `json:"host"` // e.g. smtp.example.com
|
||||
Port int `json:"port"` // default 587 (STARTTLS), 465 (TLS), 25 (none)
|
||||
Encryption string `json:"encryption"` // "starttls" | "tls" | "none"
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"` // sensitive — AEAD-encrypted with the rest of config
|
||||
From string `json:"from"` // RFC 5322 address; "alerts@example.com" or "Restic-Manager <alerts@…>"
|
||||
To string `json:"to"` // single recipient or distribution-list address; v1 = one channel = one to-line
|
||||
}
|
||||
```
|
||||
|
||||
### Engine state
|
||||
|
||||
The engine itself is stateless beyond the channels it owns; all
|
||||
persisted state is in the existing `alerts` table + the new
|
||||
`notification_log` table. A process restart re-evaluates from scratch:
|
||||
on next tick the stale-schedule + auto-resolution sweeps catch up with
|
||||
whatever happened during the downtime. No outbox to drain.
|
||||
|
||||
## UI templates
|
||||
|
||||
| Template | Purpose |
|
||||
| ----------------------------------------- | ------------------------------------------------------ |
|
||||
| `web/templates/pages/alerts.html` | Fleet alerts page |
|
||||
| `web/templates/partials/alert_row.html` | One alert row (used by both list and detail-fragment swap) |
|
||||
| `web/templates/pages/settings.html` | Settings shell with Notifications / Users / Auth sub-tabs |
|
||||
| `web/templates/pages/notifications.html` | Channel list (Notifications sub-tab body) |
|
||||
| `web/templates/pages/notification_edit.html` | Channel kind picker + per-kind form + test button + payload preview |
|
||||
| `web/templates/partials/crit_banner.html` | Dashboard top-of-page banner |
|
||||
| `web/templates/partials/nav.html` | Existing — gain a `data-alerts-count` attribute on the Alerts tab so the badge auto-updates |
|
||||
|
||||
The Settings shell + Notifications sub-tab is the new chrome the wireframe
|
||||
introduced; Users + Authentication tabs are placeholder links that 404 in
|
||||
v1 (or render an "Lands later" notice). Same pattern P2R-02 used for
|
||||
inert sub-tabs.
|
||||
|
||||
## Tests (target coverage)
|
||||
|
||||
- `internal/alert/engine_test.go` — rule firing per kind: backup_failed
|
||||
raises on `MarkJobFinished(kind=backup, status=failed)`; touch-only on
|
||||
the second failure for the same host (no second notification);
|
||||
auto-resolve on next success.
|
||||
- `internal/alert/agent_offline_test.go` — `OnHostOffline` emits without
|
||||
raising until the 15-min floor; `OnHostOnline` clears the alert.
|
||||
- `internal/alert/stale_schedule_test.go` — synthetic schedule whose next
|
||||
fire is in the past triggers; resets when a job lands.
|
||||
- `internal/notification/webhook_test.go` — payload shape pinned;
|
||||
authorisation header sent when bearer set; custom header echoed; 5s
|
||||
timeout enforced; error in `notification_log`.
|
||||
- `internal/notification/ntfy_test.go` — title/priority/tags/click headers
|
||||
match the severity mapping; access token sent as `Authorization: Bearer
|
||||
<token>`; default priority overridden by severity for critical.
|
||||
- `internal/notification/smtp_test.go` — round-trip against a local
|
||||
`net/smtp.NewServer`-style fake (or `mhog`/MailHog if convenient):
|
||||
STARTTLS handshake completes against a self-signed cert; PLAIN auth
|
||||
uses configured creds; subject + from + to + body bytes match the
|
||||
spec'd format; Message-ID contains the alert id; 10s timeout enforced;
|
||||
failure path (auth refused) lands in `notification_log` with the
|
||||
server's error string.
|
||||
- `internal/server/http/ui_alerts_test.go` — page renders with filters
|
||||
applied; ack/resolve POSTs flip the row + write audit; HX-Redirect
|
||||
bounces back to the filtered list.
|
||||
- `internal/server/http/ui_notifications_test.go` — CRUD happy paths,
|
||||
validation re-render, secrets-encrypted-at-rest assertion (load row,
|
||||
decrypt, compare), test-button hits the real send path against a
|
||||
test http.Server.
|
||||
- Migration 0013 + 0014 round-trip tested via `store.Open` on a fresh
|
||||
db.
|
||||
|
||||
## Playwright sweep
|
||||
|
||||
End-of-phase sweep mirrors the P2R-02 / P3-restore pattern:
|
||||
|
||||
1. Login → `/alerts` (initially empty) → see "All clear · last alert
|
||||
never" empty state.
|
||||
2. Trigger a fake-failed-backup via `POST /api/hosts/{id}/jobs` against a
|
||||
host with a deliberately-wrong rest-server URL. Wait for the
|
||||
`backup_failed` alert to appear in the list within ~2s of the job
|
||||
finishing.
|
||||
3. Acknowledge → row tints + ack actor visible.
|
||||
4. Take the agent offline (`systemctl stop`); wait 15 min OR mock
|
||||
`last_seen_at` to 16 min ago via the test harness; confirm
|
||||
`agent_offline` alert raises once.
|
||||
5. Restart the agent → `agent_offline` auto-resolves; `backup_failed` is
|
||||
still open.
|
||||
6. Configure a webhook channel pointing at a local test sink; click "Send
|
||||
test" → green ✓.
|
||||
7. Configure a ntfy channel pointing at a local sink → click "Send test"
|
||||
→ green ✓.
|
||||
8. Configure an SMTP channel pointing at a local MailHog (Docker, port
|
||||
1025, no TLS for the local-only sweep) → click "Send test" → green ✓
|
||||
→ MailHog UI at :8025 shows the test email with the right subject
|
||||
and Message-ID.
|
||||
9. Trigger a fresh failed backup → all three channels receive the
|
||||
notification (verified from sink logs + MailHog inbox);
|
||||
`notification_log` has three rows `event=alert.raised, ok=true`.
|
||||
10. Manually Resolve the open `backup_failed`; confirm all three channels
|
||||
receive `event=alert.resolved`.
|
||||
11. Critical-severity test: trigger `check_failed` (mocked) → dashboard
|
||||
banner appears; clicking it lands on `/alerts?severity=critical&status=open`.
|
||||
12. Empty the alerts again → banner disappears.
|
||||
|
||||
Screenshots into `_diag/p3-alerts-sweep/`. End-to-end clean, zero console
|
||||
errors, before handing back.
|
||||
|
||||
## What does NOT change
|
||||
|
||||
- Existing chrome/templates beyond the small additions noted above.
|
||||
- Existing `alerts.severity` CHECK (`info`/`warning`/`critical`) — already
|
||||
the right shape; no migration needed for that.
|
||||
- Audit log writer pattern — engine writes audit rows for ack/resolve
|
||||
the same way every other state-changing handler does.
|
||||
- The agent. Alerts are entirely a server concern; the agent doesn't
|
||||
know they exist.
|
||||
|
||||
## Open questions / explicit non-goals
|
||||
|
||||
- **Per-rule cooldowns / re-raise on long-running issues.** Out of scope
|
||||
(brainstorm question 8 ruled this out). Operators see "still happening"
|
||||
in the UI; they don't get a reminder ping.
|
||||
- **SMTP HTML emails.** v1 is plain text only — operators wanting rich
|
||||
rendering can deploy a webhook → mail-merge bridge, or wait for a v2
|
||||
template engine. The Message-ID threading + plain text body should be
|
||||
enough for almost every overnight-digest workflow.
|
||||
- **SMTP OAuth2 / XOAUTH2.** Out of scope. Gmail / Microsoft 365 with
|
||||
modern OAuth requires an `app password` workaround in v1. Native
|
||||
XOAUTH2 lands when an operator asks (or when Google starts refusing
|
||||
app passwords for non-business accounts in earnest).
|
||||
- **Multi-recipient SMTP channels.** A channel = one `To`. Operators
|
||||
wanting multiple recipients add multiple channels. Keeps failure
|
||||
attribution per-recipient.
|
||||
- **Apprise sidecar integration.** Deferred per brainstorm. The
|
||||
`Channel` interface accepts a third impl without reshaping when we get
|
||||
there.
|
||||
- **Per-host or per-severity channel routing.** Out of scope. Likely
|
||||
next step if operators ask: a `min_severity` field on the channel row.
|
||||
- **Snooze / mute.** Out of scope. Acknowledge is the closest analogue;
|
||||
full silence-windows would need a new table and is YAGNI for v1.
|
||||
- **PagerDuty / OpsGenie.** Both have webhook receivers; operators wire
|
||||
them via the webhook channel today.
|
||||
- **Alert "rules" UI.** No CRUD; the rule set is hardcoded.
|
||||
@@ -0,0 +1,342 @@
|
||||
# P3 — Restore (design)
|
||||
|
||||
> Phase 3 sub-spec covering single-host restore (P3-01, P3-02, P3-03, P3-09).
|
||||
> P3-04 (cross-host restore) is deferred to a new "Future / unscheduled"
|
||||
> section in `tasks.md` — disaster recovery is already covered by re-enrolling
|
||||
> a replacement host with the same repo credentials.
|
||||
>
|
||||
> Wireframe: `_diag/p3-restore-wizard/wireframe.html`. Screenshot:
|
||||
> `_diag/p3-restore-wizard/01-full-wizard.png`.
|
||||
|
||||
## Scope locked
|
||||
|
||||
Brainstorm decisions (in order asked):
|
||||
|
||||
1. **In-place vs new-directory.** Default is a new directory under
|
||||
`/var/restic-restore/<job-id>/`. An "Restore in place (overwrite original
|
||||
paths)" toggle is gated by typed-confirmation of the host name, mirroring
|
||||
the repo re-init pattern.
|
||||
2. **Path-selection granularity.** Tree browser as the path selector, lazy-
|
||||
loaded via `restic ls --json <snapshot> <path>` per directory expansion.
|
||||
3. **Cross-host restore (P3-04).** Out of scope this phase. Move to
|
||||
"Future / unscheduled" in `tasks.md`. The disaster-recovery case is covered
|
||||
by the standard enrolment flow: stand up a replacement host, paste the
|
||||
original repo creds at enrolment, snapshots reappear, restore is
|
||||
same-host.
|
||||
4. **Snapshot diff (P3-09).** Diff-as-a-job. New `JobDiff` JobKind dispatched
|
||||
like every other agent operation. Output streams as `log.stream` and
|
||||
renders on the live job log page.
|
||||
5. **Wizard entry points.** Top-level "Restore" button on host detail
|
||||
(`/hosts/{id}/restore`, opens wizard at step 1) plus a per-snapshot
|
||||
Restore action on snapshot rows (`/hosts/{id}/snapshots/{sid}/restore`,
|
||||
skips step 1).
|
||||
6. **Wizard interaction model.** Single-page, sections progressively enable;
|
||||
tree-browser nodes lazy-load via HTMX partials. No `restore_drafts` table.
|
||||
7. **Tree-browser data path.** Synchronous WS RPC (`tree.list` ↔
|
||||
`tree.list.result`, correlation-ID) plus a per-wizard-session in-memory
|
||||
cache keyed by `{snapshot_id, path}` with ~30-min TTL.
|
||||
8. **Restore progress UI.** Restore-specific job-page variant: files-restored
|
||||
/ bytes-restored / throughput / ETA / current-file display, driven by
|
||||
restic restore's JSON status events surfaced through `job.progress`.
|
||||
9. **Permissions/ownership.** Policy, not toggle. In-place restore preserves
|
||||
original ownership; new-directory restore drops ownership
|
||||
(`--no-ownership`).
|
||||
10. **Concurrency.** Single-flight per host (one job at a time across all
|
||||
kinds). Plus a real cancel-job feature: `command.cancel` envelope, agent
|
||||
kills the `restic` subprocess via context cancel (SIGTERM, SIGKILL after
|
||||
grace), server transitions the job to `cancelled`. The "Cancel" button
|
||||
already in the `job_detail` template becomes real for any running job
|
||||
kind.
|
||||
11. **Audit + safety.** Audit row on every restore dispatch (`host.restore`
|
||||
with snapshot ID, paths, target, in-place flag). Recent-restores panel
|
||||
on the host page surfacing the latest restore job alongside last-backup
|
||||
and last-init signals. Role gate deferred to P4-03.
|
||||
|
||||
## Architecture
|
||||
|
||||
Restore composes from existing primitives plus three new pieces:
|
||||
|
||||
- **New JobKind values**: `JobRestore`, `JobDiff`. Dispatcher cases mirror
|
||||
the prune/check pattern. Agent-side handlers wrap `restic.RunRestore` and
|
||||
`restic.RunDiff` (new methods on the `restic` package).
|
||||
- **New WS RPC**: `tree.list` request (`{snapshot_id, path}`) ↔
|
||||
`tree.list.result` reply (`{entries: [{name, type, size}], ...}` or
|
||||
`{error}`). Reuses existing correlation-ID infrastructure from P1-09. No
|
||||
`jobs` row.
|
||||
- **New cancel surface**: `command.cancel` request (`{job_id}`), agent
|
||||
cancels the running subprocess context, returns `command.ack` + `job.finished`
|
||||
with status `cancelled`. Server endpoint `POST /api/jobs/{id}/cancel`
|
||||
bridges UI button → WS envelope.
|
||||
|
||||
Everything else (job lifecycle, log streaming, progress envelope, snapshot
|
||||
listing, audit log writer, host_chrome partial, danger-zone typed-confirmation)
|
||||
already exists and is reused verbatim.
|
||||
|
||||
### Component boundaries
|
||||
|
||||
| Component | Purpose | Depends on |
|
||||
| ---------------------------------- | ---------------------------------------------------- | ----------------------------------------- |
|
||||
| `internal/restic.RunRestore` | Run `restic restore` with paths + target + ownership | `restic.Env` |
|
||||
| `internal/restic.RunDiff` | Run `restic diff --json a b` | `restic.Env` |
|
||||
| `internal/agent/runner` cases | Dispatch `JobRestore` / `JobDiff` jobs | `restic.Run*`, hooks (skipped: backup-only) |
|
||||
| `internal/agent/runner` cancel hook | Wire WS `command.cancel` → ctx.CancelFunc per job | runner job map |
|
||||
| `internal/agent/runner` tree-list | Sync RPC handler: `restic ls --json` for one path | `restic.Env` |
|
||||
| `internal/server/ws/cancel.go` | Validate + send `command.cancel` envelope | hub.Send, store.UpdateJobStatus |
|
||||
| `internal/server/ws/tree.go` | RPC mediator: `tree.list` request → reply, with cache | hub.SendRPC, in-memory cache |
|
||||
| `internal/server/http/restore.go` | Wizard routes + dispatch endpoint | store, ws, audit |
|
||||
| `internal/server/http/diff.go` | Snapshot-diff dispatch endpoint | store, ws |
|
||||
| `internal/server/http/cancel.go` | `POST /api/jobs/{id}/cancel` | ws |
|
||||
| `web/templates/pages/host_restore.html` | Wizard page | host_chrome partial |
|
||||
| `web/templates/partials/tree_node.html` | Lazy-loaded tree node fragment for HTMX swap | — |
|
||||
| `web/templates/pages/job_detail.html` | Restore-kind progress widget (variant) | existing job_detail |
|
||||
|
||||
### Data flow — wizard happy path
|
||||
|
||||
```
|
||||
operator
|
||||
├─ GET /hosts/{id}/restore
|
||||
│ server renders wizard shell, snapshot table from store.ListSnapshotsByHost
|
||||
│
|
||||
├─ click snapshot row (or arrives via /hosts/{id}/snapshots/{sid}/restore)
|
||||
│ wizard advances to step 2, snapshot summary card rendered
|
||||
│
|
||||
├─ expand a tree node (chevron click)
|
||||
│ HTMX GET /hosts/{id}/restore/tree?snapshot={sid}&path=/etc
|
||||
│ server checks per-session cache (keyed by sid+path)
|
||||
│ hit → render tree_node fragment from cache
|
||||
│ miss → hub.SendRPC(host_id, "tree.list", {sid, path}) → wait reply
|
||||
│ cache result, render tree_node fragment
|
||||
│
|
||||
├─ tick file/dir checkboxes (form state, no round-trip)
|
||||
│
|
||||
├─ pick target radio (and optionally type host name to unlock in-place)
|
||||
│
|
||||
└─ POST /hosts/{id}/restore (form submit)
|
||||
server validates: ≥1 path, target mode, in-place ⇒ host name match
|
||||
write audit row host.restore
|
||||
store.CreateJob{kind=restore, payload={snapshot_id, paths, target, in_place}}
|
||||
hub.Send(host_id, "command.run", {job_id, kind=restore, payload})
|
||||
HX-Redirect: /jobs/{job_id}
|
||||
```
|
||||
|
||||
### Data flow — agent restore execution
|
||||
|
||||
```
|
||||
agent.runner receives command.run kind=restore
|
||||
├─ check single-flight: if r.activeJobID != "" → reply busy
|
||||
│ (server queues to pending_runs only for kind=backup; restore returns busy)
|
||||
├─ allocate ctx, ctxCancel — store cancelFunc against job_id in r.cancels
|
||||
├─ sendStarted(job_id, JobRestore, now)
|
||||
├─ build target path: if in_place → "/" else "/var/restic-restore/<job_id>/"
|
||||
├─ build flags: paths from payload, --no-ownership when !in_place
|
||||
├─ restic.RunRestore(ctx, env, snapshot_id, paths, target, in_place):
|
||||
│ restic restore <sid> --target <path> [--no-ownership] -- <p1> <p2> ...
|
||||
│ parse stdout JSON: forward "status" → job.progress (1Hz throttle), "summary" → final
|
||||
├─ on success: sendFinished(job_id, succeeded, exit=0)
|
||||
├─ on ctx.Err() == context.Canceled: sendFinished(job_id, cancelled, exit=130)
|
||||
└─ delete cancel func from r.cancels
|
||||
```
|
||||
|
||||
### Data flow — cancel
|
||||
|
||||
```
|
||||
operator clicks Cancel on /jobs/{id} (running)
|
||||
POST /api/jobs/{id}/cancel
|
||||
server: lookup job, ensure status=running, find host
|
||||
hub.Send(host_id, "command.cancel", {job_id})
|
||||
→ agent.runner receives command.cancel
|
||||
cancelFunc, ok := r.cancels[job_id]
|
||||
ok && cancelFunc()
|
||||
→ restic subprocess context done → exec.Cmd kills via SIGTERM
|
||||
→ if still alive after 5s grace → SIGKILL
|
||||
→ runner sendFinished(job_id, cancelled, exit=130)
|
||||
→ server receives job.finished status=cancelled, persists, broadcasts
|
||||
→ browser refresh shows cancelled state
|
||||
```
|
||||
|
||||
The cancel surface is independently useful for any kind (prune/check/backup) —
|
||||
not gated to restore. The button already in `job_detail.html` becomes real.
|
||||
|
||||
### Tree-list RPC details
|
||||
|
||||
New WS message types (added to `internal/api/messages.go`):
|
||||
|
||||
```
|
||||
type TreeListRequestPayload struct {
|
||||
SnapshotID string `json:"snapshot_id"`
|
||||
Path string `json:"path"`
|
||||
}
|
||||
|
||||
type TreeListEntry struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"` // "dir" | "file" | "symlink"
|
||||
Size int64 `json:"size,omitempty"`
|
||||
}
|
||||
|
||||
type TreeListResultPayload struct {
|
||||
SnapshotID string `json:"snapshot_id"`
|
||||
Path string `json:"path"`
|
||||
Entries []TreeListEntry `json:"entries,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
```
|
||||
|
||||
Server-side mediator (`ws.SendRPC`) takes a request envelope, registers the
|
||||
correlation ID in a pending map, sends, blocks on a per-call channel until
|
||||
the matching reply arrives (or 30s timeout). The pattern is small enough
|
||||
to inline in `internal/server/ws/rpc.go` as a generic helper — future
|
||||
synchronous RPCs reuse it.
|
||||
|
||||
In-memory cache: `map[sessionID]map[cacheKey]TreeListResultPayload` with
|
||||
`cacheKey = snapshot_id + "\x00" + path`. Session ID minted per wizard
|
||||
load (HTTP-only cookie scoped to `/hosts/{id}/restore/tree`, lifetime 30
|
||||
min). On wizard close (browser navigation away) the entry expires
|
||||
naturally. No persistence, no migration.
|
||||
|
||||
Agent handler runs `restic ls --json <sid> <path>` (non-recursive — restic
|
||||
defaults to recursive but `restic ls` accepts `--long` and a path filter;
|
||||
parse output line-by-line and emit only direct children of `path`). 60s
|
||||
context timeout, mirroring existing `restic snapshots` invocation.
|
||||
|
||||
### Restore payload
|
||||
|
||||
`api.CommandRunPayload` gains a nested optional `restore` field:
|
||||
|
||||
```
|
||||
type RestorePayload struct {
|
||||
SnapshotID string `json:"snapshot_id"`
|
||||
Paths []string `json:"paths"` // absolute paths inside the snapshot
|
||||
InPlace bool `json:"in_place"`
|
||||
TargetDir string `json:"target_dir"` // empty when in_place=true
|
||||
PreserveOwner bool `json:"preserve_owner"` // mirrors policy: in_place=>true, else=>false
|
||||
}
|
||||
```
|
||||
|
||||
The payload is set by the server when dispatching `JobRestore` and ignored
|
||||
on every other kind. Wire-shape test pinned in `wire_test.go`.
|
||||
|
||||
### Diff payload
|
||||
|
||||
`api.CommandRunPayload` gains:
|
||||
|
||||
```
|
||||
type DiffPayload struct {
|
||||
SnapshotA string `json:"snapshot_a"`
|
||||
SnapshotB string `json:"snapshot_b"`
|
||||
}
|
||||
```
|
||||
|
||||
Set on `JobDiff`. Output is plain `restic diff --json <a> <b>` forwarded as
|
||||
`log.stream` lines. Job page renders unchanged — operator reads the diff
|
||||
output directly.
|
||||
|
||||
### Recent-restores panel
|
||||
|
||||
A small panel rendered on the host detail page below the existing init-status
|
||||
line:
|
||||
|
||||
```
|
||||
last restore: succeeded 2h ago · job f73ab4c1… · 3 files to /var/restic-restore/...
|
||||
```
|
||||
|
||||
Backed by a new `store.LatestJobByKind(host_id, JobRestore)` query (mirroring
|
||||
the existing `store.LatestJobByKind` already used for init/forget/prune/check
|
||||
in P2R-06). One template addition in `host_chrome.html` next to the
|
||||
`InitStatus` block.
|
||||
|
||||
## Routes added
|
||||
|
||||
| Method | Path | Purpose |
|
||||
| ------- | --------------------------------------------------------- | ----------------------------------------------------------- |
|
||||
| GET | `/hosts/{id}/restore` | Wizard shell (step 1 = snapshot picker) |
|
||||
| GET | `/hosts/{id}/snapshots/{sid}/restore` | Wizard shell with snapshot pre-selected (skips step 1) |
|
||||
| GET | `/hosts/{id}/restore/tree` | HTMX partial: tree node listing for `?snapshot=&path=` |
|
||||
| POST | `/hosts/{id}/restore` | Validate + dispatch restore job, redirect to live job page |
|
||||
| POST | `/api/hosts/{id}/snapshots/diff` | Dispatch a diff job for `{snapshot_a, snapshot_b}` |
|
||||
| POST | `/api/jobs/{id}/cancel` | Send `command.cancel` to host, transition job → cancelled |
|
||||
|
||||
## Migrations
|
||||
|
||||
None. Restore + diff piggyback on the existing `jobs` table (their `kind` is
|
||||
new but the schema already accepts arbitrary kind strings — there's no
|
||||
CHECK constraint on `kind`). The cancel feature uses the existing
|
||||
`JobCancelled` terminal status. The tree-list cache lives in process memory.
|
||||
|
||||
## Tests (target coverage)
|
||||
|
||||
- `internal/restic/restore_test.go` — `RunRestore` invocation builds the
|
||||
expected argv (paths, --target, --no-ownership flag presence, in-place
|
||||
variant); JSON status parsing → `BackupStatus`-shaped progress envelopes.
|
||||
- `internal/restic/diff_test.go` — `RunDiff` argv shape and JSON forwarding.
|
||||
- `internal/agent/runner/restore_test.go` — happy path, cancel mid-run
|
||||
produces `cancelled` finished, in-place vs new-directory dispatch,
|
||||
single-flight rejects when another job is running.
|
||||
- `internal/agent/runner/tree_test.go` — `tree.list` handler returns
|
||||
direct children for a synthetic restic ls output, surfaces error on
|
||||
missing snapshot.
|
||||
- `internal/server/ws/rpc_test.go` — `SendRPC` correlation matching,
|
||||
timeout, concurrent calls.
|
||||
- `internal/server/http/restore_test.go` — wizard renders with snapshots,
|
||||
POST validates ≥1 path + in-place host-name match, audit row written,
|
||||
job dispatched with correct payload, in-place without typed-confirm
|
||||
re-renders form with input intact and an error.
|
||||
- `internal/server/http/diff_test.go` — POST dispatches `JobDiff`,
|
||||
snapshot IDs validated against the host's snapshot list.
|
||||
- `internal/server/http/cancel_test.go` — POST cancel happy path
|
||||
(running → cancelled), 4xx for non-running jobs, 4xx when host offline.
|
||||
- `internal/server/http/restore_e2e_test.go` — happy path: GET wizard,
|
||||
expand `/etc` (HTMX call returns expected fragment), submit, follow
|
||||
HX-Redirect to job page, see status.
|
||||
- `web/templates/pages/host_restore_test.go` (template-render test) —
|
||||
wizard renders all four sections; in-place card disabled until typed
|
||||
confirm.
|
||||
|
||||
## Playwright iteration / sweep
|
||||
|
||||
A Playwright sweep at the end (mirroring P2R-02 Slice 6) runs against the
|
||||
local smoke server with a real agent enrolled. Steps:
|
||||
|
||||
1. Login → navigate to alfa-01 host → click Restore.
|
||||
2. Wizard step 1: pick the most recent snapshot.
|
||||
3. Wizard step 2: expand a directory two levels, tick three files,
|
||||
verify tally updates.
|
||||
4. Wizard step 3: leave default new-directory.
|
||||
5. Wizard step 4: dispatch.
|
||||
6. Land on live job page, see progress widget animating, see log lines.
|
||||
7. Click Cancel mid-flight, verify status transitions to cancelled and
|
||||
the agent's subprocess actually died (log line `signal: killed` or exit
|
||||
130).
|
||||
8. Repeat with in-place mode: type host name, dispatch, verify red
|
||||
primary button, verify files actually overwritten on host.
|
||||
9. Snapshot diff: navigate to snapshots, pick two, dispatch diff, see
|
||||
diff output streamed.
|
||||
10. Screenshots into `_diag/p3-restore-sweep/`.
|
||||
|
||||
End-to-end clean, zero console errors, before handing back.
|
||||
|
||||
## What does NOT change
|
||||
|
||||
- `host_chrome.html` only grows the recent-restores line; sub-tab list
|
||||
unchanged (Restore is a top-level button on the host page, not a sub-tab).
|
||||
- `enrollment.go`, schedule reconciliation, source-group CRUD, repo
|
||||
maintenance ticker, hook execution — none of these are touched.
|
||||
- The CLAUDE.md restage block applies as-is when the agent binary changes
|
||||
(it does — runner gains restore/diff/cancel/tree handlers). The unit
|
||||
file does not change.
|
||||
|
||||
## Open questions / explicit non-goals
|
||||
|
||||
- **Restore preview / dry-run.** Restic doesn't have a dry-run for restore.
|
||||
Out of scope.
|
||||
- **Resumable restore.** Restic restore is idempotent per-file but not
|
||||
resumable mid-stream from where it left off. If a restore is cancelled,
|
||||
the operator re-runs (files already written are overwritten). No state
|
||||
to track.
|
||||
- **Restore to a glob/pattern (e.g. `*.conf`).** Out of scope; the tree
|
||||
picker requires explicit ticks. Power users can edit the URL or use the
|
||||
CLI.
|
||||
- **Bandwidth caps for restore.** Honoured automatically — restic's
|
||||
`--limit-download` is part of `restic.Env` already (P2R-13) and applies
|
||||
to restore unchanged.
|
||||
- **Pre/post hooks for restore.** Hooks today gate only `kind=backup`
|
||||
(P2R-11). Out of scope.
|
||||
@@ -0,0 +1,340 @@
|
||||
# P4-03 / P4-04 — RBAC + User Management Design
|
||||
|
||||
> **Date:** 2026-05-05
|
||||
> **Status:** brainstorm complete; ready for plan
|
||||
> **Closes:** P4-03 (RBAC enforcement at API layer), P4-04 (User management UI)
|
||||
|
||||
## Goal
|
||||
|
||||
Enforce role-based access control at the HTTP layer (currently every authenticated user has admin powers) and ship the operator-facing screens for managing users, roles, and password lifecycle.
|
||||
|
||||
## Architecture
|
||||
|
||||
Two coupled subsystems landing in one PR:
|
||||
|
||||
1. **RBAC enforcement** — chi route-group middleware that gates each subtree by minimum role. Fail-closed default (admin) so a forgotten declaration doesn't accidentally widen access.
|
||||
2. **User management** — `/settings/users` sub-tab with list / add / edit / disable. Setup-link flow for new users (1-hour-expiry single-use token). Self-service password change at `/settings/account`.
|
||||
|
||||
The audit log already records actor + user_id on every mutation; new endpoints fold in naturally.
|
||||
|
||||
## Role taxonomy
|
||||
|
||||
Locked. Three roles, hierarchical (admin ⊇ operator ⊇ viewer):
|
||||
|
||||
| Action | admin | operator | viewer |
|
||||
|---|:-:|:-:|:-:|
|
||||
| View dashboard / alerts / audit / hosts | ✓ | ✓ | ✓ |
|
||||
| Trigger Run-now / Restore / Snapshot diff | ✓ | ✓ | ✗ |
|
||||
| Acknowledge / resolve alerts | ✓ | ✓ | ✗ |
|
||||
| Edit schedules / source groups / retention / hooks | ✓ | ✓ | ✗ |
|
||||
| Add / remove hosts (enrolment, accept/reject pending) | ✓ | ✓ | ✗ |
|
||||
| Cancel running jobs | ✓ | ✓ | ✗ |
|
||||
| Edit repo credentials | ✓ | ✓ | ✗ |
|
||||
| Edit notification channels | ✓ | ✗ | ✗ |
|
||||
| Manage users | ✓ | ✗ | ✗ |
|
||||
| Self password change (`/settings/account`) | ✓ | ✓ | ✓ |
|
||||
|
||||
The role enum already exists in the schema (`CHECK (role IN ('admin','operator','viewer'))`) and in `internal/store/types.go`. Bootstrap creates the first user as admin. Zero migration needed for existing installs.
|
||||
|
||||
## Schema changes
|
||||
|
||||
All column-level ALTERs (CLAUDE.md prefers these over rebuilds; safe under `foreign_keys=ON`).
|
||||
|
||||
### Migration 0017 — `users` extensions
|
||||
|
||||
```sql
|
||||
ALTER TABLE users ADD COLUMN email TEXT;
|
||||
ALTER TABLE users ADD COLUMN disabled_at TEXT;
|
||||
ALTER TABLE users ADD COLUMN must_change_password INTEGER NOT NULL DEFAULT 0;
|
||||
|
||||
-- Username case-insensitive lookup. Existing rows are kept as-is;
|
||||
-- normalisation only applies to new INSERTs (handled in Go).
|
||||
CREATE UNIQUE INDEX users_username_lower ON users(LOWER(username));
|
||||
```
|
||||
|
||||
### Migration 0018 — `user_setup_tokens`
|
||||
|
||||
```sql
|
||||
CREATE TABLE user_setup_tokens (
|
||||
user_id TEXT PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
|
||||
token_hash TEXT NOT NULL, -- sha256(raw_token), hex
|
||||
expires_at TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL,
|
||||
created_by TEXT NOT NULL REFERENCES users(id) ON DELETE SET NULL
|
||||
);
|
||||
|
||||
CREATE INDEX user_setup_tokens_expires ON user_setup_tokens(expires_at);
|
||||
```
|
||||
|
||||
`user_id` is PRIMARY KEY, not just FOREIGN KEY — only one outstanding setup token per user. Regenerating supersedes the old via `INSERT OR REPLACE`.
|
||||
|
||||
## RBAC enforcement
|
||||
|
||||
### Middleware
|
||||
|
||||
```go
|
||||
// requireRole returns chi middleware that 403s any request whose
|
||||
// session-resolved user doesn't meet the minimum role. Roles are
|
||||
// hierarchical: admin > operator > viewer.
|
||||
func (s *Server) requireRole(min store.Role) func(http.Handler) http.Handler
|
||||
```
|
||||
|
||||
Hierarchy implemented as a small helper:
|
||||
|
||||
```go
|
||||
func roleAtLeast(have, min store.Role) bool {
|
||||
rank := map[store.Role]int{
|
||||
store.RoleViewer: 1,
|
||||
store.RoleOperator: 2,
|
||||
store.RoleAdmin: 3,
|
||||
}
|
||||
return rank[have] >= rank[min]
|
||||
}
|
||||
```
|
||||
|
||||
### Route grouping in `server.go`
|
||||
|
||||
The existing `/api` and UI routes get re-grouped into three role bands plus a self-service group:
|
||||
|
||||
```
|
||||
/api/* viewer-readable — GET endpoints anyone authenticated can hit
|
||||
/api/* operator+ — mutating endpoints up to host/source-group/schedule level
|
||||
/api/* admin-only — /api/users/*, channel CRUD
|
||||
/api/account — self-service password change
|
||||
|
||||
/audit, /alerts, /hosts/{id}, etc. — viewer
|
||||
/hosts/{id}/run, /alerts/{id}/ack — operator
|
||||
/settings/users/*, /settings/notifications/* — admin
|
||||
/settings/account — viewer (any authenticated)
|
||||
```
|
||||
|
||||
Default at the bottom of `routes()` is admin (fail-closed). Any future endpoint that doesn't get explicitly placed lands in admin-only, surfacing the missing declaration as a permission error rather than a silent bypass.
|
||||
|
||||
### Per-handler nuance
|
||||
|
||||
One existing case warrants a handler-level check on top of the route gate: `GET /settings/users/{id}/edit` is admin-only, but the `PUT /api/account/password` is viewer-OK. The split-by-route already covers this; no per-handler overrides expected in v1.
|
||||
|
||||
### Out of scope of role middleware
|
||||
|
||||
- `/ws/agent` and `/api/agents/*` — agent bearer-token auth, separate chain
|
||||
- `/healthz` — unauthenticated
|
||||
- `/login`, `/logout`, `/bootstrap` — public
|
||||
|
||||
### 403 handling
|
||||
|
||||
- JSON endpoints: `{"error":"forbidden","code":"insufficient_role"}` with HTTP 403
|
||||
- HTML endpoints: render a small "You don't have permission" panel inside the chrome (so the user keeps their nav and can move away), HTTP 403
|
||||
- **No audit row on 403** — too noisy with normal users hitting URLs they don't have access to
|
||||
|
||||
### Session re-validation
|
||||
|
||||
Sessions need to honour `disabled_at` and current role on every request, not just at login. The session-validation middleware reads the user row each request (single PK lookup, fast in SQLite). If `disabled_at IS NOT NULL`, the session is invalidated and the request 401s. This makes "disable user" and "force logout" effectively immediate.
|
||||
|
||||
Cost: one SELECT per authenticated request. SQLite handles this comfortably for the fleet sizes this codebase targets.
|
||||
|
||||
## Setup-token flow (replacing temp passwords)
|
||||
|
||||
### Add user
|
||||
|
||||
1. Admin clicks **+ Add user** on `/settings/users`
|
||||
2. Form: username (required, lowercase-normalised), email (optional, validated), role (admin/operator/viewer)
|
||||
3. Server:
|
||||
- Validates username uniqueness (case-insensitive). On collision with a *disabled* user, return a 409 with `{"existing_user_id": "...", "disabled": true}` so the UI can pivot to a "re-enable existing user" prompt
|
||||
- On collision with an enabled user: 409 with a plain "username taken" error
|
||||
- Creates user row with `password_hash = ""`, `must_change_password = 1`, `disabled_at = NULL`
|
||||
- Generates 32 random bytes, hex-encodes → raw token (64 chars). Stores `sha256(token)` hex in `user_setup_tokens`. `expires_at = now + 1h`
|
||||
- Audit: `user.created`, payload `{"username": "...", "role": "...", "with_setup_token": true}`
|
||||
4. Server returns the admin to a one-time setup-link page: `/settings/users/{id}/setup-link`
|
||||
- Shows the URL `http(s)://<base>/setup?token=<raw>` with a Copy button
|
||||
- Countdown timer (live JS) showing time-to-expiry
|
||||
- Warning: "This is the only time you'll see this link. If you lose it, regenerate from the user edit page."
|
||||
- "Done" button → `/settings/users`
|
||||
|
||||
The raw token is **never persisted** server-side. Lost tokens require regeneration.
|
||||
|
||||
### Setup landing page (public, no auth required)
|
||||
|
||||
1. User clicks the link, lands on `/setup?token=<raw>`
|
||||
2. Server hashes the token, looks up `user_setup_tokens` row, validates `expires_at > now`
|
||||
3. On invalid / expired: render an error page with a "Contact your administrator" message. Audit: `user.setup_token.expired` (no actor).
|
||||
4. On valid: render a password-set form: `new password + confirm`. Submit:
|
||||
- Validates password meets policy (min 12 chars, no other constraints in v1 — same as bootstrap path)
|
||||
- Hashes via `auth.HashPassword` (existing helper)
|
||||
- Updates `users.password_hash`, sets `must_change_password = 0`
|
||||
- Deletes the `user_setup_tokens` row (single-use)
|
||||
- Logs the user in via the existing session helper
|
||||
- Audit: `user.setup_completed`, payload `{"user_id": "..."}`
|
||||
- Redirect to `/`
|
||||
|
||||
### Regenerate setup link (admin)
|
||||
|
||||
`/settings/users/{id}/edit` shows a "Regenerate setup link" button when `must_change_password = 1`. Clicking it:
|
||||
|
||||
1. Generates a new token + hash, INSERT OR REPLACE on `user_setup_tokens`
|
||||
2. Returns the admin to the same one-time link page as the add-user flow
|
||||
3. Audit: `user.setup_token.regenerated`
|
||||
|
||||
### Cleanup
|
||||
|
||||
Expired tokens linger in the DB until cleaned. Add a cheap sweep on the existing maintenance ticker: `DELETE FROM user_setup_tokens WHERE expires_at < ?`. Runs at the same cadence as the alert engine tick (60s). No new ticker needed.
|
||||
|
||||
## Self-service password change
|
||||
|
||||
`/settings/account`
|
||||
|
||||
- Accessible to every authenticated user (any role)
|
||||
- Form: `current password + new password + confirm`
|
||||
- Server validates current password (re-uses login bcrypt comparison), updates hash, audits `user.password_changed`
|
||||
- Special case: if `must_change_password = 1`, the current-password field is hidden / not required (covers the legacy "admin reset password" path if we ever add one — current setup-token path doesn't use this)
|
||||
|
||||
The bootstrap user's password change uses this same page (no special case for "first admin").
|
||||
|
||||
## User list / management UI
|
||||
|
||||
### `/settings/users` (admin-only)
|
||||
|
||||
```
|
||||
Settings · Users [3]
|
||||
─────────────────────────────────────────────────
|
||||
[ + Add user ] [ ] Show disabled
|
||||
|
||||
USERNAME EMAIL ROLE LAST LOGIN STATUS
|
||||
alice alice@example.com admin 2 mins ago enabled
|
||||
bob — operator 3 days ago enabled
|
||||
charlie c@example.com viewer never setup pending ← if has open setup token
|
||||
diane d@example.com operator 1 month ago disabled ← only when "Show disabled"
|
||||
|
||||
Actions per row: Edit · (Re-enable | Disable)
|
||||
```
|
||||
|
||||
- "setup pending" badge for users with `must_change_password=1` — clicking the row goes to edit, which surfaces the regenerate-link button prominently
|
||||
- "Show disabled" is a checkbox querystring filter (`?show_disabled=1`)
|
||||
- Sort columns: clickable like the audit log (username, role, last_login). Reuse the same pattern (server-side sort + URL builder + glyph)
|
||||
|
||||
### `/settings/users/new` (admin-only)
|
||||
|
||||
Single form: `username + email (optional) + role`. On submit → either landed on the setup-link page (success) or returned with an inline "username exists, re-enable existing?" panel (collision with disabled user) / red error (collision with enabled user).
|
||||
|
||||
### `/settings/users/{id}/edit` (admin-only)
|
||||
|
||||
- Display-only block: id, created_at, last_login_at, status
|
||||
- **Editable**: email, role
|
||||
- **Buttons**:
|
||||
- "Regenerate setup link" — only when `must_change_password = 1`
|
||||
- "Disable user" — flips `disabled_at`; rejected if last enabled admin (server-side check). Confirmation modal with typed name to confirm.
|
||||
- "Re-enable user" — clears `disabled_at`. No confirmation.
|
||||
- "Force logout" — separate from disable; just kills the session but keeps the user enabled. Useful for "I think Bob's session was hijacked" without locking him out.
|
||||
- Cancel / Save buttons at the bottom
|
||||
|
||||
### `/settings/users/{id}/setup-link` (admin-only)
|
||||
|
||||
Renders the one-time link with copy button + countdown. Shown after add-user and after regenerate. Reload of this URL after the token is consumed: 410 Gone with a clear message.
|
||||
|
||||
### `/settings/account` (any authenticated)
|
||||
|
||||
Self-service password change. Form-only page; no nav under Settings since most users will only see this one Settings page in v1.
|
||||
|
||||
## API surface
|
||||
|
||||
```
|
||||
GET /api/users admin — list (with ?show_disabled=1 filter)
|
||||
POST /api/users admin — create user, returns user_id + setup_url
|
||||
GET /api/users/{id} admin — read
|
||||
PATCH /api/users/{id} admin — update email, role
|
||||
POST /api/users/{id}/disable admin — set disabled_at; rejects last-admin
|
||||
POST /api/users/{id}/enable admin — clear disabled_at
|
||||
POST /api/users/{id}/regenerate-setup admin — new token, returns setup_url
|
||||
POST /api/users/{id}/force-logout admin — kill all sessions for this user
|
||||
|
||||
POST /api/account/password any auth — self password change
|
||||
GET /setup public — landing page (HTML form)
|
||||
POST /setup public — submit new password
|
||||
```
|
||||
|
||||
UI routes mirror the API but at `/settings/users/...`.
|
||||
|
||||
## Last-admin self-protection
|
||||
|
||||
Two operations that could lock everyone out are guarded:
|
||||
|
||||
- **Disable user**: rejected if the user is admin AND there are no other enabled admins
|
||||
- **Demote admin to operator/viewer**: same check
|
||||
|
||||
Server-side enforcement (single SELECT on `COUNT(*) FROM users WHERE role='admin' AND disabled_at IS NULL`). UI hint: edit page disables the role dropdown's non-admin options + disable button when the user is the last admin, with a tooltip explaining why.
|
||||
|
||||
The bootstrap admin is just a regular admin row; this check covers it.
|
||||
|
||||
## Audit actions
|
||||
|
||||
New action strings introduced:
|
||||
|
||||
- `user.created`
|
||||
- `user.updated` (email / role change)
|
||||
- `user.disabled`
|
||||
- `user.enabled`
|
||||
- `user.password_changed`
|
||||
- `user.setup_completed`
|
||||
- `user.setup_token.regenerated`
|
||||
- `user.setup_token.expired` (system-driven, on cleanup sweep)
|
||||
- `user.force_logout`
|
||||
|
||||
All target_kind = `user`, target_id = the affected user's id. Existing payload conventions apply.
|
||||
|
||||
## Ordering / dependencies
|
||||
|
||||
Slices in approximate landing order (writing-plans will firm this up):
|
||||
|
||||
1. **A. Schema** — migrations 0017 + 0018, `Role` helper updates, store API extensions (email, disabled_at, must_change_password, setup_token CRUD, lowercase username constraints)
|
||||
2. **B. RBAC middleware** — `requireRole` + `roleAtLeast`, route re-grouping in server.go, 403 rendering for HTML + JSON
|
||||
3. **C. Session re-validation** — extend the existing session middleware to re-read user state per request, kick disabled users
|
||||
4. **D. Setup-token flow** — `/setup` GET+POST, the one-time link page after add-user
|
||||
5. **E. User CRUD API** — handlers + handlers' tests
|
||||
6. **F. UI** — `/settings/users` list, add, edit, setup-link page, account page
|
||||
7. **G. Sweep** — Playwright walk through the full lifecycle (add → setup link → user signs in → admin disables → user gets kicked → admin re-enables → user signs back in)
|
||||
|
||||
Each slice can land as its own commit on the branch. RBAC middleware (B) goes in *before* user CRUD so we don't ship an open `/api/users/*` even briefly.
|
||||
|
||||
## Test strategy
|
||||
|
||||
- **Store**: `Set/GetSetupToken`, `EnableUser`/`DisableUser`, last-admin guard, lowercase-username uniqueness, expired-token cleanup
|
||||
- **HTTP middleware**: `roleAtLeast` truth table; viewer hitting an operator route returns 403; disabled user gets 401 mid-session
|
||||
- **Setup flow integration**: create user → fetch setup URL → land on `/setup?token=...` → POST password → user can log in → token row gone
|
||||
- **UI**: existing Playwright sweep pattern, screenshots into `_diag/p4-03-04-sweep/`
|
||||
|
||||
## Out of scope (deferred)
|
||||
|
||||
- **OIDC** (P4-05) — adds a parallel auth chain. This PR keeps the surface for it (role taxonomy, session middleware) but doesn't wire it.
|
||||
- **Email-the-setup-link** — explicitly deferred. Easy follow-up because the SMTP channel client from P3-06 is already there.
|
||||
- **Hard delete** — disable-only in v1; can add a typed-confirm "purge" later if it turns out to be needed.
|
||||
- **Password complexity / rotation policy** — current minimum (12 chars) and no rotation; tighten later if/when policy demands.
|
||||
- **Lockout on failed login** — a brute-force protection layer is its own task and orthogonal to RBAC.
|
||||
- **Audit on 403** — not in v1; revisit if compliance asks for it.
|
||||
|
||||
## Risks / gotchas to watch
|
||||
|
||||
- **Existing tests** that assume "any logged-in user can hit any endpoint" will break. Audit the test fixtures: most use `loginAsAdmin`, which is fine; any tests currently exercising specific operator/viewer paths need explicit role assignment. (Quick grep suggests there aren't many — bootstrap-only.)
|
||||
- **Bootstrap user normalisation** — the existing admin row's username is whatever it was set to at first run. The new lowercase-uniqueness index uses `LOWER(username)`, which makes the existing row implicitly lowercase-keyed for lookups. No data migration needed.
|
||||
- **Session middleware re-read cost** — one SELECT per authenticated request. SQLite WAL handles this fine at expected fleet sizes; if it ever shows up on a profile we add a small in-memory cache keyed by session id with a 30s TTL.
|
||||
- **403 vs 401 distinction** — make sure unauthenticated requests still get 401 (login redirect) and authenticated-but-insufficient get 403. The middleware should compose: auth-required first, role-required second.
|
||||
|
||||
## Acceptance
|
||||
|
||||
- [ ] An admin can add a user, copy the setup link, the new user can land on `/setup?token=...`, set a password, and reach `/`
|
||||
- [ ] An expired token (>1h) on `/setup?token=...` shows the "contact your administrator" page
|
||||
- [ ] Admin regenerates the link, old token is invalid, new token works
|
||||
- [ ] Operator user can trigger Run-now but cannot reach `/settings/users` (403) and the Users tab in Settings is hidden in their nav
|
||||
- [ ] Viewer user gets 403 on Run-now, 200 on dashboard / alerts / audit
|
||||
- [ ] Admin disables a user mid-session — the user's next request is 401 and they're redirected to login
|
||||
- [ ] Admin cannot disable themselves if they are the last enabled admin (server returns 409, UI button is greyed)
|
||||
- [ ] Self-service password change at `/settings/account` works for every role
|
||||
- [ ] All existing tests pass; new test suite covers role middleware, setup-token lifecycle, last-admin guard
|
||||
|
||||
## Self-review notes
|
||||
|
||||
- ✅ All sections concrete, no TBD / TODO
|
||||
- ✅ Schema migrations are column-level (CLAUDE.md compliance)
|
||||
- ✅ Audit action vocabulary listed in one place; no string typos to drift
|
||||
- ✅ Out-of-scope list explicit so reviewers can challenge what we *aren't* doing
|
||||
- ✅ Last-admin guard handled both server-side and UI-hinted
|
||||
- ✅ Token storage hashes the secret server-side; raw is shown to admin once and never again
|
||||
- ✅ Session re-validation cost noted with a fallback if it shows up on a profile
|
||||
@@ -0,0 +1,215 @@
|
||||
# P4-05 — OIDC Login Design
|
||||
|
||||
> **Date:** 2026-05-05
|
||||
> **Status:** brainstorm complete; ready for plan
|
||||
> **Closes:** P4-05 (OIDC login)
|
||||
|
||||
## Goal
|
||||
|
||||
Wire OpenID Connect authentication as a sign-in path alongside the existing local-user system, so a deployment that already has an IdP (Authelia, Authentik, Keycloak, Okta, Auth0, etc.) can use it for restic-manager logins.
|
||||
|
||||
## Architecture
|
||||
|
||||
OIDC sits on top of the local-user system rather than replacing it. The first time a user signs in via OIDC the server **just-in-time provisions** a local user row marked `auth_source='oidc'`, with role derived from the IdP's `roles` claim. Subsequent sign-ins look up the same row by stable `oidc_subject` and refresh role + email from the latest claims. Once the row exists it behaves like any other local user — admin can disable it, force-logout, see it in audit logs, etc. — except password-login is rejected because there's no password.
|
||||
|
||||
The Authorization Code flow (with PKCE) is implemented against the discovered well-known config of a single configured issuer. Front-channel logout: clicking Sign out drops the local session + redirects the browser to the IdP's `end_session_endpoint` (when advertised). Back-channel logout deferred.
|
||||
|
||||
## Locked decisions
|
||||
|
||||
| Decision | Pick |
|
||||
|---|---|
|
||||
| User lifecycle | **B** — JIT-provision local rows on first OIDC login (`auth_source='oidc'`, `oidc_subject`) |
|
||||
| Role mapping config | **A** — YAML/env, claim name configurable (default `groups`, matching Authelia / Keycloak / Authentik), default = deny on no-match |
|
||||
| Username source | `preferred_username`, fallback to `email` |
|
||||
| Username collision with existing local user | **Refuse** with clear remediation message |
|
||||
| Provider config | **Single provider** — `providers:` array can come later |
|
||||
| Login page layout | SSO button **above** password form; password form labelled "or sign in with a local account" |
|
||||
| OIDC users + password login | **Disabled** — `auth_source='oidc'` rows have empty `password_hash`; password form rejects them |
|
||||
| Logout shape | **Front-channel only** — drop session + redirect to `end_session_endpoint` when advertised |
|
||||
| Role re-evaluation | **At login only** — claims read at the OIDC callback; admin can disable mid-session locally |
|
||||
|
||||
## Schema changes
|
||||
|
||||
Migration 0019 — `users` extensions for OIDC bookkeeping:
|
||||
|
||||
```sql
|
||||
ALTER TABLE users ADD COLUMN auth_source TEXT NOT NULL DEFAULT 'local'
|
||||
CHECK (auth_source IN ('local', 'oidc'));
|
||||
ALTER TABLE users ADD COLUMN oidc_subject TEXT;
|
||||
|
||||
CREATE UNIQUE INDEX users_oidc_subject ON users(oidc_subject)
|
||||
WHERE oidc_subject IS NOT NULL;
|
||||
```
|
||||
|
||||
Both column-level ALTERs (CLAUDE.md preference). The unique partial index defends the JIT-lookup invariant (one row per IdP subject) without blocking multiple rows with NULL oidc_subject (the local users).
|
||||
|
||||
## Configuration
|
||||
|
||||
```yaml
|
||||
# server config — extend existing config struct
|
||||
oidc:
|
||||
issuer: https://auth.example.com # well-known config discovered from this
|
||||
client_id: restic-manager
|
||||
client_secret: ${RM_OIDC_CLIENT_SECRET} # or via _FILE
|
||||
display_name: Authelia # button label "Sign in with <display_name>"; default "SSO"
|
||||
scopes: [openid, profile, email, groups]
|
||||
role_claim: groups # default if absent (matches Authelia / Keycloak / Authentik)
|
||||
role_mapping:
|
||||
rm-admins: admin
|
||||
rm-operators: operator
|
||||
rm-viewers: viewer
|
||||
# Optional — auto-derived from BaseURL if absent.
|
||||
redirect_url: https://rm.example.com/auth/oidc/callback
|
||||
```
|
||||
|
||||
Env-var overrides: `RM_OIDC_ISSUER`, `RM_OIDC_CLIENT_ID`, `RM_OIDC_CLIENT_SECRET`, `RM_OIDC_CLIENT_SECRET_FILE`. Mapping is YAML-only (env doesn't fit a multi-key string→string map cleanly).
|
||||
|
||||
When `oidc.issuer` is empty or missing, OIDC is disabled (current behaviour). No restart-toggle UI; this is a deploy-time setting.
|
||||
|
||||
## Auth flow
|
||||
|
||||
### Login start
|
||||
|
||||
`GET /auth/oidc/login` — only mounted when OIDC is configured.
|
||||
|
||||
1. Generate `state` (32 random bytes, base64) and `code_verifier` (64 random bytes, base64); compute `code_challenge = base64(sha256(code_verifier))`.
|
||||
2. Store `(state, code_verifier, created_at)` in a new ephemeral table (or in memory with a 5-minute TTL — see "trade-off" below).
|
||||
3. Redirect to `<authorization_endpoint>?response_type=code&client_id=...&redirect_uri=...&scope=...&state=...&code_challenge=...&code_challenge_method=S256`.
|
||||
|
||||
### Callback
|
||||
|
||||
`GET /auth/oidc/callback?code=...&state=...` — also OIDC-only mount.
|
||||
|
||||
1. Validate `state` against the stored value (one-shot — delete row on read). Reject if missing/expired/already used.
|
||||
2. Exchange `code` + `code_verifier` for tokens at `token_endpoint`.
|
||||
3. Validate the `id_token` JWT: signature against the JWKS endpoint, `iss`, `aud`, `exp`, `iat`, `nonce` (if used).
|
||||
4. Extract `sub`, `preferred_username`, `email`, and the configured `role_claim` (default `roles`).
|
||||
5. Pick username: `preferred_username` if non-empty, else `email`. Lowercase / trim per the existing local-user rules.
|
||||
6. Pick role: first match in `role_mapping` against the array of role-claim values. **No match → deny with a clear error page**, no row created.
|
||||
7. Look up user by `oidc_subject`. Three cases:
|
||||
- **Found** — refresh `email`, `role`, `last_login_at`. Don't touch `username` (changing it would break audit trails; if the IdP changes the username, that's an operator concern). Log `user.oidc_login`.
|
||||
- **Not found, username free** — INSERT row with `auth_source='oidc'`, `oidc_subject=<sub>`, `password_hash=''`, `must_change_password=0`. Log `user.created` with payload `{"auth_source":"oidc"}` + `user.oidc_login`.
|
||||
- **Not found, username taken by a local user** — render an error page: "This OIDC user (`<sub>`) wants to sign in as `alice`, but a local user with that name already exists. Ask your administrator to either rename / remove the local user, or exclude this user from the OIDC mapping." 403, no row created. Log `user.oidc_login_blocked`.
|
||||
8. Drop a session cookie + `MarkUserLogin` (the existing helper).
|
||||
9. Redirect to `/`.
|
||||
|
||||
### Logout
|
||||
|
||||
`POST /logout` (existing handler) — augmented:
|
||||
|
||||
1. Look up the session before deletion (we need the user row to know if they're an OIDC user).
|
||||
2. Delete the session as today.
|
||||
3. If the user is `auth_source='oidc'` AND the discovered `end_session_endpoint` is non-empty → 303 to `<end_session_endpoint>?id_token_hint=<id_token>&post_logout_redirect_uri=<base>/login`. Otherwise → existing 303 to `/login`.
|
||||
|
||||
We need to keep the latest `id_token` per session to drive `id_token_hint`. Stash it in a new `sessions.id_token TEXT` column (one column-level ALTER on migration 0019 alongside the user columns), populated only for OIDC sessions.
|
||||
|
||||
## State table
|
||||
|
||||
Two reasonable shapes for the short-lived state used during the OAuth round-trip:
|
||||
|
||||
- **In-memory map** with a 5-minute TTL sweeper. Simpler, but multi-process deployments lose it (no multi-process today, but Phase 5 OSS readiness might add).
|
||||
- **`oidc_state` table** — `(state_hash PK, code_verifier, created_at)`, swept on the same 60s alert-engine tick that already handles setup-token cleanup.
|
||||
|
||||
I'll go with the **table**. Costs ~3 lines in the existing cleanup tick, behaves correctly under restarts, and survives a future scale-out. Migration 0019 includes:
|
||||
|
||||
```sql
|
||||
CREATE TABLE oidc_state (
|
||||
state_hash TEXT PRIMARY KEY, -- sha256(state) hex; raw state never persisted
|
||||
code_verifier TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
CREATE INDEX oidc_state_created ON oidc_state(created_at);
|
||||
```
|
||||
|
||||
## Login-page UI
|
||||
|
||||
`/login` template branches based on `view.OIDCEnabled`:
|
||||
|
||||
- **OIDC off** → current layout (just the password form).
|
||||
- **OIDC on** → an `Sign in with <provider name>` button at the top, then a faint divider line, then the existing password form labelled "Or sign in with a local account". Provider name comes from a new optional config `oidc.display_name` (defaults to "SSO").
|
||||
|
||||
Failed-OIDC redirects (no role match, username collision, IdP error) land on `/login?oidc_error=<reason>` with a small banner above the buttons.
|
||||
|
||||
## Audit actions
|
||||
|
||||
New entries in the action vocabulary:
|
||||
|
||||
- `user.oidc_login` (target_kind=user, target_id=user_id, payload `{"sub":"…"}`)
|
||||
- `user.oidc_login_blocked` (target_kind=user, target_id=oidc_subject when no row was created, payload `{"username":"…", "reason":"username_taken|no_role_match|other"}`)
|
||||
- `user.created` already exists; OIDC's first-time provisioning fires this with payload `{"auth_source":"oidc"}` so the audit log distinguishes admin-created from JIT-provisioned rows.
|
||||
|
||||
## User-management UI changes
|
||||
|
||||
Small additions, not new screens:
|
||||
|
||||
- **Users list** — Status column adds a small `oidc` chip when `auth_source='oidc'` so admin can see at a glance which rows came from JIT-provisioning. Sortable by auth_source via the same sortable-headers pattern (lands as a small follow-up if anyone asks; out of scope for v1).
|
||||
- **Add user form** — disabled when OIDC is the only auth path, with a hint: "User provisioning is handled by your OIDC provider; users appear here on first sign-in." Configurable later via a `oidc.disable_local_users` flag if that becomes a real ask. Out of scope for v1; both paths stay open.
|
||||
- **Edit user form** — when `auth_source='oidc'`:
|
||||
- Username field disabled (changing it would just be undone on next OIDC login)
|
||||
- Role dropdown disabled, with a hint: "Role is managed by your OIDC provider's `roles` claim mapping. Edit the mapping in server config to change."
|
||||
- Email field disabled (refreshed from IdP on each login)
|
||||
- **Disable / Enable / Force logout** still work — disabling an OIDC user kicks their session and rejects future OIDC logins ("user disabled by administrator")
|
||||
- **Regenerate setup link** hidden — there's no setup token for OIDC users
|
||||
- **Login UI** — password form rejects users with `auth_source='oidc'` ("This account uses single sign-on. Click the SSO button above.")
|
||||
|
||||
## Middleware / handler changes
|
||||
|
||||
- **Routes**: new public-band entries `GET /auth/oidc/login`, `GET /auth/oidc/callback`. Skipped entirely when OIDC isn't configured (`s.deps.OIDC == nil`).
|
||||
- **Logout handler** augmented to fetch the user row + decide between local logout (303 → `/login`) and OIDC logout (303 → `end_session_endpoint`).
|
||||
- **Login handler** rejects `auth_source='oidc'` users with the SSO-prompt error.
|
||||
- **Last-admin guard** — already covers OIDC users naturally because they live in the `users` table. The role-from-claims path could create a "every admin gets demoted to operator" situation if the IdP's claim mapping is wrong; the guard rejects that demotion at the moment it'd be applied (returns the user to the login page with `oidc_error=role_change_blocked` and audit entry; admin must fix the mapping or promote a local admin first).
|
||||
|
||||
## Implementation outline
|
||||
|
||||
1. **Schema** — migration 0019 (users.auth_source + oidc_subject, sessions.id_token, oidc_state table)
|
||||
2. **Config** — extend `internal/server/config` with the OIDC block + env-var overrides; load JWKS lazily
|
||||
3. **Discovery + JWKS** — small helper that fetches `<issuer>/.well-known/openid-configuration` once at startup, caches `authorization_endpoint`, `token_endpoint`, `end_session_endpoint`, `jwks_uri`. JWKS refreshed on first failed verification.
|
||||
4. **Login start handler** — `/auth/oidc/login`
|
||||
5. **Callback handler** — `/auth/oidc/callback`, with the four claim-resolution branches
|
||||
6. **Logout handler augmentation** — branch on `auth_source`
|
||||
7. **Login form rejection** — local-user password form rejects OIDC accounts
|
||||
8. **State cleanup** — extend the alert engine's existing cleanup tick
|
||||
9. **UI** — `oidc` chip on users list, disabled fields on edit-form for OIDC users, login page SSO button + error banner
|
||||
10. **Tests** — config parse tests; happy-path callback test using a fake IdP (httptest server with a hand-rolled discovery doc + JWKS); username-collision test; no-role-match test; logout test
|
||||
11. **Sweep** — full Playwright walk against an actual IdP (Authelia in a Docker container) — admin gets in via OIDC, role mapping works, logout redirects through IdP, OIDC user can't password-login
|
||||
|
||||
## Test strategy
|
||||
|
||||
The IdP is the hard part to test cleanly. Two layers:
|
||||
|
||||
- **Unit / integration tests** use a stub OIDC provider built into the test harness — `httptest.Server` exposing `.well-known/openid-configuration`, a token endpoint that signs minted JWTs with a test ECDSA key, and a JWKS endpoint serving the public key. This covers every code path without a real IdP. Pattern: each test mints its own claims and runs the callback against the stub.
|
||||
- **Smoke env** runs against a real Authelia container (existing `compose.smoke.yaml`-style file or one-liner `docker run`) for the final sweep — confirms the discovery doc isn't being misread, real JWT verification works, real `end_session_endpoint` redirect works.
|
||||
|
||||
## Out of scope (deferred)
|
||||
|
||||
- **Multi-provider** support (`providers:` array)
|
||||
- **Back-channel logout** (RFC 8138) — schema isn't blocked from adding it later
|
||||
- **UI-driven role mapping** (config-only in v1)
|
||||
- **Refresh tokens / mid-session role re-evaluation** — login-only refresh in v1
|
||||
- **`oidc.disable_local_users`** flag — both paths stay open in v1
|
||||
- **OIDC user dashboard chip / badges** beyond the small `oidc` indicator on the users list
|
||||
- **Per-user "auth source" filter on the users list** — sortable headers cover most of the use case
|
||||
|
||||
## Risks / gotchas
|
||||
|
||||
- **JWKS key rotation** — refresh on first failed verification is the standard fix; document the cache TTL (1h) in the config block.
|
||||
- **Clock skew** — accept `iat`/`exp` with a 60s leeway; matches what most OIDC libraries do.
|
||||
- **End-session 404 / not advertised** — degrade gracefully; just drop the session and 303 to `/login`. Don't 500 the logout because the IdP doesn't implement RP-initiated logout.
|
||||
- **Username changes at the IdP** — silently keep the local username (matches our locked decision: subject is the stable key, username is display-only). Document.
|
||||
- **Role claim is sometimes a string, sometimes an array, sometimes a comma-separated string** depending on IdP — normalise into `[]string` before mapping. Authelia/Keycloak emit arrays; some custom setups emit strings; handle both.
|
||||
- **Authelia `sub` is an opaque UUID, not the username** (Authelia 4.39+ default for new clients). Don't assume `sub` is human-readable; it's stable but display value is `preferred_username` or `email`. The locked design already keys lookups on `sub` and uses `preferred_username` for the display username, so this is just a correctness note.
|
||||
- **`end_session_endpoint` may not be published** (Authelia doesn't advertise it for many configs). The locked logout flow already degrades to "drop session + redirect to /login" when the discovery doc lacks it; no extra config needed.
|
||||
- **Password-form bypass for OIDC users via /api/auth/login (JSON)** — same rejection rule applies, not just the HTML form.
|
||||
|
||||
## Acceptance
|
||||
|
||||
- [ ] An OIDC user with `roles: ["rm-admins"]` can sign in, becomes an admin, is visible in `/settings/users` with an `oidc` chip
|
||||
- [ ] Same user signing in again resolves to the same row (no duplicate)
|
||||
- [ ] Same user with `roles: ["something-else"]` is denied, lands on `/login?oidc_error=no_role_match` with a banner, no row created
|
||||
- [ ] OIDC user can't password-login through `/login` or `/api/auth/login`
|
||||
- [ ] Admin disables an OIDC user → next OIDC login is rejected, existing session bounced (existing disable-mid-session)
|
||||
- [ ] Sign out as an OIDC user → 303 to IdP's end-session URL (when advertised); no end-session URL → 303 to `/login`
|
||||
- [ ] OIDC config absent → password login works exactly as today (zero behavioural change)
|
||||
- [ ] Username collision: a local `alice` exists, OIDC user with `preferred_username=alice` and a different `sub` → blocked at sign-in with the clear error page
|
||||
- [ ] Last-admin guard refuses to demote the only enabled admin even if the IdP's role mapping says otherwise
|
||||
- [ ] All existing tests pass; new test suite covers the four claim-resolution branches and logout
|
||||
@@ -0,0 +1,229 @@
|
||||
# P5-03 — Docker-only release path
|
||||
|
||||
**Status:** approved 2026-05-05. Pivots P5-03 away from `goreleaser` +
|
||||
binary archives toward a single Docker image as the only public
|
||||
deliverable.
|
||||
|
||||
## Goal
|
||||
|
||||
One artifact per tag: the `restic-manager` server image, multi-arch
|
||||
(linux amd64 + arm64), published to the Gitea container registry of
|
||||
this self-hosted instance. The image bakes in cross-compiled agent
|
||||
binaries (linux amd64, linux arm64, windows amd64), the install
|
||||
scripts, and the systemd unit at a read-only image path. The running
|
||||
server distributes those agents and scripts via its existing
|
||||
`/agent/binary` and `/install/*` endpoints; operators on N hosts never
|
||||
download a release artifact directly.
|
||||
|
||||
Source builds via `make build` remain a first-class path for anyone
|
||||
who wants binaries.
|
||||
|
||||
## Non-goals
|
||||
|
||||
- Standalone binary archives (`.tar.gz`, `.zip`) on the release page.
|
||||
- darwin / windows-arm64 agent targets — neither is service-tested.
|
||||
- `goreleaser`. Not used.
|
||||
- `cosign`, `SBOM`, `in-toto`, `minisign`. Re-promote when we ship
|
||||
binaries outside an image (Phase 6 candidate).
|
||||
- GHCR / GitHub mirror. Single source of truth = Gitea.
|
||||
|
||||
## Decisions captured (with one-line rationale)
|
||||
|
||||
| ID | Decision | Why |
|
||||
|----|----------|-----|
|
||||
| D1 | One artifact: server Docker image | Architecture already routes agent distribution through the server (`/agent/binary`); release surface should mirror that. |
|
||||
| D2 | Trigger: `tag-push` (`v*.*.*`) **plus** `workflow_dispatch` | Tag for real cuts; dispatch for snapshot iteration without polluting tag history. |
|
||||
| D3 | Build matrix: linux amd64+arm64 server image; agent cross-compiles for linux amd64+arm64+windows amd64 | Mirrors the existing CI build matrix; nothing ships that hasn't been service-tested. |
|
||||
| D4 | Image-baked, separate path (`/opt/restic-manager/dist/`); HTTP handler reads `<DataDir>/...` first, falls back to `/opt/...` | Volume stays purely operator state; image content is immutable per tag; eliminates the smoke-env "stale agent" footgun in production. |
|
||||
| D5 | Tag fan-out: `vX.Y.Z`, `X.Y`, `X`, `latest` — but `latest` is held back until `v1.0.0` | Standard rolling-minor pattern; pre-1.0 forces explicit pinning. |
|
||||
| D6 | Snapshot tag: `:snapshot-<shortsha>`, never moves `latest` | Operator can never accidentally pull an unblessed build. |
|
||||
| D7 | Version embedding via `-ldflags`: `main.version`, `main.commit`, `main.date` on both `cmd/server` and `cmd/agent` | Server already had `version`; add `commit`/`date` to both for parity and traceability. |
|
||||
| D8 | Registry: Gitea container registry on this instance, under `<host>/<owner>/restic-manager` | One source of truth, no external creds. |
|
||||
| D9 | Integrity: a `SHA256SUMS` file + the manifest digest in the release notes; nothing else | Image is the unit of trust; pull-by-digest is the verification primitive. |
|
||||
| D10 | P1-31 (signed binaries) stays deferred | Re-promote the day we ship binaries outside an image. |
|
||||
|
||||
## Image layout
|
||||
|
||||
Multi-stage Dockerfile (extends today's `deploy/Dockerfile.server`):
|
||||
|
||||
```
|
||||
build stage (golang:1.25-alpine):
|
||||
cross-compile cmd/server for $TARGETARCH (linux)
|
||||
cross-compile cmd/agent for linux/amd64
|
||||
cross-compile cmd/agent for linux/arm64
|
||||
cross-compile cmd/agent for windows/amd64
|
||||
(CGO_ENABLED=0 throughout — pure-Go SQLite)
|
||||
|
||||
final stage (gcr.io/distroless/static-debian12:nonroot):
|
||||
/usr/local/bin/restic-manager-server (matches image arch)
|
||||
/opt/restic-manager/dist/agent-binaries/
|
||||
restic-manager-agent-linux-amd64
|
||||
restic-manager-agent-linux-arm64
|
||||
restic-manager-agent-windows-amd64.exe
|
||||
/opt/restic-manager/dist/install/
|
||||
install.sh
|
||||
install.ps1
|
||||
restic-manager-agent.service
|
||||
```
|
||||
|
||||
`/opt/restic-manager/dist/` is owned by `root:root`, mode `0755` for
|
||||
directories, `0755` for `install.sh` (script must be executable when
|
||||
the install path uses `curl ... | sh` semantics) and `0644` for the
|
||||
unit file and `install.ps1`. The agent binaries are mode `0755`.
|
||||
|
||||
`<DataDir>` keeps holding only operator state: `restic-manager.db`,
|
||||
`secret.key`, `secrets.enc`, `audit/`, `tls/`. Nothing the image
|
||||
owns gets written into the volume.
|
||||
|
||||
## Server-side handler change
|
||||
|
||||
`internal/server/http/agent_assets.go` today reads from
|
||||
`<DataDir>/agent-binaries/<name>` and `<DataDir>/install/<name>`.
|
||||
|
||||
Change: if the file isn't present in `<DataDir>`, fall back to
|
||||
`/opt/restic-manager/dist/<subpath>/<name>`. The fallback path is a
|
||||
new server-config field defaulted to `/opt/restic-manager/dist`,
|
||||
overridable via `RM_BUNDLED_ASSETS_DIR` for tests and source-build
|
||||
deployments. If neither path resolves, return 404 (existing
|
||||
`binary_not_published` / `not_found` body unchanged).
|
||||
|
||||
This means:
|
||||
- A fresh container without any operator-staged overrides serves the
|
||||
baked-in agents. No first-run setup needed.
|
||||
- An operator can still drop a custom-built agent into
|
||||
`<DataDir>/agent-binaries/` to override the image's copy (handy for
|
||||
pre-release agent testing without rebuilding the server image).
|
||||
- Source-build dev (`bin/restic-manager-server` running out of the
|
||||
working tree) still works exactly as today — the fallback dir is
|
||||
configurable, and the `<DataDir>` path remains the primary lookup.
|
||||
|
||||
Tests cover four cases: (a) DataDir hit, (b) fallback hit, (c) DataDir
|
||||
hit shadows fallback, (d) neither — 404.
|
||||
|
||||
## Versioning
|
||||
|
||||
Both binaries grow `commit` and `date` ldflag-targets next to the
|
||||
existing `version`:
|
||||
|
||||
```go
|
||||
var (
|
||||
version = "dev"
|
||||
commit = "none"
|
||||
date = "unknown"
|
||||
)
|
||||
```
|
||||
|
||||
Dockerfile gains `ARG VERSION`, `ARG COMMIT`, `ARG DATE`, all
|
||||
`""`-defaulted; the `go build` line passes them via `-ldflags`. The
|
||||
release workflow fills them from `${{ gitea.ref_name }}`,
|
||||
`${{ gitea.sha }}`, and a UTC ISO-8601 timestamp.
|
||||
|
||||
Snapshot builds (workflow_dispatch) compute
|
||||
`VERSION=0.0.0-snapshot-${SHORTSHA}` and tag the image as
|
||||
`:snapshot-${SHORTSHA}` only. They never touch `latest` or any
|
||||
`vX.Y.Z` tag.
|
||||
|
||||
## Workflow (`.gitea/workflows/release.yml`)
|
||||
|
||||
```yaml
|
||||
name: Release
|
||||
|
||||
on:
|
||||
push:
|
||||
tags: ['v[0-9]+.[0-9]+.[0-9]+']
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
IMAGE: gitea.dcglab.co.uk/${{ gitea.repository }}
|
||||
|
||||
jobs:
|
||||
image:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: docker/setup-qemu-action@v3
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: gitea.dcglab.co.uk
|
||||
username: ${{ gitea.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: compute tags
|
||||
id: meta
|
||||
run: |
|
||||
# tag-push → :vX.Y.Z, :X.Y, :X (only :latest if X >= 1)
|
||||
# dispatch → :snapshot-<shortsha>
|
||||
...
|
||||
- uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
file: deploy/Dockerfile.server
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
build-args: |
|
||||
VERSION=${{ steps.meta.outputs.version }}
|
||||
COMMIT=${{ gitea.sha }}
|
||||
DATE=${{ steps.meta.outputs.date }}
|
||||
```
|
||||
|
||||
The `compute tags` step:
|
||||
|
||||
- For `push:tags`: extract `vMAJOR.MINOR.PATCH`. Always emit
|
||||
`:vMAJOR.MINOR.PATCH`, `:MAJOR.MINOR`, `:MAJOR`. Emit `:latest`
|
||||
only when `MAJOR >= 1`.
|
||||
- For `workflow_dispatch`: emit `:snapshot-<shortsha>`. Nothing else.
|
||||
|
||||
No release-asset upload step yet — the GHCR-equivalent registry push
|
||||
is the deliverable. A future iteration may attach a `SHA256SUMS` file
|
||||
to a Gitea release object once `tea release create` is wired in;
|
||||
that's not in scope for the first cut.
|
||||
|
||||
## Tests / verification
|
||||
|
||||
1. `go vet ./...` (CLAUDE.md rule, runs locally pre-commit).
|
||||
2. `go test ./internal/server/http/...` covers the new fallback
|
||||
logic.
|
||||
3. Local manual smoke: `docker build -f deploy/Dockerfile.server .`
|
||||
produces an image; `docker run --rm <image>` starts the server;
|
||||
`curl http://127.0.0.1:8080/agent/binary?os=linux&arch=amd64`
|
||||
serves bytes; `curl http://127.0.0.1:8080/install/install.sh`
|
||||
serves the script.
|
||||
4. Release workflow itself is exercised on first tag-push; until
|
||||
then, `workflow_dispatch` is the smoke test.
|
||||
|
||||
## Operator-facing changes
|
||||
|
||||
- `README.md` install snippet becomes
|
||||
`docker run -v rm-data:/var/lib/restic-manager ...
|
||||
gitea.dcglab.co.uk/<owner>/restic-manager:vX.Y.Z`. Pre-1.0
|
||||
releases are pinned by exact tag; no `:latest` is published.
|
||||
- The CLAUDE.md "restage" block is dev-only (smoke env runs the
|
||||
server out of `bin/`). Production users on the image never see
|
||||
it.
|
||||
- `RM_BUNDLED_ASSETS_DIR` is documented in the server config
|
||||
reference (defaults to `/opt/restic-manager/dist`).
|
||||
|
||||
## Risks / footguns
|
||||
|
||||
- **Image size growth.** Three agent binaries (~15-20 MB each
|
||||
stripped) add ~50 MB. Acceptable; we're already shipping a
|
||||
distroless server. Watch the trajectory once Phase 4 alerting is
|
||||
in.
|
||||
- **Dockerfile cross-compile multiplies build time** on the runner.
|
||||
Pure-Go means each leg is just a `go build`; total stage time
|
||||
should stay under 60s on the self-hosted runner.
|
||||
- **`ARG VERSION` leakage.** The current Dockerfile already accepts
|
||||
`ARG VERSION=dev`; we're tightening, not loosening.
|
||||
- **Operator overriding `<DataDir>/agent-binaries/<name>`** with a
|
||||
stale binary will silently shadow the image's copy. Documented in
|
||||
the server config reference; this is a feature (lets operators
|
||||
hot-patch a pre-release agent) not a bug.
|
||||
|
||||
## Out of scope (tracked for follow-up)
|
||||
|
||||
- Cosign / SBOM / in-toto provenance — defer to Phase 6 with the rest
|
||||
of the supply-chain hardening.
|
||||
- GHCR mirror — defer until P5-01 docs site goes public.
|
||||
- `tea release create` integration — pending until we have something
|
||||
worth attaching beyond the image digest.
|
||||
@@ -0,0 +1,448 @@
|
||||
# P6-01 + P6-02 — Agent self-update + fleet update
|
||||
|
||||
Status: design approved 2026-05-06.
|
||||
Scope: P6-01 (agent self-update mechanism) and P6-02 (dashboard
|
||||
version reporting + fleet update UI). One spec, one branch — the
|
||||
two tasks are tightly coupled (P6-02 is the operator surface for
|
||||
the mechanism P6-01 ships).
|
||||
|
||||
## 1. Background
|
||||
|
||||
P5-03 pivoted release distribution to a single multi-arch server
|
||||
Docker image, with cross-compiled agent binaries baked under
|
||||
`/opt/restic-manager/dist/agent-binaries/` and served via
|
||||
`GET /agent/binary?os=…&arch=…`. The plumbing already does
|
||||
dual-path lookup: `<DataDir>/agent-binaries/<name>` overrides the
|
||||
image-baked copy, so an operator can hot-patch a pre-release agent
|
||||
without rebuilding the image.
|
||||
|
||||
That makes the server the natural distribution point for agent
|
||||
upgrades. "Update agent" collapses to "re-fetch from your own
|
||||
server" — no apt repo, no Chocolatey, no third-party signing infra,
|
||||
and version pinning is automatic because the server only ever
|
||||
serves the agent that matches its own release.
|
||||
|
||||
This spec wires up the update mechanism end-to-end and the
|
||||
operator surface that drives it.
|
||||
|
||||
## 2. Decisions
|
||||
|
||||
| # | Decision | Rationale |
|
||||
|---|----------|-----------|
|
||||
| 1 | Operator-driven only — no auto-update | Matches the rest of the app's job-dispatch model; avoids "bad release upgrades every host instantly"; auto-update can be added later as a setting flip if asked |
|
||||
| 2 | Linux: just exit, let systemd restart. Windows: detached helper script. | Linux supports rename-while-open; Windows holds an exclusive lock on the running .exe |
|
||||
| 3 | M1 (keep `agent.old` on disk) + M2 (rolling fleet update with halt-on-fail). Skip M3 (auto-rollback watchdog). | M1 is ~5 lines, M2 falls naturally out of P6-02's UI, M3 is a lot of plumbing for "shipped a binary that doesn't start" |
|
||||
| 4 | Skip sha256 digest verification for v1 | TLS already covers the corruption-in-transit threat; image-tampering is image-build's problem, not the agent's |
|
||||
| 5 | Exact string version match for "out of date" | With server-bundled binaries there's exactly one canonical version per server image — anything else is out of date by definition |
|
||||
| 6 | WS envelope only, no `restic-manager-agent update` CLI subcommand | YAGNI; no concrete consumer; the underlying logic is reusable when one appears |
|
||||
|
||||
## 3. Wire protocol
|
||||
|
||||
### 3.1 Server → agent: `command.update`
|
||||
|
||||
```
|
||||
{
|
||||
"type": "command.update",
|
||||
"id": "<envelope id>",
|
||||
"payload": {
|
||||
"job_id": "<ulid>"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
No `os` / `arch` / `version` in the payload — the agent already
|
||||
knows its own build target and fetches from its configured server
|
||||
URL via the existing `/agent/binary` handler. Including a target
|
||||
version would also tempt the agent into version-comparison logic;
|
||||
keep that on the server side.
|
||||
|
||||
### 3.2 Job lifecycle (server-driven)
|
||||
|
||||
The agent has limited ability to report on its own restart, so the
|
||||
job state machine lives on the server:
|
||||
|
||||
- **queued → running** when the envelope is dispatched.
|
||||
- **running → succeeded** when the agent re-hellos with
|
||||
`agent_version == server.Version` after dispatch and within
|
||||
the timeout. Audit `host.update_succeeded`.
|
||||
- **running → failed (timeout)** if 90 seconds pass without a
|
||||
hello carrying the matching version. Audit `host.update_failed`.
|
||||
Raise alert kind `update_failed` (reuses P3-05 alert engine).
|
||||
This single transition covers both the "agent never came back
|
||||
at all" case and the "agent came back at the wrong version"
|
||||
case — see §6.2 for why we don't transition immediately on a
|
||||
mismatched hello.
|
||||
|
||||
Migration 0021 widens the `jobs.kind` CHECK constraint to include
|
||||
`update`. Same column-level pattern as 0012 (where 0012 added
|
||||
`restore` and `diff`).
|
||||
|
||||
## 4. Agent-side execution
|
||||
|
||||
Lives in `internal/agent/updater`, build-tag split:
|
||||
|
||||
- `updater_unix.go` — Linux + any future POSIX target.
|
||||
- `updater_windows.go` — Windows-only, uses the helper-script
|
||||
pattern.
|
||||
- `updater.go` — shared `Update(ctx, serverURL string) error`
|
||||
interface and the HTTP fetch/streaming code (no platform deps).
|
||||
|
||||
### 4.1 Linux flow
|
||||
|
||||
1. Receive `command.update` from the WS dispatcher.
|
||||
2. Resolve own binary via `os.Executable()` and `filepath.Abs`.
|
||||
Refuse if the resolved path is `/proc/self/exe` or otherwise
|
||||
not a real file (defence in depth — shouldn't happen under
|
||||
systemd, but bail loudly if it does).
|
||||
3. `GET <server>/agent/binary?os=linux&arch=<runtime.GOARCH>`,
|
||||
stream to `<binary>.new` in the same directory as the running
|
||||
binary (same filesystem ⇒ atomic rename).
|
||||
4. fsync the file, `os.Chmod(0755)`.
|
||||
5. Copy current binary to `<binary>.old` (overwrite if it
|
||||
exists). M1 — one-revision rollback target.
|
||||
6. `os.Rename(<binary>.new, <binary>)`.
|
||||
7. Close the WS connection cleanly (sends close frame so the
|
||||
server transitions the connection to `disconnected` rather
|
||||
than waiting for the heartbeat-miss sweep).
|
||||
8. `os.Exit(0)`. Systemd's `Restart=always` (already in the unit)
|
||||
brings up the new binary within seconds.
|
||||
|
||||
### 4.2 Windows flow
|
||||
|
||||
The .exe is exclusively locked by the OS while running, so steps
|
||||
5–6 above can't happen in-process. Use a detached helper:
|
||||
|
||||
1. Steps 1–4 the same — fetch into `<binary>.exe.new`, fsync.
|
||||
2. Write `update.cmd` to a tmp path with the orchestration:
|
||||
```
|
||||
timeout /t 3 /nobreak >nul
|
||||
copy /Y "<binary>.exe" "<binary>.exe.old"
|
||||
sc stop restic-manager-agent
|
||||
:wait
|
||||
sc query restic-manager-agent | find "STOPPED" >nul
|
||||
if errorlevel 1 (timeout /t 1 /nobreak >nul & goto wait)
|
||||
move /Y "<binary>.exe.new" "<binary>.exe"
|
||||
sc start restic-manager-agent
|
||||
del "%~f0"
|
||||
```
|
||||
3. `CreateProcess` it detached
|
||||
(`DETACHED_PROCESS | CREATE_NO_WINDOW`, no parent handles).
|
||||
4. Close WS, `os.Exit(0)`. SCM sees clean stop and waits — does
|
||||
*not* try to restart, because `sc stop` is the helper's job,
|
||||
not a crash. (`Restart=always` semantics differ between
|
||||
systemd and SCM. SCM treats clean-exit-after-stop as
|
||||
intentional and does not auto-restart; only crashes restart.
|
||||
That's why the helper script needs the explicit `sc start`
|
||||
at the end.)
|
||||
|
||||
### 4.3 Service-user assumption
|
||||
|
||||
Both Linux (`User=root` per the existing unit) and Windows
|
||||
(`LocalSystem` by default) can write the binary path directly. If
|
||||
the agent ever moves to a non-root service user, the updater
|
||||
breaks — would need either a setuid helper or an out-of-process
|
||||
update service. Add a `// NOTE:` comment in the updater package
|
||||
flagging this; not a v1 blocker.
|
||||
|
||||
## 5. Server build version
|
||||
|
||||
New package `internal/version` exposing two constants:
|
||||
|
||||
```
|
||||
package version
|
||||
|
||||
var (
|
||||
Version = "dev"
|
||||
Commit = ""
|
||||
)
|
||||
```
|
||||
|
||||
Wired via `-ldflags` in the Makefile:
|
||||
|
||||
```
|
||||
GO_LDFLAGS = -X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Version=$(VERSION) \
|
||||
-X gitea.dcglab.co.uk/steve/restic-manager/internal/version.Commit=$(COMMIT)
|
||||
|
||||
VERSION := $(shell git describe --tags --always --dirty)
|
||||
COMMIT := $(shell git rev-parse --short HEAD)
|
||||
```
|
||||
|
||||
Both `cmd/server` and `cmd/agent` link the same package, so an
|
||||
agent's `agent_version` (sent in the hello payload, already wired
|
||||
since P1-11) is comparable byte-for-byte to the server's
|
||||
`version.Version`.
|
||||
|
||||
`make build` already does what's needed for source builds. The
|
||||
Phase 2 work in this spec is the Docker release path — confirm
|
||||
during plan execution that `.gitea/workflows/release.yml` passes
|
||||
`VERSION` and `COMMIT` into the Docker `--build-arg` chain so the
|
||||
in-image binaries embed the same string the image is tagged with.
|
||||
If not, add the wiring.
|
||||
|
||||
Dirty/dev builds (`v1.2.3-dirty`) won't match clean server builds,
|
||||
so every dev environment will show every host as out-of-date. This
|
||||
is acceptable — the chip is a noop in dev, real ops always run
|
||||
tagged builds.
|
||||
|
||||
A new `GET /api/version` endpoint returns
|
||||
`{"version": "...", "commit": "..."}`. Used by the dashboard
|
||||
header tile and by `/settings/fleet-update`. Public-band — exposes
|
||||
no secrets, lets the install scripts surface it too.
|
||||
|
||||
## 6. P6-01 server endpoints
|
||||
|
||||
### 6.1 `POST /api/hosts/{id}/update`
|
||||
|
||||
Admin-only. Refuses (with structured error code) when:
|
||||
|
||||
- Host is offline (`host_offline`).
|
||||
- Host's `agent_version == server.Version` (`already_up_to_date`).
|
||||
- An update job for this host is already running (`update_in_progress`).
|
||||
|
||||
Happy path: creates `jobs` row with `kind=update`, dispatches
|
||||
`command.update` envelope, audit-logs `host.update_dispatched`,
|
||||
returns `{"job_id": "..."}`.
|
||||
|
||||
UI form-post variant on `/hosts/{id}/update` returns
|
||||
`HX-Redirect` to the live job log.
|
||||
|
||||
### 6.2 Hello handler integration
|
||||
|
||||
The existing `onAgentHello` (P1-11) already upserts
|
||||
`agent_version`. Extend it: after the upsert, look for any
|
||||
`update` job for this host with `status='running'`. If one
|
||||
exists:
|
||||
|
||||
- `agent_version == server.Version` → mark job `succeeded`,
|
||||
audit `host.update_succeeded`.
|
||||
- `agent_version != server.Version` → leave the job running so
|
||||
the timeout path catches it as a rollback failure (don't fail
|
||||
immediately — gives the agent one chance to come back, restart,
|
||||
hello again with the right version).
|
||||
|
||||
Adds a small in-memory map of pending updates so the timeout
|
||||
goroutine knows when to give up. Persisted state lives in the
|
||||
`jobs` table; the in-memory map is just for the timer.
|
||||
|
||||
## 7. P6-02 fleet update
|
||||
|
||||
### 7.1 Schema
|
||||
|
||||
Migration 0022, column-level adds only:
|
||||
|
||||
```
|
||||
CREATE TABLE fleet_updates (
|
||||
id TEXT PRIMARY KEY,
|
||||
started_at TEXT NOT NULL,
|
||||
started_by_user_id TEXT NOT NULL REFERENCES users(id),
|
||||
target_version TEXT NOT NULL,
|
||||
status TEXT NOT NULL CHECK (status IN ('running','completed','halted','cancelled')),
|
||||
current_host_id TEXT REFERENCES hosts(id),
|
||||
halted_reason TEXT,
|
||||
completed_at TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE fleet_update_hosts (
|
||||
fleet_update_id TEXT NOT NULL REFERENCES fleet_updates(id) ON DELETE CASCADE,
|
||||
host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
|
||||
status TEXT NOT NULL CHECK (status IN ('pending','running','succeeded','failed','skipped')),
|
||||
job_id TEXT REFERENCES jobs(id),
|
||||
failed_reason TEXT,
|
||||
PRIMARY KEY (fleet_update_id, host_id)
|
||||
);
|
||||
```
|
||||
|
||||
### 7.2 Worker loop
|
||||
|
||||
A single in-process goroutine — at most one fleet update may run
|
||||
at a time (enforced via a `sync.Mutex` + a precondition check on
|
||||
`POST /api/fleet/update`).
|
||||
|
||||
```
|
||||
for each pending fleet_update_hosts row in dispatch order:
|
||||
set fleet_updates.current_host_id = row.host_id
|
||||
set fleet_update_hosts.status = 'running'
|
||||
if host.agent_version == server.Version:
|
||||
# Already updated since we built the list — skip.
|
||||
set status = 'skipped'; continue
|
||||
if !host.online:
|
||||
# Offline since we built the list — halt.
|
||||
halt(reason="host went offline")
|
||||
return
|
||||
dispatch_update_for_host(host) # reuses 6.1 logic
|
||||
wait_up_to_90s_for_hello_with_matching_version()
|
||||
if matched:
|
||||
set status = 'succeeded'; continue
|
||||
else:
|
||||
set status = 'failed', failed_reason = "..."
|
||||
halt(reason="update failed on host X")
|
||||
return
|
||||
set fleet_updates.status = 'completed', completed_at = now
|
||||
```
|
||||
|
||||
Halt: set `fleet_updates.status = 'halted'`, raise an alert kind
|
||||
`fleet_update_halted`, audit `fleet.update_halted` with the host
|
||||
id and reason. Subsequent hosts stay `pending` so the operator can
|
||||
see what was queued and decide whether to resume (resume = start a
|
||||
new fleet update with the still-out-of-date subset).
|
||||
|
||||
Cancel: admin-only `POST /api/fleet-updates/{id}/cancel`. Sets
|
||||
`status='cancelled'`. The currently-dispatched host's update job
|
||||
keeps running (the agent is already mid-restart) — cancel only
|
||||
prevents the *next* host from being picked. Audit
|
||||
`fleet.update_cancelled`.
|
||||
|
||||
### 7.3 UI surfaces
|
||||
|
||||
**Per-host chip (host_row partial + host detail chrome):**
|
||||
|
||||
`out of date · v1.2.2 → v1.2.3` — amber-accented, mirrors `.tag`
|
||||
token shape. Only rendered when:
|
||||
|
||||
```
|
||||
host.agent_version != "" && host.agent_version != server.Version
|
||||
```
|
||||
|
||||
Empty `agent_version` (host enrolled but never connected) renders
|
||||
nothing rather than "out of date" — we don't know what version
|
||||
they have.
|
||||
|
||||
**Dashboard summary tile:**
|
||||
|
||||
The hero strip already has tiles. Add an "Updates" tile:
|
||||
`N hosts behind` linking to `/?updates=behind` (extends NS-04's
|
||||
filter machinery — adds an `updates` query param alongside
|
||||
`status`/`repo_status`/`tag`). Hidden when N == 0.
|
||||
|
||||
**Per-host Update button on `/hosts/{id}`:**
|
||||
|
||||
Right-rail, admin-only. Disabled with hover tooltip when host
|
||||
offline / already up to date / update in progress. POSTs to
|
||||
`/hosts/{id}/update`, `HX-Redirect` to the live job log.
|
||||
|
||||
**Fleet update page `/settings/fleet-update`:**
|
||||
|
||||
Admin-only. Two states:
|
||||
|
||||
- **Idle**: lists out-of-date online hosts (table: hostname,
|
||||
current version, target version, last seen). Big "Start rolling
|
||||
update" button behind a typed-confirm dialog (operator types
|
||||
the host count, e.g. `12`, to enable the button — same shape as
|
||||
the host-delete confirm).
|
||||
- **Running/halted/completed**: shows the currently-active
|
||||
fleet_update row + per-host progress list. Polls every 3s (htmx
|
||||
trigger conditional on `document.visibilityState === 'visible'`,
|
||||
same pattern as the alerts page). Renders:
|
||||
```
|
||||
Updated 3/12 · currently updating <hostname>
|
||||
Halted on <hostname>: <reason> · job log →
|
||||
```
|
||||
|
||||
Audit actions: `fleet.update_started`, `fleet.update_completed`,
|
||||
`fleet.update_halted`, `fleet.update_cancelled`.
|
||||
|
||||
### 7.4 Alert engine integration
|
||||
|
||||
P3-05's alert engine already supports kind-based registration. Add
|
||||
two new kinds:
|
||||
|
||||
- `update_failed` — per-host, raised on individual update failure.
|
||||
Auto-resolves when the host re-hellos with the matching version.
|
||||
- `fleet_update_halted` — global, raised on fleet halt. Auto-resolves
|
||||
when a subsequent fleet update completes successfully.
|
||||
|
||||
## 8. RBAC
|
||||
|
||||
| Endpoint | Role |
|
||||
|----------|------|
|
||||
| `POST /api/hosts/{id}/update` | admin |
|
||||
| `POST /api/fleet/update` | admin |
|
||||
| `POST /api/fleet-updates/{id}/cancel` | admin |
|
||||
| `GET /api/fleet-updates/{id}` | admin (status polling) |
|
||||
| `GET /api/version` | public |
|
||||
|
||||
Operator and viewer see the "out of date" chip but no update
|
||||
buttons. Mirrors the existing pattern: read affordances are
|
||||
visible to all roles, write affordances are gated.
|
||||
|
||||
## 9. Testing
|
||||
|
||||
### 9.1 Unit
|
||||
|
||||
- `internal/agent/updater`: fake-`/agent/binary` HTTP server +
|
||||
tmp "running binary" file, assert post-state — binary swapped,
|
||||
`.old` present, no leftover `.new`. Linux path only (Windows
|
||||
helper covered by build-tag compile-only).
|
||||
- `internal/server/http`: `POST /api/hosts/{id}/update` happy
|
||||
path, refuses-when-offline, refuses-when-up-to-date,
|
||||
refuses-when-update-in-progress, RBAC enforcement, audit row
|
||||
written.
|
||||
- Hello handler: agent reconnects with matching version after
|
||||
`update` job dispatch → marks job `succeeded`, drops the
|
||||
in-memory pending entry. Mismatched version → no-op (timeout
|
||||
catches it).
|
||||
- Timeout path: synthetic `update` job + 90s elapsed →
|
||||
marks `failed`, raises alert.
|
||||
- Fleet worker: table-driven over the loop's state machine —
|
||||
success-then-success, success-then-timeout-halts,
|
||||
cancel-mid-flight, no-online-out-of-date-hosts-completes-immediately,
|
||||
host-disappears-from-list-mid-loop-skips.
|
||||
|
||||
### 9.2 Smoke validation (per CLAUDE.md restage block)
|
||||
|
||||
1. Build server + agent at version A. Restage. Enrol a host;
|
||||
confirm `agent_version=A`.
|
||||
2. Bump version to B (`make build VERSION=B`), rebuild server
|
||||
only, restart server. Dashboard shows host as out-of-date with
|
||||
`A → B` chip. Updates tile reads "1 host behind".
|
||||
3. Rebuild agent at B, restage `<DataDir>/agent-binaries/`. Click
|
||||
**Update agent** on host detail. Agent fetches, swaps, exits;
|
||||
systemd restarts it; hello-back at B → job `succeeded`, chip
|
||||
gone, tile clears.
|
||||
4. Rollback path: leave `<DataDir>/agent-binaries/` at A, server
|
||||
at B, click Update — agent fetches A, swaps to A, restarts at
|
||||
A; hello says A != B; server marks job `failed` after 90s with
|
||||
reason "agent reconnected at version A, expected B".
|
||||
5. Fleet update: spin up two smoke hosts both out-of-date, fire
|
||||
**Start rolling update**, watch progress page tick host 1 →
|
||||
host 2 → completed.
|
||||
6. Halt path: replace one of the `<DataDir>/agent-binaries/`
|
||||
files with `/bin/false`. Run fleet update. First host gets
|
||||
broken binary, fails to come back up, fleet update halts at
|
||||
host 1 after 90s, alert raised, host 2 left as `pending`.
|
||||
|
||||
Step 6 validates M2 end-to-end — the rolling halt is the actual
|
||||
safety guarantee, not a nice-to-have.
|
||||
|
||||
## 10. Out of scope
|
||||
|
||||
- sha256 digest verification (deferred — see decision 4).
|
||||
- `restic-manager-agent update` CLI subcommand (deferred —
|
||||
decision 6).
|
||||
- Auto-update (deferred — decision 1).
|
||||
- Auto-rollback watchdog M3 (deferred — decision 3).
|
||||
- Migrating the agent off `User=root` (separate hardening track).
|
||||
- Cross-version protocol-compatibility checks beyond the existing
|
||||
`protocol_version` handshake (P1-11). If the new agent's
|
||||
`protocol_version` is incompatible with the server, the
|
||||
existing handshake rejects it; the update job will then
|
||||
correctly time out and be marked failed.
|
||||
|
||||
## 11. Migration plan
|
||||
|
||||
1. `internal/version` package + Makefile ldflags wiring.
|
||||
2. Migration 0021 (jobs.kind widening) + 0022 (fleet_updates
|
||||
tables).
|
||||
3. `internal/agent/updater` package, Linux first.
|
||||
4. WS envelope wiring + `command.update` dispatcher.
|
||||
5. `POST /api/hosts/{id}/update` + hello-handler integration +
|
||||
timeout goroutine.
|
||||
6. UI: chip + per-host update button + dashboard tile + filter.
|
||||
7. Fleet update worker + page.
|
||||
8. Windows updater path.
|
||||
9. Alert engine kinds.
|
||||
10. Smoke validation per §9.2.
|
||||
|
||||
Each step is independently testable; commits should land at each
|
||||
boundary so a failed Windows path (8) doesn't block the rest of
|
||||
the work.
|
||||
@@ -0,0 +1,223 @@
|
||||
# P6-03 — Repo size trend graphs
|
||||
|
||||
Sparkline on the dashboard host row + full chart on the host repo
|
||||
page, both showing repo growth over time. Closes the last
|
||||
operator-visibility gap in Phase 6 alongside Prometheus metrics
|
||||
(P6-04).
|
||||
|
||||
## Goals
|
||||
|
||||
- Operators can see at a glance whether a host's repo is growing,
|
||||
stable, or shrinking, without leaving the dashboard.
|
||||
- A second screen on the repo page exposes the same data over a
|
||||
longer window with a snapshot-count overlay so retention
|
||||
behaviour can be eyeballed against size.
|
||||
- Zero new client-side dependencies; matches the existing
|
||||
HTMX + server-rendered idiom used everywhere else in the UI.
|
||||
|
||||
## Non-goals
|
||||
|
||||
- No backfill of historical data. Trend lights up with whatever
|
||||
the agents report from the day this ships.
|
||||
- No per-source-group breakdown — repo-level only.
|
||||
- No alerting on growth rate (dedicated to a future ticket if a
|
||||
user asks).
|
||||
- No JSON API surface. Prometheus exposure is P6-04, separate.
|
||||
|
||||
## Decisions taken in brainstorming
|
||||
|
||||
- **Metrics:** `total_size_bytes` (sparkline + chart) and
|
||||
`snapshot_count` (chart only). Raw size dropped as redundant.
|
||||
- **Cadence:** one row per `(host_id, UTC date)`, last-write-wins
|
||||
per column. Bounded at ~365 rows/host/year regardless of job
|
||||
frequency.
|
||||
- **Backfill:** none. Pure forward-fill from launch day.
|
||||
- **Rendering:** server-rendered inline SVG, no JS library.
|
||||
- **Spans:** sparkline fixed at 30 days; chart has `30d | 90d | 1y`
|
||||
range selector, server-rendered swap.
|
||||
|
||||
## Schema
|
||||
|
||||
New migration `internal/store/migrations/0023_host_repo_stats_history.sql`:
|
||||
|
||||
```sql
|
||||
CREATE TABLE host_repo_stats_history (
|
||||
host_id TEXT NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
|
||||
day TEXT NOT NULL, -- 'YYYY-MM-DD' UTC
|
||||
total_size_bytes INTEGER, -- nullable; partial patches don't overwrite
|
||||
snapshot_count INTEGER, -- nullable
|
||||
recorded_at TEXT NOT NULL, -- RFC3339Nano of last write touching this row
|
||||
PRIMARY KEY (host_id, day)
|
||||
);
|
||||
CREATE INDEX host_repo_stats_history_host_day
|
||||
ON host_repo_stats_history(host_id, day DESC);
|
||||
```
|
||||
|
||||
FK cascade matches every other host-scoped table; deleting a host
|
||||
through `Store.DeleteHost` (NS-01) wipes its history automatically.
|
||||
|
||||
## Write path
|
||||
|
||||
Hook the existing `MsgRepoStats` handler in
|
||||
`internal/server/ws/handler.go` (around line 319). After the
|
||||
existing `UpsertHostRepoStats(ctx, hostID, patch)` call, append:
|
||||
|
||||
```go
|
||||
day := time.Now().UTC().Format("2006-01-02")
|
||||
if err := deps.Store.UpsertHostRepoStatsHistory(ctx, hostID, day, patch); err != nil {
|
||||
slog.Warn("ws: upsert host repo stats history", "host_id", hostID, "err", err)
|
||||
}
|
||||
```
|
||||
|
||||
A history-write failure is logged and dropped — never blocks the
|
||||
main upsert. The partial-update contract that
|
||||
`UpsertHostRepoStats` already implements is preserved at the
|
||||
history layer:
|
||||
|
||||
```sql
|
||||
INSERT INTO host_repo_stats_history (host_id, day, total_size_bytes, snapshot_count, recorded_at)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
ON CONFLICT(host_id, day) DO UPDATE SET
|
||||
total_size_bytes = COALESCE(excluded.total_size_bytes, host_repo_stats_history.total_size_bytes),
|
||||
snapshot_count = COALESCE(excluded.snapshot_count, host_repo_stats_history.snapshot_count),
|
||||
recorded_at = excluded.recorded_at;
|
||||
```
|
||||
|
||||
This is critical: the agent's prune handler in
|
||||
`internal/agent/runner/runner.go:318` emits a stats patch that
|
||||
only carries `LastPruneAt`. Without `COALESCE`, that prune ack
|
||||
would null out a `total_size_bytes` we'd already captured from a
|
||||
backup earlier the same day.
|
||||
|
||||
## Read path
|
||||
|
||||
Two new helpers in `internal/store/host_repo_stats_history.go`:
|
||||
|
||||
```go
|
||||
type RepoStatsHistoryPoint struct {
|
||||
Day time.Time // 00:00:00 UTC
|
||||
TotalSizeBytes *int64
|
||||
SnapshotCount *int64
|
||||
}
|
||||
|
||||
func (s *Store) ListHostRepoStatsHistory(
|
||||
ctx context.Context, hostID string, since time.Time,
|
||||
) ([]RepoStatsHistoryPoint, error)
|
||||
```
|
||||
|
||||
Returns rows ordered by `day` ascending where at least one metric
|
||||
is non-null. The renderer connects available points with a
|
||||
straight line — there is no explicit gap representation. A host
|
||||
that was offline for a week shows a single segment spanning the
|
||||
gap, which is the right visual: the repo state didn't change.
|
||||
|
||||
## Rendering
|
||||
|
||||
New package `internal/web/sparkline`. Pure Go, no template
|
||||
dependency:
|
||||
|
||||
```go
|
||||
type Series struct {
|
||||
Name string
|
||||
Points []float64 // nil-points represented as math.NaN
|
||||
Stroke string // CSS color
|
||||
}
|
||||
|
||||
func RenderSparkline(points []float64, width, height int) template.HTML
|
||||
func RenderChart(series []Series, days []time.Time, opts ChartOpts) template.HTML
|
||||
```
|
||||
|
||||
`RenderChart` produces a 600×220 SVG with:
|
||||
|
||||
- Light horizontal gridlines (4 bands).
|
||||
- Two y-axes: bytes (left, blue) and count (right, amber). Each
|
||||
series is normalised against its own axis.
|
||||
- X-axis labels at start, midpoint, and end of the window.
|
||||
- Per-point `<circle>` with a `<title>` for hover tooltips —
|
||||
accessible by default, no JS.
|
||||
- Empty state: faint dashed baseline + centered "no data yet"
|
||||
text.
|
||||
|
||||
Sparkline is 80×20, single blue polyline, single `<title>` on the
|
||||
group element showing `"current → 30d ago"`.
|
||||
|
||||
Two new partials:
|
||||
|
||||
- `web/templates/partials/repo_size_sparkline.html`
|
||||
- `web/templates/partials/repo_size_chart.html`
|
||||
|
||||
Both call into the renderer with the appropriate opts. No
|
||||
inline `<style>` — colours come from existing Tailwind palette
|
||||
classes already used elsewhere (`text-blue-500`, `text-amber-500`).
|
||||
|
||||
## UI placement
|
||||
|
||||
### Dashboard host row
|
||||
|
||||
`web/templates/partials/host_row.html` gains one `<td>` between
|
||||
the existing "Repo size" cell and "Snapshots" cell. Width ≈ 88px.
|
||||
Cell renders the sparkline partial; if `len(points) < 2` the cell
|
||||
shows "—" centred (matches the existing no-data idiom for
|
||||
last-backup time in the same partial).
|
||||
|
||||
The dashboard's existing 5-second htmx live-refresh
|
||||
(`hx-trigger="every 5s ..."` from NS-04) re-renders this cell
|
||||
along with the rest of the row. No extra polling.
|
||||
|
||||
### Host repo page
|
||||
|
||||
`web/templates/pages/host_repo.html` gains a "Trend" panel
|
||||
inserted between the existing summary panel and the maintenance
|
||||
panel. Panel contains:
|
||||
|
||||
- Range pills `30d | 90d | 1y` (anchor links with
|
||||
`hx-get="/hosts/{id}/repo/trend?range=…"` and
|
||||
`hx-target="#repo-trend-chart" hx-swap="outerHTML"`).
|
||||
- The chart partial wrapped in `<div id="repo-trend-chart">`.
|
||||
- A small legend strip below the chart.
|
||||
|
||||
## Endpoints
|
||||
|
||||
- `GET /hosts/{id}/repo/trend?range=30d|90d|1y` — admin/operator,
|
||||
htmx fragment, returns the chart partial. Auth reuses the
|
||||
existing host-scoped middleware on the `/hosts/{id}` family.
|
||||
Invalid `range` falls back to 30d.
|
||||
|
||||
No new admin-only surface — anyone with read access to the host
|
||||
can see the trend.
|
||||
|
||||
## Testing
|
||||
|
||||
- `internal/store/host_repo_stats_history_test.go` — upsert
|
||||
merges partial patches without nulling; ordering; since-day
|
||||
filter; cascade on host delete.
|
||||
- `internal/web/sparkline/sparkline_test.go` — golden SVG files
|
||||
for: empty input, single point, full 30-day series, mixed
|
||||
null points. Goldens live under `testdata/`.
|
||||
- `internal/server/http/ui_repo_test.go` — trend panel renders
|
||||
with seeded history; range selector swaps server-side; empty
|
||||
state.
|
||||
- `internal/server/http/ui_dashboard_test.go` — host row sparkline
|
||||
cell present and renders SVG when points exist, "—" when not.
|
||||
- Smoke after build: dashboard row shows sparkline once two days
|
||||
of data exist; repo page chart toggles cleanly between ranges.
|
||||
|
||||
## Migration / rollout
|
||||
|
||||
- Schema migration is additive — no risk to existing tables.
|
||||
- Write path is best-effort; on schema issue the main repo-stats
|
||||
upsert is unaffected.
|
||||
- No agent change required, so no fleet update needed.
|
||||
|
||||
## Acceptance
|
||||
|
||||
- After two days of operation, the dashboard sparkline shows a
|
||||
visible line for any host that has run a backup or
|
||||
maintenance op on both days.
|
||||
- Host repo page renders the trend panel with the snapshot-count
|
||||
overlay; range selector switches view without a full page
|
||||
reload.
|
||||
- `go test ./...` and `go vet ./...` clean.
|
||||
- Smoke env exercise: backup → sparkline updates; range pills
|
||||
swap; FK cascade verified by deleting a host and checking the
|
||||
history table.
|
||||
@@ -1,126 +0,0 @@
|
||||
# Threat model
|
||||
|
||||
A short, structured walkthrough of the assets restic-manager
|
||||
protects, the actors that interact with it, the attack surfaces
|
||||
exposed, and the mitigations in place. This document is written for
|
||||
operators considering a deployment and for contributors evaluating
|
||||
security-sensitive changes. It is **not** a formal certification —
|
||||
restic-manager has not been third-party audited.
|
||||
|
||||
Last reviewed: **2026-05-09** (against v1.0.0).
|
||||
|
||||
---
|
||||
|
||||
## 1. Assets
|
||||
|
||||
In rough order of sensitivity:
|
||||
|
||||
| Asset | Why it matters |
|
||||
|---|---|
|
||||
| **Restic repository passwords** | Decrypt every backup in the repo. Server holds them encrypted at rest; agents need plaintext at backup-time. |
|
||||
| **Repository URLs with embedded credentials** (e.g. `rest:https://user:pass@host/repo`) | Same as above — read access to the repo is leak-equivalent to the password. |
|
||||
| **Agent bearer tokens** | Long-lived credentials authenticating each agent → server WS. Compromise lets an attacker impersonate that host (push fake snapshots, ack fake schedule versions, exfiltrate repo creds the server pushes back). |
|
||||
| **Server session cookies** | Browser-side session for human operators. Compromise = full UI access at the user's role for the cookie's TTL (24h). |
|
||||
| **Database secret key** | Wraps every encrypted-at-rest field (repo creds, agent enrolment payloads). Loss of the file means decryptable backups; rotation requires re-pushing creds to every agent. |
|
||||
| **Bootstrap / setup tokens** | One-shot, time-limited; mint admin or invited-user accounts. |
|
||||
| **Audit log** | Tamper-evident record of admin actions; read-only via UI. |
|
||||
| **Backup data on the wire** | Restic itself encrypts on the agent before sending — see "out of scope". |
|
||||
|
||||
---
|
||||
|
||||
## 2. Actors
|
||||
|
||||
| Actor | Trust |
|
||||
|---|---|
|
||||
| **Anonymous internet** | Untrusted. Should not reach the server unless proxied behind auth (see deployment guide). |
|
||||
| **Authenticated viewer** | Read-only on hosts/jobs/alerts/audit. |
|
||||
| **Authenticated operator** | Add/remove hosts, edit schedules, run backups/restores, mint enrolment tokens, ack alerts. |
|
||||
| **Authenticated admin** | All of the above plus user management, role changes, fleet update controls, secret-key visibility (no — see below). |
|
||||
| **Agent** | Trusted to backup-and-report on its own host only. Cannot read other hosts' creds. Bearer-authenticated. |
|
||||
| **Restic backend (rest-server / S3 / B2 / etc.)** | Out of scope for this document — assumed to authenticate the credentials presented and not collude. |
|
||||
|
||||
---
|
||||
|
||||
## 3. Attack surfaces and mitigations
|
||||
|
||||
### 3.1 First-run bootstrap
|
||||
|
||||
- **Surface**: `/bootstrap` UI + `/api/bootstrap` JSON endpoint.
|
||||
- **Risk**: race between server start and admin creation — an attacker who reaches the server first can claim admin.
|
||||
- **Mitigations**:
|
||||
- Bootstrap token printed to stderr exactly once; held in memory, not persisted.
|
||||
- The UI form on `/bootstrap` uses the in-memory token automatically (no token field for the operator to type or expose).
|
||||
- Both surfaces self-disable the moment any user row exists (`CountUsers > 0`).
|
||||
- Token is also blanked from process memory after success (defence in depth).
|
||||
- **Residual risk**: if an operator brings up the server on the public internet before reaching the bootstrap page, an attacker reaching `/bootstrap` first wins. **Recommendation**: bring the server up behind an existing trusted network or with the listener bound to `127.0.0.1` until first-run is complete.
|
||||
|
||||
### 3.2 Local user accounts
|
||||
|
||||
- **Surface**: `/login`, `/api/auth/login`.
|
||||
- **Mitigations**: Argon2id password hashing with per-deployment params; constant-time password compare; session-cookie minting via `crypto/rand`; session rows hash-only (raw token only in cookie).
|
||||
- **Rate limiting**: Currently not in place at the application layer — the project assumes a reverse proxy enforces login throttling. **Recommendation**: front the server with `caddy`/`nginx` rate-limit rules in production.
|
||||
- **Password policy**: 12-character minimum on bootstrap and user-setup paths; no maximum, no rotation, no history. Sufficient for self-hosted ops; tighten in policy if a deployment requires it.
|
||||
|
||||
### 3.3 OIDC SSO
|
||||
|
||||
- **Surface**: `/auth/oidc/*` — generic OIDC client, JIT user provisioning.
|
||||
- **Mitigations**: state + nonce per flow; role mapping is server-configured (claims trusted only to identify the user, not pick role); user-disabled gate runs after IdP success.
|
||||
- **Residual risk**: misconfigured role-mapping rules can promote any IdP user to admin. **Recommendation**: review `cfg.OIDC.RoleMappings` carefully.
|
||||
|
||||
### 3.4 Agent enrolment
|
||||
|
||||
- **Surface**: `/api/agents/enroll` (token-authenticated), `/api/agents/announce` (anonymous, then operator-approves).
|
||||
- **Mitigations**:
|
||||
- Token path: one-shot, hashed at rest, 1h TTL; agent receives a fresh long-lived bearer in the response.
|
||||
- Announce path: agent supplies an Ed25519 public key; operator sees a fingerprint to confirm out-of-band before accepting.
|
||||
- Bearer tokens are SHA-256 hashed in the DB.
|
||||
- **Residual risk**: an attacker on the network between operator and target host who intercepts the install snippet can enrol *as* the target. The install script must be served over TLS in production (the docker-only deployment defaults to TLS-by-default; bare-metal deployers must configure their own).
|
||||
|
||||
### 3.5 Agent → server WebSocket
|
||||
|
||||
- **Surface**: persistent WS authenticated by agent bearer.
|
||||
- **Mitigations**: bearer is presented per-connection; server pins the agent fingerprint for the announce flow; messages are envelope-typed and rejected if shape-invalid.
|
||||
- **No payload-level signing** today — TLS is the integrity boundary. A man-in-the-middle with a valid cert chain could swap messages. **Recommendation**: pin the server cert via `RM_SERVER_CERT_PIN_SHA256` if running over a network you don't fully control.
|
||||
|
||||
### 3.6 Repo credential lifecycle
|
||||
|
||||
- Stored encrypted at rest under the AEAD secret key.
|
||||
- Pushed to the agent over the WS on hello, on creds change, and on demand.
|
||||
- Agent persists them encrypted (per-host secret key derived from a value known only to the agent).
|
||||
- Logged surfaces use `restic.RedactURL()` to strip `user:pass@` from URLs before they reach `slog`.
|
||||
- Plaintext form is constructed only at `exec.Command` time inside the agent, never stored on a struct field that could be slogged.
|
||||
|
||||
### 3.7 Restore
|
||||
|
||||
- Operators can restore to any path the agent (running as root) can write.
|
||||
- Cross-host restore (host A's snapshot → host C) is **deferred** — see F-01. The current single-host restore does not require granting any cross-host privileges.
|
||||
|
||||
### 3.8 Audit log
|
||||
|
||||
- Append-only writes from the application; SQLite enforces no schema-level immutability.
|
||||
- A compromise of the SQLite file (via OS-level access) can edit the audit log. **Recommendation**: ship audit entries to an append-only sink (syslog / Loki / Splunk) if tamper-evidence beyond the OS boundary is required.
|
||||
|
||||
### 3.9 Self-update channel (P6)
|
||||
|
||||
- Agents fetch new binaries via the WS transport from the server.
|
||||
- Binaries are signature-checked by the agent against a key embedded in the existing agent (see `internal/fleetupdate/`).
|
||||
- **Residual risk**: a server compromise lets the attacker push code to every agent (running as root). The signing-key compromise window is the same as the server compromise window because both live on the server. Splitting the signing key onto a separate signer is future work (not v1).
|
||||
|
||||
---
|
||||
|
||||
## 4. Out of scope
|
||||
|
||||
- **Restic itself** — its repository format, encryption, and backend protocol are upstream-trusted.
|
||||
- **The host OS** — root compromise of a host obviously compromises that host's backups.
|
||||
- **The backup destination** — restic-manager assumes the rest-server / object-store / SFTP target enforces its own auth.
|
||||
- **Side-channel attacks** on the server process (RAM dump, process tracing).
|
||||
- **Physical access** to the server's disk.
|
||||
|
||||
---
|
||||
|
||||
## 5. Reporting
|
||||
|
||||
Found something we missed? See `SECURITY.md` for the disclosure
|
||||
process. Coordinated disclosure preferred; the project is
|
||||
maintained by a small team and we'll respond as quickly as we
|
||||
reasonably can.
|
||||
@@ -33,7 +33,7 @@ COPY --from=build /out/restic-manager-agent /usr/local/bin/restic-manager-agent
|
||||
USER root
|
||||
|
||||
# The agent needs a writable directory for its config + secrets store.
|
||||
RUN mkdir -p /etc/restic-manager /var/lib/restic-manager
|
||||
RUN mkdir -p /etc/restic-manager /var/lib/restic-manager-agent
|
||||
ENV RM_AGENT_CONFIG=/etc/restic-manager/agent.yaml
|
||||
|
||||
# The compose entrypoint sets the announce URL via env.
|
||||
|
||||
+5
-10
@@ -60,22 +60,14 @@ services:
|
||||
# with a few files so the snapshot list isn't empty.
|
||||
- source-data:/source
|
||||
- agent-config:/etc/restic-manager
|
||||
- agent-state:/var/lib/restic-manager
|
||||
- agent-state:/var/lib/restic-manager-agent
|
||||
networks: [rmnet]
|
||||
|
||||
# Playwright test runner. Profile-gated so `compose up` doesn't
|
||||
# start it; CI invokes it via `compose run` and `docker cp`s the
|
||||
# report+traces out (see .gitea/workflows/e2e.yml). Lives on
|
||||
# start it; CI runs it via `compose run --rm playwright`. Lives on
|
||||
# rmnet so it can reach the server via its compose-network DNS
|
||||
# name rather than depending on host port-publish (which doesn't
|
||||
# work on Gitea's container-based runners).
|
||||
#
|
||||
# Reports are NOT bind-mounted: when the runner job itself runs
|
||||
# inside a container, `./playwright/...` resolves to a path that
|
||||
# only exists inside the runner container, so the host docker
|
||||
# daemon would silently mount an empty dir. Instead the report
|
||||
# stays inside the playwright container and the workflow extracts
|
||||
# it via `docker cp` before tearing down.
|
||||
playwright:
|
||||
profiles: [test]
|
||||
build:
|
||||
@@ -84,6 +76,9 @@ services:
|
||||
environment:
|
||||
RM_BASE_URL: "http://server:8080"
|
||||
RM_BOOTSTRAP_TOKEN: "${RM_BOOTSTRAP_TOKEN:-}"
|
||||
volumes:
|
||||
- ./playwright/playwright-report:/work/playwright-report
|
||||
- ./playwright/test-results:/work/test-results
|
||||
depends_on:
|
||||
- server
|
||||
- agent
|
||||
|
||||
@@ -10,11 +10,7 @@ const baseURL = process.env.RM_BASE_URL ?? 'http://127.0.0.1:8080';
|
||||
|
||||
export default defineConfig({
|
||||
testDir: './tests',
|
||||
// 4 minutes — the smoke test waits for: enrolment + bootstrap
|
||||
// (~5s), auto-init landing (~10s), backup completion (~120s
|
||||
// budget). 60s is far too tight in CI; 4m gives headroom even
|
||||
// on a contended runner without masking real regressions.
|
||||
timeout: 240_000,
|
||||
timeout: 60_000,
|
||||
expect: { timeout: 10_000 },
|
||||
fullyParallel: false,
|
||||
retries: process.env.CI ? 1 : 0,
|
||||
|
||||
@@ -10,7 +10,6 @@ export interface HostJSON {
|
||||
id: string;
|
||||
name: string;
|
||||
status: string;
|
||||
repo_status?: string;
|
||||
last_backup_status?: string;
|
||||
}
|
||||
|
||||
@@ -107,43 +106,6 @@ export async function waitForHostStatus(
|
||||
throw new Error(`waitForHostStatus: timeout. Last seen: ${JSON.stringify(last)}`);
|
||||
}
|
||||
|
||||
export async function createSourceGroup(
|
||||
request: APIRequestContext,
|
||||
cookie: string,
|
||||
hostID: string,
|
||||
body: { name: string; includes: string[]; excludes?: string[] },
|
||||
): Promise<string> {
|
||||
const res = await request.post(`${baseURL}/api/hosts/${hostID}/source-groups`, {
|
||||
headers: { cookie, 'content-type': 'application/json' },
|
||||
data: {
|
||||
name: body.name,
|
||||
includes: body.includes,
|
||||
excludes: body.excludes ?? [],
|
||||
retention_policy: {},
|
||||
retry_max: 0,
|
||||
retry_backoff_seconds: 0,
|
||||
},
|
||||
});
|
||||
if (!res.ok()) throw new Error(`createSourceGroup: ${res.status()} ${await res.text()}`);
|
||||
const created = (await res.json()) as { id?: string; group?: { id?: string } };
|
||||
const id = created.id ?? created.group?.id;
|
||||
if (!id) throw new Error(`createSourceGroup: no id in response: ${JSON.stringify(created)}`);
|
||||
return id;
|
||||
}
|
||||
|
||||
export async function runSourceGroup(
|
||||
request: APIRequestContext,
|
||||
cookie: string,
|
||||
hostID: string,
|
||||
groupID: string,
|
||||
): Promise<void> {
|
||||
const res = await request.post(
|
||||
`${baseURL}/api/hosts/${hostID}/source-groups/${groupID}/run`,
|
||||
{ headers: { cookie } },
|
||||
);
|
||||
if (!res.ok()) throw new Error(`runSourceGroup: ${res.status()} ${await res.text()}`);
|
||||
}
|
||||
|
||||
export async function getSessionCookie(page: Page): Promise<string> {
|
||||
const cookies = await page.context().cookies();
|
||||
const c = cookies.find((c) => c.name === 'rm_session');
|
||||
|
||||
@@ -14,13 +14,11 @@ import {
|
||||
waitForPendingHostID,
|
||||
acceptPending,
|
||||
waitForHostStatus,
|
||||
createSourceGroup,
|
||||
runSourceGroup,
|
||||
getSessionCookie,
|
||||
} from './lib/server';
|
||||
|
||||
test.describe('smoke: enrol-via-announce → backup', () => {
|
||||
test('happy path: enrol → accept → backup → succeeded', async ({ page, request }) => {
|
||||
test('happy path completes in under a minute', async ({ page, request }) => {
|
||||
const { username, password } = await bootstrapAdmin(request);
|
||||
await loginViaUI(page, username, password);
|
||||
|
||||
@@ -40,37 +38,29 @@ test.describe('smoke: enrol-via-announce → backup', () => {
|
||||
password: 'e2e-repo-password',
|
||||
});
|
||||
|
||||
// Wait for the host to come online AND for auto-init to
|
||||
// finish. Coming online happens as soon as the agent's
|
||||
// bearer-authed WS attaches (~1s after accept); repo_status
|
||||
// flips to 'ready' once the auto-init job completes (a
|
||||
// couple of seconds later). Loading the host page before
|
||||
// that leaves the Run-backup button disabled because the
|
||||
// server-rendered HTML reflects the still-in-progress init,
|
||||
// and the page has no live-refresh on that field.
|
||||
const readyHost = await waitForHostStatus(
|
||||
// Wait for the host to come online + auto-init to land.
|
||||
const onlineHost = await waitForHostStatus(
|
||||
request, cookie,
|
||||
(h) => h.status === 'online' && h.repo_status === 'ready',
|
||||
90_000,
|
||||
(h) => h.status === 'online',
|
||||
60_000,
|
||||
);
|
||||
expect(readyHost.id).toBeTruthy();
|
||||
expect(onlineHost.id).toBeTruthy();
|
||||
|
||||
// Per-host Run-now is gone; backups are dispatched per
|
||||
// source-group now. Create one that maps to the agent's
|
||||
// /source mount, then kick it via the JSON API.
|
||||
const groupID = await createSourceGroup(request, cookie, readyHost.id, {
|
||||
name: 'default',
|
||||
includes: ['/source'],
|
||||
});
|
||||
await runSourceGroup(request, cookie, readyHost.id, groupID);
|
||||
// Trigger a backup via the UI form-post (HX-Redirect to /jobs/{id}).
|
||||
await page.goto(`${baseURL}/hosts/${onlineHost.id}`);
|
||||
await Promise.all([
|
||||
page.waitForURL(/\/jobs\//),
|
||||
page.locator('form[action$="/run-backup"] button[type="submit"]').first().click(),
|
||||
]);
|
||||
|
||||
// Wait for the host's last_backup_status to flip to 'succeeded'.
|
||||
// The host record is the source of truth: it's what the
|
||||
// dashboard projects from job-completion events on the WS
|
||||
// channel.
|
||||
// The job page itself is harder to assert on (it uses
|
||||
// server-pushed updates and a reload-on-finish pattern); the
|
||||
// host record is the source of truth and is what the dashboard
|
||||
// surfaces.
|
||||
const finishedHost = await waitForHostStatus(
|
||||
request, cookie,
|
||||
(h) => h.id === readyHost.id && h.last_backup_status === 'succeeded',
|
||||
(h) => h.id === onlineHost.id && h.last_backup_status === 'succeeded',
|
||||
120_000,
|
||||
);
|
||||
expect(finishedHost.last_backup_status).toBe('succeeded');
|
||||
@@ -78,9 +68,12 @@ test.describe('smoke: enrol-via-announce → backup', () => {
|
||||
});
|
||||
|
||||
test.describe('smoke: scrape /metrics', () => {
|
||||
test('metrics endpoint exposes the host gauge', async ({ request }) => {
|
||||
// Compose sets RM_METRICS_TRUSTED_CIDR=0.0.0.0/0 so the
|
||||
// endpoint is open to the test runner.
|
||||
// The /metrics endpoint is documented (RM_METRICS_TOKEN /
|
||||
// RM_METRICS_TRUSTED_CIDR, gauges rm_hosts_total / rm_build_info)
|
||||
// but not yet implemented in the server. Skipping until the
|
||||
// Prometheus exposition lands; tracked separately from this
|
||||
// e2e harness.
|
||||
test.skip('metrics endpoint exposes the host gauge', async ({ request }) => {
|
||||
const res = await request.get(`${baseURL}/metrics`);
|
||||
expect(res.status()).toBe(200);
|
||||
const body = await res.text();
|
||||
|
||||
@@ -2,14 +2,10 @@ package runner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/restic"
|
||||
@@ -47,22 +43,13 @@ func (s *fakeSender) snapshot() []api.Envelope {
|
||||
// setupScript writes a shell script (without shebang) to a temp dir,
|
||||
// names it "restic", makes it executable, and returns the path.
|
||||
//
|
||||
// Writes to "<path>.tmp" then renames into place. The rename is the
|
||||
// usual guard against ETXTBSY: under -race + many t.Parallel tests,
|
||||
// a fork-from-another-goroutine can inherit the writable fd from
|
||||
// Writes to "<path>.tmp" then renames into place. The rename is what
|
||||
// makes this race-free: under -race + many t.Parallel tests, a
|
||||
// fork-from-another-goroutine can inherit the writable fd from
|
||||
// os.WriteFile before close completes, and exec'ing the file then
|
||||
// returns ETXTBSY ("text file busy"). The renamed dirent points at
|
||||
// an inode that has no writable fd open anywhere — exec is safe on
|
||||
// a vanilla filesystem.
|
||||
//
|
||||
// On overlayfs (every job that runs inside a `container:` block on
|
||||
// our Gitea runner), the rename can briefly leak ETXTBSY anyway —
|
||||
// the upper layer's "writable inode" bookkeeping lags the userspace
|
||||
// close. To make the helper deterministic across environments, we
|
||||
// probe-exec the file with a benign argument until exec succeeds,
|
||||
// then return. Each script body has a `case "$1" in ... esac` shape
|
||||
// where unknown args fall through to a clean exit, so the probe is
|
||||
// a no-op from the test's point of view.
|
||||
// returns ETXTBSY ("text file busy"). Once the rename lands, the
|
||||
// final path is a fresh dirent pointing at an inode that has no
|
||||
// writable fd open anywhere — exec is safe.
|
||||
func setupScript(t *testing.T, body string) string {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
@@ -74,22 +61,8 @@ func setupScript(t *testing.T, body string) string {
|
||||
if err := os.Rename(tmp, final); err != nil {
|
||||
t.Fatalf("setupScript: rename: %v", err)
|
||||
}
|
||||
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
for {
|
||||
err := exec.Command(final, "__rm_probe__").Run()
|
||||
if err == nil {
|
||||
return final
|
||||
}
|
||||
if !errors.Is(err, syscall.ETXTBSY) {
|
||||
t.Fatalf("setupScript: probe exec: %v", err)
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("setupScript: %s still ETXTBSY after 3s", final)
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
// firstEnvOfType returns the first envelope with the given type, or
|
||||
// fails the test if none is found.
|
||||
|
||||
@@ -22,12 +22,6 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
)
|
||||
|
||||
// staleBackupThreshold is how long an intermittent host may go without
|
||||
// a successful backup before we raise a stale_schedule alert. Global
|
||||
// constant for v1 (may become per-host later). Only intermittent hosts
|
||||
// are evaluated — always-on hosts' stale_schedule stays a no-op.
|
||||
const staleBackupThreshold = 7 * 24 * time.Hour
|
||||
|
||||
// JobFinishedEvent carries everything the engine needs to evaluate
|
||||
// the failed-X rules. Pushed via Engine.NotifyJobFinished from the
|
||||
// MarkJobFinished site.
|
||||
@@ -155,10 +149,6 @@ func (e *Engine) handleJobFinished(ctx context.Context, ev JobFinishedEvent) {
|
||||
fmt.Sprintf("%s job %s failed", ev.Kind, ev.JobID), ev.When)
|
||||
case "succeeded":
|
||||
e.resolveAndNotify(ctx, ev.HostID, kind, dedupKey, ev.When)
|
||||
if ev.Kind == "backup" {
|
||||
// A fresh backup clears staleness for intermittent hosts.
|
||||
e.resolveAndNotify(ctx, ev.HostID, KindStaleSchedule, "", ev.When)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,12 +157,6 @@ func (e *Engine) handleHostOffline(ctx context.Context, hostID string) {
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
// Intermittent hosts (laptops) legitimately disappear — never raise
|
||||
// agent_offline for them. The stale_schedule sweep in tick() is the
|
||||
// only staleness signal for these hosts.
|
||||
if !host.AlwaysOn {
|
||||
return
|
||||
}
|
||||
// Apply the 15-min floor — raise only when last_seen_at is older
|
||||
// than agentOfflineFloor. A nil last_seen_at (host enrolled but
|
||||
// never connected) is treated as "now" so we don't raise
|
||||
@@ -196,9 +180,11 @@ func (e *Engine) handleHostOnline(ctx context.Context, hostID string) {
|
||||
// tick is the 60-second sweep. Responsibilities:
|
||||
// 1. Re-evaluate agent_offline for every offline host that may have
|
||||
// crossed the floor between explicit events.
|
||||
// 2. Stale-schedule detection for intermittent hosts — raises
|
||||
// stale_schedule when LastBackupAt is older than 7 days and the
|
||||
// host has an enabled schedule. Always-on hosts are excluded.
|
||||
// 2. Stale-schedule detection — declared in the spec but intentionally
|
||||
// left as a no-op in v1. The precise "expected to have fired but
|
||||
// didn't" trigger requires a store helper that lands in a later
|
||||
// task. The KindStaleSchedule constant is exported so UI code can
|
||||
// reference the tag string today.
|
||||
func (e *Engine) tick(ctx context.Context, now time.Time) {
|
||||
// User-management cleanup piggy-backed here for now. Setup tokens
|
||||
// have a 1h expiry; the alert engine tick is the cheapest existing
|
||||
@@ -217,35 +203,6 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
|
||||
return
|
||||
}
|
||||
for _, h := range hosts {
|
||||
// Intermittent hosts: suppress agent_offline entirely; instead
|
||||
// raise stale_schedule when they have gone too long with no
|
||||
// successful backup AND they have at least one enabled schedule
|
||||
// to be measured against. A nil LastBackupAt (never backed up)
|
||||
// has no baseline — onboarding/repo_status covers that case.
|
||||
if !h.AlwaysOn {
|
||||
if h.LastBackupAt == nil {
|
||||
continue
|
||||
}
|
||||
if now.Sub(*h.LastBackupAt) < staleBackupThreshold {
|
||||
continue
|
||||
}
|
||||
hasEnabled, err := e.hostHasEnabledSchedule(ctx, h.ID)
|
||||
if err != nil {
|
||||
slog.Warn("alert: tick list schedules", "host_id", h.ID, "err", err)
|
||||
continue
|
||||
}
|
||||
if !hasEnabled {
|
||||
continue
|
||||
}
|
||||
e.raiseAndNotify(ctx, h.ID, KindStaleSchedule, "", "warning",
|
||||
fmt.Sprintf("No backup in %s (threshold %s)",
|
||||
roundDur(now.Sub(*h.LastBackupAt)), staleBackupThreshold), now)
|
||||
// Resolution is handled in handleJobFinished on a successful
|
||||
// backup (and ResolveOnModeChange on toggle) — the tick only
|
||||
// raises, it does not auto-resolve.
|
||||
continue
|
||||
}
|
||||
// Always-on hosts: existing agent_offline re-evaluation.
|
||||
if h.Status != "offline" || h.LastSeenAt == nil {
|
||||
continue
|
||||
}
|
||||
@@ -255,6 +212,7 @@ func (e *Engine) tick(ctx context.Context, now time.Time) {
|
||||
roundDur(now.Sub(*h.LastSeenAt)), e.agentOfflineFloor), now)
|
||||
}
|
||||
}
|
||||
// Stale-schedule sweep — no-op in v1. See KindStaleSchedule doc comment.
|
||||
}
|
||||
|
||||
// roundDur returns a human-readable duration string, rounding to the
|
||||
@@ -266,19 +224,3 @@ func roundDur(d time.Duration) string {
|
||||
}
|
||||
return d.Round(time.Minute).String()
|
||||
}
|
||||
|
||||
// hostHasEnabledSchedule reports whether the host has at least one
|
||||
// enabled backup schedule — the precondition for a stale_schedule
|
||||
// alert (no schedule = no backup expectation to measure against).
|
||||
func (e *Engine) hostHasEnabledSchedule(ctx context.Context, hostID string) (bool, error) {
|
||||
schedules, err := e.store.ListSchedulesByHost(ctx, hostID)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
for _, sc := range schedules {
|
||||
if sc.Enabled {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
@@ -1,255 +0,0 @@
|
||||
package alert
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/oklog/ulid/v2"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
)
|
||||
|
||||
// TestIntermittentHostSuppressesOfflineAlert checks that handleHostOffline
|
||||
// does NOT raise agent_offline for a host with AlwaysOn=false.
|
||||
func TestIntermittentHostSuppressesOfflineAlert(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
ctx := context.Background()
|
||||
|
||||
// Make the host intermittent.
|
||||
if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
|
||||
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||
}
|
||||
|
||||
// Give it a stale last_seen_at well past the floor.
|
||||
if _, err := st.DB().Exec(
|
||||
`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
|
||||
time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
|
||||
"offline",
|
||||
hostID,
|
||||
); err != nil {
|
||||
t.Fatalf("update last_seen_at: %v", err)
|
||||
}
|
||||
|
||||
eng.handleHostOffline(ctx, hostID)
|
||||
|
||||
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 0 {
|
||||
t.Fatalf("expected 0 open alerts for intermittent host; got %d: %+v", len(open), open)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAlwaysOnHostStillRaisesOfflineAlert checks that always-on hosts still
|
||||
// get an agent_offline alert when offline past the floor.
|
||||
func TestAlwaysOnHostStillRaisesOfflineAlert(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
ctx := context.Background()
|
||||
|
||||
// always_on=true is the default, but be explicit.
|
||||
if err := st.SetHostAlwaysOn(ctx, hostID, true); err != nil {
|
||||
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||
}
|
||||
|
||||
// Give it a stale last_seen_at well past the 15m floor.
|
||||
if _, err := st.DB().Exec(
|
||||
`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
|
||||
time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
|
||||
"offline",
|
||||
hostID,
|
||||
); err != nil {
|
||||
t.Fatalf("update last_seen_at: %v", err)
|
||||
}
|
||||
|
||||
eng.handleHostOffline(ctx, hostID)
|
||||
|
||||
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 1 || open[0].Kind != KindAgentOffline {
|
||||
t.Fatalf("expected 1 agent_offline alert; got %d: %+v", len(open), open)
|
||||
}
|
||||
}
|
||||
|
||||
// TestStalenessAlertForIntermittentHost checks that tick raises stale_schedule
|
||||
// for an intermittent host whose last backup is older than 7 days AND has an
|
||||
// enabled schedule. Also verifies that a succeeded backup clears the alert.
|
||||
func TestStalenessAlertForIntermittentHost(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
ctx := context.Background()
|
||||
|
||||
// Make intermittent.
|
||||
if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
|
||||
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||
}
|
||||
|
||||
// Create a source group to attach the schedule to.
|
||||
sgID := ulid.Make().String()
|
||||
if err := st.CreateSourceGroup(ctx, &store.SourceGroup{
|
||||
ID: sgID,
|
||||
HostID: hostID,
|
||||
Name: "default",
|
||||
Includes: []string{"/home"},
|
||||
}); err != nil {
|
||||
t.Fatalf("CreateSourceGroup: %v", err)
|
||||
}
|
||||
|
||||
// Create an enabled schedule pointing at the source group.
|
||||
schedID := ulid.Make().String()
|
||||
if err := st.CreateSchedule(ctx, &store.Schedule{
|
||||
ID: schedID,
|
||||
HostID: hostID,
|
||||
CronExpr: "0 2 * * *",
|
||||
Enabled: true,
|
||||
SourceGroupIDs: []string{sgID},
|
||||
}); err != nil {
|
||||
t.Fatalf("CreateSchedule: %v", err)
|
||||
}
|
||||
|
||||
// Set last_backup_at to 8 days ago.
|
||||
eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour)
|
||||
if err := st.SetHostLastBackup(ctx, hostID, "succeeded", eightDaysAgo); err != nil {
|
||||
t.Fatalf("SetHostLastBackup: %v", err)
|
||||
}
|
||||
|
||||
eng.tick(ctx, time.Now().UTC())
|
||||
|
||||
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
var staleCount int
|
||||
for _, a := range open {
|
||||
if a.Kind == KindStaleSchedule {
|
||||
staleCount++
|
||||
}
|
||||
}
|
||||
if staleCount != 1 {
|
||||
t.Fatalf("expected 1 stale_schedule alert after tick; got %d (all open: %+v)", staleCount, open)
|
||||
}
|
||||
|
||||
// A succeeded backup should clear the stale_schedule alert.
|
||||
eng.handleJobFinished(ctx, JobFinishedEvent{
|
||||
HostID: hostID,
|
||||
JobID: ulid.Make().String(),
|
||||
Kind: "backup",
|
||||
Status: "succeeded",
|
||||
SourceGroupID: sgID,
|
||||
When: time.Now().UTC(),
|
||||
})
|
||||
|
||||
open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
for _, a := range open {
|
||||
if a.Kind == KindStaleSchedule {
|
||||
t.Fatalf("expected stale_schedule to be resolved after backup succeeded; still open: %+v", a)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestNoStalenessWithoutEnabledSchedule checks that no stale_schedule is
|
||||
// raised for an intermittent host with a stale backup but no enabled schedule.
|
||||
func TestNoStalenessWithoutEnabledSchedule(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
ctx := context.Background()
|
||||
|
||||
// Make intermittent.
|
||||
if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
|
||||
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||
}
|
||||
|
||||
// Set last_backup_at to 8 days ago — stale — but no schedule.
|
||||
eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour)
|
||||
if err := st.SetHostLastBackup(ctx, hostID, "succeeded", eightDaysAgo); err != nil {
|
||||
t.Fatalf("SetHostLastBackup: %v", err)
|
||||
}
|
||||
|
||||
eng.tick(ctx, time.Now().UTC())
|
||||
|
||||
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
for _, a := range open {
|
||||
if a.Kind == KindStaleSchedule {
|
||||
t.Fatalf("expected no stale_schedule without an enabled schedule; got: %+v", a)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveOnModeChangeClearsOfflineAlert checks that ResolveOnModeChange
|
||||
// clears an open agent_offline alert when a host's mode is toggled.
|
||||
func TestResolveOnModeChangeClearsOfflineAlert(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
ctx := context.Background()
|
||||
|
||||
// Make always-on and set it offline with a stale last_seen_at.
|
||||
if err := st.SetHostAlwaysOn(ctx, hostID, true); err != nil {
|
||||
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||
}
|
||||
if _, err := st.DB().Exec(
|
||||
`UPDATE hosts SET last_seen_at = ?, status = ? WHERE id = ?`,
|
||||
time.Now().UTC().Add(-2*time.Hour).Format(time.RFC3339Nano),
|
||||
"offline",
|
||||
hostID,
|
||||
); err != nil {
|
||||
t.Fatalf("update last_seen_at: %v", err)
|
||||
}
|
||||
|
||||
// Raise the offline alert.
|
||||
eng.handleHostOffline(ctx, hostID)
|
||||
|
||||
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
if len(open) != 1 || open[0].Kind != KindAgentOffline {
|
||||
t.Fatalf("expected 1 agent_offline alert before mode change; got %d: %+v", len(open), open)
|
||||
}
|
||||
|
||||
// Toggle mode — should clear the alert.
|
||||
eng.ResolveOnModeChange(ctx, hostID, time.Now().UTC())
|
||||
|
||||
open, _ = st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
for _, a := range open {
|
||||
if a.Kind == KindAgentOffline {
|
||||
t.Fatalf("expected agent_offline to be resolved after mode change; still open: %+v", a)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestNoStalenessWhenNeverBackedUp checks that no stale_schedule alert is
|
||||
// raised for an intermittent host that has never backed up (nil LastBackupAt).
|
||||
func TestNoStalenessWhenNeverBackedUp(t *testing.T) {
|
||||
t.Parallel()
|
||||
eng, st, hostID := setupEngine(t)
|
||||
ctx := context.Background()
|
||||
|
||||
// Make intermittent.
|
||||
if err := st.SetHostAlwaysOn(ctx, hostID, false); err != nil {
|
||||
t.Fatalf("SetHostAlwaysOn: %v", err)
|
||||
}
|
||||
|
||||
// Create a source group and an enabled schedule — but do NOT set LastBackupAt.
|
||||
sgID := ulid.Make().String()
|
||||
if err := st.CreateSourceGroup(ctx, &store.SourceGroup{
|
||||
ID: sgID,
|
||||
HostID: hostID,
|
||||
Name: "default",
|
||||
Includes: []string{"/home"},
|
||||
}); err != nil {
|
||||
t.Fatalf("CreateSourceGroup: %v", err)
|
||||
}
|
||||
|
||||
schedID := ulid.Make().String()
|
||||
if err := st.CreateSchedule(ctx, &store.Schedule{
|
||||
ID: schedID,
|
||||
HostID: hostID,
|
||||
CronExpr: "0 2 * * *",
|
||||
Enabled: true,
|
||||
SourceGroupIDs: []string{sgID},
|
||||
}); err != nil {
|
||||
t.Fatalf("CreateSchedule: %v", err)
|
||||
}
|
||||
|
||||
eng.tick(ctx, time.Now().UTC())
|
||||
|
||||
open, _ := st.ListAlerts(ctx, store.AlertFilter{Status: "open", HostID: hostID})
|
||||
for _, a := range open {
|
||||
if a.Kind == KindStaleSchedule {
|
||||
t.Fatalf("expected no stale_schedule when never backed up; got: %+v", a)
|
||||
}
|
||||
}
|
||||
}
|
||||
+4
-14
@@ -27,10 +27,10 @@ const (
|
||||
// integrity is at risk) when a check job fails.
|
||||
KindCheckFailed = "check_failed"
|
||||
|
||||
// KindStaleSchedule is raised for intermittent (non-always-on) hosts
|
||||
// when their last successful backup is older than staleBackupThreshold
|
||||
// (7 days) and they have at least one enabled schedule. Resolved on
|
||||
// backup success or when the host is switched to always-on mode.
|
||||
// KindStaleSchedule is declared for completeness but intentionally
|
||||
// left as a no-op in v1. The precise "expected to have fired but
|
||||
// didn't" logic requires a store helper that lands in a follow-up
|
||||
// task. Ask the team before implementing.
|
||||
KindStaleSchedule = "stale_schedule"
|
||||
|
||||
// KindAgentOffline is raised when a host's last_seen_at is older
|
||||
@@ -122,16 +122,6 @@ func alertPayload(ctx context.Context, st *store.Store, ev notification.Event, a
|
||||
}
|
||||
}
|
||||
|
||||
// ResolveOnModeChange clears any open agent_offline and stale_schedule
|
||||
// alerts for a host whose always-on flag was just toggled. The next
|
||||
// 60s tick re-raises whichever still applies under the new mode, so
|
||||
// this is a self-correcting "wipe and let the sweep settle" call.
|
||||
// Safe to invoke from the HTTP layer (it only touches the store + hub).
|
||||
func (e *Engine) ResolveOnModeChange(ctx context.Context, hostID string, when time.Time) {
|
||||
e.resolveAndNotify(ctx, hostID, KindAgentOffline, "", when)
|
||||
e.resolveAndNotify(ctx, hostID, KindStaleSchedule, "", when)
|
||||
}
|
||||
|
||||
// resolveAndNotify clears the open (or acknowledged) alert matching
|
||||
// (host_id, kind, dedup_key) via store.AutoResolve, then fires
|
||||
// alert.resolved for the row(s) actually closed. Best-effort —
|
||||
|
||||
@@ -41,24 +41,6 @@ type Config struct {
|
||||
// DataDir. Source-build deployments can override via
|
||||
// RM_BUNDLED_ASSETS_DIR.
|
||||
BundledAssetsDir string `yaml:"bundled_assets_dir"`
|
||||
|
||||
// MetricsToken, if set, gates the /metrics scrape endpoint
|
||||
// behind a `Authorization: Bearer <token>` check (constant-time
|
||||
// compare). When neither this nor MetricsTrustedCIDRs is set,
|
||||
// the route is not mounted at all (the endpoint is opt-in).
|
||||
MetricsToken string `yaml:"metrics_token"`
|
||||
|
||||
// MetricsTrustedCIDRs, if non-empty, gates /metrics so only
|
||||
// callers from these networks may scrape. ANDed with
|
||||
// MetricsToken when both are set.
|
||||
MetricsTrustedCIDRs []string `yaml:"metrics_trusted_cidrs"`
|
||||
}
|
||||
|
||||
// MetricsAuthEnabled reports whether the operator has opted into
|
||||
// exposing the Prometheus scrape endpoint by configuring at least
|
||||
// one auth gate.
|
||||
func (c Config) MetricsAuthEnabled() bool {
|
||||
return c.MetricsToken != "" || len(c.MetricsTrustedCIDRs) > 0
|
||||
}
|
||||
|
||||
// Load resolves config in this order:
|
||||
@@ -111,19 +93,6 @@ func Load(yamlPath string) (Config, error) {
|
||||
if v, ok := os.LookupEnv("RM_BUNDLED_ASSETS_DIR"); ok {
|
||||
c.BundledAssetsDir = v
|
||||
}
|
||||
if v, ok := os.LookupEnv("RM_METRICS_TOKEN"); ok {
|
||||
c.MetricsToken = v
|
||||
}
|
||||
if v, ok := os.LookupEnv("RM_METRICS_TRUSTED_CIDR"); ok {
|
||||
parts := strings.Split(v, ",")
|
||||
c.MetricsTrustedCIDRs = c.MetricsTrustedCIDRs[:0]
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if p != "" {
|
||||
c.MetricsTrustedCIDRs = append(c.MetricsTrustedCIDRs, p)
|
||||
}
|
||||
}
|
||||
}
|
||||
if v, ok := os.LookupEnv("RM_TRUSTED_PROXY"); ok {
|
||||
// Comma-separated CIDRs; allow whitespace for readability.
|
||||
parts := strings.Split(v, ",")
|
||||
@@ -168,10 +137,5 @@ func (c *Config) validate() error {
|
||||
return fmt.Errorf("config: RM_TRUSTED_PROXY entry %q is not a valid CIDR: %w", cidr, err)
|
||||
}
|
||||
}
|
||||
for _, cidr := range c.MetricsTrustedCIDRs {
|
||||
if _, err := netip.ParsePrefix(cidr); err != nil {
|
||||
return fmt.Errorf("config: RM_METRICS_TRUSTED_CIDR entry %q is not a valid CIDR: %w", cidr, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -98,45 +98,6 @@ func TestCookieSecureDefaultAndOverride(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsAuthGates(t *testing.T) {
|
||||
t.Setenv("RM_LISTEN", ":8080")
|
||||
t.Setenv("RM_DATA_DIR", "/tmp/x")
|
||||
|
||||
c, err := Load("")
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if c.MetricsAuthEnabled() {
|
||||
t.Errorf("metrics endpoint should be off by default")
|
||||
}
|
||||
|
||||
t.Setenv("RM_METRICS_TOKEN", "s3cr3t-token-with-enough-bytes")
|
||||
t.Setenv("RM_METRICS_TRUSTED_CIDR", "10.0.0.0/8, 192.168.1.0/24")
|
||||
c, err = Load("")
|
||||
if err != nil {
|
||||
t.Fatalf("load: %v", err)
|
||||
}
|
||||
if c.MetricsToken != "s3cr3t-token-with-enough-bytes" {
|
||||
t.Errorf("token: %q", c.MetricsToken)
|
||||
}
|
||||
if got := c.MetricsTrustedCIDRs; len(got) != 2 || got[0] != "10.0.0.0/8" || got[1] != "192.168.1.0/24" {
|
||||
t.Errorf("cidrs: %v", got)
|
||||
}
|
||||
if !c.MetricsAuthEnabled() {
|
||||
t.Errorf("MetricsAuthEnabled should be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsTrustedCIDRRejectsGarbage(t *testing.T) {
|
||||
t.Setenv("RM_LISTEN", ":8080")
|
||||
t.Setenv("RM_DATA_DIR", "/tmp/x")
|
||||
t.Setenv("RM_METRICS_TRUSTED_CIDR", "garbage")
|
||||
|
||||
if _, err := Load(""); err == nil {
|
||||
t.Fatal("expected validation error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func writeFile(path string, body []byte) error {
|
||||
return writeFileImpl(path, body)
|
||||
}
|
||||
|
||||
@@ -1,141 +0,0 @@
|
||||
// catchup.go — server-side catch-up for intermittent (non-always-on)
|
||||
// hosts. When such a host reconnects we wait a short settle window,
|
||||
// then dispatch a backup for any schedule whose window elapsed while
|
||||
// the host was asleep. This is separate from pending_runs: a host that
|
||||
// was asleep never fired its local cron, so no pending row exists.
|
||||
package http
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"time"
|
||||
)
|
||||
|
||||
// scheduleOverdue reports whether a schedule's most recent expected
|
||||
// fire is newer than the host's last successful backup — i.e. a window
|
||||
// passed with no backup. A nil lastBackup means "never backed up" and
|
||||
// is always overdue (provided the cron parses). An unparseable cron is
|
||||
// treated as not-overdue so a bad expression can never trigger a
|
||||
// surprise dispatch. Uses the same cronParser the agent's scheduler
|
||||
// and schedule validation use, so interpretation is identical.
|
||||
func scheduleOverdue(cronExpr string, lastBackup *time.Time, now time.Time) bool {
|
||||
sched, err := cronParser.Parse(cronExpr)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
if lastBackup == nil {
|
||||
return true
|
||||
}
|
||||
next := sched.Next(*lastBackup)
|
||||
return !next.After(now)
|
||||
}
|
||||
|
||||
// catchupSettle is how long after a reconnect we wait before evaluating
|
||||
// catch-up, so a laptop that wakes briefly and sleeps again doesn't
|
||||
// trigger a backup it can't finish. ~1 minute per the spec.
|
||||
const catchupSettle = 60 * time.Second
|
||||
|
||||
// ArmCatchup records that an intermittent host just reconnected and
|
||||
// should be evaluated for a missed backup after the settle window.
|
||||
// No-op for always-on hosts (caller passes only intermittent hosts).
|
||||
// Re-arming overwrites the timer (debounce — flapping doesn't stack).
|
||||
func (s *Server) ArmCatchup(hostID string, now time.Time) {
|
||||
s.catchupMu.Lock()
|
||||
defer s.catchupMu.Unlock()
|
||||
s.catchupDueAt[hostID] = now.Add(catchupSettle)
|
||||
}
|
||||
|
||||
// dueCatchups returns the hostIDs whose settle window has elapsed and
|
||||
// removes them from the map. Caller evaluates each.
|
||||
func (s *Server) dueCatchups(now time.Time) []string {
|
||||
s.catchupMu.Lock()
|
||||
defer s.catchupMu.Unlock()
|
||||
var due []string
|
||||
for id, at := range s.catchupDueAt {
|
||||
if !now.Before(at) {
|
||||
due = append(due, id)
|
||||
delete(s.catchupDueAt, id)
|
||||
}
|
||||
}
|
||||
return due
|
||||
}
|
||||
|
||||
// RunCatchupsDue is the tick entrypoint. For each host past its settle
|
||||
// window it dispatches a backup for every enabled schedule that is
|
||||
// overdue. Skips hosts that bounced back offline, that are already
|
||||
// running/queued a job, or that turned out to be always-on.
|
||||
func (s *Server) RunCatchupsDue(ctx context.Context) {
|
||||
if s.deps.Hub == nil {
|
||||
return
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
for _, hostID := range s.dueCatchups(now) {
|
||||
s.runCatchup(ctx, hostID, now)
|
||||
}
|
||||
}
|
||||
|
||||
// runCatchup evaluates and dispatches catch-up backups for a single
|
||||
// host. Kept separate so RunCatchupsDue reads cleanly.
|
||||
func (s *Server) runCatchup(ctx context.Context, hostID string, now time.Time) {
|
||||
conn := s.deps.Hub.Conn(hostID)
|
||||
if conn == nil {
|
||||
return // bounced offline during the settle window; re-arms on next hello
|
||||
}
|
||||
host, err := s.deps.Store.GetHost(ctx, hostID)
|
||||
if err != nil {
|
||||
slog.Warn("catchup: load host", "host_id", hostID, "err", err)
|
||||
return
|
||||
}
|
||||
if host.AlwaysOn {
|
||||
return // mode flipped during settle window
|
||||
}
|
||||
// Skip if a backup is already queued or running for this host —
|
||||
// don't pile a catch-up on top of in-flight work. (hosts.current_job_id
|
||||
// is not maintained, so we check the jobs table directly.)
|
||||
active, err := s.deps.Store.HasActiveBackupJob(ctx, hostID)
|
||||
if err != nil {
|
||||
slog.Warn("catchup: check active backup", "host_id", hostID, "err", err)
|
||||
return
|
||||
}
|
||||
if active {
|
||||
return
|
||||
}
|
||||
schedules, err := s.deps.Store.ListSchedulesByHost(ctx, hostID)
|
||||
if err != nil {
|
||||
slog.Warn("catchup: list schedules", "host_id", hostID, "err", err)
|
||||
return
|
||||
}
|
||||
// NOTE: overdue is measured against host.LastBackupAt, which is the
|
||||
// most recent *successful backup of any schedule* on this host — not
|
||||
// a per-schedule timestamp. For the common intermittent host (a
|
||||
// single backup schedule) this is exact. With multiple schedules of
|
||||
// different cadences, a recent backup from one schedule can mask
|
||||
// another schedule's missed window. Acceptable for v1; revisit with
|
||||
// per-schedule last-success tracking if multi-cadence laptops appear.
|
||||
for _, sc := range schedules {
|
||||
if !sc.Enabled || len(sc.SourceGroupIDs) == 0 {
|
||||
continue
|
||||
}
|
||||
if !scheduleOverdue(sc.CronExpr, host.LastBackupAt, now) {
|
||||
continue
|
||||
}
|
||||
for _, gid := range sc.SourceGroupIDs {
|
||||
g, err := s.deps.Store.GetSourceGroup(ctx, hostID, gid)
|
||||
if err != nil {
|
||||
slog.Warn("catchup: load source group",
|
||||
"host_id", hostID, "schedule_id", sc.ID, "group_id", gid, "err", err)
|
||||
continue
|
||||
}
|
||||
if _, derr := s.dispatchBackupForGroupCore(ctx, conn, hostID, sc.ID, g, now); derr != nil {
|
||||
// Send failed for this group — host may have dropped
|
||||
// again. Earlier groups in this batch were already
|
||||
// dispatched; re-arm so a later reconnect re-evaluates
|
||||
// any still-overdue schedules.
|
||||
s.ArmCatchup(hostID, now)
|
||||
return
|
||||
}
|
||||
slog.Info("catchup: dispatched missed backup",
|
||||
"host_id", hostID, "schedule_id", sc.ID, "group", g.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,246 +0,0 @@
|
||||
// catchup_scheduler_test.go — integration tests for the catch-up scheduler.
|
||||
package http
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/oklog/ulid/v2"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
)
|
||||
|
||||
// TestRunCatchupDispatchesOverdue verifies four properties of the
|
||||
// catch-up scheduler in separate sub-tests sharing no state.
|
||||
func TestRunCatchupDispatchesOverdue(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// --- 1. Overdue host with connected agent → backup dispatched -------
|
||||
t.Run("overdue_dispatch", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
srv, ts, st := rawTestServer(t)
|
||||
hostID, token := enrolHostForWS(t, srv, st, "catchup-overdue")
|
||||
|
||||
if err := st.SetHostAlwaysOn(context.Background(), hostID, false); err != nil {
|
||||
t.Fatalf("set always_on: %v", err)
|
||||
}
|
||||
// Last backup ~8 days ago → schedule overdue.
|
||||
eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour)
|
||||
if err := st.SetHostLastBackup(context.Background(), hostID, "succeeded", eightDaysAgo); err != nil {
|
||||
t.Fatalf("set last backup: %v", err)
|
||||
}
|
||||
|
||||
if err := st.CreateJob(context.Background(), store.Job{
|
||||
ID: ulid.Make().String(), HostID: hostID, Kind: "init",
|
||||
ActorKind: "system", CreatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
t.Fatalf("seed init: %v", err)
|
||||
}
|
||||
|
||||
gid := ulid.Make().String()
|
||||
if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{
|
||||
ID: gid, HostID: hostID, Name: "home", Includes: []string{"/home"},
|
||||
}); err != nil {
|
||||
t.Fatalf("source group: %v", err)
|
||||
}
|
||||
sid := ulid.Make().String()
|
||||
if err := st.CreateSchedule(context.Background(), &store.Schedule{
|
||||
ID: sid, HostID: hostID, CronExpr: "0 2 * * *", Enabled: true,
|
||||
SourceGroupIDs: []string{gid},
|
||||
}); err != nil {
|
||||
t.Fatalf("schedule: %v", err)
|
||||
}
|
||||
|
||||
c := agentDial(t, srv, ts, hostID, token)
|
||||
sendHello(t, c, "catchup-overdue")
|
||||
_ = drainUntil(t, c, api.MsgScheduleSet)
|
||||
|
||||
// Arm with a past time so the settle window is already elapsed.
|
||||
srv.ArmCatchup(hostID, time.Now().UTC().Add(-2*time.Minute))
|
||||
srv.RunCatchupsDue(context.Background())
|
||||
|
||||
// Give the dispatch goroutine a moment to write the job row.
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
var n int
|
||||
if err := st.DB().QueryRow(
|
||||
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup'`, hostID).Scan(&n); err != nil {
|
||||
t.Fatalf("count: %v", err)
|
||||
}
|
||||
if n < 1 {
|
||||
t.Errorf("overdue host: want ≥1 backup job, got %d", n)
|
||||
}
|
||||
})
|
||||
|
||||
// --- 2. Not overdue → no dispatch -----------------------------------
|
||||
t.Run("not_overdue_no_dispatch", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
srv, ts, st := rawTestServer(t)
|
||||
hostID, token := enrolHostForWS(t, srv, st, "catchup-notoverdue")
|
||||
|
||||
if err := st.SetHostAlwaysOn(context.Background(), hostID, false); err != nil {
|
||||
t.Fatalf("set always_on: %v", err)
|
||||
}
|
||||
// Last backup just now → not overdue.
|
||||
now := time.Now().UTC()
|
||||
if err := st.SetHostLastBackup(context.Background(), hostID, "succeeded", now); err != nil {
|
||||
t.Fatalf("set last backup: %v", err)
|
||||
}
|
||||
|
||||
if err := st.CreateJob(context.Background(), store.Job{
|
||||
ID: ulid.Make().String(), HostID: hostID, Kind: "init",
|
||||
ActorKind: "system", CreatedAt: now,
|
||||
}); err != nil {
|
||||
t.Fatalf("seed init: %v", err)
|
||||
}
|
||||
|
||||
gid := ulid.Make().String()
|
||||
if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{
|
||||
ID: gid, HostID: hostID, Name: "home", Includes: []string{"/home"},
|
||||
}); err != nil {
|
||||
t.Fatalf("source group: %v", err)
|
||||
}
|
||||
sid := ulid.Make().String()
|
||||
if err := st.CreateSchedule(context.Background(), &store.Schedule{
|
||||
ID: sid, HostID: hostID, CronExpr: "0 2 * * *", Enabled: true,
|
||||
SourceGroupIDs: []string{gid},
|
||||
}); err != nil {
|
||||
t.Fatalf("schedule: %v", err)
|
||||
}
|
||||
|
||||
c := agentDial(t, srv, ts, hostID, token)
|
||||
sendHello(t, c, "catchup-notoverdue")
|
||||
_ = drainUntil(t, c, api.MsgScheduleSet)
|
||||
|
||||
srv.ArmCatchup(hostID, time.Now().UTC().Add(-2*time.Minute))
|
||||
srv.RunCatchupsDue(context.Background())
|
||||
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
var n int
|
||||
if err := st.DB().QueryRow(
|
||||
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup'`, hostID).Scan(&n); err != nil {
|
||||
t.Fatalf("count: %v", err)
|
||||
}
|
||||
if n != 0 {
|
||||
t.Errorf("not-overdue host: want 0 backup jobs, got %d", n)
|
||||
}
|
||||
})
|
||||
|
||||
// --- 3. Active backup in flight → no new dispatch -------------------
|
||||
t.Run("active_backup_blocks_dispatch", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
srv, ts, st := rawTestServer(t)
|
||||
hostID, token := enrolHostForWS(t, srv, st, "catchup-active")
|
||||
|
||||
if err := st.SetHostAlwaysOn(context.Background(), hostID, false); err != nil {
|
||||
t.Fatalf("set always_on: %v", err)
|
||||
}
|
||||
eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour)
|
||||
if err := st.SetHostLastBackup(context.Background(), hostID, "succeeded", eightDaysAgo); err != nil {
|
||||
t.Fatalf("set last backup: %v", err)
|
||||
}
|
||||
|
||||
if err := st.CreateJob(context.Background(), store.Job{
|
||||
ID: ulid.Make().String(), HostID: hostID, Kind: "init",
|
||||
ActorKind: "system", CreatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
t.Fatalf("seed init: %v", err)
|
||||
}
|
||||
|
||||
gid := ulid.Make().String()
|
||||
if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{
|
||||
ID: gid, HostID: hostID, Name: "home", Includes: []string{"/home"},
|
||||
}); err != nil {
|
||||
t.Fatalf("source group: %v", err)
|
||||
}
|
||||
sid := ulid.Make().String()
|
||||
if err := st.CreateSchedule(context.Background(), &store.Schedule{
|
||||
ID: sid, HostID: hostID, CronExpr: "0 2 * * *", Enabled: true,
|
||||
SourceGroupIDs: []string{gid},
|
||||
}); err != nil {
|
||||
t.Fatalf("schedule: %v", err)
|
||||
}
|
||||
|
||||
// Seed a queued backup job — this is "already in flight".
|
||||
if err := st.CreateJob(context.Background(), store.Job{
|
||||
ID: ulid.Make().String(), HostID: hostID, Kind: "backup",
|
||||
ActorKind: "schedule", CreatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
t.Fatalf("seed queued backup: %v", err)
|
||||
}
|
||||
|
||||
c := agentDial(t, srv, ts, hostID, token)
|
||||
sendHello(t, c, "catchup-active")
|
||||
_ = drainUntil(t, c, api.MsgScheduleSet)
|
||||
|
||||
srv.ArmCatchup(hostID, time.Now().UTC().Add(-2*time.Minute))
|
||||
srv.RunCatchupsDue(context.Background())
|
||||
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
var n int
|
||||
if err := st.DB().QueryRow(
|
||||
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup'`, hostID).Scan(&n); err != nil {
|
||||
t.Fatalf("count: %v", err)
|
||||
}
|
||||
// Count must still be exactly 1 — no second job added.
|
||||
if n != 1 {
|
||||
t.Errorf("active backup guard: want 1 job (the seeded one), got %d", n)
|
||||
}
|
||||
})
|
||||
|
||||
// --- 4. Disconnected host → no dispatch -----------------------------
|
||||
t.Run("disconnected_no_dispatch", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
srv, _, st := rawTestServer(t)
|
||||
hostID, _ := enrolHostForWS(t, srv, st, "catchup-disconnected")
|
||||
|
||||
if err := st.SetHostAlwaysOn(context.Background(), hostID, false); err != nil {
|
||||
t.Fatalf("set always_on: %v", err)
|
||||
}
|
||||
eightDaysAgo := time.Now().UTC().Add(-8 * 24 * time.Hour)
|
||||
if err := st.SetHostLastBackup(context.Background(), hostID, "succeeded", eightDaysAgo); err != nil {
|
||||
t.Fatalf("set last backup: %v", err)
|
||||
}
|
||||
|
||||
if err := st.CreateJob(context.Background(), store.Job{
|
||||
ID: ulid.Make().String(), HostID: hostID, Kind: "init",
|
||||
ActorKind: "system", CreatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
t.Fatalf("seed init: %v", err)
|
||||
}
|
||||
|
||||
gid := ulid.Make().String()
|
||||
if err := st.CreateSourceGroup(context.Background(), &store.SourceGroup{
|
||||
ID: gid, HostID: hostID, Name: "home", Includes: []string{"/home"},
|
||||
}); err != nil {
|
||||
t.Fatalf("source group: %v", err)
|
||||
}
|
||||
sid := ulid.Make().String()
|
||||
if err := st.CreateSchedule(context.Background(), &store.Schedule{
|
||||
ID: sid, HostID: hostID, CronExpr: "0 2 * * *", Enabled: true,
|
||||
SourceGroupIDs: []string{gid},
|
||||
}); err != nil {
|
||||
t.Fatalf("schedule: %v", err)
|
||||
}
|
||||
|
||||
// Host is NOT connected — no agentDial.
|
||||
|
||||
srv.ArmCatchup(hostID, time.Now().UTC().Add(-2*time.Minute))
|
||||
srv.RunCatchupsDue(context.Background())
|
||||
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
var n int
|
||||
if err := st.DB().QueryRow(
|
||||
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup'`, hostID).Scan(&n); err != nil {
|
||||
t.Fatalf("count: %v", err)
|
||||
}
|
||||
if n != 0 {
|
||||
t.Errorf("disconnected host: want 0 backup jobs, got %d", n)
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -1,41 +0,0 @@
|
||||
package http
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestScheduleOverdue(t *testing.T) {
|
||||
mustParse := func(s string) time.Time {
|
||||
t.Helper()
|
||||
v, err := time.Parse(time.RFC3339, s)
|
||||
if err != nil {
|
||||
t.Fatalf("parse %q: %v", s, err)
|
||||
}
|
||||
return v
|
||||
}
|
||||
daily := "0 2 * * *" // 02:00 every day
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
cron string
|
||||
lastBackup *time.Time
|
||||
now time.Time
|
||||
want bool
|
||||
}{
|
||||
{name: "never backed up is overdue", cron: daily, lastBackup: nil, now: mustParse("2026-06-15T09:00:00Z"), want: true},
|
||||
{name: "missed last nights window", cron: daily, lastBackup: ptrTime(mustParse("2026-06-13T02:05:00Z")), now: mustParse("2026-06-15T09:00:00Z"), want: true},
|
||||
{name: "backed up after the most recent window", cron: daily, lastBackup: ptrTime(mustParse("2026-06-15T02:05:00Z")), now: mustParse("2026-06-15T09:00:00Z"), want: false},
|
||||
{name: "unparseable cron is never overdue", cron: "not a cron", lastBackup: nil, now: mustParse("2026-06-15T09:00:00Z"), want: false},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
got := scheduleOverdue(c.cron, c.lastBackup, c.now)
|
||||
if got != c.want {
|
||||
t.Fatalf("scheduleOverdue(%q, %v, %v) = %v, want %v", c.cron, c.lastBackup, c.now, got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func ptrTime(t time.Time) *time.Time { return &t }
|
||||
@@ -483,12 +483,6 @@ func (s *Server) onAgentHello(ctx context.Context, hostID string, conn *ws.Conn)
|
||||
// and the drain may take seconds across many rows. A non-blocking
|
||||
// goroutine keeps the hello path snappy.
|
||||
go s.DrainPending(context.Background(), hostID)
|
||||
// Intermittent hosts that just reconnected may have slept through a
|
||||
// backup window. Arm a catch-up evaluation after a settle delay; the
|
||||
// pending-drain tick fires it. Always-on hosts never need this.
|
||||
if host, err := s.deps.Store.GetHost(ctx, hostID); err == nil && !host.AlwaysOn {
|
||||
s.ArmCatchup(hostID, time.Now().UTC())
|
||||
}
|
||||
}
|
||||
|
||||
// maybeAutoInit dispatches a `restic init` job iff the host has no
|
||||
|
||||
@@ -25,7 +25,6 @@ type hostView struct {
|
||||
CurrentJobID *string `json:"current_job_id,omitempty"`
|
||||
LastBackupAt *string `json:"last_backup_at,omitempty"`
|
||||
LastBackupStatus *string `json:"last_backup_status,omitempty"`
|
||||
RepoStatus string `json:"repo_status,omitempty"`
|
||||
RepoSizeBytes int64 `json:"repo_size_bytes"`
|
||||
SnapshotCount int `json:"snapshot_count"`
|
||||
OpenAlertCount int `json:"open_alert_count"`
|
||||
@@ -86,7 +85,6 @@ func hostToView(h store.Host) hostView {
|
||||
Tags: h.Tags,
|
||||
CurrentJobID: h.CurrentJobID,
|
||||
LastBackupStatus: h.LastBackupStatus,
|
||||
RepoStatus: h.RepoStatus,
|
||||
RepoSizeBytes: h.RepoSizeBytes,
|
||||
SnapshotCount: h.SnapshotCount,
|
||||
OpenAlertCount: h.OpenAlertCount,
|
||||
|
||||
@@ -1,185 +0,0 @@
|
||||
package http
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/subtle"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/netip"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||
)
|
||||
|
||||
// handleMetrics serves the Prometheus exposition body. The route is
|
||||
// only mounted when the operator has opted in via RM_METRICS_TOKEN
|
||||
// or RM_METRICS_TRUSTED_CIDR (see Server.New + Cfg.MetricsAuthEnabled).
|
||||
func (s *Server) handleMetrics(w http.ResponseWriter, r *http.Request) {
|
||||
if !authoriseMetricsScrape(r, s.deps.Cfg) {
|
||||
// 401 with no body; Prom respects this and surfaces the failed
|
||||
// scrape. WWW-Authenticate hints at bearer when the operator
|
||||
// actually configured a token.
|
||||
if s.deps.Cfg.MetricsToken != "" {
|
||||
w.Header().Set("WWW-Authenticate", `Bearer realm="restic-manager metrics"`)
|
||||
}
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
return
|
||||
}
|
||||
|
||||
snap, err := s.gatherMetricsSnapshot(r.Context())
|
||||
if err != nil {
|
||||
http.Error(w, "snapshot: "+err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
// 0.0.4 is the long-stable text-format version Prometheus accepts
|
||||
// without negotiation; OpenMetrics is intentionally not used here.
|
||||
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
if err := metrics.Render(w, snap); err != nil {
|
||||
// Body is partially written; nothing useful we can do beyond
|
||||
// dropping the connection (chi's recoverer will log).
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// authoriseMetricsScrape applies bearer + CIDR gates per the spec.
|
||||
// AND semantics when both are configured; either alone is sufficient
|
||||
// when only it is configured.
|
||||
func authoriseMetricsScrape(r *http.Request, cfg config.Config) bool {
|
||||
tokenOK := true
|
||||
if cfg.MetricsToken != "" {
|
||||
tokenOK = false
|
||||
hdr := r.Header.Get("Authorization")
|
||||
const prefix = "Bearer "
|
||||
if strings.HasPrefix(hdr, prefix) {
|
||||
got := []byte(strings.TrimPrefix(hdr, prefix))
|
||||
want := []byte(cfg.MetricsToken)
|
||||
if subtle.ConstantTimeCompare(got, want) == 1 {
|
||||
tokenOK = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cidrOK := true
|
||||
if len(cfg.MetricsTrustedCIDRs) > 0 {
|
||||
cidrOK = false
|
||||
ip := callerIP(r, cfg.TrustedProxies)
|
||||
if ip.IsValid() {
|
||||
for _, c := range cfg.MetricsTrustedCIDRs {
|
||||
prefix, err := netip.ParsePrefix(c)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if prefix.Contains(ip) {
|
||||
cidrOK = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return tokenOK && cidrOK
|
||||
}
|
||||
|
||||
// callerIP resolves the client IP. When the request hit the server
|
||||
// directly we use RemoteAddr; when the immediate hop is a trusted
|
||||
// proxy we honour the right-most untrusted X-Forwarded-For entry
|
||||
// (mirrors how realIP middlewares typically resolve).
|
||||
func callerIP(r *http.Request, trustedProxies []string) netip.Addr {
|
||||
host, _, err := net.SplitHostPort(r.RemoteAddr)
|
||||
if err != nil {
|
||||
host = r.RemoteAddr
|
||||
}
|
||||
directAddr, err := netip.ParseAddr(host)
|
||||
if err != nil {
|
||||
return netip.Addr{}
|
||||
}
|
||||
|
||||
if !addrInAnyCIDR(directAddr, trustedProxies) {
|
||||
return directAddr
|
||||
}
|
||||
|
||||
xff := r.Header.Get("X-Forwarded-For")
|
||||
if xff == "" {
|
||||
return directAddr
|
||||
}
|
||||
parts := strings.Split(xff, ",")
|
||||
// Walk right→left, skipping trusted proxies, until we land on the
|
||||
// first untrusted hop — that's the genuine client.
|
||||
for i := len(parts) - 1; i >= 0; i-- {
|
||||
p := strings.TrimSpace(parts[i])
|
||||
a, err := netip.ParseAddr(p)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if addrInAnyCIDR(a, trustedProxies) {
|
||||
continue
|
||||
}
|
||||
return a
|
||||
}
|
||||
return directAddr
|
||||
}
|
||||
|
||||
func addrInAnyCIDR(a netip.Addr, cidrs []string) bool {
|
||||
for _, c := range cidrs {
|
||||
pre, err := netip.ParsePrefix(c)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if pre.Contains(a) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// gatherMetricsSnapshot pulls the data the renderer needs. One
|
||||
// indexed query per per-host or fleet-wide read; no N+1.
|
||||
func (s *Server) gatherMetricsSnapshot(ctx context.Context) (metrics.Snapshot, error) {
|
||||
hosts, err := s.deps.Store.ListHosts(ctx)
|
||||
if err != nil {
|
||||
return metrics.Snapshot{}, err
|
||||
}
|
||||
hostRows := make([]metrics.HostRow, 0, len(hosts))
|
||||
for _, h := range hosts {
|
||||
row := metrics.HostRow{
|
||||
ID: h.ID,
|
||||
Name: h.Name,
|
||||
Online: h.Status == "online",
|
||||
SnapshotCount: h.SnapshotCount,
|
||||
OpenAlertCount: h.OpenAlertCount,
|
||||
RepoStatus: h.RepoStatus,
|
||||
}
|
||||
if h.LastBackupAt != nil {
|
||||
ts := h.LastBackupAt.Unix()
|
||||
row.LastBackupUnix = &ts
|
||||
}
|
||||
if h.LastBackupStatus != nil {
|
||||
ok := *h.LastBackupStatus == "succeeded"
|
||||
row.LastBackupSucceeded = &ok
|
||||
}
|
||||
if h.RepoSizeBytes > 0 {
|
||||
sz := h.RepoSizeBytes
|
||||
row.RepoSizeBytes = &sz
|
||||
}
|
||||
hostRows = append(hostRows, row)
|
||||
}
|
||||
|
||||
open, err := s.deps.Store.ListAlerts(ctx, store.AlertFilter{Status: "open"})
|
||||
if err != nil {
|
||||
return metrics.Snapshot{}, err
|
||||
}
|
||||
bySeverity := map[string]int{"info": 0, "warning": 0, "critical": 0}
|
||||
for _, a := range open {
|
||||
bySeverity[a.Severity]++
|
||||
}
|
||||
|
||||
reg := s.deps.Metrics
|
||||
if reg == nil {
|
||||
reg = metrics.NewRegistry() // empty histogram block
|
||||
}
|
||||
return reg.SnapshotWith(hostRows, bySeverity, version.Version, version.Commit, runtime.Version()), nil
|
||||
}
|
||||
@@ -1,209 +0,0 @@
|
||||
package http
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
stdhttp "net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
)
|
||||
|
||||
// newMetricsServer builds a Server with metrics enabled per cfg.
|
||||
// Returns (URL, registry) so tests can both observe job durations
|
||||
// directly and exercise the HTTP gate.
|
||||
func newMetricsServer(t *testing.T, cfg config.Config) (string, *metrics.Registry, *store.Store) {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
|
||||
st, err := store.Open(context.Background(), filepath.Join(dir, "rm.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { _ = st.Close() })
|
||||
|
||||
keyPath := filepath.Join(dir, "secret.key")
|
||||
if err := crypto.GenerateKeyFile(keyPath); err != nil {
|
||||
t.Fatalf("genkey: %v", err)
|
||||
}
|
||||
key, _ := crypto.LoadKeyFromFile(keyPath)
|
||||
aead, _ := crypto.NewAEAD(key)
|
||||
|
||||
cfg.Listen = ":0"
|
||||
cfg.DataDir = dir
|
||||
cfg.SecretKeyFile = keyPath
|
||||
|
||||
reg := metrics.NewRegistry()
|
||||
deps := Deps{
|
||||
Cfg: cfg,
|
||||
Store: st,
|
||||
AEAD: aead,
|
||||
Metrics: reg,
|
||||
}
|
||||
s := New(deps)
|
||||
ts := httptest.NewServer(s.srv.Handler)
|
||||
t.Cleanup(ts.Close)
|
||||
return ts.URL, reg, st
|
||||
}
|
||||
|
||||
func TestMetricsRouteNotMountedByDefault(t *testing.T) {
|
||||
t.Parallel()
|
||||
url, _, _ := newMetricsServer(t, config.Config{})
|
||||
res, err := stdhttp.Get(url + "/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != stdhttp.StatusNotFound {
|
||||
t.Errorf("status: got %d, want 404 (route should not be mounted)", res.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsTokenRequired(t *testing.T) {
|
||||
t.Parallel()
|
||||
url, _, _ := newMetricsServer(t, config.Config{
|
||||
MetricsToken: "the-token",
|
||||
})
|
||||
|
||||
// Missing token.
|
||||
res, err := stdhttp.Get(url + "/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != stdhttp.StatusUnauthorized {
|
||||
t.Errorf("no token: got %d", res.StatusCode)
|
||||
}
|
||||
if !strings.Contains(res.Header.Get("WWW-Authenticate"), "Bearer") {
|
||||
t.Errorf("WWW-Authenticate hint missing: %q", res.Header.Get("WWW-Authenticate"))
|
||||
}
|
||||
|
||||
// Wrong token.
|
||||
req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
|
||||
req.Header.Set("Authorization", "Bearer not-the-token")
|
||||
res2, err := stdhttp.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res2.Body.Close()
|
||||
if res2.StatusCode != stdhttp.StatusUnauthorized {
|
||||
t.Errorf("wrong token: got %d", res2.StatusCode)
|
||||
}
|
||||
|
||||
// Right token.
|
||||
req3, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
|
||||
req3.Header.Set("Authorization", "Bearer the-token")
|
||||
res3, err3 := stdhttp.DefaultClient.Do(req3)
|
||||
if err3 != nil {
|
||||
t.Fatalf("GET: %v", err3)
|
||||
}
|
||||
defer res3.Body.Close()
|
||||
if res3.StatusCode != stdhttp.StatusOK {
|
||||
t.Errorf("right token: got %d", res3.StatusCode)
|
||||
}
|
||||
if ct := res3.Header.Get("Content-Type"); !strings.HasPrefix(ct, "text/plain") {
|
||||
t.Errorf("content-type: %q", ct)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsCIDRGate(t *testing.T) {
|
||||
t.Parallel()
|
||||
// 127.0.0.1 is what httptest hits with; pick a CIDR that excludes it
|
||||
// to assert the "wrong source" branch.
|
||||
url, _, _ := newMetricsServer(t, config.Config{
|
||||
MetricsTrustedCIDRs: []string{"10.0.0.0/8"},
|
||||
})
|
||||
res, err := stdhttp.Get(url + "/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != stdhttp.StatusUnauthorized {
|
||||
t.Errorf("loopback hitting non-matching CIDR: got %d, want 401", res.StatusCode)
|
||||
}
|
||||
|
||||
// Now allow loopback.
|
||||
url2, _, _ := newMetricsServer(t, config.Config{
|
||||
MetricsTrustedCIDRs: []string{"127.0.0.0/8"},
|
||||
})
|
||||
res2, err := stdhttp.Get(url2 + "/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res2.Body.Close()
|
||||
if res2.StatusCode != stdhttp.StatusOK {
|
||||
t.Errorf("loopback in allow CIDR: got %d, want 200", res2.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsTokenAndCIDRBothRequired(t *testing.T) {
|
||||
t.Parallel()
|
||||
url, _, _ := newMetricsServer(t, config.Config{
|
||||
MetricsToken: "the-token",
|
||||
MetricsTrustedCIDRs: []string{"127.0.0.0/8"},
|
||||
})
|
||||
// Token only — CIDR ok (loopback) but token missing.
|
||||
res, err := stdhttp.Get(url + "/metrics")
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != stdhttp.StatusUnauthorized {
|
||||
t.Errorf("missing token but in CIDR: got %d", res.StatusCode)
|
||||
}
|
||||
|
||||
// Both right.
|
||||
req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
|
||||
req.Header.Set("Authorization", "Bearer the-token")
|
||||
res2, err := stdhttp.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res2.Body.Close()
|
||||
if res2.StatusCode != stdhttp.StatusOK {
|
||||
t.Errorf("both right: got %d", res2.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
func readAll(t *testing.T, r io.Reader) string {
|
||||
t.Helper()
|
||||
b, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
t.Fatalf("read: %v", err)
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
func TestMetricsBodyContainsExpectedLines(t *testing.T) {
|
||||
t.Parallel()
|
||||
url, reg, _ := newMetricsServer(t, config.Config{
|
||||
MetricsToken: "the-token",
|
||||
})
|
||||
reg.ObserveJob("backup", "succeeded", 0) // produce one histogram row
|
||||
|
||||
req, _ := stdhttp.NewRequest(stdhttp.MethodGet, url+"/metrics", nil)
|
||||
req.Header.Set("Authorization", "Bearer the-token")
|
||||
res, err := stdhttp.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
body := readAll(t, res.Body)
|
||||
for _, want := range []string{
|
||||
"rm_hosts_total",
|
||||
"rm_hosts_online",
|
||||
`rm_active_alerts{severity="critical"}`,
|
||||
"rm_build_info{",
|
||||
"rm_job_duration_seconds_count{kind=\"backup\",status=\"succeeded\"}",
|
||||
} {
|
||||
if !strings.Contains(body, want) {
|
||||
t.Errorf("body missing %q\n--- body ---\n%s", want, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -512,27 +512,11 @@ func TestDrainPendingSerializesPerHost(t *testing.T) {
|
||||
// Connect the agent so DrainPending can dispatch.
|
||||
c := agentDial(t, srv, ts, hostID, token)
|
||||
sendHello(t, c, "serialise-host")
|
||||
// Wait for the on-hello push to settle.
|
||||
// Drain the on-hello goroutine's pass first (no pending rows yet),
|
||||
// then wait for the schedule.set so the connection is fully settled.
|
||||
_ = drainUntil(t, c, api.MsgScheduleSet)
|
||||
|
||||
// A real agent is always in a read loop. Keep this test client
|
||||
// reading in the background for the rest of the test: without an
|
||||
// active reader the server-side conn can be dropped under parallel
|
||||
// load, which unregisters it from the hub and makes DrainPending
|
||||
// no-op (conn == nil) — the historical source of this test's
|
||||
// flakiness (it would observe 0 or a partial drain). The reader also
|
||||
// consumes the command.run envelopes our drains emit.
|
||||
readerCtx, stopReader := context.WithCancel(context.Background())
|
||||
defer stopReader()
|
||||
go func() {
|
||||
for {
|
||||
if _, _, err := c.Read(readerCtx); err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Insert 5 due pending rows.
|
||||
// Insert 5 pending rows now that the on-hello drain has already run.
|
||||
now := time.Now().UTC()
|
||||
for i := range 5 {
|
||||
pid := ulid.Make().String()
|
||||
@@ -549,8 +533,7 @@ func TestDrainPendingSerializesPerHost(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// Fire 10 concurrent DrainPending calls. The per-host mutex must
|
||||
// ensure each row is dispatched at most once (no double-dispatch).
|
||||
// Spawn 10 goroutines all calling DrainPending concurrently.
|
||||
var wg sync.WaitGroup
|
||||
for range 10 {
|
||||
wg.Add(1)
|
||||
@@ -561,26 +544,24 @@ func TestDrainPendingSerializesPerHost(t *testing.T) {
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Drain to completion. The fire-and-forget on-hello DrainPending
|
||||
// shares the same per-host mutex and can hold it during the burst,
|
||||
// leaving rows for a later pass — exactly how production drains
|
||||
// (repeatedly, via the 30s tick / on reconnect). Re-drain until the
|
||||
// queue is empty; because every drain is still serialised, each row
|
||||
// is dispatched at most once, so the exactly-5 job count below proves
|
||||
// there was no double-dispatch.
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for countPendingForHost(t, st, hostID) > 0 && time.Now().Before(deadline) {
|
||||
srv.DrainPending(context.Background(), hostID)
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
// Drain any envelopes the agent received so we don't block below.
|
||||
// We read with short timeouts and stop when the connection goes quiet.
|
||||
drainDeadline := time.Now().Add(500 * time.Millisecond)
|
||||
for time.Now().Before(drainDeadline) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
_, _, err := c.Read(ctx)
|
||||
cancel()
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// All 5 pending rows must be drained.
|
||||
// All 5 pending rows must be gone.
|
||||
if n := countPendingForHost(t, st, hostID); n != 0 {
|
||||
t.Errorf("pending rows after drain-to-completion: got %d, want 0", n)
|
||||
t.Errorf("pending rows after concurrent drain: got %d, want 0", n)
|
||||
}
|
||||
|
||||
// Exactly 5 backup job rows (one per pending row) — never more, which
|
||||
// would mean the per-host mutex failed to prevent double-dispatch.
|
||||
// Exactly 5 backup job rows (one per pending row), not 10+ from a race.
|
||||
var n int
|
||||
_ = st.DB().QueryRow(
|
||||
`SELECT COUNT(*) FROM jobs WHERE host_id = ? AND kind = 'backup' AND actor_kind = 'schedule'`,
|
||||
|
||||
@@ -17,7 +17,6 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/crypto"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/notification"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/config"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/oidc"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ui"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/ws"
|
||||
@@ -57,12 +56,6 @@ type Deps struct {
|
||||
// OIDC (optional). Non-nil when the operator has configured an
|
||||
// IdP — handlers under /auth/oidc/* are mounted only when set.
|
||||
OIDC *oidc.Client
|
||||
// Metrics (optional). When non-nil the WS job-finished branch
|
||||
// records job durations and the /metrics handler can pull a
|
||||
// histogram snapshot. Independent of MetricsAuthEnabled — the
|
||||
// recorder runs even if the scrape endpoint is gated off, so a
|
||||
// later config flip doesn't lose the running window.
|
||||
Metrics *metrics.Registry
|
||||
}
|
||||
|
||||
// Server is the running HTTP server.
|
||||
@@ -90,13 +83,6 @@ type Server struct {
|
||||
// directories (P3-X2). Pre-allocated in New so the lazy-init
|
||||
// race is impossible.
|
||||
treeCache *treeCache
|
||||
|
||||
// catchupDueAt tracks intermittent hosts that reconnected and are
|
||||
// in their settle window. Keyed hostID → earliest time to evaluate
|
||||
// catch-up. Best-effort + in-memory: a server restart simply re-arms
|
||||
// on the next hello. Guarded by catchupMu.
|
||||
catchupMu sync.Mutex
|
||||
catchupDueAt map[string]time.Time
|
||||
}
|
||||
|
||||
// New builds a configured but not-yet-started server.
|
||||
@@ -116,7 +102,6 @@ func New(deps Deps) *Server {
|
||||
announceRL: newAnnounceLimiter(),
|
||||
pendingHub: newPendingHub(),
|
||||
treeCache: newTreeCache(),
|
||||
catchupDueAt: make(map[string]time.Time),
|
||||
}
|
||||
s.routes(r)
|
||||
|
||||
@@ -146,16 +131,12 @@ func (s *Server) routes(r chi.Router) {
|
||||
r.Get("/agent/binary", s.handleAgentBinary)
|
||||
r.Get("/install/*", s.handleInstallAsset)
|
||||
r.Get("/api/version", s.handleVersion)
|
||||
if s.deps.Cfg.MetricsAuthEnabled() {
|
||||
r.Get("/metrics", s.handleMetrics)
|
||||
}
|
||||
if s.deps.Hub != nil {
|
||||
hd := ws.HandlerDeps{
|
||||
Hub: s.deps.Hub,
|
||||
Store: s.deps.Store,
|
||||
JobHub: s.deps.JobHub,
|
||||
AlertEngine: s.deps.AlertEngine,
|
||||
Metrics: s.deps.Metrics,
|
||||
OnHello: s.onAgentHello,
|
||||
OnScheduleAck: s.applyScheduleAck,
|
||||
OnScheduleFire: s.dispatchScheduledJob,
|
||||
@@ -287,7 +268,6 @@ func (s *Server) routes(r chi.Router) {
|
||||
r.Post("/hosts/{id}/repo/probe", s.handleUIRepoProbe)
|
||||
r.Post("/hosts/{id}/repo/hooks", s.handleUIRepoHooksSave)
|
||||
r.Post("/hosts/{id}/tags", s.handleUIHostTagsSave)
|
||||
r.Post("/hosts/{id}/mode", s.handleUIHostModeSave)
|
||||
r.Post("/hosts/{id}/admin-credentials", s.handleUIAdminCredentialsSave)
|
||||
r.Post("/hosts/{id}/admin-credentials/delete", s.handleUIAdminCredentialsDelete)
|
||||
r.Post("/hosts/{id}/schedules/new", s.handleUIScheduleSave)
|
||||
|
||||
@@ -49,14 +49,8 @@ func TestDashboard_HostRowSparklineRendersWithHistory(t *testing.T) {
|
||||
hostID := makeHost(t, st, "h-spark")
|
||||
ctx := context.Background()
|
||||
|
||||
// Two history points → polyline must render. Use dates relative to
|
||||
// now so the points always fall inside the dashboard's rolling
|
||||
// 30-day window (ui_handlers.go: since = now-30d); hard-coded dates
|
||||
// silently age out of the window and break this test over time.
|
||||
for i, day := range []string{
|
||||
time.Now().UTC().AddDate(0, 0, -2).Format("2006-01-02"),
|
||||
time.Now().UTC().AddDate(0, 0, -1).Format("2006-01-02"),
|
||||
} {
|
||||
// Two history points → polyline must render.
|
||||
for i, day := range []string{"2026-05-05", "2026-05-06"} {
|
||||
v := int64(100 + i*50)
|
||||
if err := st.UpsertHostRepoStatsHistory(ctx, hostID, day,
|
||||
store.HostRepoStats{TotalSizeBytes: &v}, time.Now().UTC()); err != nil {
|
||||
|
||||
@@ -983,43 +983,6 @@ func (s *Server) handleUIHostTagsSave(w stdhttp.ResponseWriter, r *stdhttp.Reque
|
||||
stdhttp.Redirect(w, r, "/hosts/"+hostID, stdhttp.StatusSeeOther)
|
||||
}
|
||||
|
||||
// handleUIHostModeSave flips a host's always-on flag. Checkbox present
|
||||
// in the form (value any) => always-on; absent => intermittent.
|
||||
// Operator-band; mounted in server.go. On change we clear open
|
||||
// offline/staleness alerts via the engine so the next sweep re-raises
|
||||
// only what still applies under the new mode.
|
||||
func (s *Server) handleUIHostModeSave(w stdhttp.ResponseWriter, r *stdhttp.Request) {
|
||||
u := s.requireUIUser(w, r)
|
||||
if u == nil {
|
||||
return
|
||||
}
|
||||
hostID := chi.URLParam(r, "id")
|
||||
if _, err := s.deps.Store.GetHost(r.Context(), hostID); err != nil {
|
||||
stdhttp.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
if err := r.ParseForm(); err != nil {
|
||||
stdhttp.Error(w, "bad request", stdhttp.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
alwaysOn := r.PostForm.Get("always_on") != ""
|
||||
if err := s.deps.Store.SetHostAlwaysOn(r.Context(), hostID, alwaysOn); err != nil {
|
||||
slog.Error("ui host mode: save", "host_id", hostID, "err", err)
|
||||
stdhttp.Error(w, "internal", stdhttp.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if s.deps.AlertEngine != nil {
|
||||
s.deps.AlertEngine.ResolveOnModeChange(r.Context(), hostID, time.Now().UTC())
|
||||
}
|
||||
_ = s.deps.Store.AppendAudit(r.Context(), store.AuditEntry{
|
||||
ID: ulid.Make().String(), UserID: &u.ID, Actor: "user",
|
||||
Action: "host.mode_updated",
|
||||
TargetKind: ptr("host"), TargetID: &hostID,
|
||||
TS: time.Now().UTC(),
|
||||
})
|
||||
stdhttp.Redirect(w, r, "/hosts/"+hostID, stdhttp.StatusSeeOther)
|
||||
}
|
||||
|
||||
// normaliseTags splits a comma-separated string, lowercases each token,
|
||||
// trims whitespace, drops empties, and dedupes. Order is preserved
|
||||
// from first occurrence (so the user's typing order shows on screen).
|
||||
|
||||
@@ -1,88 +0,0 @@
|
||||
// ui_host_mode_test.go — covers handleUIHostModeSave: toggling a
|
||||
// host's always-on flag via POST /hosts/{id}/mode.
|
||||
package http
|
||||
|
||||
import (
|
||||
"context"
|
||||
stdhttp "net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestHostModeSaveToggle verifies the checkbox-absent ⇒ intermittent
|
||||
// and checkbox-present ⇒ always-on semantics, and that the audit row
|
||||
// lands for each request.
|
||||
func TestHostModeSaveToggle(t *testing.T) {
|
||||
t.Parallel()
|
||||
_, ts, st := rawTestServerWithUI(t)
|
||||
hostID, _ := enrolHostForUI(t, nil, st, "mode-toggle-host")
|
||||
|
||||
cookie := loginAsAdmin(t, st)
|
||||
|
||||
cli := &stdhttp.Client{
|
||||
CheckRedirect: func(*stdhttp.Request, []*stdhttp.Request) error {
|
||||
return stdhttp.ErrUseLastResponse
|
||||
},
|
||||
}
|
||||
|
||||
// --- POST with no always_on field => intermittent ---
|
||||
form := url.Values{}
|
||||
req, _ := stdhttp.NewRequest("POST", ts.URL+"/hosts/"+hostID+"/mode",
|
||||
strings.NewReader(form.Encode()))
|
||||
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
||||
req.AddCookie(cookie)
|
||||
res, err := cli.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("do: %v", err)
|
||||
}
|
||||
_ = res.Body.Close()
|
||||
if res.StatusCode != stdhttp.StatusSeeOther {
|
||||
t.Fatalf("status: got %d, want 303", res.StatusCode)
|
||||
}
|
||||
if loc := res.Header.Get("Location"); loc != "/hosts/"+hostID {
|
||||
t.Errorf("Location: got %q, want /hosts/%s", loc, hostID)
|
||||
}
|
||||
|
||||
got, err := st.GetHost(context.Background(), hostID)
|
||||
if err != nil {
|
||||
t.Fatalf("GetHost: %v", err)
|
||||
}
|
||||
if got.AlwaysOn {
|
||||
t.Errorf("AlwaysOn after empty form: got true, want false")
|
||||
}
|
||||
|
||||
// --- POST with always_on=on => always-on ---
|
||||
form2 := url.Values{"always_on": {"on"}}
|
||||
req2, _ := stdhttp.NewRequest("POST", ts.URL+"/hosts/"+hostID+"/mode",
|
||||
strings.NewReader(form2.Encode()))
|
||||
req2.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
||||
req2.AddCookie(cookie)
|
||||
res2, err := cli.Do(req2)
|
||||
if err != nil {
|
||||
t.Fatalf("do: %v", err)
|
||||
}
|
||||
_ = res2.Body.Close()
|
||||
if res2.StatusCode != stdhttp.StatusSeeOther {
|
||||
t.Fatalf("status: got %d, want 303", res2.StatusCode)
|
||||
}
|
||||
|
||||
got2, err := st.GetHost(context.Background(), hostID)
|
||||
if err != nil {
|
||||
t.Fatalf("GetHost: %v", err)
|
||||
}
|
||||
if !got2.AlwaysOn {
|
||||
t.Errorf("AlwaysOn after always_on=on: got false, want true")
|
||||
}
|
||||
|
||||
// Audit rows must exist (one per request).
|
||||
var n int
|
||||
if err := st.DB().QueryRow(
|
||||
`SELECT COUNT(*) FROM audit_log WHERE action = 'host.mode_updated' AND target_id = ?`,
|
||||
hostID).Scan(&n); err != nil {
|
||||
t.Fatalf("count audit: %v", err)
|
||||
}
|
||||
if n != 2 {
|
||||
t.Errorf("audit rows: got %d, want 2", n)
|
||||
}
|
||||
}
|
||||
@@ -1,301 +0,0 @@
|
||||
// Package metrics owns the in-process Prometheus exposition for
|
||||
// the control plane. It deliberately avoids prometheus/client_golang
|
||||
// — the legacy text format is small and stable, and the repo's house
|
||||
// style is to keep dependency surface minimal.
|
||||
//
|
||||
// Two halves:
|
||||
//
|
||||
// - Registry holds a job-duration histogram. Server hooks call
|
||||
// Registry.ObserveJob from the WS job-finished branch.
|
||||
//
|
||||
// - Render emits a complete /metrics body from a Snapshot. The
|
||||
// Snapshot is a plain value bag; the HTTP handler assembles it
|
||||
// from store reads + Registry.Snapshot at scrape time. This
|
||||
// keeps the package free of any database or HTTP dependency.
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// JobDurationBuckets is the upper-bound ladder for the job duration
|
||||
// histogram, in seconds. Covers admin commands (unlock/init/check
|
||||
// finishing in seconds) up through hours-long backups; +Inf is
|
||||
// implicit.
|
||||
var JobDurationBuckets = []float64{1, 5, 30, 60, 300, 1800, 3600, 21600, 86400}
|
||||
|
||||
// Registry is the in-memory store for the job-duration histogram.
|
||||
// Concurrent observers and a single periodic snapshotter is the
|
||||
// expected access pattern; both are guarded by a mutex.
|
||||
type Registry struct {
|
||||
mu sync.Mutex
|
||||
jobs map[jobKey]*histogramState
|
||||
clock func() time.Time
|
||||
}
|
||||
|
||||
type jobKey struct{ kind, status string }
|
||||
|
||||
type histogramState struct {
|
||||
// counts[i] = number of observations <= JobDurationBuckets[i].
|
||||
// counts[len(JobDurationBuckets)] is the implicit +Inf bucket
|
||||
// (== total count, kept here for symmetry with the rendered
|
||||
// _bucket{le="+Inf"} line and as a sanity check).
|
||||
counts []uint64
|
||||
sum float64
|
||||
count uint64
|
||||
}
|
||||
|
||||
// NewRegistry builds an empty registry.
|
||||
func NewRegistry() *Registry {
|
||||
return &Registry{
|
||||
jobs: make(map[jobKey]*histogramState),
|
||||
clock: time.Now,
|
||||
}
|
||||
}
|
||||
|
||||
// ObserveJob records one job-duration sample. Negative durations
|
||||
// (clock-skew artefacts) are clamped to zero. Empty kind/status
|
||||
// strings are tolerated but degrade the dashboard — callers should
|
||||
// pass meaningful values.
|
||||
func (r *Registry) ObserveJob(kind, status string, dur time.Duration) {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
if dur < 0 {
|
||||
dur = 0
|
||||
}
|
||||
secs := dur.Seconds()
|
||||
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
k := jobKey{kind: kind, status: status}
|
||||
hs, ok := r.jobs[k]
|
||||
if !ok {
|
||||
hs = &histogramState{counts: make([]uint64, len(JobDurationBuckets)+1)}
|
||||
r.jobs[k] = hs
|
||||
}
|
||||
for i, ub := range JobDurationBuckets {
|
||||
if secs <= ub {
|
||||
hs.counts[i]++
|
||||
}
|
||||
}
|
||||
hs.counts[len(JobDurationBuckets)]++ // +Inf
|
||||
hs.sum += secs
|
||||
hs.count++
|
||||
}
|
||||
|
||||
// HistogramRow is one (kind,status) row in a Snapshot. Buckets is
|
||||
// the cumulative count per upper bound (matching JobDurationBuckets,
|
||||
// last element is the +Inf total).
|
||||
type HistogramRow struct {
|
||||
Kind string
|
||||
Status string
|
||||
Buckets []uint64
|
||||
Sum float64
|
||||
Count uint64
|
||||
}
|
||||
|
||||
// snapshotJobs returns a deterministic, sorted copy of the
|
||||
// histogram state. Sort order: kind asc, status asc.
|
||||
func (r *Registry) snapshotJobs() []HistogramRow {
|
||||
if r == nil {
|
||||
return nil
|
||||
}
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
rows := make([]HistogramRow, 0, len(r.jobs))
|
||||
for k, hs := range r.jobs {
|
||||
buckets := make([]uint64, len(hs.counts))
|
||||
copy(buckets, hs.counts)
|
||||
rows = append(rows, HistogramRow{
|
||||
Kind: k.kind,
|
||||
Status: k.status,
|
||||
Buckets: buckets,
|
||||
Sum: hs.sum,
|
||||
Count: hs.count,
|
||||
})
|
||||
}
|
||||
sort.Slice(rows, func(i, j int) bool {
|
||||
if rows[i].Kind != rows[j].Kind {
|
||||
return rows[i].Kind < rows[j].Kind
|
||||
}
|
||||
return rows[i].Status < rows[j].Status
|
||||
})
|
||||
return rows
|
||||
}
|
||||
|
||||
// HostRow is one host's projection for the per-host gauges.
|
||||
// Pointers carry "no value" semantics so we can omit a metric line
|
||||
// when, e.g., a host has never run a backup.
|
||||
type HostRow struct {
|
||||
ID string
|
||||
Name string
|
||||
Online bool
|
||||
LastBackupUnix *int64 // nil = no backup yet
|
||||
LastBackupSucceeded *bool // nil = no backup yet
|
||||
RepoSizeBytes *int64 // nil = no stats yet
|
||||
SnapshotCount int
|
||||
OpenAlertCount int
|
||||
RepoStatus string // "unknown" | "ready" | "init_failed"
|
||||
}
|
||||
|
||||
// Snapshot is a frozen view of the data needed to render /metrics.
|
||||
// Constructed by the HTTP handler from Store reads + Registry.snapshotJobs.
|
||||
type Snapshot struct {
|
||||
Hosts []HostRow
|
||||
HostsTotal int
|
||||
HostsOnline int
|
||||
AlertsBySeverity map[string]int // severity → count
|
||||
BuildVersion string
|
||||
BuildCommit string
|
||||
GoVersion string
|
||||
JobDurationRows []HistogramRow
|
||||
}
|
||||
|
||||
// SnapshotWith builds a Snapshot from raw inputs and the registry's
|
||||
// current job-duration state. Convenience for the HTTP handler.
|
||||
func (r *Registry) SnapshotWith(hosts []HostRow, alerts map[string]int, buildVer, commit, goVer string) Snapshot {
|
||||
online := 0
|
||||
for _, h := range hosts {
|
||||
if h.Online {
|
||||
online++
|
||||
}
|
||||
}
|
||||
return Snapshot{
|
||||
Hosts: hosts,
|
||||
HostsTotal: len(hosts),
|
||||
HostsOnline: online,
|
||||
AlertsBySeverity: alerts,
|
||||
BuildVersion: buildVer,
|
||||
BuildCommit: commit,
|
||||
GoVersion: goVer,
|
||||
JobDurationRows: r.snapshotJobs(),
|
||||
}
|
||||
}
|
||||
|
||||
// Render emits a complete Prometheus text-exposition body for s.
|
||||
// Output is deterministic: metric names appear in a fixed order and
|
||||
// labels within a metric are sorted by their first label value.
|
||||
func Render(w io.Writer, s Snapshot) error {
|
||||
var b strings.Builder
|
||||
|
||||
// --- Server gauges ---------------------------------------------------
|
||||
b.WriteString("# HELP rm_hosts_total Total number of enrolled hosts (excludes pending announces).\n")
|
||||
b.WriteString("# TYPE rm_hosts_total gauge\n")
|
||||
fmt.Fprintf(&b, "rm_hosts_total %d\n", s.HostsTotal)
|
||||
|
||||
b.WriteString("# HELP rm_hosts_online Number of hosts currently online (status='online').\n")
|
||||
b.WriteString("# TYPE rm_hosts_online gauge\n")
|
||||
fmt.Fprintf(&b, "rm_hosts_online %d\n", s.HostsOnline)
|
||||
|
||||
b.WriteString("# HELP rm_active_alerts Open alerts grouped by severity.\n")
|
||||
b.WriteString("# TYPE rm_active_alerts gauge\n")
|
||||
severities := []string{"info", "warning", "critical"}
|
||||
for _, sev := range severities {
|
||||
fmt.Fprintf(&b, "rm_active_alerts{severity=%q} %d\n", sev, s.AlertsBySeverity[sev])
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_build_info Build identifying labels; value is always 1.\n")
|
||||
b.WriteString("# TYPE rm_build_info gauge\n")
|
||||
fmt.Fprintf(&b, "rm_build_info{version=%q,commit=%q,go_version=%q} 1\n",
|
||||
s.BuildVersion, s.BuildCommit, s.GoVersion)
|
||||
|
||||
// --- Per-host gauges -------------------------------------------------
|
||||
// Stable order: by host id.
|
||||
hosts := append([]HostRow(nil), s.Hosts...)
|
||||
sort.Slice(hosts, func(i, j int) bool { return hosts[i].ID < hosts[j].ID })
|
||||
|
||||
b.WriteString("# HELP rm_host_agent_online 1 if the agent is currently online, 0 otherwise.\n")
|
||||
b.WriteString("# TYPE rm_host_agent_online gauge\n")
|
||||
for _, h := range hosts {
|
||||
v := 0
|
||||
if h.Online {
|
||||
v = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_host_agent_online{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, v)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_last_backup_timestamp_seconds Unix timestamp of the host's most recent backup. Omitted for hosts with no backup yet.\n")
|
||||
b.WriteString("# TYPE rm_host_last_backup_timestamp_seconds gauge\n")
|
||||
for _, h := range hosts {
|
||||
if h.LastBackupUnix == nil {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_host_last_backup_timestamp_seconds{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, *h.LastBackupUnix)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_last_backup_success 1 if the host's most recent backup succeeded, 0 otherwise. Omitted for hosts with no backup yet.\n")
|
||||
b.WriteString("# TYPE rm_host_last_backup_success gauge\n")
|
||||
for _, h := range hosts {
|
||||
if h.LastBackupSucceeded == nil {
|
||||
continue
|
||||
}
|
||||
v := 0
|
||||
if *h.LastBackupSucceeded {
|
||||
v = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_host_last_backup_success{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, v)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_repo_size_bytes Latest reported repo size from `restic stats --mode raw-data`. Omitted for hosts with no stats yet.\n")
|
||||
b.WriteString("# TYPE rm_host_repo_size_bytes gauge\n")
|
||||
for _, h := range hosts {
|
||||
if h.RepoSizeBytes == nil {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_host_repo_size_bytes{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, *h.RepoSizeBytes)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_snapshot_count Number of restic snapshots known on the host's repo.\n")
|
||||
b.WriteString("# TYPE rm_host_snapshot_count gauge\n")
|
||||
for _, h := range hosts {
|
||||
fmt.Fprintf(&b, "rm_host_snapshot_count{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, h.SnapshotCount)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_open_alerts Number of currently open alerts attached to this host.\n")
|
||||
b.WriteString("# TYPE rm_host_open_alerts gauge\n")
|
||||
for _, h := range hosts {
|
||||
fmt.Fprintf(&b, "rm_host_open_alerts{host_id=%q,host=%q} %d\n",
|
||||
h.ID, h.Name, h.OpenAlertCount)
|
||||
}
|
||||
|
||||
b.WriteString("# HELP rm_host_repo_status Repo readiness state for the host. Exactly one row per host with status label set.\n")
|
||||
b.WriteString("# TYPE rm_host_repo_status gauge\n")
|
||||
for _, h := range hosts {
|
||||
st := h.RepoStatus
|
||||
if st == "" {
|
||||
st = "unknown"
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_host_repo_status{host_id=%q,host=%q,status=%q} 1\n",
|
||||
h.ID, h.Name, st)
|
||||
}
|
||||
|
||||
// --- Histogram -------------------------------------------------------
|
||||
b.WriteString("# HELP rm_job_duration_seconds End-to-end duration of completed jobs, by kind and terminal status.\n")
|
||||
b.WriteString("# TYPE rm_job_duration_seconds histogram\n")
|
||||
for _, row := range s.JobDurationRows {
|
||||
for i, ub := range JobDurationBuckets {
|
||||
fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"%g\"} %d\n",
|
||||
row.Kind, row.Status, ub, row.Buckets[i])
|
||||
}
|
||||
fmt.Fprintf(&b, "rm_job_duration_seconds_bucket{kind=%q,status=%q,le=\"+Inf\"} %d\n",
|
||||
row.Kind, row.Status, row.Buckets[len(JobDurationBuckets)])
|
||||
fmt.Fprintf(&b, "rm_job_duration_seconds_sum{kind=%q,status=%q} %g\n",
|
||||
row.Kind, row.Status, row.Sum)
|
||||
fmt.Fprintf(&b, "rm_job_duration_seconds_count{kind=%q,status=%q} %d\n",
|
||||
row.Kind, row.Status, row.Count)
|
||||
}
|
||||
|
||||
_, err := io.WriteString(w, b.String())
|
||||
return err
|
||||
}
|
||||
@@ -1,182 +0,0 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestObserveJobBuckets(t *testing.T) {
|
||||
r := NewRegistry()
|
||||
// Bucket boundaries: 1, 5, 30, 60, 300, 1800, 3600, 21600, 86400
|
||||
r.ObserveJob("backup", "succeeded", 500*time.Millisecond) // <= 1
|
||||
r.ObserveJob("backup", "succeeded", 30*time.Second) // == 30 (boundary)
|
||||
r.ObserveJob("backup", "succeeded", 90*time.Second) // > 60, <= 300
|
||||
r.ObserveJob("backup", "succeeded", 2*time.Hour) // > 3600 → 21600 bucket
|
||||
rows := r.snapshotJobs()
|
||||
if len(rows) != 1 {
|
||||
t.Fatalf("rows: %d", len(rows))
|
||||
}
|
||||
row := rows[0]
|
||||
if row.Count != 4 {
|
||||
t.Errorf("count: %d", row.Count)
|
||||
}
|
||||
wantSum := 0.5 + 30 + 90 + 7200.0
|
||||
if row.Sum != wantSum {
|
||||
t.Errorf("sum: got %v want %v", row.Sum, wantSum)
|
||||
}
|
||||
// Cumulative buckets:
|
||||
// le=1 → 1 (the 0.5s)
|
||||
// le=5 → 1
|
||||
// le=30 → 2 (boundary inclusive: 30s included)
|
||||
// le=60 → 2
|
||||
// le=300 → 3
|
||||
// le=1800 → 3
|
||||
// le=3600 → 3
|
||||
// le=21600 → 4
|
||||
// le=86400 → 4
|
||||
// le=+Inf → 4
|
||||
want := []uint64{1, 1, 2, 2, 3, 3, 3, 4, 4, 4}
|
||||
for i, w := range want {
|
||||
if row.Buckets[i] != w {
|
||||
t.Errorf("bucket[%d]=%d want %d", i, row.Buckets[i], w)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserveJobNegativeClampedToZero(t *testing.T) {
|
||||
r := NewRegistry()
|
||||
r.ObserveJob("backup", "succeeded", -5*time.Second)
|
||||
rows := r.snapshotJobs()
|
||||
if len(rows) != 1 || rows[0].Sum != 0 || rows[0].Count != 1 {
|
||||
t.Errorf("expected one zero-second observation, got %+v", rows)
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserveJobConcurrent(t *testing.T) {
|
||||
r := NewRegistry()
|
||||
const goroutines = 16
|
||||
const each = 200
|
||||
var wg sync.WaitGroup
|
||||
for g := 0; g < goroutines; g++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < each; i++ {
|
||||
r.ObserveJob("backup", "succeeded", time.Second)
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
rows := r.snapshotJobs()
|
||||
if len(rows) != 1 {
|
||||
t.Fatalf("rows: %d", len(rows))
|
||||
}
|
||||
if rows[0].Count != uint64(goroutines*each) {
|
||||
t.Errorf("count: got %d want %d", rows[0].Count, goroutines*each)
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserveJobNilRegistryNoop(t *testing.T) {
|
||||
var r *Registry // nil
|
||||
r.ObserveJob("backup", "succeeded", time.Second)
|
||||
}
|
||||
|
||||
func TestRenderGolden(t *testing.T) {
|
||||
r := NewRegistry()
|
||||
r.ObserveJob("backup", "succeeded", 5*time.Second)
|
||||
r.ObserveJob("forget", "succeeded", 100*time.Millisecond)
|
||||
|
||||
pi64 := func(v int64) *int64 { return &v }
|
||||
pbool := func(v bool) *bool { return &v }
|
||||
|
||||
hosts := []HostRow{
|
||||
{
|
||||
ID: "01H0001", Name: "alpha",
|
||||
Online: true,
|
||||
LastBackupUnix: pi64(1700000000),
|
||||
LastBackupSucceeded: pbool(true),
|
||||
RepoSizeBytes: pi64(123456789),
|
||||
SnapshotCount: 42,
|
||||
OpenAlertCount: 0,
|
||||
RepoStatus: "ready",
|
||||
},
|
||||
{
|
||||
ID: "01H0002", Name: "bravo",
|
||||
Online: false,
|
||||
SnapshotCount: 0,
|
||||
OpenAlertCount: 1,
|
||||
RepoStatus: "init_failed",
|
||||
},
|
||||
}
|
||||
snap := r.SnapshotWith(hosts,
|
||||
map[string]int{"info": 0, "warning": 1, "critical": 0},
|
||||
"v1.2.3", "deadbeef", "go1.25.0")
|
||||
|
||||
var buf bytes.Buffer
|
||||
if err := Render(&buf, snap); err != nil {
|
||||
t.Fatalf("render: %v", err)
|
||||
}
|
||||
out := buf.String()
|
||||
|
||||
for _, want := range []string{
|
||||
"# HELP rm_hosts_total ",
|
||||
"rm_hosts_total 2\n",
|
||||
"rm_hosts_online 1\n",
|
||||
`rm_active_alerts{severity="warning"} 1`,
|
||||
`rm_active_alerts{severity="info"} 0`,
|
||||
`rm_active_alerts{severity="critical"} 0`,
|
||||
`rm_build_info{version="v1.2.3",commit="deadbeef",go_version="go1.25.0"} 1`,
|
||||
`rm_host_agent_online{host_id="01H0001",host="alpha"} 1`,
|
||||
`rm_host_agent_online{host_id="01H0002",host="bravo"} 0`,
|
||||
`rm_host_last_backup_timestamp_seconds{host_id="01H0001",host="alpha"} 1700000000`,
|
||||
`rm_host_last_backup_success{host_id="01H0001",host="alpha"} 1`,
|
||||
`rm_host_repo_size_bytes{host_id="01H0001",host="alpha"} 123456789`,
|
||||
`rm_host_snapshot_count{host_id="01H0001",host="alpha"} 42`,
|
||||
`rm_host_snapshot_count{host_id="01H0002",host="bravo"} 0`,
|
||||
`rm_host_open_alerts{host_id="01H0002",host="bravo"} 1`,
|
||||
`rm_host_repo_status{host_id="01H0001",host="alpha",status="ready"} 1`,
|
||||
`rm_host_repo_status{host_id="01H0002",host="bravo",status="init_failed"} 1`,
|
||||
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="1"} 0`,
|
||||
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="5"} 1`,
|
||||
`rm_job_duration_seconds_bucket{kind="backup",status="succeeded",le="+Inf"} 1`,
|
||||
`rm_job_duration_seconds_sum{kind="backup",status="succeeded"} 5`,
|
||||
`rm_job_duration_seconds_count{kind="backup",status="succeeded"} 1`,
|
||||
`rm_job_duration_seconds_bucket{kind="forget",status="succeeded",le="1"} 1`,
|
||||
} {
|
||||
if !strings.Contains(out, want) {
|
||||
t.Errorf("missing line:\n %s\n--- full output ---\n%s", want, out)
|
||||
}
|
||||
}
|
||||
|
||||
// bravo had no last backup → those metric lines must be absent for it.
|
||||
for _, ban := range []string{
|
||||
`rm_host_last_backup_timestamp_seconds{host_id="01H0002"`,
|
||||
`rm_host_last_backup_success{host_id="01H0002"`,
|
||||
`rm_host_repo_size_bytes{host_id="01H0002"`,
|
||||
} {
|
||||
if strings.Contains(out, ban) {
|
||||
t.Errorf("unexpected line for bravo: %q", ban)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderEmptySnapshot(t *testing.T) {
|
||||
r := NewRegistry()
|
||||
snap := r.SnapshotWith(nil, nil, "dev", "", "go1.25.0")
|
||||
var buf bytes.Buffer
|
||||
if err := Render(&buf, snap); err != nil {
|
||||
t.Fatalf("render: %v", err)
|
||||
}
|
||||
out := buf.String()
|
||||
if !strings.Contains(out, "rm_hosts_total 0\n") {
|
||||
t.Errorf("missing zero-host gauge:\n%s", out)
|
||||
}
|
||||
// Histogram block has its HELP/TYPE but no rows. The HELP/TYPE
|
||||
// presence is correct and helps Prometheus pre-register the metric.
|
||||
if !strings.Contains(out, "# TYPE rm_job_duration_seconds histogram") {
|
||||
t.Errorf("histogram HELP/TYPE missing")
|
||||
}
|
||||
}
|
||||
@@ -221,40 +221,23 @@ func formatBytes(n int64) template.HTML {
|
||||
// "in 5m"-style. Accepts *time.Time or time.Time so templates can
|
||||
// pass either without fighting Go's lack of an address-of operator.
|
||||
// Anything else returns "—".
|
||||
//
|
||||
// The output is wrapped in a <time data-rel-ts="..."> element so a
|
||||
// small client-side ticker (see base.html) can refresh the label
|
||||
// without a full page reload — otherwise a long-open tab shows
|
||||
// timestamps frozen at render time.
|
||||
func formatRelTime(v any) template.HTML {
|
||||
func formatRelTime(v any) string {
|
||||
var t time.Time
|
||||
switch x := v.(type) {
|
||||
case time.Time:
|
||||
t = x
|
||||
case *time.Time:
|
||||
if x == nil {
|
||||
return template.HTML("—")
|
||||
return "—"
|
||||
}
|
||||
t = *x
|
||||
default:
|
||||
return template.HTML("—")
|
||||
return "—"
|
||||
}
|
||||
if t.IsZero() {
|
||||
return template.HTML("—")
|
||||
return "—"
|
||||
}
|
||||
label := relTimeLabel(time.Since(t))
|
||||
return template.HTML(fmt.Sprintf(
|
||||
`<time data-rel-ts="%s" title="%s">%s</time>`,
|
||||
t.UTC().Format(time.RFC3339Nano),
|
||||
t.UTC().Format("2006-01-02 15:04:05 UTC"),
|
||||
label,
|
||||
))
|
||||
}
|
||||
|
||||
// relTimeLabel turns a duration-since-now into the short human label
|
||||
// used by formatRelTime (and mirrored verbatim by the JS ticker, so
|
||||
// keep the two in sync if you change the buckets).
|
||||
func relTimeLabel(d time.Duration) string {
|
||||
d := time.Since(t)
|
||||
suffix := "ago"
|
||||
if d < 0 {
|
||||
d = -d
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
package ui
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestFormatRelTimeWrapsInTickableTimeElement(t *testing.T) {
|
||||
// A long-open tab needs a stable anchor so the JS ticker can
|
||||
// refresh the label — see base.html.
|
||||
when := time.Now().Add(-3 * time.Hour)
|
||||
got := string(formatRelTime(when))
|
||||
if !strings.Contains(got, `<time data-rel-ts="`) {
|
||||
t.Errorf("missing data-rel-ts anchor in %q", got)
|
||||
}
|
||||
if !strings.Contains(got, "3h ago</time>") {
|
||||
t.Errorf("expected '3h ago' label, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatRelTimeNilReturnsDash(t *testing.T) {
|
||||
var p *time.Time
|
||||
if string(formatRelTime(p)) != "—" {
|
||||
t.Errorf("nil should render as em-dash, got %q", formatRelTime(p))
|
||||
}
|
||||
if string(formatRelTime(time.Time{})) != "—" {
|
||||
t.Errorf("zero should render as em-dash")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRelTimeLabelBuckets(t *testing.T) {
|
||||
cases := []struct {
|
||||
d time.Duration
|
||||
want string
|
||||
}{
|
||||
{30 * time.Second, "30s ago"},
|
||||
{5 * time.Minute, "5m ago"},
|
||||
{2 * time.Hour, "2h ago"},
|
||||
{3 * 24 * time.Hour, "3d ago"},
|
||||
{2 * 7 * 24 * time.Hour, "2w ago"},
|
||||
{-5 * time.Minute, "5m from now"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
if got := relTimeLabel(c.d); got != c.want {
|
||||
t.Errorf("relTimeLabel(%v) = %q, want %q", c.d, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -36,7 +36,7 @@ type ViewData struct {
|
||||
User *User
|
||||
|
||||
// Active is the slug of the currently active primary nav tab
|
||||
// ("dashboard" / "alerts" / "audit" / "settings").
|
||||
// ("dashboard" / "repos" / "alerts" / "audit" / "settings").
|
||||
// The nav partial highlights the matching tab.
|
||||
Active string
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@ import (
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/alert"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/api"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/auth"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/server/metrics"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/store"
|
||||
"gitea.dcglab.co.uk/steve/restic-manager/internal/version"
|
||||
)
|
||||
@@ -28,9 +27,6 @@ type HandlerDeps struct {
|
||||
// AlertEngine receives job-finished and host-online events so the
|
||||
// alert engine can evaluate its rules. Optional; nil = no-op.
|
||||
AlertEngine *alert.Engine
|
||||
// Metrics records job-duration observations on every terminal
|
||||
// status. Optional; nil = no-op (test fixtures pass nil).
|
||||
Metrics *metrics.Registry
|
||||
// UpdateWatcher reconciles in-flight agent-update dispatches against
|
||||
// hello envelopes. Optional; nil = no-op.
|
||||
UpdateWatcher *UpdateWatcher
|
||||
@@ -243,13 +239,6 @@ func dispatchAgentMessage(ctx context.Context, c *Conn, hostID string, env api.E
|
||||
slog.Warn("ws: set host last backup", "host_id", hostID, "err", err)
|
||||
}
|
||||
}
|
||||
// Job-duration histogram (P6-04). Skip when StartedAt is
|
||||
// missing (race: agent shipped finished without a started,
|
||||
// or the row predates this code).
|
||||
if deps.Metrics != nil && job.StartedAt != nil {
|
||||
deps.Metrics.ObserveJob(job.Kind, string(p.Status),
|
||||
p.FinishedAt.Sub(*job.StartedAt))
|
||||
}
|
||||
}
|
||||
if deps.JobHub != nil {
|
||||
deps.JobHub.Broadcast(p.JobID, env)
|
||||
|
||||
+4
-25
@@ -44,7 +44,7 @@ func (s *Store) LookupHostByAgentToken(ctx context.Context, tokenHash string) (*
|
||||
repo_size_bytes, snapshot_count, open_alert_count,
|
||||
applied_schedule_version, bandwidth_up_kbps, bandwidth_down_kbps,
|
||||
pre_hook_default, post_hook_default,
|
||||
repo_status, repo_status_error, always_on
|
||||
repo_status, repo_status_error
|
||||
FROM hosts WHERE agent_token_hash = ?`,
|
||||
tokenHash)
|
||||
return scanHost(row)
|
||||
@@ -59,7 +59,7 @@ func (s *Store) GetHost(ctx context.Context, id string) (*Host, error) {
|
||||
repo_size_bytes, snapshot_count, open_alert_count,
|
||||
applied_schedule_version, bandwidth_up_kbps, bandwidth_down_kbps,
|
||||
pre_hook_default, post_hook_default,
|
||||
repo_status, repo_status_error, always_on
|
||||
repo_status, repo_status_error
|
||||
FROM hosts WHERE id = ?`, id)
|
||||
return scanHost(row)
|
||||
}
|
||||
@@ -227,7 +227,7 @@ func (s *Store) ListHosts(ctx context.Context) ([]Host, error) {
|
||||
repo_size_bytes, snapshot_count, open_alert_count,
|
||||
applied_schedule_version, bandwidth_up_kbps, bandwidth_down_kbps,
|
||||
pre_hook_default, post_hook_default,
|
||||
repo_status, repo_status_error, always_on
|
||||
repo_status, repo_status_error
|
||||
FROM hosts ORDER BY name`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("store: list hosts: %w", err)
|
||||
@@ -267,7 +267,6 @@ func scanHostRow(s hostScanner) (*Host, error) {
|
||||
tags string
|
||||
bwUp, bwDown sql.NullInt64
|
||||
preHook, postHook sql.NullString
|
||||
alwaysOn int
|
||||
)
|
||||
err := s.Scan(&h.ID, &h.Name, &h.OS, &h.Arch,
|
||||
&h.AgentVersion, &h.ResticVersion, &h.ProtocolVersion,
|
||||
@@ -276,7 +275,7 @@ func scanHostRow(s hostScanner) (*Host, error) {
|
||||
&h.RepoSizeBytes, &h.SnapshotCount, &h.OpenAlertCount,
|
||||
&h.AppliedScheduleVersion, &bwUp, &bwDown,
|
||||
&preHook, &postHook,
|
||||
&h.RepoStatus, &h.RepoStatusError, &alwaysOn)
|
||||
&h.RepoStatus, &h.RepoStatusError)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil, ErrNotFound
|
||||
@@ -331,7 +330,6 @@ func scanHostRow(s hostScanner) (*Host, error) {
|
||||
if postHook.Valid {
|
||||
h.PostHookDefault = postHook.String
|
||||
}
|
||||
h.AlwaysOn = alwaysOn != 0
|
||||
return &h, nil
|
||||
}
|
||||
|
||||
@@ -380,25 +378,6 @@ func (s *Store) SetHostTags(ctx context.Context, hostID string, tags []string) e
|
||||
return nil
|
||||
}
|
||||
|
||||
// SetHostAlwaysOn flips the host's always-on flag. true = 24x7 server
|
||||
// (default); false = intermittent host (laptop). See the
|
||||
// always-on-host-mode spec.
|
||||
func (s *Store) SetHostAlwaysOn(ctx context.Context, hostID string, alwaysOn bool) error {
|
||||
v := 0
|
||||
if alwaysOn {
|
||||
v = 1
|
||||
}
|
||||
res, err := s.db.ExecContext(ctx,
|
||||
`UPDATE hosts SET always_on = ? WHERE id = ?`, v, hostID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("store: set host always_on: %w", err)
|
||||
}
|
||||
if n, _ := res.RowsAffected(); n == 0 {
|
||||
return ErrNotFound
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// DistinctHostTags returns the union of every tag in use across the
|
||||
// fleet, sorted. Powers the autocomplete on the host-tags editor and
|
||||
// the chip-row filter on the dashboard. Cheap at fleet sizes this
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestHostAlwaysOnDefaultAndToggle(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
st := openTestStore(t)
|
||||
|
||||
h := Host{
|
||||
ID: "h-always-on", Name: "lap", OS: "linux", Arch: "amd64",
|
||||
ProtocolVersion: 1, EnrolledAt: time.Now().UTC(),
|
||||
}
|
||||
if err := st.CreateHost(ctx, h, "tok-hash", "pin"); err != nil {
|
||||
t.Fatalf("create host: %v", err)
|
||||
}
|
||||
got, err := st.GetHost(ctx, h.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("get host: %v", err)
|
||||
}
|
||||
if !got.AlwaysOn {
|
||||
t.Fatalf("new host should default to always_on=true, got false")
|
||||
}
|
||||
|
||||
if err := st.SetHostAlwaysOn(ctx, h.ID, false); err != nil {
|
||||
t.Fatalf("set always_on: %v", err)
|
||||
}
|
||||
got, err = st.GetHost(ctx, h.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("get host 2: %v", err)
|
||||
}
|
||||
if got.AlwaysOn {
|
||||
t.Fatalf("expected always_on=false after toggle, got true")
|
||||
}
|
||||
|
||||
hosts, err := st.ListHosts(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("list hosts: %v", err)
|
||||
}
|
||||
if len(hosts) != 1 || hosts[0].AlwaysOn {
|
||||
t.Fatalf("ListHosts should report always_on=false, got %+v", hosts)
|
||||
}
|
||||
|
||||
// Verify the agent hot-path (LookupHostByAgentToken) also reflects the toggle.
|
||||
byToken, err := st.LookupHostByAgentToken(ctx, "tok-hash")
|
||||
if err != nil {
|
||||
t.Fatalf("lookup by agent token: %v", err)
|
||||
}
|
||||
if byToken.AlwaysOn {
|
||||
t.Fatalf("LookupHostByAgentToken: expected always_on=false after toggle, got true")
|
||||
}
|
||||
}
|
||||
@@ -270,22 +270,6 @@ func (s *Store) LatestJobByKind(ctx context.Context, hostID, kind string) (*Job,
|
||||
return &j, nil
|
||||
}
|
||||
|
||||
// HasActiveBackupJob reports whether the host has a backup job that is
|
||||
// still queued or running. The catch-up scheduler uses this to avoid
|
||||
// dispatching a duplicate backup alongside one already in flight
|
||||
// (hosts.current_job_id is not maintained, so this is the authoritative
|
||||
// in-flight check).
|
||||
func (s *Store) HasActiveBackupJob(ctx context.Context, hostID string) (bool, error) {
|
||||
var exists bool
|
||||
err := s.db.QueryRowContext(ctx,
|
||||
`SELECT EXISTS(SELECT 1 FROM jobs WHERE host_id = ? AND kind = 'backup' AND status IN ('queued','running'))`,
|
||||
hostID).Scan(&exists)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("store: has active backup job: %w", err)
|
||||
}
|
||||
return exists, nil
|
||||
}
|
||||
|
||||
// HasJobOfKind reports whether any job of the given kind exists for
|
||||
// this host, regardless of status. Used by the auto-init path on
|
||||
// agent hello to decide whether to dispatch a fresh `restic init` —
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
-- 0024: distinguish always-on (24x7 server) hosts from intermittent
|
||||
-- hosts (laptops/workstations that legitimately sleep). Default 1 so
|
||||
-- every existing and future host keeps today's offline/alert
|
||||
-- semantics unless explicitly opted out. Column-level ALTER per the
|
||||
-- repo's migration rules (no table rebuild — hosts has inbound FKs).
|
||||
ALTER TABLE hosts ADD COLUMN always_on INTEGER NOT NULL DEFAULT 1;
|
||||
@@ -99,12 +99,6 @@ type Host struct {
|
||||
// agent-side message when RepoStatus == "init_failed".
|
||||
RepoStatus string
|
||||
RepoStatusError string
|
||||
|
||||
// AlwaysOn is true for 24x7 server hosts (the default). When false
|
||||
// the host is intermittent (laptop/workstation): offline alerts are
|
||||
// suppressed, the UI shows an "asleep" state, and a missed backup is
|
||||
// caught up ~1 min after reconnect. See the always-on-host-mode spec.
|
||||
AlwaysOn bool
|
||||
}
|
||||
|
||||
// Schedule is now intentionally slim: cron + which groups + enabled.
|
||||
|
||||
@@ -13,8 +13,4 @@ var (
|
||||
// Commit is the short git SHA. Informational only; surfaced via
|
||||
// /api/version but not used for any comparison.
|
||||
Commit = ""
|
||||
|
||||
// Date is the RFC3339 build timestamp. Informational only; printed
|
||||
// by `--version` but not used for any comparison.
|
||||
Date = "unknown"
|
||||
)
|
||||
|
||||
@@ -310,7 +310,7 @@ Sizes: **S** = under a day, **M** = 1–3 days, **L** = 3–7 days.
|
||||
> **Sweep verified (smoke env):** admin adds operator → setup link generated → curl-as-new-user fetches /setup (200, page shows username) → POSTs password → 303 to / + Set-Cookie → operator authenticated → 200 on /, 200 on /settings/account, **403 on /settings/users** (admin-only) → admin disables user → operator's next request is **401** + session row count drops to 0 → audit log shows `user.created` + `user.setup_completed` for the cycle. All 26 implementation tasks landed; full `go test ./...` green.
|
||||
- [x] **P4-05** (L) OIDC login (generic provider config, group → role mapping)
|
||||
|
||||
> **As shipped (2026-05-05):** Authorization Code + PKCE (S256) against any OIDC IdP advertising standard discovery. Config is YAML+env (`oidc.issuer`, `oidc.client_id`, `oidc.client_secret`/`_file`, `oidc.role_claim` default `groups`, `oidc.role_mapping`, `oidc.display_name`, `oidc.redirect_url`); empty issuer → OIDC disabled, no routes mounted. Migration 0019 adds `users.auth_source`/`oidc_subject` (partial unique index on `oidc_subject`), `sessions.id_token`, and a small `oidc_state` table for state+verifier round-trip (cleaned up every alert tick, 5 min TTL). Login page renders **Sign in with `<display_name>`** above the local form when OIDC is enabled; the SSO button kicks off a 303 to the IdP with state + S256 code_challenge persisted server-side. Callback verifies ID token, fetches `/userinfo` to merge claims (Authelia / many IdPs only put `sub` in the ID token and surface `preferred_username`/`email`/`groups` from userinfo), maps the first matching group to a role; **no match → deny banner**, no row created, audit `user.oidc_login_blocked`. Username-collision with an existing local user → same deny path with `username_taken`. New user → JIT-provisioned with `auth_source='oidc'`, `oidc_subject=<sub>`, `password_hash=''`. Returning user → looked up by `oidc_subject` (stable when usernames change at the IdP), role + email refreshed on every login. Local password login is rejected for `auth_source='oidc'` users. Logout posts to `/logout` and, when the IdP advertised `end_session_endpoint`, follows up with RP-initiated logout (carries `id_token_hint` + `post_logout_redirect_uri=BaseURL`); when not advertised (Authelia in our smoke env), the local session is cleared and the browser lands on `/login`. Users list shows a small **oidc** chip beside enabled/disabled; the edit page disables username/email/role for OIDC users (server-side guard mirrors UI, returns 403). Force-logout, disable, and the last-admin guard from P4-04 all still apply. **Live Authelia sweep verified all four paths against local auth:** rm-admin → admin role + JIT row + chip + readonly edit; rm-operator → operator JIT, 403 on `/settings/users`; rm-viewer → viewer JIT, 403 on `/hosts/new`; rm-other (group not in role_mapping) → no_role_match banner, no row created, audit logged. Returning rm-admin login resolved to the same row by sub. Screenshots in `_diag/p4-05-sweep/`. Out-of-scope and on Phase 6 candidate list: refresh tokens, back-channel logout, multiple providers, post-login PKCE for the cookie itself.
|
||||
> **As shipped (2026-05-05):** Authorization Code + PKCE (S256) against any OIDC IdP advertising standard discovery. Config is YAML+env (`oidc.issuer`, `oidc.client_id`, `oidc.client_secret`/`_file`, `oidc.role_claim` default `groups`, `oidc.role_mapping`, `oidc.display_name`, `oidc.redirect_url`); empty issuer → OIDC disabled, no routes mounted. Migration 0019 adds `users.auth_source`/`oidc_subject` (partial unique index on `oidc_subject`), `sessions.id_token`, and a small `oidc_state` table for state+verifier round-trip (cleaned up every alert tick, 5 min TTL). Login page renders **Sign in with `<display_name>`** above the local form when OIDC is enabled; the SSO button kicks off a 303 to the IdP with state + S256 code_challenge persisted server-side. Callback verifies ID token, fetches `/userinfo` to merge claims (Authelia / many IdPs only put `sub` in the ID token and surface `preferred_username`/`email`/`groups` from userinfo), maps the first matching group to a role; **no match → deny banner**, no row created, audit `user.oidc_login_blocked`. Username-collision with an existing local user → same deny path with `username_taken`. New user → JIT-provisioned with `auth_source='oidc'`, `oidc_subject=<sub>`, `password_hash=''`. Returning user → looked up by `oidc_subject` (stable when usernames change at the IdP), role + email refreshed on every login. Local password login is rejected for `auth_source='oidc'` users. Logout posts to `/logout` and, when the IdP advertised `end_session_endpoint`, follows up with RP-initiated logout (carries `id_token_hint` + `post_logout_redirect_uri=BaseURL`); when not advertised (Authelia in our smoke env), the local session is cleared and the browser lands on `/login`. Users list shows a small **oidc** chip beside enabled/disabled; the edit page disables username/email/role for OIDC users (server-side guard mirrors UI, returns 403). Force-logout, disable, and the last-admin guard from P4-04 all still apply. **Live Authelia sweep verified all four paths against `https://auth.dcglab.co.uk`:** rm-admin → admin role + JIT row + chip + readonly edit; rm-operator → operator JIT, 403 on `/settings/users`; rm-viewer → viewer JIT, 403 on `/hosts/new`; rm-other (group not in role_mapping) → no_role_match banner, no row created, audit logged. Returning rm-admin login resolved to the same row by sub. Screenshots in `_diag/p4-05-sweep/`. Out-of-scope and on Phase 6 candidate list: refresh tokens, back-channel logout, multiple providers, post-login PKCE for the cookie itself.
|
||||
|
||||
- [x] **P4-07** (S) Per-host tags + dashboard filtering by tag
|
||||
|
||||
@@ -432,45 +432,8 @@ Sizes: **S** = under a day, **M** = 1–3 days, **L** = 3–7 days.
|
||||
> swap, helper `buildRepoTrendView` shared between page-load and
|
||||
> fragment endpoint). No new dependencies, no client JS, no agent
|
||||
> change. CI green; in-browser smoke walk-through pending operator.
|
||||
- [x] **P6-04** (M) Prometheus `/metrics` endpoint: per-host gauges (last backup timestamp, last backup status, repo size, snapshot count, agent online), server gauges (active alerts, build info), job duration histograms; protected by bearer token or IP allow-list. _(Was P4-08.)_
|
||||
- [x] **P6-05** (S) Document Prometheus integration + sample Grafana dashboard JSON. _(Was P4-09.)_
|
||||
|
||||
> **As shipped (2026-05-07, branch `p6-04-05-prometheus-metrics`):**
|
||||
> Spec `docs/superpowers/specs/2026-05-07-p6-04-05-prometheus-metrics-design.md`,
|
||||
> plan `docs/superpowers/plans/2026-05-07-p6-04-05-prometheus-metrics.md`.
|
||||
> New `internal/server/metrics` package emits the legacy
|
||||
> `text/plain; version=0.0.4` exposition format directly — no
|
||||
> `prometheus/client_golang` dependency, matching the repo's
|
||||
> "no Tailwind, no Node" minimal-deps style. `/metrics` is **opt-in**:
|
||||
> `RM_METRICS_TOKEN` and/or `RM_METRICS_TRUSTED_CIDR` must be set or
|
||||
> the route isn't mounted at all (404). When both are set, both must
|
||||
> pass; either alone gates access. Token compare is constant-time.
|
||||
> CIDR check honours `X-Forwarded-For` only when the immediate hop
|
||||
> is a configured `RM_TRUSTED_PROXY` (mirrors the existing realIP
|
||||
> resolution).
|
||||
>
|
||||
> **Metrics:** per-host gauges (`rm_host_agent_online`,
|
||||
> `rm_host_last_backup_timestamp_seconds`, `rm_host_last_backup_success`,
|
||||
> `rm_host_repo_size_bytes`, `rm_host_snapshot_count`,
|
||||
> `rm_host_open_alerts`, `rm_host_repo_status`); server gauges
|
||||
> (`rm_hosts_total`, `rm_hosts_online`, `rm_active_alerts{severity}`,
|
||||
> `rm_build_info{version,commit,go_version}`); histogram
|
||||
> `rm_job_duration_seconds_bucket{kind,status,le}` with buckets
|
||||
> `1, 5, 30, 60, 300, 1800, 3600, 21600, 86400, +Inf`.
|
||||
> Histogram is in-memory; observations come from the existing
|
||||
> `MsgJobFinished` branch in `internal/server/ws/handler.go`.
|
||||
>
|
||||
> **Docs:** `docs/prometheus.md` covers enable + scrape config +
|
||||
> metric reference + dashboard import. **Dashboard:**
|
||||
> `deploy/grafana/restic-manager-dashboard.json` — six panels
|
||||
> (fleet status, open alerts, backups failing, hosts table, repo
|
||||
> size over time, job-duration p95). Schema 39, single Prometheus
|
||||
> datasource variable.
|
||||
>
|
||||
> **Tests:** golden-render + concurrent-observe + bucket-boundary
|
||||
> in the metrics package; auth matrix (no auth → 404; token
|
||||
> missing/wrong/right; CIDR matching/non-matching; token AND CIDR)
|
||||
> in the HTTP layer.
|
||||
- [ ] **P6-04** (M) Prometheus `/metrics` endpoint: per-host gauges (last backup timestamp, last backup status, repo size, snapshot count, agent online), server gauges (active alerts, build info), job duration histograms; protected by bearer token or IP allow-list. _(Was P4-08.)_
|
||||
- [ ] **P6-05** (S) Document Prometheus integration + sample Grafana dashboard JSON. _(Was P4-09.)_
|
||||
|
||||
### Phase 6 acceptance
|
||||
|
||||
@@ -480,11 +443,11 @@ Sizes: **S** = under a day, **M** = 1–3 days, **L** = 3–7 days.
|
||||
|
||||
## Cross-cutting / ongoing
|
||||
|
||||
- [x] **X-01** Keep CHANGELOG.md updated (Keep-a-Changelog format). ✅ Landed: `CHANGELOG.md` at the repo root with a v1.0.0 entry summarising what each phase shipped, plus an empty Unreleased section to accumulate changes after the tag. Updated on each release going forward.
|
||||
- [ ] **X-01** Keep CHANGELOG.md updated (Keep-a-Changelog format)
|
||||
- [ ] **X-02** Track restic version compatibility matrix
|
||||
- [ ] **X-03** Periodic dependency updates (`dependabot` or `renovate`)
|
||||
- [x] **X-04** Threat-model review at end of each phase. ✅ Landed: `docs/threat-model.md` covering assets, actors, attack surfaces (bootstrap, local accounts, OIDC, agent enrolment, agent ↔ server WS, credential lifecycle, restore, audit log, self-update channel), residual risks, and explicit out-of-scope items. Reviewed against v1.0.0 surface; refresh on each tagged release.
|
||||
- [x] **X-05** Proper first-run onboarding UI. ✅ Landed: bootstrap form already lives at `/bootstrap` and `/login` redirects to it when no users exist (so an operator hitting the server in a browser is guided into setup automatically — the form takes username + password only, no token field needed because the server holds the in-memory token and applies it server-side). Improvements added here: at first-run startup the server now prints a clickable `$RM_BASE_URL/bootstrap` URL (or a fallback message when `RM_BASE_URL` is unset) alongside the existing one-shot token for headless `/api/bootstrap` use; the bootstrap form's password field shows an explicit "Minimum 12 characters" hint so the rule is visible before submission instead of failing on submit.
|
||||
- [ ] **X-04** Threat-model review at end of each phase
|
||||
- [ ] **X-05** Proper first-run onboarding UI: admin shouldn't need to `curl` `/api/bootstrap` by hand. Render the bootstrap form on the same login page (extra "setup token" field shown only while no admin user exists, hidden after); on submit POST to `/api/bootstrap`, then drop straight into a session. Surface the one-time token from the server log somewhere copy-able (or print a clickable URL with the token in the query string at first-run). Also: relax the 12-char password floor for the first-run path or document it in the form so `admin` doesn't silently fail validation.
|
||||
|
||||
---
|
||||
|
||||
@@ -496,10 +459,6 @@ Sizes: **S** = under a day, **M** = 1–3 days, **L** = 3–7 days.
|
||||
- [x] **NS-01** Admin-driven host deletion. ✅ Landed: store `DeleteHost` (FK cascade revokes the agent bearer along with everything else), admin-band `POST /hosts/{id}/delete`, danger-zone form on host detail with hostname-confirm, audit `host.deleted`, live WS connection closed pre-delete. Original scope below for reference. No UI or API surface today — once a host is enrolled the only way to remove it is hand-editing SQLite, which then cascades through schedules/jobs/snapshots/source-groups via the FK chain. Needs: store-level `DeleteHost` + cascade audit, admin-band `DELETE /api/hosts/{id}` and form-post variant, confirm-modal on the host-detail page, audit entry, and a decision on whether to also revoke the agent's bearer (recommend: yes, so a re-installed host comes back through the normal pending-host accept flow).
|
||||
- [x] **NS-02** Recoverable enrollment-token UX. ✅ Landed: `Store.ListOutstandingEnrollmentTokens` + `DeleteEnrollmentToken`; outstanding-tokens panel on the Add-host page (short hash, redacted repo URL, created/expires) with per-row Regenerate (revokes old hash, mints fresh raw token preserving repo creds + initial paths, 303s to `/hosts/pending/{newToken}`) and Revoke (delete + audit). Audit actions `enrollment_token.regenerated` / `enrollment_token.revoked`. Original scope below. Today `POST /hosts/new` mints a token and 303s to `/hosts/pending/{token}`; if the operator closes that tab the install snippet is lost and there's no UI surface to find it again — the row sits in `enrollment_tokens` until TTL expiry, invisible. Needs: store-level `ListOutstandingEnrollmentTokens` returning `(token_hash, created_at, expires_at, repo_url_redacted, initial_paths, attached_host_id_or_null)`; a small list section on the Add-host page (and/or Settings) showing outstanding tokens with created/expires-in and the redacted repo URL; admin-band `POST /api/enrollment-tokens/{id}/regenerate` (revokes the old hash, mints a fresh raw token, re-uses the original attachments — same pattern as the user-setup-token regenerate flow) and `POST /api/enrollment-tokens/{id}/revoke`. Choose regenerate over "show original token" because we only persist hashes, never raw tokens.
|
||||
- [x] **NS-03** Auto-init repo on first onboard, surface credential failures eagerly. ✅ Landed: migration 0020 adds `hosts.repo_status` (`unknown`/`ready`/`init_failed`) + `repo_status_error`; WS handler projects every init job's terminal state onto the host row (with idempotent "config file already exists" → ready); creds-save handlers (UI + JSON API) reset status to `unknown` and dispatch a fresh init when the agent is online; new `/hosts/{id}/repo/probe` retry endpoint and a status banner on the repo page. Remainder of original scope below. surface credential failures eagerly. Today the operator types repo URL + creds during Add-host and the credentials are pushed to the agent on connect, but no `restic init`/probe runs until the first scheduled job — so a typo in the password or a wrong URL goes undetected for hours/days, manifesting as a silent missed-backup. Wanted behaviour: when the host completes enrolment (or when an admin saves new repo creds), the server dispatches a one-shot probe job that runs `restic cat config` (cheap, repo-existence + creds-validity in one call). On `Is there already a config file? unable to open config file` → run `restic init`. On success → mark the host's repo as ready. On any other error (network, auth, fingerprint) → surface a panel-level error on the host detail page and audit the failure, leaving the host in an "init pending" state with a "Retry" button. Needs: a new `JobKind` (or piggyback on an existing one) for the probe, server-side state on the host row (`repo_status` enum: `unknown`/`ready`/`init_pending`/`init_failed`), UI panel that shows the state, and clear copy on the Add-host page so the operator knows the save isn't fire-and-forget.
|
||||
- [x] **NS-05** Drop redundant `actions/setup-go` from `.gitea/workflows/ci.yml`. ✅ Already gone — verified `.gitea/workflows/ci.yml` has zero `actions/setup-go@v5` invocations and no `GO_VERSION` env; the file's header comment now documents that the runner image (`gitea.dcglab.co.uk/steve/ci-runner-go`) is the single source of truth for the Go version. Closing as done; no further code change needed.
|
||||
- [x] **NS-06** Remove the permanently-disabled "Run backup now" button from `web/templates/partials/host_chrome.html`. ✅ Landed: dropped the disabled tombstone button from the host header action row; only "Edit credentials" + the ⋯ menu remain. Per-source-group Run-now on `/hosts/{id}/sources` is the only path now. No e2e change needed — `smoke.spec.ts` does not assert on host_chrome's button row.
|
||||
- [x] **NS-07** Relative timestamps go stale on long-open tabs. ✅ Landed: `formatRelTime` now wraps its label in `<time data-rel-ts=…>` and both layouts (`base.html`, `chromeless.html`) carry a small ticker that re-renders every 30s, so a page rendered an hour ago no longer keeps showing "2h ago" when the wall-clock truth is "3h ago". Covered by `funcs_test.go`. The bug: every relative label was computed once at server render and never updated client-side, so a job-detail page left open drifted further from reality the longer it sat.
|
||||
- [x] **NS-08** Always-On vs intermittent host mode. ✅ Landed: a host can now be marked not-always-on (laptop/workstation) so it stops generating offline-alert noise when it legitimately sleeps. Migration 0024 adds `hosts.always_on` (default 1 = today's 24×7 behaviour; intermittent is strictly opt-in). The alert engine suppresses `agent_offline` for intermittent hosts and instead wires up the previously-dead `stale_schedule` alert for them — raised at a 7-day global threshold when the host has an enabled schedule and a stale last backup, resolved on the next successful backup. A new server-side catch-up scheduler (`internal/server/http/catchup.go`) arms on agent hello and fires from the existing 30s pending-drain tick: ~60s after an intermittent host reconnects it dispatches a backup for any enabled schedule whose window elapsed while asleep (overdue = `cron.Next(lastBackup) <= now`, reusing the shared `cronParser`), guarded against firing when the host bounced offline, flipped to always-on, or already has a job running. Overdue is measured against the per-host `LastBackupAt` (exact for the common single-schedule laptop; a known coarseness for multi-cadence hosts, documented in code). Operator toggle via `POST /hosts/{id}/mode` (audited `host.mode_updated`), which also clears open offline/staleness alerts so the next sweep re-settles. UI: intermittent offline hosts render a calm grey `asleep · <relTime> · will catch up on return` state (new `.dot-asleep`) instead of red "offline"; a `24×7` chip shows only for always-on hosts; a "presence" inline toggle on the host header. Design + plan in `docs/specs/2026-06-15-always-on-host-mode-design.md` and `docs/plans/2026-06-15-always-on-host-mode.md`. Spec §2 (online/offline mechanics) deliberately left untouched. Out of scope for v1: per-host staleness thresholds, continuous (non-reconnect) overdue evaluation, per-schedule last-success tracking.
|
||||
- [x] **NS-04** Dashboard parity with the alerts screen: live refresh, column sorting, filters. ✅ Landed: `/` now parses `q`/`status`/`repo_status`/`tag`/`sort`/`dir` query params (round-trip durable for bookmarks); table is wrapped in an `id="hosts-table"` htmx live-poll matching the alerts cadence (5s, gated on `document.visibilityState` and `localStorage.rm-dashboard-live`); filter row above the table with hostname free-text + status + repo_status selects + tag chips + clear; column headers (Host / OS · arch / Last backup / Repo size / Snapshots) are clickable links that toggle direction on the active column; pure-Go sort+filter pipeline covered by `dashboard_filter_test.go`. Original scope below. live refresh, column sorting, filters. The host list is currently a static render — operators have to reload to see new heartbeats / job state changes. Mirror the alerts pattern (`web/templates/pages/alerts.html` uses `hx-trigger="every 5s [document.visibilityState==='visible' && localStorage.getItem('rm-alerts-live')!=='off']"` plus a Live/Off toggle so background tabs and explicit-off don't burn server cycles). Add: server-side sort on every meaningful column (name, OS, last-backup time, last-backup status, agent online/offline, restic version, tags), and a small filter row above the table — at minimum free-text on hostname, status (online/offline/never-seen), and tag chips. Columns + filter state should round-trip through query string so a bookmarked / shared URL is durable. Re-use the `host_row` partial that already exists so the live-refresh swap is a clean OOB swap, not a full table re-render.
|
||||
|
||||
---
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -70,7 +70,6 @@
|
||||
.dot-online { background: var(--ok); box-shadow: 0 0 0 3px color-mix(in oklch, var(--ok), transparent 80%); }
|
||||
.dot-degraded { background: var(--warn); box-shadow: 0 0 0 3px color-mix(in oklch, var(--warn), transparent 80%); }
|
||||
.dot-offline { background: var(--off); }
|
||||
.dot-asleep { background: var(--ink-fade); opacity: 0.6; }
|
||||
.dot-failed { background: var(--bad); box-shadow: 0 0 0 3px color-mix(in oklch, var(--bad), transparent 80%); }
|
||||
.pulse { animation: rm-pulse 2.4s ease-in-out infinite; }
|
||||
@keyframes rm-pulse {
|
||||
@@ -196,17 +195,6 @@
|
||||
}
|
||||
.tag-removable .x { color: var(--ink-fade); cursor: pointer; padding-left: 2px; }
|
||||
|
||||
/* ---------- header meta groups (boxed tags / presence pills) ---------- */
|
||||
.meta-group {
|
||||
display: inline-flex; align-items: center; gap: 6px;
|
||||
font-size: 11px; line-height: 1; padding: 3px 9px;
|
||||
border: 1px solid var(--line); border-radius: 5px;
|
||||
background: color-mix(in oklch, var(--ink), transparent 95%);
|
||||
}
|
||||
.meta-group .meta-label { color: var(--ink-mute); }
|
||||
.meta-group .meta-val { color: var(--ink-mid); text-decoration: none; }
|
||||
.meta-group a.meta-val:hover { color: var(--ink); text-decoration: underline; }
|
||||
|
||||
/* ---------- form fields ---------- */
|
||||
.field-label { font-size: 12px; color: var(--ink-mid); margin-bottom: 6px; display: block; }
|
||||
.field-help { font-size: 12px; color: var(--ink-mute); margin-top: 6px; line-height: 1.55; }
|
||||
|
||||
@@ -20,37 +20,6 @@
|
||||
|
||||
{{template "toast" .}}
|
||||
|
||||
<script>
|
||||
// Tick <time data-rel-ts> labels so long-open tabs don't freeze
|
||||
// (e.g. a job page rendered an hour ago kept showing "2h ago" when
|
||||
// the truth was "3h ago"). Buckets must match relTimeLabel in
|
||||
// internal/server/ui/funcs.go.
|
||||
(function () {
|
||||
function label(ms) {
|
||||
var suffix = 'ago';
|
||||
if (ms < 0) { ms = -ms; suffix = 'from now'; }
|
||||
var s = Math.floor(ms / 1000);
|
||||
if (s < 60) return s + 's ' + suffix;
|
||||
var m = Math.floor(s / 60);
|
||||
if (m < 60) return m + 'm ' + suffix;
|
||||
var h = Math.floor(m / 60);
|
||||
if (h < 24) return h + 'h ' + suffix;
|
||||
var d = Math.floor(h / 24);
|
||||
if (d < 7) return d + 'd ' + suffix;
|
||||
return Math.floor(d / 7) + 'w ' + suffix;
|
||||
}
|
||||
function tick() {
|
||||
var now = Date.now();
|
||||
document.querySelectorAll('time[data-rel-ts]').forEach(function (el) {
|
||||
var t = Date.parse(el.getAttribute('data-rel-ts'));
|
||||
if (!isNaN(t)) el.textContent = label(now - t);
|
||||
});
|
||||
}
|
||||
tick();
|
||||
setInterval(tick, 30000);
|
||||
})();
|
||||
</script>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
{{end}}
|
||||
|
||||
@@ -11,34 +11,6 @@
|
||||
</head>
|
||||
<body class="min-h-screen flex flex-col">
|
||||
{{block "content" .}}{{end}}
|
||||
<script>
|
||||
// See base.html for rationale; chromeless pages (e.g. pending host)
|
||||
// also use the relTime helper, so they need the same ticker.
|
||||
(function () {
|
||||
function label(ms) {
|
||||
var suffix = 'ago';
|
||||
if (ms < 0) { ms = -ms; suffix = 'from now'; }
|
||||
var s = Math.floor(ms / 1000);
|
||||
if (s < 60) return s + 's ' + suffix;
|
||||
var m = Math.floor(s / 60);
|
||||
if (m < 60) return m + 'm ' + suffix;
|
||||
var h = Math.floor(m / 60);
|
||||
if (h < 24) return h + 'h ' + suffix;
|
||||
var d = Math.floor(h / 24);
|
||||
if (d < 7) return d + 'd ' + suffix;
|
||||
return Math.floor(d / 7) + 'w ' + suffix;
|
||||
}
|
||||
function tick() {
|
||||
var now = Date.now();
|
||||
document.querySelectorAll('time[data-rel-ts]').forEach(function (el) {
|
||||
var t = Date.parse(el.getAttribute('data-rel-ts'));
|
||||
if (!isNaN(t)) el.textContent = label(now - t);
|
||||
});
|
||||
}
|
||||
tick();
|
||||
setInterval(tick, 30000);
|
||||
})();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
{{end}}
|
||||
|
||||
@@ -36,7 +36,6 @@
|
||||
<label class="field-label" for="bs-pw">Password</label>
|
||||
<input id="bs-pw" name="password" type="password" class="field"
|
||||
required minlength="12" autocomplete="new-password" />
|
||||
<div class="field-help">Minimum 12 characters.</div>
|
||||
</div>
|
||||
<div>
|
||||
<label class="field-label" for="bs-pw2">Confirm password</label>
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
{{$page := .Page}}
|
||||
{{template "crit_banner" .Page}}
|
||||
{{if and (eq $page.HostCount 0) (eq (len $page.PendingHosts) 0)}}
|
||||
{{if eq $page.HostCount 0}}
|
||||
|
||||
{{/* ---------- empty state ---------- */}}
|
||||
<div class="pt-14 pb-24">
|
||||
|
||||
@@ -34,32 +34,17 @@
|
||||
{{else if eq $host.Status "degraded"}}
|
||||
<span class="dot dot-degraded"></span>
|
||||
{{else if eq $host.Status "offline"}}
|
||||
{{if $host.AlwaysOn}}
|
||||
<span class="dot dot-offline"></span>
|
||||
{{else}}
|
||||
<span class="dot dot-asleep"></span>
|
||||
{{end}}
|
||||
{{else}}
|
||||
<span class="dot dot-failed"></span>
|
||||
{{end}}
|
||||
<h1 class="mono text-[26px] font-medium tracking-[0.005em] text-ink">{{$host.Name}}</h1>
|
||||
<div class="flex items-center gap-2.5">
|
||||
{{/* tags group pill — click the "tags" label to edit; the tag
|
||||
values still filter the dashboard by that tag. */}}
|
||||
<span class="meta-group">
|
||||
<span class="meta-label cursor-pointer hover:text-ink"
|
||||
<div class="flex gap-1.5 items-center">
|
||||
{{range $host.Tags}}<a href="/?tag={{.}}" class="tag" title="filter dashboard by this tag">{{.}}</a>{{end}}
|
||||
<button type="button" class="text-ink-fade text-[11px] hover:text-ink-mid whitespace-nowrap"
|
||||
style="padding: 2px 8px; border: 1px dashed var(--line); border-radius: 3px; cursor: pointer;"
|
||||
onclick="document.getElementById('tags-edit-{{$host.ID}}').classList.toggle('hidden')"
|
||||
title="Edit tags">tags</span>
|
||||
{{range $host.Tags}}<a href="/?tag={{.}}" class="meta-val" title="filter dashboard by this tag">{{.}}</a>{{end}}
|
||||
{{if not $host.Tags}}<span class="meta-val">—</span>{{end}}
|
||||
</span>
|
||||
{{/* presence group pill — click anywhere to edit. */}}
|
||||
<span class="meta-group cursor-pointer"
|
||||
onclick="document.getElementById('mode-edit-{{$host.ID}}').classList.toggle('hidden')"
|
||||
title="Change presence mode">
|
||||
<span class="meta-label">presence</span>
|
||||
<span class="meta-val">{{if $host.AlwaysOn}}24x7{{else}}Free{{end}}</span>
|
||||
</span>
|
||||
title="Edit tags">{{if $host.Tags}}edit tags{{else}}add tags{{end}}</button>
|
||||
</div>
|
||||
{{if gt $page.ScheduleVersion 0}}
|
||||
<span class="mono text-[11px] text-ink-mute ml-2">
|
||||
@@ -95,24 +80,6 @@
|
||||
</div>
|
||||
<div class="field-help">Comma-separated. Lowercased automatically.</div>
|
||||
</form>
|
||||
{{/* Presence-mode editor — hidden by default; toggled by the
|
||||
"presence" button. Checkbox present => always-on (24×7);
|
||||
unchecked => intermittent (laptop): no offline alerts, shows
|
||||
"asleep", auto-catches-up a missed backup on reconnect. */}}
|
||||
<form id="mode-edit-{{$host.ID}}" method="post"
|
||||
action="/hosts/{{$host.ID}}/mode"
|
||||
class="hidden mt-3" style="max-width: 640px;">
|
||||
<label class="flex items-center gap-2 text-[12px] text-ink-mid">
|
||||
<input type="checkbox" name="always_on" value="on" {{if $host.AlwaysOn}}checked{{end}} />
|
||||
Always On — expected online 24×7
|
||||
</label>
|
||||
<div class="field-help">
|
||||
Uncheck for an intermittent host (laptop/workstation): it won't
|
||||
raise offline alerts when asleep, shows an "asleep" state, and
|
||||
catches up a missed backup ~1 minute after it reconnects.
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary mt-2 whitespace-nowrap">Save presence</button>
|
||||
</form>
|
||||
<div class="flex items-center gap-3 mt-3 text-[13px] text-ink-mute">
|
||||
<span class="mono text-ink-mid">{{$host.OS}}/{{$host.Arch}}</span>
|
||||
<span class="text-ink-fade">·</span>
|
||||
@@ -121,17 +88,14 @@
|
||||
<span>restic <span class="mono text-ink-mid">{{if $host.ResticVersion}}{{$host.ResticVersion}}{{else}}—{{end}}</span></span>
|
||||
<span class="text-ink-fade">·</span>
|
||||
{{if eq $host.Status "offline"}}
|
||||
{{if $host.AlwaysOn}}
|
||||
<span>last seen <span class="mono text-ink-mid">{{relTime $host.LastSeenAt}}</span></span>
|
||||
{{else}}
|
||||
<span>asleep · last seen <span class="mono text-ink-mid">{{relTime $host.LastSeenAt}}</span> · will catch up on return</span>
|
||||
{{end}}
|
||||
{{else}}
|
||||
<span>online · last heartbeat <span class="mono text-ink-mid">{{relTime $host.LastSeenAt}}</span></span>
|
||||
{{end}}
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex items-center gap-2">
|
||||
<button class="btn" disabled title="per-source-group Run-now lives on the Sources tab">Run backup now</button>
|
||||
<button class="btn">Edit credentials</button>
|
||||
<button class="btn btn-ghost text-base px-2.5">⋯</button>
|
||||
</div>
|
||||
|
||||
@@ -8,11 +8,7 @@
|
||||
{{- else if eq $h.Status "degraded" -}}
|
||||
<span class="dot dot-degraded"></span>
|
||||
{{- else if eq $h.Status "offline" -}}
|
||||
{{- if $h.AlwaysOn -}}
|
||||
<span class="dot dot-offline"></span>
|
||||
{{- else -}}
|
||||
<span class="dot dot-asleep"></span>
|
||||
{{- end -}}
|
||||
{{- else -}}
|
||||
<span class="dot dot-failed"></span>
|
||||
{{- end -}}
|
||||
@@ -30,11 +26,7 @@
|
||||
{{- else if eq (deref $h.LastBackupStatus) "cancelled" -}}
|
||||
<span class="text-warn">cancelled</span> · <span class="mono">{{relTime $h.LastBackupAt}}</span>
|
||||
{{- else if eq $h.Status "offline" -}}
|
||||
{{- if $h.AlwaysOn -}}
|
||||
<span class="text-ink-mute">last seen <span class="mono">{{relTime $h.LastSeenAt}}</span></span>
|
||||
{{- else -}}
|
||||
<span class="text-ink-mute">asleep · <span class="mono">{{relTime $h.LastSeenAt}}</span> · will catch up on return</span>
|
||||
{{- end -}}
|
||||
{{- else -}}
|
||||
<span class="text-ink-fade italic">never run</span>
|
||||
{{- end -}}
|
||||
@@ -61,7 +53,7 @@
|
||||
</div>
|
||||
<div class="text-right row-action">
|
||||
{{- if eq $h.Status "offline" -}}
|
||||
<span class="mono text-xs text-ink-fade">{{if $h.AlwaysOn}}offline{{else}}asleep{{end}}</span>
|
||||
<span class="mono text-xs text-ink-fade">offline</span>
|
||||
{{- else if $h.CurrentJobID -}}
|
||||
<a href="/jobs/{{deref $h.CurrentJobID}}" class="btn btn-ghost">View job →</a>
|
||||
{{- else if .RunAllScheduleID -}}
|
||||
|
||||
@@ -7,5 +7,5 @@
|
||||
Hidden entirely when UpdateAvailable is false.
|
||||
*/}}
|
||||
{{define "host_update_chip"}}
|
||||
{{if .UpdateAvailable}}<span class="update-chip" title="Agent at {{.Host.AgentVersion}}; server at {{.TargetVersion}}">out of date</span>{{end}}
|
||||
{{if .UpdateAvailable}}<span class="update-chip" title="Agent at {{.Host.AgentVersion}}; server at {{.TargetVersion}}">out of date · {{.Host.AgentVersion}} → {{.TargetVersion}}</span>{{end}}
|
||||
{{end}}
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
<div class="max-w-[1280px] mx-auto px-8 flex items-end justify-between">
|
||||
<nav class="flex items-end">
|
||||
<a href="/" class="nav-tab {{if eq .Active "dashboard"}}active{{end}}">Dashboard</a>
|
||||
<a href="/repos" class="nav-tab {{if eq .Active "repos"}}active{{end}}">Repos</a>
|
||||
<a href="/alerts" class="nav-tab {{if eq .Active "alerts"}}active{{end}}">Alerts{{if gt .OpenAlerts 0}} <span class="tag tag-critical mono ml-1">{{.OpenAlerts}}</span>{{end}}</a>
|
||||
<a href="/audit" class="nav-tab {{if eq .Active "audit"}}active{{end}}">Audit</a>
|
||||
<a href="/settings" class="nav-tab {{if eq .Active "settings"}}active{{end}}">Settings</a>
|
||||
|
||||
Reference in New Issue
Block a user